├── .env
├── .gitignore
├── .pylintrc
├── Justfile
├── LICENSE
├── README.md
├── dynamic-exp
    ├── best_hp.csv
    ├── dynamic_exp.sh
    └── parse_log_exmaple.py
├── hyper-params.md
├── lecarb
    ├── __init__.py
    ├── __main__.py
    ├── constants.py
    ├── dataset
    │   ├── dataset.py
    │   ├── gen_dataset.py
    │   └── manipulate_dataset.py
    ├── dtypes.py
    ├── estimator
    │   ├── bayesnet.py
    │   ├── deepdb
    │   │   ├── README.md
    │   │   ├── aqp_spn
    │   │   │   ├── __init__.py
    │   │   │   ├── aqp_leaves.py
    │   │   │   ├── aqp_spn.py
    │   │   │   ├── code_generation
    │   │   │   │   ├── convert_conditions.py
    │   │   │   │   ├── generate_code.py
    │   │   │   │   └── templates
    │   │   │   │   │   ├── categorical_leave.cpp
    │   │   │   │   │   ├── identity_leave.cpp
    │   │   │   │   │   ├── master.cpp
    │   │   │   │   │   ├── method_master.cpp
    │   │   │   │   │   ├── product_node.cpp
    │   │   │   │   │   ├── registration_master.cpp
    │   │   │   │   │   └── sum_node.cpp
    │   │   │   ├── custom_spflow
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── custom_learning.py
    │   │   │   │   ├── custom_structure_learning.py
    │   │   │   │   ├── custom_transform_structure.py
    │   │   │   │   ├── custom_validity.py
    │   │   │   │   └── utils.py
    │   │   │   ├── expectations.py
    │   │   │   ├── group_by_combination.py
    │   │   │   ├── ranges.py
    │   │   │   └── util
    │   │   │   │   ├── Graphics.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── bloom_filter.py
    │   │   ├── data_preparation
    │   │   │   ├── __init__.py
    │   │   │   ├── join_data_preparation.py
    │   │   │   └── prepare_single_tables.py
    │   │   ├── deepdb.py
    │   │   ├── ensemble_compilation
    │   │   │   ├── __init__.py
    │   │   │   ├── graph_representation.py
    │   │   │   ├── physical_db.py
    │   │   │   ├── probabilistic_query.py
    │   │   │   ├── spn_ensemble.py
    │   │   │   └── utils.py
    │   │   ├── ensemble_creation
    │   │   │   ├── __init__.py
    │   │   │   ├── naive.py
    │   │   │   ├── rdc_based.py
    │   │   │   └── utils.py
    │   │   └── evaluation
    │   │   │   ├── aqp_evaluation.py
    │   │   │   ├── cardinality_evaluation.py
    │   │   │   ├── confidence_interval_evaluation.py
    │   │   │   ├── spn_statistics.py
    │   │   │   └── utils.py
    │   ├── estimator.py
    │   ├── feedback_kde.py
    │   ├── lw
    │   │   ├── README.md
    │   │   ├── common.py
    │   │   ├── lw_nn.py
    │   │   ├── lw_tree.py
    │   │   └── model.py
    │   ├── mhist.py
    │   ├── mscn
    │   │   ├── README.md
    │   │   ├── model.py
    │   │   └── mscn.py
    │   ├── mysql.py
    │   ├── naru
    │   │   ├── README.md
    │   │   ├── made.py
    │   │   ├── naru.py
    │   │   └── transformer.py
    │   ├── postgres.py
    │   ├── sample.py
    │   └── utils.py
    └── workload
    │   ├── __init__.py
    │   ├── dump_quicksel.py
    │   ├── gen_label.py
    │   ├── gen_workload.py
    │   ├── generator.py
    │   ├── merge_workload.py
    │   └── workload.py
└── pyproject.toml


/.env:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=data
 2 | OUTPUT_ROOT=output
 3 | # DATABASE_URL=postgres://card:card@localhost:6666/card
 4 | DATABASE_URL=postgres://card:card@localhost:6667/card
 5 | KDE_DATABASE_URL=postgres://card:card@localhost:5432/card
 6 | 
 7 | CPU_NUM_THREADS=16
 8 | OMP_NUM_THREADS=16
 9 | OPENBLAS_NUM_THREADS=16
10 | MKL_NUM_THREADS=16
11 | VECLIB_MAXIMUM_THREADS=16
12 | NUMEXPR_NUM_THREADS=16
13 | 
14 | PSQL=/usr/bin/psql
15 | KDE_PSQL=/usr/local/pgsql/bin/psql
16 | KDE_POSTGRES=/usr/local/pgsql/bin/postgres
17 | KDE_PG_DATA=/home/ubuntu/feedback-kde/data
18 | 
19 | # MYSQL
20 | MYSQL=mysql
21 | MYSQL_HOST=localhost
22 | MYSQL_DB=card
23 | MYSQL_USER=root
24 | MYSQL_PSWD=card
25 | MYSQL_PORT=10235
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | figures/
 3 | .ipynb_checkpoints/
 4 | cluster/
 5 | nohup.out
 6 | __pycache__/
 7 | result/
 8 | log/
 9 | model/
10 | fmodels/
11 | tmp/
12 | tmp.*
13 | *.bk
14 | .vscode
15 | .mypy_cache
16 | .venv
17 | lecarb.egg-info
18 | *.tar
19 | poetry.lock
20 | quicksel/
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 SFU Database Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Are We Ready For Learned Cardinality Estimation?
  2 | 
  3 | **Our paper can be found at [arxiv](https://arxiv.org/abs/2012.06743) and [vldb](http://www.vldb.org/pvldb/vol14/p1640-wang.pdf).**
  4 | 
  5 | ## Development Environment Setup
  6 | 
  7 | Setup:
  8 | * Install Just
  9 |   * MacOS: `brew install just`
 10 |   * Linux: `curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to /usr/local/bin`
 11 | * Install Poetry: `pip install poetry`
 12 | * Install Python dependencies: `just install-dependencies`
 13 | 
 14 | We define all the commands used in this work in `Justfile`. Run `just -l` for a list of supported tasks.
 15 | 
 16 | All the environmental configurations (e.g. data path, database configurations) are set in file `.env`.
 17 | 
 18 | ## Dataset
 19 | 
 20 | Download the real-world datasets and workloads from [here](https://www.dropbox.com/s/5bmvc1si5hysapf/data.tar.gz?dl=0).
 21 | 
 22 | The path of the data is defined in `.env` as variable `DATA_ROOT`. We support dataset with different versions, typically a csv file is located at: `{DATA_ROOT}/{dataset name}/{version name}.csv`.
 23 | 
 24 | We define the `Table` object, which contains both data, some commonly used statistics and functions for convenient usage. Please refer to `lecarb/dataset/dataset.py` for details. (Most of the methods in our repo take `Table` as the dataset input.)
 25 | 
 26 | - Example: Given a csv file of census dataset (name: census13, version: original), generate the Table object
 27 | ```bash
 28 | # 1. convert csv file to pickle
 29 | just csv2pkl data/census13/original.csv
 30 | 
 31 | # 2. convert to Table object
 32 | just pkl2table census13 original
 33 | ```
 34 | 
 35 | - Example: Generate synthetic dataset with s=1.0, c=1.0, d=1000 (dataset name: dom1000, version: skew1.0_corr1.0)
 36 | ```bash
 37 | just data-gen 1.0 1.0 1000
 38 | ```
 39 | If we want to update the dataset, please run the command in the following format:
 40 | 
 41 | `just append-data-{update} {seed} {dataset} {version} {interval}`,
 42 | 
 43 | where {update} can be chosen from `cor` and `skew`. {seed} is the random seed. {dataset} is the dataset name. {version} is the version of the data. {interval} is between 0 and 1. It decides the ratio of the data to be appended. 
 44 | 
 45 | 
 46 | 
 47 | - Example: Generate appended the dataset (name: census13, version: original) with correlated (update: cor) data:
 48 | ```
 49 | just append-data-cor 123 census13 original 0.2
 50 | ```
 51 | The appended data will be located at: `{DATA_ROOT}/{dataset name}/{version}+{version}_{update}_{interval}.pkl`
 52 | 
 53 | ## Workload
 54 | We adopt a unified workload generation framework to produce synthetic queries that we use in all the experiments. Specifically, in our framework each query is generated through three steps:
 55 | 
 56 | 1. Choose a set of attributes to place predicates.
 57 | 2. Select the query center for each predicate.
 58 | 3. Determine the operator for each predicate (as well as widths for range predicates).
 59 | 
 60 | We have different implementations of each step in `lecarb/workload/generator.py` (function names start with `asf_`, `csf_` and `wsf_` respectively), user can also add customized implementations to the code for more variations.
 61 | 
 62 | - Example: generate workload used in static experiment for census dataset (workloads for real-world datasets used in the paper are already provided [here](https://www.dropbox.com/s/5bmvc1si5hysapf/data.tar.gz?dl=0))
 63 | ```bash
 64 | # generate workload for small datasets (labels are generated in the same time)
 65 | just wkld-gen-base census13 original base
 66 | 
 67 | # for large datasets, start 10 processes to generate workload and then merge
 68 | just wkld-gen-mth10 census13 original base
 69 | just wkld-merge census13 original base
 70 | rm data/census13/workload/base_[0-9]*
 71 | ```
 72 | 
 73 | - Example: generate workload for synthetic dataset (name: dom1000, version: skew1.0_corr1.0) used in the paper, check [hyper-params.md](./hyper-params.md#preparation) for a whole prepartion procedure (data generation and workload & label generation) of the micro-benchmark.
 74 | ```base
 75 | # 1. generate workload (no labels generated)
 76 | just wkld-vood dom1000 skew1.0_corr1.0
 77 | 
 78 | # 2. generate labels
 79 | just wkld-label dom1000 skew1.0_corr1.0 vood
 80 | ```
 81 | 
 82 | ## Train & Test
 83 | 
 84 | Training and test commands for all the estimators are defined in `Justfile`, for hyper-parameters used and examples please refer to [hyper-params.md](./hyper-params.md).
 85 | 
 86 | Generated models are located at `{OUTPUT_ROOT}/model/{dataset name}/` and prediction results are at `{OUTPUT_ROOT}/result/{dataset name}/` in csv format.
 87 | 
 88 | Run `just report-error {output file name} {dataset name}` to see different error metrics of the **static** experiment result.
 89 | 
 90 | ## Run dynamic experiments:
 91 | 
 92 | Dynamic experiment related code for reproducibility is in `dynamic-exp/` and commands can be found in `Justfile`.
 93 | 
 94 | (1) To run all dynmaic experiments, run `bash dynamic-exp/dynamic_exp.sh`. It includes all commands for dynamic experiment.
 95 | 
 96 | - Example: we want to run dynamic experiment for mscn on data 'census13'. We could run the following command:
 97 | 
 98 | ```
 99 | just dynamic-mscn-census13 census13 original base cor 0.2 10000 123
100 | ```
101 | 
102 | 'original' is the old version of census13. 'base' is the training workload generation method. 'cor' is the correlation change we consider for data update. '0.2' is the appended size of data (i.e. 20% of the 'original data'). '10000' is the size of training workload. '123' is the random seed.
103 | 
104 | (2) Run the following command to see different error metrics of the dynamic experiment errors.
105 | 
106 | `just report-error-dynamic {dataset} {stale model result file} {update model result file} {T} {model update time}`,
107 | 
108 | where {model update time} can be extracted through parsing the logging files. `dynamic-exp/parse_log_exmaple.py` provides some example scipts of extracting {model update time}.
109 | 
110 | For convinience usage, we put the hyperparameters of deferent models in `dynamic-exp/best_hp.py`. It copies the best hyperparameters we tested from [hyper-params.md](./hyper-params.md).
111 | 
112 | ## Code References:
113 | 
114 | * Naru (including implementation of BayesNet): https://github.com/naru-project/naru
115 | * MSCN: https://github.com/andreaskipf/learnedcardinalities
116 | * DeepDB: https://github.com/DataManagementLab/deepdb-public
117 | * QuickSel: https://github.com/illinoisdata/quicksel
118 | * KDE-FB: https://github.com/martinkiefer/feedback-kde
119 | 
120 | Our forked repos:
121 | * QuickSel:
122 |   * Change: adding a new test class
123 |   * Link: https://github.com/sfu-db/quicksel
124 | * KDE-FB:
125 |   * Making the code support tables with <=15 columns (original code has the limitation <=10)
126 |   * Link: https://github.com/sfu-db/feedback-kde
127 | 


--------------------------------------------------------------------------------
/dynamic-exp/best_hp.csv:
--------------------------------------------------------------------------------
 1 | dataset_method,model
 2 | census13_naru,"resmade_hid16,16,16,16_emb8_ep100_embedInembedOut_warm0"
 3 | census13_mscn,mscn_hid8_sample500_ep100_bs256
 4 | census13_chinn,chinn_hid64_64_64_bin200_ep500_bs128
 5 | census13_chitree,chixgb_tr64_bin200
 6 | census13_deepdb,spn_sample48842_rdc0.4_ms0.01
 7 | forest10_naru,"resmade_hid64,64,64,64_emb8_ep100_embedInembedOut_warm4000"
 8 | forest10_mscn,mscn_hid32_sample3000_ep100_bs256
 9 | forest10_chinn,chinn_hid256_256_128_64_bin200_ep500_bs32
10 | forest10_chitree,chixgb_tr512_bin200
11 | forest10_deepdb,spn_sample581012_rdc0.4_ms0.005
12 | power7_naru,"resmade_hid128,128,128,128,128_emb16_ep100_embedInembedOut_warm4000"
13 | power7_mscn,mscn_hid64_sample5000_ep100_bs256
14 | power7_chinn,chinn_hid512_512_256_bin200_ep500_bs128
15 | power7_chitree,chixgb_tr256_bin200
16 | power7_deepdb,spn_sample2075259_rdc0.3_ms0.001
17 | dmv11_naru,"resmade_hid512,512,512,512_emb128_ep100_embedInembedOut_warm4000"
18 | dmv11_mscn,mscn_hid256_sample10000_ep100_bs256
19 | dmv11_chinn,chinn_hid2048_1024_512_256_bin200_ep500_bs32
20 | dmv11_chitree,chixgb_tr8192_bin200
21 | dmv11_deepdb,spn_sample1000000_rdc0.2_ms0.001
22 | 


--------------------------------------------------------------------------------
/dynamic-exp/dynamic_exp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #################### Dynamic exp
 3 | ### Do not use 0 as random seed, becasue our Postgres sets random seed as 1/seed
 4 | log_path='log'
 5 | exp_num=1
 6 | for (( i=1; i < 1+$exp_num; ++i ))
 7 | do
 8 |     for dataset in 'census13' 'forest10' 'power7' 'dmv11'
 9 |     do
10 |         for up in 'cor' #'skew' 
11 |         do
12 |             ## MSCN
13 |             just dynamic-mscn-${dataset} ${dataset} 'original' 'base' ${up} '0.2' '10000' "$i" >${log_path}/${dataset}/mscn_${up}-exp${i}.out 2>&1
14 | 
15 |             ## lw retrain
16 |             just dynamic-lw-tree-${dataset}-retrain ${dataset} 'original' 'base' ${up} '0.2' '8000' "$i" >>${log_path}/${dataset}/lwtree_${up}-exp${i}.out 2>&1
17 |             just dynamic-lw-nn-${dataset}-retrain ${dataset} 'original' 'base' ${up} '0.2' '16000' "$i" '500' >>${log_path}/${dataset}/lwnn_eq500_${up}-exp${i}.out 2>&1
18 |             just dynamic-lw-nn-${dataset}-retrain ${dataset} 'original' 'base' ${up} '0.2' '16000' "$i" '100' >${log_path}/${dataset}/lwnn_eq100_${up}-exp${i}.out 2>&1
19 | 
20 |             ## Postgres
21 |             just dynamic-postgres-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" >${log_path}/${dataset}/postgres_${up}-exp${i}.out 2>&1
22 | 
23 |             ## MySQL
24 |             just dynamic-mysql-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" >${log_path}/${dataset}/mysql_${up}-exp${i}.out 2>&1
25 | 
26 |             ## Naru
27 |             just dynamic-naru-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" '1' >>${log_path}/${dataset}/naru_eq1_${up}-exp${i}.out 2>&1
28 |             just dynamic-naru-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" '7' >${log_path}/${dataset}/naru_eq7_${up}-exp${i}.out 2>&1
29 |             just dynamic-naru-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" '15' >${log_path}/${dataset}/naru_eq15_${up}-exp${i}.out 2>&1
30 |             ## QuickSel
31 |             just dynamic-quicksel ${dataset} 'original' 'base' ${up} '0.2' "$i" >${log_path}/${dataset}/quicksel_${up}-exp${i}.out 2>&1
32 | 
33 |             ## DeepDB
34 |             just dynamic-deepdb-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" >${log_path}/${dataset}/deepdb_${up}-exp${i}.out 2>&1
35 |         done
36 |     done
37 | done
38 | 
39 | 
40 | # epoch vs accuracy
41 | for (( i=1; i < 1+$exp_num; ++i ))
42 | do
43 |     for dataset in 'census13' 'forest10' 'power7' 'dmv11'
44 |     do
45 |         for up in 'cor' 'skew' 
46 |         do
47 |             ## lwNN
48 |             for ep in '100' '200' '300' '400' '500'
49 |             do
50 |                 just dynamic-lw-nn-${dataset}-retrain ${dataset} 'original' 'base' ${up} '0.2' '16000' "$i" $ep >${log_path}/${dataset}/lwnn_eq${ep}_${up}-exp${i}.out 2>&1
51 |             done
52 |             ## Naru
53 |             for ep in '1' '5' '10' '15' '20'
54 |             do
55 |                 just dynamic-naru-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" $ep >${log_path}/${dataset}/naru_eq${ep}_${up}-exp${i}.out 2>&1
56 |             done
57 |         done
58 |     done
59 | done
60 | 
61 | 
62 | echo `date` "All Finished!"
63 | 


--------------------------------------------------------------------------------
/dynamic-exp/parse_log_exmaple.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from parse import *
 4 | from datetime import datetime as dt
 5 | 
 6 | # path hack
 7 | sys.path.append(os.getcwd())
 8 | sys.path.append('..')
 9 | 
10 | TIME_FMT = '%Y-%m-%d %H:%M:%S,%f'
11 | 
12 | def get_gen_query_time(logfile, training_size):
13 |     '''return time_gen_train_query, time_gen_train_label'''
14 |     t_tr_query = [[],[]]
15 |     t_tr_label = [[],[]]
16 |     # time_update_model = [[],[]]
17 |     with open(logfile, 'r') as log_f:
18 |         lines = log_f.readlines()
19 |         for line in lines:
20 |             line = line.strip()
21 |             # parse time for training query update
22 |             s_tr_query=parse("[{time} INFO] lecarb.workload.gen_workload: Start generate workload with {train_num:d} queries for train...", line)
23 |             e_tr_query=parse("[{time} INFO] lecarb.workload.gen_workload: Start generate workload with {test_num:d} queries for valid...", line)
24 |             if s_tr_query and s_tr_query['train_num'] == training_size:
25 |                 t_tr_query[0].append(dt.strptime(s_tr_query['time'], TIME_FMT))
26 |             if e_tr_query:
27 |                 t_tr_query[1].append(dt.strptime(e_tr_query['time'], TIME_FMT))
28 |             
29 |             # parse time for training label update
30 |             s_tr_label=parse("[{time} INFO] lecarb.workload.gen_label: Updating ground truth labels for the workload, with sample size {}...", line)
31 |             e_tr_label=parse("[{time} INFO] lecarb.workload.gen_label: Dump labels to disk...", line)
32 |             if s_tr_label:
33 |                 t_tr_label[0].append(dt.strptime(s_tr_label['time'], TIME_FMT))
34 |             if e_tr_label:
35 |                 t_tr_label[1].append(dt.strptime(e_tr_label['time'], TIME_FMT))
36 |         # print(t_tr_query, t_tr_label)
37 | 
38 |         time_gen_tr_query = 0
39 |         time_gen_tr_label = 0
40 |         if len(t_tr_query[0]) >= 1:
41 |             time_gen_tr_query = (t_tr_query[1][0] - t_tr_query[0][0]).total_seconds()
42 |         if len(t_tr_label[0]) >= 1:
43 |             time_gen_tr_label = (t_tr_label[1][0] - t_tr_label[0][0]).total_seconds()
44 |     return time_gen_tr_query, time_gen_tr_label
45 | 
46 | def get_lw_nn_training_time(logfile):
47 |     with open(logfile, 'r') as log_f:
48 |             lines = log_f.readlines()
49 |             for line in lines:
50 |                 line = line.strip()
51 |                 update_time=parse("[{} INFO] lecarb.estimator.lw.lw_nn: Training finished! Time spent since start: {train_time:f} mins", line)
52 |                 if update_time:
53 |                     return update_time['train_time'] 
54 |     return 0
55 | 
56 | def get_postgres_time(logfile):
57 |     with open(logfile, 'r') as logf:
58 |         lines = logf.readlines()
59 |         for line in lines:
60 |             line = line.strip()
61 |             # parse time for training query update
62 |             update_time=parse("[{} INFO] lecarb.estimator.postgres: construct statistics finished, using {update_time:f} minutes, All statistics consumes {} MBs", line)
63 |             if update_time:
64 |                 return update_time['update_time']
65 |     return 0
66 |         
67 | def get_mysql_time(logfile):
68 |     with open(logfile, 'r') as logf:
69 |         lines = logf.readlines()
70 |         for line in lines:
71 |             line = line.strip()
72 |             # parse time for training query update
73 |             update_time=parse("[{} INFO] lecarb.estimator.mysql: construct statistics finished, using {update_time:f} minutes", line)
74 |             if update_time:
75 |                 return update_time['update_time']
76 |     return 0
77 | 


--------------------------------------------------------------------------------
/hyper-params.md:
--------------------------------------------------------------------------------
  1 | # Hyper-parameter Tuning
  2 | 
  3 | This file contains the hyper-parameters we test and report in our paper (mapping to the parameters in Justfile commands).
  4 | 
  5 | **NOTE:** Different hardwares may have different results, for example the max q-error on Census dataset of Naru (the same hyper-parameter and python & library version) on CPU machine is 66.0, P100 GPU machine (ComputeCanada Cedar) is 57.0 and K80 GPU machine (AWS p2.xlarge) is 58.0. The result we report for neural network methods are trained and tested on P100 GPU machine and others are on CPU.
  6 | 
  7 | ## Static Environment
  8 | 
  9 | ### Naru
 10 | 
 11 | CMD: `train-naru` and `test-naru`
 12 | 
 13 | Model Architectures:
 14 | * Census
 15 |   * layers: 5, hc_hiddens: 16, embed_size: 8
 16 |   * layers: 4, hc_hiddens: 16, embed_size: 8
 17 |   * layers: 5, hc_hiddens: 32, embed_size: 4
 18 |   * layers: 4, hc_hiddens: 32, embed_size: 4
 19 | * Forest
 20 |   * layers: 5, hc_hiddens: 32, embed_size: 8
 21 |   * layers: 4, hc_hiddens: 64, embed_size: 8
 22 |   * layers: 5, hc_hiddens: 64, embed_size: 4
 23 |   * layers: 4, hc_hiddens: 64, embed_size: 4
 24 | * Power
 25 |   * layers: 5, hc_hiddens: 64, embed_size: 32
 26 |   * layers: 4, hc_hiddens: 64, embed_size: 32
 27 |   * layers: 5, hc_hiddens: 128, embed_size: 16
 28 |   * layers: 4, hc_hiddens: 128, embed_size: 16
 29 | * DMV
 30 |   * layers: 5, hc_hiddens: 256, embed_size: 128
 31 |   * layers: 4, hc_hiddens: 512, embed_size: 128
 32 |   * layers: 5, hc_hiddens: 512, embed_size: 64
 33 |   * layers: 4, hc_hiddens: 512, embed_size: 64
 34 | 
 35 | Others:
 36 | * warmups: 0, 4000, 8000
 37 | * epochs: 100
 38 | * psample: 2000
 39 | * we use natral order for all the dataset
 40 | 
 41 | Selected Models:
 42 | 
 43 | ```bash
 44 | # census
 45 | just train-naru census13 original 4 16 8 embed embed True 0 0 100 base 123
 46 | just test-naru original-resmade_hid16,16,16,16_emb8_ep100_embedInembedOut_warm0-123 2000 census13 original base 123
 47 | 
 48 | # forest
 49 | just train-naru forest10 original 4 64 8 embed embed True 4000 0 100 base 123
 50 | just test-naru original-resmade_hid64,64,64,64_emb8_ep100_embedInembedOut_warm4000-123 2000 forest10 original base 123
 51 | 
 52 | # power
 53 | just train-naru power7 original 5 128 16 embed embed True 4000 0 100 base 123
 54 | just test-naru original-resmade_hid128,128,128,128,128_emb16_ep100_embedInembedOut_warm4000-123 2000 power7 original base 123
 55 | 
 56 | # dmv
 57 | just train-naru dmv11 original 4 512 128 embed embed True 4000 0 100 base 123
 58 | just test-naru original-resmade_hid512,512,512,512_emb128_ep100_embedInembedOut_warm4000-123 2000 dmv11 original base 123
 59 | ```
 60 | 
 61 | ### MSCN
 62 | 
 63 | CMD: `train-mscn` and `test-mscn`
 64 | 
 65 | Model Architectures:
 66 | * Census
 67 |   * num_samples: 200, hid_units: 32
 68 |   * num_samples: 400, hid_units: 16
 69 |   * num_samples: 500, hid_units: 8
 70 |   * num_samples: 600, hid_units: 4
 71 | * Forest
 72 |   * num_samples: 1000, hid_units: 64
 73 |   * num_samples: 3000, hid_units: 32
 74 |   * num_samples: 4000, hid_units: 16
 75 |   * num_samples: 5000, hid_units: 8
 76 | * Power
 77 |   * num_samples: 1000, hid_units: 128
 78 |   * num_samples: 5000, hid_units: 64
 79 |   * num_samples: 9000, hid_units: 32
 80 |   * num_samples: 10000, hid_units: 16
 81 | * DMV
 82 |   * num_samples: 1000, hid_units: 512
 83 |   * num_samples: 5000, hid_units: 512
 84 |   * num_samples: 8000, hid_units: 256
 85 |   * num_samples: 10000, hid_units: 256
 86 | 
 87 | Others:
 88 | * bs: 256, 512, 1024, 2048
 89 | * epochs: 100
 90 | 
 91 | Selected Models:
 92 | 
 93 | ```bash
 94 | # census
 95 | just train-mscn census13 original base 500 8 100 256 100000 0 123
 96 | just test-mscn original_base-mscn_hid8_sample500_ep100_bs256_100k-123 census13 original base 123
 97 | 
 98 | # forest
 99 | just train-mscn forest10 original base 3000 32 100 256 100000 0 123
100 | just test-mscn original_base-mscn_hid32_sample3000_ep100_bs256_100k-123 forest10 original base 123
101 | 
102 | # power
103 | just train-mscn power7 original base 5000 64 100 256 100000 0 123
104 | just test-mscn original_base-mscn_hid64_sample5000_ep100_bs256_100k-123 power7 original base 123
105 | 
106 | # dmv
107 | just train-mscn dmv11 original base 10000 256 100 256 100000 0 123
108 | just test-mscn original_base-mscn_hid256_sample10000_ep100_bs256_100k-123 dmv11 original base 123
109 | ```
110 | 
111 | ### LW-NN
112 | 
113 | CMD: `train-lw-nn` and `test-lw-nn`
114 | 
115 | Model Architectures:
116 | * Census
117 |   * hid_units: 64_64_64_64
118 |   * hid_units: 128_64_32_16
119 |   * hid_units: 64_64_64
120 |   * hid_units: 128_64_32
121 | * Forest
122 |   * hid_units: 512_256
123 |   * hid_units: 256_256_256
124 |   * hid_units: 256_256_128_128
125 |   * hid_units: 256_256_128_64
126 | * Power
127 |   * hid_units: 512_512
128 |   * hid_units: 512_256_128_64
129 |   * hid_units: 256_256_256_256
130 |   * hid_units: 512_512_256
131 | * DMV
132 |   * hid_units: 2048_1024_512_256
133 |   * hid_units: 1024_1024_1024_1024
134 |   * hid_units: 2048_1024_1024
135 |   * hid_units: 1024_1024_1024
136 | 
137 | Others:
138 | * bs: 32, 128, 512
139 | * bins: 200
140 | * epochs: 500
141 | 
142 | Selected Models:
143 | 
144 | ```bash
145 | # census
146 | just train-lw-nn census13 original base 64_64_64 200 100000 128 0 123
147 | just test-lw-nn original_base-lwnn_hid64_64_64_bin200_ep500_bs128_100k-123 census13 original base True 123
148 | 
149 | # forest
150 | just train-lw-nn forest10 original base 256_256_128_64 200 100000 32 0 123
151 | just test-lw-nn original_base-lwnn_hid256_256_128_64_bin200_ep500_bs32_100k-123 forest10 original base True 123
152 | 
153 | # power
154 | just train-lw-nn power7 original base 512_512_256 200 100000 128 0 123
155 | just test-lw-nn original_base-lwnn_hid512_512_256_bin200_ep500_bs128_100k-123 power7 original base True 123
156 | 
157 | # dmv
158 | just train-lw-nn dmv11 original base 2048_1024_512_256 200 100000 32 0 123
159 | just test-lw-nn original_base-lwnn_hid2048_1024_512_256_bin200_ep500_bs32_100k-123 dmv11 original base True 123
160 | ```
161 | 
162 | ### LW-XGB
163 | 
164 | CMD: `train-lw-tree` and `test-lw-tree`
165 | 
166 | trees:
167 | * Census: 16, 32, 64
168 | * Forest: 128, 256, 512
169 | * Power: 256, 512, 1024
170 | * DMV: 2048, 4096, 8192
171 | 
172 | Selected Models:
173 | 
174 | ```bash
175 | # census
176 | just train-lw-tree census13 original base 64 200 100000 0 123
177 | just test-lw-tree original_base-lwxgb_tr64_bin200_100k-123 census13 original base True 123
178 | 
179 | # forest
180 | just train-lw-tree forest10 original base 512 200 100000 0 123
181 | just test-lw-tree original_base-lwxgb_tr512_bin200_100k-123 forest10 original base True 123
182 | 
183 | # power
184 | just train-lw-tree power7 original base 256 200 100000 0 123
185 | just test-lw-tree original_base-lwxgb_tr256_bin200_100k-123 power7 original base True 123
186 | 
187 | # dmv
188 | just train-lw-tree dmv11 original base 8192 200 100000 0 123
189 | just test-lw-tree original_base-lwxgb_tr8192_bin200_100k-123 dmv11 original base True 123
190 | ```  
191 | 
192 | ### DeepDB
193 | 
194 | CMD: `train-deepdb` and `test-deepdb`
195 | 
196 | Grid Search:
197 | * rdc_threshold: 0.2, 0.3, 0.4
198 | * ratio_min_instance_slice: 0.001, 0.005, 0.01, 0.05
199 | * hdf_sample_size: 1M, 10M
200 | 
201 | Selected Models:
202 | 
203 | ```bash
204 | # census
205 | just train-deepdb census13 original 1000000 0.4 0.01 0 base 123
206 | just test-deepdb original-spn_sample48842_rdc0.4_ms0.01-123 census13 original base 123
207 | 
208 | # forest
209 | just train-deepdb forest10 original 1000000 0.4 0.005 0 base 123
210 | just test-deepdb original-spn_sample581012_rdc0.4_ms0.005-123 forest10 original base 123
211 | 
212 | # power
213 | just train-deepdb power7 original 10000000 0.3 0.001 0 base 123
214 | just test-deepdb original-spn_sample2075259_rdc0.3_ms0.001-123 power7 original base 123
215 | 
216 | # dmv
217 | just train-deepdb dmv11 original 1000000 0.2 0.001 0 base 123
218 | just test-deepdb original-spn_sample1000000_rdc0.2_ms0.001-123 dmv11 original base 123
219 | ```
220 | 
221 | ## Micro-Bencmark
222 | 
223 | ### Preparation
224 | 
225 | #### Data Generation
226 | 
227 | CMD: `data-gen`
228 | * skew: 0.0, 0.2, 0.4, ..., 1.8, 2.0
229 | * corr: 0.0, 0.1, 0.2, ..., 0.9, 1.0
230 | * dom: 10, 100, 1000, 10000
231 | 
232 | #### Workload Generation
233 | 
234 | CMD: `wkld-gen-vood` and `wkld-label`
235 | 
236 | Example: generate dataset and workload for dataset versions with 1000 domain values
237 | 
238 | ```bash
239 | # 1. generate versions
240 | for c in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0; do
241 |     for s in 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0; do
242 |         just data-gen $s $c 1000
243 |     done
244 | done
245 | 
246 | # 2. generate queryset (can use any version to generate this workload since we use independent center values and the domains are the same)
247 | wkld-gen-vood dom1000 skew0.0_corr0.0
248 | 
249 | # 3. generate labels for each version
250 | for c in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0; do
251 |     for s in 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0; do
252 |         just wkld-label dom1000 skew${s}_corr${c} vood
253 |     done
254 | done
255 | ```
256 | 
257 | ### Model Parameters
258 | 
259 | In this experiment, we train and test following models on every synthetic dataset (using `vood` workload) we generated. **Architecture used to report in paper**
260 | 
261 | #### Naru
262 | 
263 | CMD: `train-naru` and `test-naru`
264 | 
265 | Model Architectures:
266 | * dom10 (domain size = 10)
267 |   * layers: 5, hc_hiddens: 64, embed_size: 128
268 |   * **layers: 4, hc_hiddens: 64, embed_size: 64**
269 |   * layers: 5, hc_hiddens: 64, embed_size: 32
270 | * dom100
271 |   * layers: 4, hc_hiddens: 32, embed_size: 128
272 |   * layers: 5, hc_hiddens: 32, embed_size: 64
273 |   * **layers: 5, hc_hiddens: 32, embed_size: 32**
274 | * dom1000
275 |   * layers: 5, hc_hiddens: 16, embed_size: 16
276 |   * layers: 5, hc_hiddens: 64, embed_size: 8
277 |   * **layers: 4, hc_hiddens: 16, embed_size: 16**
278 | * dom10000
279 |   * layers: 3, hc_hiddens: 64, embed_size: 2
280 |   * **layers: 4, hc_hiddens: 32, embed_size: 2**
281 |   * layers: 5, hc_hiddens: 32, embed_size: 2
282 | 
283 | Others:
284 | * warmups: 0
285 | * epochs: 100
286 | 
287 | #### MSCN
288 | 
289 | CMD: `train-mscn` and `test-mscn`
290 | 
291 | Model Architectures:
292 | * **num_samples: 1000, hid_units: 32**
293 | * num_samples: 3000, hid_units: 8
294 | * num_samples: 5000, hid_units: 4
295 | 
296 | Others:
297 | * bs: 1024
298 | * epochs: 100
299 | * train_num: 100000
300 | 
301 | #### LW-NN
302 | 
303 | CMD: `train-lw-nn` and `test-lw-nn`
304 | 
305 | Model Architectures:
306 | * hid_units: 256_128_64
307 | * hid_units: 128_128_128
308 | * **hid_units: 256_128_64_32**
309 | 
310 | Others:
311 | * bs: 32
312 | * bins: 200
313 | * epochs: 500
314 | * train_num: 100000
315 | 
316 | #### LW-Tree
317 | 
318 | CMD: `train-lw-tree` and `test-lw-tree`
319 | 
320 | * trees: 128
321 | * bins: 200
322 | * train_num: 100000
323 | 
324 | #### DeepDB
325 | 
326 | CMD: `train-deepdb` and `test-deepdb`
327 | 
328 | * hdf_sample_size: 1000000
329 | * rdc_threshold: 0.3
330 | * ratio_min_instance_slice: 0.01
331 | 


--------------------------------------------------------------------------------
/lecarb/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from logging import getLogger
 3 | 
 4 | logger = getLogger(__name__)
 5 | logger.setLevel(logging.DEBUG)
 6 | 
 7 | ch = logging.StreamHandler()
 8 | ch.setLevel(logging.DEBUG)
 9 | formatter = logging.Formatter("[{asctime} {levelname}] {name}: {message}", style="{")
10 | ch.setFormatter(formatter)
11 | logger.addHandler(ch)
12 | 


--------------------------------------------------------------------------------
/lecarb/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | import torch
 4 | 
 5 | DATA_ROOT = Path(os.environ["DATA_ROOT"])
 6 | OUTPUT_ROOT = Path(os.environ["OUTPUT_ROOT"])
 7 | MODEL_ROOT = OUTPUT_ROOT / "model"
 8 | RESULT_ROOT = OUTPUT_ROOT / "result"
 9 | LOG_ROOT = OUTPUT_ROOT / "log"
10 | 
11 | DATABASE_URL = os.environ["DATABASE_URL"]
12 | KDE_DATABASE_URL = os.environ["KDE_DATABASE_URL"]
13 | MYSQL_HOST = os.environ["MYSQL_HOST"]
14 | MYSQL_PORT = os.environ["MYSQL_PORT"]
15 | MYSQL_DB = os.environ["MYSQL_DB"]
16 | MYSQL_USER = os.environ["MYSQL_USER"]
17 | MYSQL_PSWD = os.environ["MYSQL_PSWD"]
18 | 
19 | PKL_PROTO = 4
20 | 
21 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
22 | NUM_THREADS = int(os.environ.get("CPU_NUM_THREADS", os.cpu_count()))
23 | 
24 | VALID_NUM_DATA_DRIVEN = 100
25 | 


--------------------------------------------------------------------------------
/lecarb/dataset/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import copy
  3 | import logging
  4 | import pickle
  5 | from collections import OrderedDict
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.metrics import mutual_info_score
 10 | from scipy.stats import entropy
 11 | 
 12 | from ..constants import DATA_ROOT, PKL_PROTO
 13 | from ..dtypes import is_categorical
 14 | 
 15 | L = logging.getLogger(__name__)
 16 | 
 17 | class Column(object):
 18 |     def __init__(self, name, data):
 19 |         self.name = name
 20 |         self.dtype = data.dtype
 21 | 
 22 |         # parse vocabulary
 23 |         self.vocab, self.has_nan = self.__parse_vocab(data)
 24 |         self.vocab_size = len(self.vocab)
 25 |         self.minval = self.vocab[1] if self.has_nan else self.vocab[0]
 26 |         self.maxval = self.vocab[-1]
 27 | 
 28 |     def __repr__(self):
 29 |         return f'Column({self.name}, type={self.dtype}, vocab size={self.vocab_size}, min={self.minval}, max={self.maxval}, has NaN={self.has_nan})'
 30 | 
 31 |     def __parse_vocab(self, data):
 32 |         # pd.isnull returns true for both np.nan and np.datetime64('NaT').
 33 |         is_nan = pd.isnull(data)
 34 |         contains_nan = np.any(is_nan)
 35 |         # NOTE: np.sort puts NaT values at beginning, and NaN values at end.
 36 |         # For our purposes we always add any null value to the beginning.
 37 |         vs = np.sort(np.unique(data[~is_nan]))
 38 |         if contains_nan:
 39 |             vs = np.insert(vs, 0, np.nan)
 40 |         return vs, contains_nan
 41 | 
 42 |     def discretize(self, data):
 43 |         """Transforms data values into integers using a Column's vocabulary"""
 44 | 
 45 |         # pd.Categorical() does not allow categories be passed in an array
 46 |         # containing np.nan.  It makes it a special case to return code -1
 47 |         # for NaN values.
 48 |         if self.has_nan:
 49 |             bin_ids = pd.Categorical(data, categories=self.vocab[1:]).codes
 50 |             # Since nan/nat bin_id is supposed to be 0 but pandas returns -1, just
 51 |             # add 1 to everybody
 52 |             bin_ids = bin_ids + 1
 53 |         else:
 54 |             # This column has no nan or nat values.
 55 |             bin_ids = pd.Categorical(data, categories=self.vocab).codes
 56 | 
 57 |         bin_ids = bin_ids.astype(np.int32, copy=False)
 58 |         assert (bin_ids >= 0).all(), (self, data, bin_ids)
 59 |         return bin_ids
 60 | 
 61 |     def normalize(self, data):
 62 |         """Normalize data to range [0, 1]"""
 63 |         minval = self.minval
 64 |         maxval = self.maxval
 65 |         # if column is not numerical, use descretized value
 66 |         if is_categorical(self.dtype):
 67 |             data = self.discretize(data)
 68 |             minval = 0
 69 |             maxval = self.vocab_size - 1
 70 |         data = np.array(data, dtype=np.float32)
 71 |         if minval >= maxval:
 72 |             L.warning(f"column {self.name} has min value {minval} >= max value{maxval}")
 73 |             return np.zeros(len(data)).astype(np.float32)
 74 |         val_norm = (data - minval) / (maxval - minval)
 75 |         return val_norm.astype(np.float32)
 76 | 
 77 | class Table(object):
 78 |     def __init__(self, dataset, version):
 79 |         self.dataset = dataset
 80 |         self.version = version
 81 |         self.name = f"{self.dataset}_{self.version}"
 82 |         L.info(f"start building data {self.name}...")
 83 | 
 84 |         # load data
 85 |         self.data = pd.read_pickle(DATA_ROOT / self.dataset / f"{self.version}.pkl")
 86 |         self.data_size_mb = self.data.values.nbytes / 1024 / 1024
 87 |         self.row_num = self.data.shape[0]
 88 |         self.col_num = len(self.data.columns)
 89 | 
 90 |         # parse columns
 91 |         self.parse_columns()
 92 |         L.info(f"build finished: {self}")
 93 |     
 94 |     def parse_columns(self):
 95 |         self.columns = OrderedDict([(col, Column(col, self.data[col])) for col in self.data.columns])
 96 | 
 97 |     def __repr__(self):
 98 |         return f"Table {self.name} ({self.row_num} rows, {self.data_size_mb:.2f}MB, columns:\n{os.linesep.join([repr(c) for c in self.columns.values()])})"
 99 | 
100 |     def get_minmax_dict(self):
101 |         minmax_dict = {}
102 |         for i, col in enumerate(self.columns.values()):
103 |             minmax_dict[i] = (col.minval, col.maxval)
104 |         return minmax_dict
105 | 
106 |     def normalize(self, scale=1):
107 |         data = copy.deepcopy(self.data)
108 |         for cname, col in self.columns.items():
109 |             data[cname] = col.normalize(data[cname].values) * scale
110 |         return data
111 | 
112 |     def digitalize(self):
113 |         data = copy.deepcopy(self.data)
114 |         for cname, col in self.columns.items():
115 |             if is_categorical(col.dtype):
116 |                 data[cname] = col.discretize(data[cname])
117 |             elif col.has_nan:
118 |                 data[cname].fillna(0, inplace=True)
119 |         return data
120 | 
121 |     def get_max_muteinfo_order(self):
122 |         order = []
123 | 
124 |         # find the first column with maximum entropy
125 |         max_entropy = float('-inf')
126 |         first_col = None
127 |         for c in self.columns.keys():
128 |             e = entropy(self.data[c].value_counts())
129 |             if e > max_entropy:
130 |                 first_col = c
131 |                 max_entropy = e
132 |         assert first_col is not None, (first_col, max_entropy)
133 |         order.append(first_col)
134 |         sep = '|'
135 |         chosen_data = self.data[first_col].astype(str) + sep
136 | 
137 |         # add the rest columns one by one by choosing the max mutual information with existing columns
138 |         while len(order) < self.col_num:
139 |             max_muinfo = float('-inf')
140 |             next_col = None
141 |             for c in self.columns.keys():
142 |                 if c in order: continue
143 |                 m = mutual_info_score(chosen_data, self.data[c])
144 |                 if m > max_muinfo:
145 |                     next_col = c
146 |                     max_muinfo = m
147 |             assert next_col is not None, (next_col, max_entropy)
148 |             order.append(next_col)
149 |             # concate new chosen columns
150 |             chosen_data = chosen_data + sep + self.data[next_col].astype(str)
151 | 
152 |         return order, [self.data.columns.get_loc(c) for c in order]
153 | 
154 |     def get_muteinfo(self, digital_data=None):
155 |         data = digital_data if digital_data is not None else self.digitalize()
156 |         muteinfo_dict = {}
157 |         for c1 in self.columns.keys():
158 |             muteinfo_dict[c1] = {}
159 |             for c2 in self.columns.keys():
160 |                 if c1 != c2 and c2 in muteinfo_dict:
161 |                     assert c1 in muteinfo_dict[c2], muteinfo_dict.keys()
162 |                     muteinfo_dict[c1][c2] = muteinfo_dict[c2][c1]
163 |                 else:
164 |                     muteinfo_dict[c1][c2] = mutual_info_score(data[c1], data[c2])
165 |         return pd.DataFrame().from_dict(muteinfo_dict)
166 | 
167 | def dump_table(table: Table) -> None:
168 |     with open(DATA_ROOT / table.dataset / f"{table.version}.table.pkl", 'wb') as f:
169 |         pickle.dump(table, f, protocol=PKL_PROTO)
170 | 
171 | def load_table(dataset: str, version: str, overwrite: bool=False) -> Table:
172 |     table_path = DATA_ROOT / dataset / f"{version}.table.pkl"
173 | 
174 |     if not overwrite and table_path.is_file():
175 |         L.info("table exists, load...")
176 |         with open(table_path, 'rb') as f:
177 |             table = pickle.load(f)
178 |         L.info(f"load finished: {table}")
179 |         return table
180 | 
181 |     table = Table(dataset, version)
182 |     L.info("dump table to disk...")
183 |     dump_table(table)
184 |     return table
185 | 
186 | def dump_table_to_num(dataset: str, version: str) -> None:
187 |     table = load_table(dataset, version)
188 |     num_data = table.digitalize()
189 |     csv_path = DATA_ROOT / dataset / f"{version}_num.csv"
190 |     L.info(f"dump csv file to {csv_path}")
191 |     num_data.to_csv(csv_path, index=False)
192 | 
193 | 
194 | if __name__ == '__main__':
195 |     #  table = load_table('forest')
196 |     #  print(table.get_max_muteinfo_order())
197 |     # 7 1 8 6 5 9 0 4 3 2
198 | 
199 |     #  table = load_table('census')
200 |     #  print(table.get_max_muteinfo_order())
201 |     # 4 3 2 0 6 12 7 5 1 13 9 10 8 11
202 | 
203 |     table = Table('census', 'original')
204 |     print(table)
205 |     #  print(table.get_max_muteinfo_order())
206 |     # 4 0 1 2 3 5 8 7 6
207 | 


--------------------------------------------------------------------------------
/lecarb/dataset/gen_dataset.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import logging
 3 | import numpy as np
 4 | import pandas as pd
 5 | from scipy.stats import truncnorm, truncexpon, genpareto
 6 | from typing import Dict, Any
 7 | 
 8 | from .dataset import load_table
 9 | from ..constants import DATA_ROOT
10 | 
11 | L = logging.getLogger(__name__)
12 | 
13 | def get_truncated_normal(mean=0, sd=100, low=0, upp=1000):
14 |     return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)
15 | 
16 | def get_truncated_expon(scale=100, low=0, upp=1000):
17 |     return truncexpon(b=(upp-low)/scale, loc=low, scale=scale)
18 | 
19 | def generate_dataset(
20 |     seed: int, dataset: str, version: str,
21 |     params: Dict[str, Any], overwrite: bool
22 | ) -> None:
23 |     path = DATA_ROOT / dataset
24 |     path.mkdir(exist_ok=True)
25 |     csv_path = path / f"{version}.csv"
26 |     pkl_path = path / f"{version}.pkl"
27 |     if not overwrite and csv_path.is_file():
28 |         L.info(f"Dataset path exists, do not continue")
29 |         return
30 | 
31 |     row_num = params['row_num']
32 |     col_num = params['col_num']
33 |     dom = params['dom']
34 |     corr = params['corr']
35 |     skew = params['skew']
36 | 
37 |     if col_num != 2:
38 |         L.info("For now only support col=2!")
39 |         exit(0)
40 | 
41 |     L.info(f"Start generate dataset with {col_num} columns and {row_num} rows using seed {seed}")
42 |     random.seed(seed)
43 |     np.random.seed(seed)
44 | 
45 |     # generate the first column according to skew
46 |     col0 = np.arange(dom) # make sure every domain value has at least 1 value
47 |     tmp = genpareto.rvs(skew-1, size=row_num-len(col0)) # c = skew - 1, so we can have c >= 0
48 |     tmp = ((tmp - tmp.min()) / (tmp.max() - tmp.min())) * dom # rescale generated data to the range of domain
49 |     col0 = np.concatenate((col0, np.clip(tmp.astype(int), 0, dom-1)))
50 | 
51 |     # generate the second column according to the first
52 |     col1 = []
53 |     for c0 in col0:
54 |         col1.append(c0 if np.random.uniform(0, 1) <= corr else np.random.choice(dom))
55 | 
56 |     df = pd.DataFrame(data={'col0': col0, 'col1': col1})
57 | 
58 |     L.info(f"Dump dataset {dataset} as version {version} to disk")
59 |     df.to_csv(csv_path, index=False)
60 |     df.to_pickle(pkl_path)
61 |     load_table(dataset, version)
62 |     L.info(f"Finish!")
63 | 
64 | 


--------------------------------------------------------------------------------
/lecarb/dataset/manipulate_dataset.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import logging
  3 | import pickle
  4 | import numpy as np
  5 | import math
  6 | import pandas as pd
  7 | from scipy.stats import truncnorm, truncexpon, genpareto
  8 | from typing import Dict, Any, Tuple
  9 | from copy import deepcopy
 10 | 
 11 | from .dataset import load_table
 12 | from ..constants import DATA_ROOT, PKL_PROTO
 13 | 
 14 | L = logging.getLogger(__name__)
 15 | 
 16 | # Independence data: Random by each column
 17 | def get_random_data(dataset: str, version: str, overwrite=False) -> Tuple[pd.DataFrame, str]:
 18 |     rand_version = f"{version}_ind"
 19 |     random_file = DATA_ROOT / dataset / f"{rand_version}.pkl"
 20 |     if not overwrite and random_file.is_file():
 21 |         L.info(f"Dataset path exists, using it")
 22 |         return pd.read_pickle(random_file), rand_version
 23 |     
 24 |     df = pd.read_pickle(DATA_ROOT / dataset / f"{version}.pkl")
 25 |     for col in df.columns:
 26 |         df[col] = df[col].sample(frac=1).reset_index(drop=True)
 27 |     pd.to_pickle(df, random_file, protocol=PKL_PROTO)
 28 |     return df, rand_version
 29 | 
 30 | # Max Spearman correlation data: sort by each column
 31 | def get_sorted_data(dataset: str, version: str, overwrite=False) -> Tuple[pd.DataFrame, str]:
 32 |     sort_version = f"{version}_cor"
 33 |     sorted_file = DATA_ROOT / dataset / f"{sort_version}.pkl"
 34 |     if not overwrite and sorted_file.is_file():
 35 |         return pd.read_pickle(sorted_file), sort_version
 36 |     
 37 |     df = pd.read_pickle(DATA_ROOT / dataset / f"{version}.pkl")
 38 |     for col in df.columns:
 39 |         df[col] = df[col].sort_values().reset_index(drop=True)
 40 |     df = df.sample(frac=1).reset_index(drop=True)
 41 |     pd.to_pickle(df, sorted_file, protocol=PKL_PROTO)
 42 |     return df, sort_version
 43 | 
 44 | # Get skew data by tuple level frequent rank.
 45 | def get_skew_data(dataset: str = 'census', version: str = 'original', sample_ratio=0.0005, overwrite=False) -> Tuple[pd.DataFrame, str]:
 46 |     skew_version = f"{version}_skew"
 47 |     skew_file = DATA_ROOT / dataset / f"{skew_version}.pkl"
 48 |     if not overwrite and skew_file.is_file():
 49 |         return pd.read_pickle(skew_file), skew_version
 50 |     
 51 |     df = pd.read_pickle(DATA_ROOT / dataset / f"{version}.pkl")
 52 | 
 53 | 
 54 |     rank_df = pd.DataFrame(0.0, index=range(len(df)), columns=['rank_sum']).astype(np.float32)
 55 |     for col in df.columns:
 56 |         rank_df['rank_sum'] += df[col].map(df[col].value_counts().div(len(rank_df))).astype(np.float32)
 57 |         print(f"{col} frequency calculation finished!")
 58 |     selected_id = rank_df.sort_values(by='rank_sum').head(round(len(df)*sample_ratio)).index
 59 |     sk_df = df.iloc[selected_id]
 60 |     sk_df = pd.concat([sk_df] * int(1/sample_ratio + 1), ignore_index=True).head(len(df))
 61 |     pd.to_pickle(sk_df, skew_file, protocol=PKL_PROTO)
 62 |     return sk_df, skew_version
 63 | 
 64 | 
 65 | 
 66 | def append_data(dataset: str, version_target: str, version_from: str, interval=0.2):
 67 |     df_target = pd.read_pickle(DATA_ROOT / dataset / f"{version_target}.pkl")
 68 |     df_from = pd.read_pickle(DATA_ROOT / dataset / f"{version_from}.pkl")
 69 | 
 70 |     row_num = len(df_from)
 71 |     l = 0
 72 |     r = l + interval
 73 |     if r <= 1:
 74 |         L.info(f"Start appending {version_target} with {version_from} in [{l}, {r}]")
 75 |         df_target = df_target.append(df_from[int(l*row_num): int(r*row_num)], ignore_index=True, sort=False)
 76 |         pd.to_pickle(df_target, DATA_ROOT / dataset / f"{version_target}+{version_from}_{r:.1f}.pkl")
 77 |         df_target.to_csv(DATA_ROOT / dataset / f"{version_target}+{version_from}_{r:.1f}.csv", index=False)
 78 |         load_table(dataset, f"{version_target}+{version_from}_{r:.1f}")
 79 |     else:
 80 |         L.info(f"Appending Fail! Batch size is too big!")
 81 | 
 82 | 
 83 | 
 84 | def gen_appended_dataset(
 85 |     seed: int, dataset: str, version: str, 
 86 |     params: Dict[str, Any], overwrite: bool
 87 |     ) -> None:
 88 |     random.seed(seed)
 89 |     np.random.seed(seed)
 90 |     update_type = params.get('type')
 91 |     batch_ratio = params.get('batch_ratio')
 92 |     L.info(f"Start generating appended data for {dataset}/{version}")
 93 | 
 94 |     if update_type == 'ind':
 95 |         _, rand_version = get_random_data(dataset, version, overwrite=overwrite)
 96 |         append_data(dataset, version, rand_version, interval=batch_ratio)
 97 |     elif update_type == 'cor':
 98 |         _, sort_version = get_sorted_data(dataset, version, overwrite=overwrite)
 99 |         append_data(dataset, version, sort_version, interval=batch_ratio)
100 |     elif update_type == 'skew':
101 |         _, skew_version = get_skew_data(dataset, version,
102 |                                         sample_ratio=float(params['skew_size']), overwrite=overwrite)
103 |         append_data(dataset, version, skew_version, interval=batch_ratio)
104 |     else:
105 |         raise NotImplementedError
106 |     L.info("Finish updating data!")
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/lecarb/dtypes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for auxiliary type detection functions
 3 | """
 4 | 
 5 | from typing import Any
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | 
10 | CATEGORICAL_NUMPY_DTYPES = [np.bool, np.object]
11 | CATEGORICAL_PANDAS_DTYPES = [pd.CategoricalDtype, pd.PeriodDtype]
12 | CATEGORICAL_DTYPES = CATEGORICAL_NUMPY_DTYPES + CATEGORICAL_PANDAS_DTYPES
13 | 
14 | NUMERICAL_NUMPY_DTYPES = [np.number, np.datetime64]
15 | NUMERICAL_PANDAS_DTYPES = [pd.DatetimeTZDtype]
16 | NUMERICAL_DTYPES = NUMERICAL_NUMPY_DTYPES + NUMERICAL_PANDAS_DTYPES
17 | 
18 | 
19 | def is_categorical(dtype: Any) -> bool:
20 |     """
21 |     Given a type, return if that type is a categorical type
22 |     """
23 | 
24 |     if is_numerical(dtype):
25 |         return False
26 | 
27 |     if isinstance(dtype, np.dtype):
28 |         dtype = dtype.type
29 | 
30 |         return any(issubclass(dtype, c) for c in CATEGORICAL_NUMPY_DTYPES)
31 |     else:
32 |         return any(isinstance(dtype, c) for c in CATEGORICAL_PANDAS_DTYPES)
33 | 
34 | 
35 | def is_numerical(dtype: Any) -> bool:
36 |     """
37 |     Given a type, return if that type is a numerical type
38 |     """
39 |     if isinstance(dtype, np.dtype):
40 |         dtype = dtype.type
41 |         return any(issubclass(dtype, c) for c in NUMERICAL_NUMPY_DTYPES)
42 |     else:
43 |         return any(isinstance(dtype, c) for c in NUMERICAL_PANDAS_DTYPES)
44 | 
45 | def is_discrete(dtype: Any) -> bool:
46 |     """
47 |     Given a type, return if that type is a discrete type (categorical or integer)
48 |     """
49 |     if is_categorical(dtype):
50 |         return True
51 | 
52 |     assert isinstance(dtype, np.dtype), dtype
53 |     dtype = dtype.type
54 |     return issubclass(dtype, np.integer)
55 | 
56 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/README.md:
--------------------------------------------------------------------------------
1 | Paper: [DeepDB: Learn from Data, not from Queries!](http://www.vldb.org/pvldb/vol13/p992-hilprecht.pdf)
2 | Code Reference: [repo](https://github.com/DataManagementLab/deepdb-public)
3 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/aqp_spn/__init__.py


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/code_generation/convert_conditions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from spn.structure.StatisticalTypes import MetaType
 3 | 
 4 | 
 5 | def _convert_range(range, pos):
 6 |     if range[pos] == -np.inf or range[pos] == np.inf:
 7 |         minusInf = True
 8 |         condition = 0
 9 |     else:
10 |         minusInf = False
11 |         condition = range[pos]
12 |     return minusInf, condition
13 | 
14 | 
15 | def _convert_real(idx, condition, inverted_features):
16 |     # method_params += [f'bool inverse{i}', f'bool leftMinusInf{i}', f'float leftCondition{i}',
17 |     #                   f'bool rightMinusInf{i}', f'float rightCondition{i}', f'bool leftIncluded{i}',
18 |     #                   f'bool rightIncluded{i}', f'float nullValue{i}']
19 | 
20 |     inverse = idx in inverted_features
21 |     if condition is not None:
22 |         leftMinusInf, leftCondition = _convert_range(condition.ranges[0], 0)
23 |         rightMinusInf, rightCondition = _convert_range(condition.ranges[0], 1)
24 |         return inverse, leftMinusInf, leftCondition, rightMinusInf, rightCondition, condition.inclusive_intervals[0][0], \
25 |                condition.inclusive_intervals[0][1], condition.null_value
26 | 
27 |     return inverse, False, 0, False, 0, False, False, 0
28 | 
29 | 
30 | def _convert_categorical(condition):
31 |     # method_params += [f'vector <int> possibleValues{i}', f'int nullValueIdx{i}']
32 | 
33 |     if condition is not None:
34 |         if condition.is_not_null_condition:
35 |             return condition.possible_values, condition.null_value
36 |         else:
37 |             return condition.possible_values, -1
38 | 
39 |     # leaves will anyway not be evaluated
40 |     return [0], 0
41 | 
42 | 
43 | def convert_range(relevant_scope, featureScope, meta_types, conditions, inverted_features):
44 |     """
45 |     Translates conditions for an expectation method call into parameters that can be passed to generated SPN code.
46 |     :param relevant_scope: relevant_scope from expectation method
47 |     :param featureScope: feature_scope from expectation method
48 |     :param meta_types: types of the columns of the SPN
49 |     :param conditions: conditions to be translated
50 |     :param inverted_features: list indicating which indexes are inverted features (1/x)
51 |     :return: Boolean indicating whether inference is supported by generated SPN. Parameters that have to be passed.
52 |     """
53 |     parameters = (relevant_scope, featureScope)
54 | 
55 |     for idx, condition in enumerate(conditions):
56 |         if meta_types[idx] == MetaType.DISCRETE:
57 |             parameters += _convert_categorical(condition)
58 |         elif meta_types[idx] == MetaType.REAL:
59 |             # several conditions currently not supported
60 |             if condition is not None and len(condition.ranges) > 1:
61 |                 return False, None
62 |             # conditions on feature column currently not supported in C++
63 |             if featureScope[idx] is None:
64 |                 return False, None
65 |             parameters += _convert_real(idx, condition, inverted_features)
66 | 
67 |     return True, parameters


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/code_generation/generate_code.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from enum import Enum
  3 | from time import perf_counter
  4 | 
  5 | import numpy as np
  6 | from spn.structure.Base import assign_ids, Product, get_number_of_nodes
  7 | from spn.structure.StatisticalTypes import MetaType
  8 | 
  9 | from aqp_spn.aqp_leaves import Categorical, IdentityNumericLeaf, Sum
 10 | from ensemble_compilation.spn_ensemble import read_ensemble
 11 | 
 12 | import os
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class TemplatePath(Enum):
 18 |     current_file_path = __file__
 19 |     current_file_dir = os.path.dirname(__file__)
 20 |     MASTER = os.path.join(current_file_dir, 'templates/master.cpp')
 21 |     CATEGORICAL = os.path.join(current_file_dir, 'templates/categorical_leave.cpp')
 22 |     IDENTITY = os.path.join(current_file_dir, 'templates/identity_leave.cpp')
 23 |     PRODUCT = os.path.join(current_file_dir, 'templates/product_node.cpp')
 24 |     SUM = os.path.join(current_file_dir, 'templates/sum_node.cpp')
 25 |     METHOD_MASTER = os.path.join(current_file_dir, 'templates/method_master.cpp')
 26 |     REGISTRATION_MASTER = os.path.join(current_file_dir, 'templates/registration_master.cpp')
 27 | 
 28 | 
 29 | def replace_template(template_path, value_dictionary, depth):
 30 |     with open(template_path.value, 'r') as ftemp:
 31 |         templateString = ftemp.read()
 32 | 
 33 |     code_string = templateString.format(**value_dictionary)
 34 |     padding = ''.join(['    '] * depth)
 35 |     return ''.join([padding + line for line in code_string.splitlines(True)])
 36 | 
 37 | 
 38 | def comma_seperated_list(value_list):
 39 |     return ', '.join([str(v) for v in value_list])
 40 | 
 41 | 
 42 | def generate_scope_check(scope):
 43 |     return ' || '.join([f'relevantScope[{node_scope}]' for node_scope in scope])
 44 | 
 45 | 
 46 | def generate_categorical_node(node, root_node, floating_data_type, depth):
 47 |     value_dictionary = {
 48 |         'node_id': node.id,
 49 |         'node_scope': node.scope[0],
 50 |         'node_p': comma_seperated_list(node.p),
 51 |         'final_assert': f'resultValue = nodeIntermediateResult[{node.id}];' if root_node == node else '',
 52 |         'floating_data_type': floating_data_type
 53 |     }
 54 |     return replace_template(TemplatePath.CATEGORICAL, value_dictionary, depth)
 55 | 
 56 | 
 57 | def nan_replacement(value):
 58 |     if np.isnan(value):
 59 |         return 0
 60 |     else:
 61 |         return value
 62 | 
 63 | 
 64 | def generate_identity_node(node, root_node, floating_data_type, depth):
 65 |     value_dictionary = {
 66 |         'node_id': node.id,
 67 |         'node_scope': node.scope[0],
 68 |         'null_value_prob': node.null_value_prob,
 69 |         'unique_values': comma_seperated_list(node.unique_vals),
 70 |         'prob_sum': comma_seperated_list(node.prob_sum),
 71 |         'mean': nan_replacement(node.mean * (1 - node.null_value_prob)),
 72 |         'inverted_mean': nan_replacement(node.inverted_mean * (1 - node.null_value_prob)),
 73 |         'floating_data_type': floating_data_type,
 74 |         'final_assert': f'resultValue = nodeIntermediateResult[{node.id}];' if root_node == node else ''
 75 |     }
 76 |     return replace_template(TemplatePath.IDENTITY, value_dictionary, depth)
 77 | 
 78 | 
 79 | def generate_product_node(node, root_node, floating_data_type, depth):
 80 |     # if ({scope_check}) {{
 81 |     #     {subtree_code}
 82 |     #     nodeIntermediateResult[{node_id}] = 1.0
 83 |     #     {result_calculation}
 84 |     # }}
 85 | 
 86 |     result_calculation_lines = []
 87 |     for child in node.children:
 88 |         result_calculation_lines += [f'if ({generate_scope_check(child.scope)}) '
 89 |                                      f'{{nodeIntermediateResult[{node.id}] *= nodeIntermediateResult[{child.id}];}}']
 90 | 
 91 |     value_dictionary = {
 92 |         'node_id': node.id,
 93 |         'scope_check': generate_scope_check(node.scope),
 94 |         'subtree_code': '\n'.join(
 95 |             [generate_method_body(child, root_node, floating_data_type, depth) for child in node.children]),
 96 |         'result_calculation': '\n    '.join(result_calculation_lines),
 97 |         'final_assert': f'resultValue = nodeIntermediateResult[{node.id}];' if root_node == node else ''
 98 |     }
 99 |     return replace_template(TemplatePath.PRODUCT, value_dictionary, depth)
100 | 
101 | 
102 | def generate_sum_node(node, root_node, floating_data_type, depth):
103 |     # if ({scope_check}) {{
104 |     # {subtree_code}
105 |     #     {result_calculation}
106 |     #     {final_assert}
107 |     # }}
108 | 
109 |     result_calculation_lines = []
110 |     for i, child in enumerate(node.children):
111 |         result_calculation_lines += [f'nodeIntermediateResult[{child.id}] * {node.weights[i]}']
112 | 
113 |     value_dictionary = {
114 |         'scope_check': generate_scope_check(node.scope),
115 |         'subtree_code': '\n'.join(
116 |             [generate_method_body(child, root_node, floating_data_type, depth) for child in node.children]),
117 |         'result_calculation': f'nodeIntermediateResult[{node.id}]=' + ' + '.join(result_calculation_lines) + ';',
118 |         'final_assert': f'resultValue = nodeIntermediateResult[{node.id}];' if root_node == node else ''
119 |     }
120 |     return replace_template(TemplatePath.SUM, value_dictionary, depth)
121 | 
122 | 
123 | def generate_method_body(node, root_node, floating_data_type, depth):
124 |     if isinstance(node, Categorical):
125 |         return generate_categorical_node(node, root_node, floating_data_type, depth + 1)
126 |     elif isinstance(node, IdentityNumericLeaf):
127 |         return generate_identity_node(node, root_node, floating_data_type, depth + 1)
128 |     elif isinstance(node, Product):
129 |         return generate_product_node(node, root_node, floating_data_type, depth + 1)
130 |     elif isinstance(node, Sum):
131 |         return generate_sum_node(node, root_node, floating_data_type, depth + 1)
132 |     else:
133 |         raise NotImplementedError
134 | 
135 | 
136 | def generate_code(spn_id, spn, meta_types, floating_data_type):
137 |     """
138 |     Generates inference code for an SPN
139 |     :param target_path: the path the generated C++ code is written to
140 |     :param floating_data_type: data type floating numbers are represented in generated C++ code
141 |     :param spn: root node of an SPN
142 |     :return: code string
143 |     """
144 | 
145 |     # make sure we have ids
146 |     assign_ids(spn)
147 | 
148 |     # fill method body according to SPN structure
149 |     method_body = generate_method_body(spn, spn, floating_data_type, 0)
150 | 
151 |     # build parameters used in generated c++ function
152 |     method_params = []
153 |     passed_params = []
154 |     for i, type in enumerate(meta_types):
155 |         if type == MetaType.DISCRETE:
156 |             method_params += [f'vector <int> possibleValues{i}', f'int nullValueIdx{i}']
157 |             passed_params += [f'py::arg("possibleValues{i}")', f'py::arg("nullValueIdx{i}")']
158 |         elif type == MetaType.REAL:
159 |             method_params += [f'bool inverse{i}', f'bool leftMinusInf{i}', f'float leftCondition{i}',
160 |                               f'bool rightMinusInf{i}', f'float rightCondition{i}', f'bool leftIncluded{i}',
161 |                               f'bool rightIncluded{i}', f'float nullValue{i}']
162 |             passed_params += [f'py::arg("inverse{i}")', f'py::arg("leftMinusInf{i}")', f'py::arg("leftCondition{i}")',
163 |                               f'py::arg("rightMinusInf{i}")', f'py::arg("rightCondition{i}")',
164 |                               f'py::arg("leftIncluded{i}")', f'py::arg("rightIncluded{i}")', f'py::arg("nullValue{i}")']
165 | 
166 |     value_dictionary = {
167 |         'spn_id': spn_id,
168 |         'method_body': method_body,
169 |         'method_params': ', '.join(method_params),
170 |         'node_count': get_number_of_nodes(spn),
171 |         'passed_params': ', '.join(passed_params),
172 |         'floating_data_type': floating_data_type
173 |     }
174 |     generated_method = replace_template(TemplatePath.METHOD_MASTER, value_dictionary, 0)
175 |     registrate_method = replace_template(TemplatePath.REGISTRATION_MASTER, value_dictionary, 0)
176 | 
177 |     return generated_method, registrate_method
178 | 
179 | 
180 | def generate_ensemble_code(spn_ensemble, floating_data_type='float', ensemble_path=None):
181 |     registrations = []
182 |     methods = []
183 |     logger.debug(f"Starting code generation")
184 |     for i, spn in enumerate(spn_ensemble.spns):
185 |         spn.id = i
186 |         gen_start = perf_counter()
187 |         generated_method, registrate_method = generate_code(i, spn.mspn, spn.meta_types, floating_data_type)
188 |         registrations.append(registrate_method)
189 |         methods.append(generated_method)
190 |         gen_end = perf_counter()
191 |         logger.debug(f"Generated code for SPN {i + 1}/{len(spn_ensemble.spns)} in {gen_end - gen_start:.2f}s.")
192 | 
193 |     value_dictionary = {
194 |         'methods': '\n\n'.join(methods),
195 |         'registration': '\n\t'.join(registrations)
196 |     }
197 |     generated_code = replace_template(TemplatePath.MASTER, value_dictionary, 0)
198 | 
199 |     if ensemble_path is not None:
200 |         spn_ensemble.save(ensemble_path)
201 | 
202 |     with open('optimized_inference.cpp', 'w') as f:
203 |         f.write(generated_code)
204 | 
205 |     logger.debug(f"Finished code generation.")
206 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/code_generation/templates/categorical_leave.cpp:
--------------------------------------------------------------------------------
 1 | if (relevantScope[{node_scope}]) {{
 2 |     // notNanPerNode[{node_id}] = true;
 3 |     {floating_data_type} probsNode{node_id}[] = {{ {node_p} }};
 4 | 
 5 |     //not null condition
 6 |     if (nullValueIdx{node_scope} != -1) {{
 7 |         nodeIntermediateResult[{node_id}] = 1 - probsNode{node_id}[nullValueIdx{node_scope}];
 8 |     }} else {{
 9 |         for (int &idx: possibleValues{node_scope}) {{
10 |             nodeIntermediateResult[{node_id}] += probsNode{node_id}[idx];
11 |         }}
12 |     }}
13 |     {final_assert}
14 | }}


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/code_generation/templates/identity_leave.cpp:
--------------------------------------------------------------------------------
 1 | if (relevantScope[{node_scope}]) {{
 2 |     if (featureScope[{node_scope}]) {{
 3 |         if (inverse{node_scope}) {{
 4 |             nodeIntermediateResult[{node_id}] = {inverted_mean};
 5 |         }} else {{
 6 |             nodeIntermediateResult[{node_id}] = {mean};
 7 |         }}
 8 |     }} else {{
 9 | 
10 |         vector<{floating_data_type}> uniqueVals{node_id}{{ {unique_values} }};
11 |         vector<{floating_data_type}> probSum{node_id}{{ {prob_sum} }};
12 | 
13 |         // search right and left bounds via binary search
14 |         int leftIdx{node_id} = 0;
15 |         if (!leftMinusInf{node_scope}) {{
16 |             vector<{floating_data_type}>::iterator leftBoundIdx{node_id};
17 |             leftBoundIdx{node_id} = std::lower_bound(uniqueVals{node_id}.begin(), uniqueVals{node_id}.end(), leftCondition{node_scope});
18 |             leftIdx{node_id} = leftBoundIdx{node_id} - uniqueVals{node_id}.begin();
19 |         }}
20 | 
21 |         int rightIdx{node_id} = uniqueVals{node_id}.size();
22 |         if (!rightMinusInf{node_scope}) {{
23 |             vector<{floating_data_type}>::iterator rightBoundIdx{node_id};
24 |             rightBoundIdx{node_id} = std::upper_bound(uniqueVals{node_id}.begin(), uniqueVals{node_id}.end(), rightCondition{node_scope});
25 |             rightIdx{node_id} = rightBoundIdx{node_id} - uniqueVals{node_id}.begin();
26 |         }}
27 | 
28 |         nodeIntermediateResult[{node_id}] = probSum{node_id}[rightIdx{node_id}] - probSum{node_id}[leftIdx{node_id}];
29 | 
30 |         // exclude null value if it was included before
31 |         if (((leftMinusInf{node_scope} || leftCondition{node_scope} < nullValue{node_scope}) && (rightMinusInf{node_scope} || rightCondition{node_scope} > nullValue{node_scope})) ||
32 |             (!leftMinusInf{node_scope} && (nullValue{node_scope} == leftCondition{node_scope}) && leftIncluded{node_scope}) ||
33 |             (!rightMinusInf{node_scope} && (nullValue{node_scope} == rightCondition{node_scope}) && rightIncluded{node_scope})) {{
34 |             nodeIntermediateResult[{node_id}] -= {null_value_prob}; // null value prob
35 |         }}
36 | 
37 |         // left value should not be included in interval
38 |         if (!leftIncluded{node_scope} && !leftMinusInf{node_scope} && leftCondition{node_scope} == uniqueVals{node_id}[leftIdx{node_id}]) {{
39 |             nodeIntermediateResult[{node_id}] -= probSum{node_id}[leftIdx{node_id} + 1] - probSum{node_id}[leftIdx{node_id}];
40 |         }}
41 | 
42 |         //same for right
43 |         if (!rightIncluded{node_scope} && !rightMinusInf{node_scope} && rightCondition{node_scope} == uniqueVals{node_id}[rightIdx{node_id}-{node_id}] && leftCondition{node_scope} != rightCondition{node_scope}) {{
44 |             nodeIntermediateResult[{node_id}] -= probSum{node_id}[rightIdx{node_id}] - probSum{node_id}[rightIdx{node_id} - 1];
45 |         }}
46 |     }}
47 |     {final_assert}
48 | }}


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/code_generation/templates/master.cpp:
--------------------------------------------------------------------------------
 1 | #include <pybind11/pybind11.h>
 2 | #include <pybind11/stl.h>
 3 | #include <vector>
 4 | using namespace std;
 5 | namespace py = pybind11;
 6 | 
 7 | {methods}
 8 | 
 9 | PYBIND11_MODULE(optimized_inference, m){{
10 |     m.doc() = "Generated RSPN ensemble code";
11 |     {registration}
12 | }}


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/code_generation/templates/method_master.cpp:
--------------------------------------------------------------------------------
1 | {floating_data_type} spn{spn_id}(vector<bool> relevantScope, vector<bool> featureScope, {method_params}){{
2 |     {floating_data_type} resultValue = 0.0;
3 |     // bool notNanPerNode[{node_count}] = {{ false }};
4 |     {floating_data_type} nodeIntermediateResult[{node_count}] = {{ 0 }};
5 | 
6 | {method_body}
7 | 
8 |     return resultValue;
9 | }}


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/code_generation/templates/product_node.cpp:
--------------------------------------------------------------------------------
1 | if ({scope_check}) {{
2 | {subtree_code}
3 |     nodeIntermediateResult[{node_id}] = 1.0;
4 |     {result_calculation}
5 |     {final_assert}
6 | }}


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/code_generation/templates/registration_master.cpp:
--------------------------------------------------------------------------------
1 |     m.def("spn{spn_id}", &spn{spn_id}, "Generate expectation on SPN", py::arg("relevantScope"), py::arg("featureScope"), {passed_params});


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/code_generation/templates/sum_node.cpp:
--------------------------------------------------------------------------------
1 | if ({scope_check}) {{
2 | {subtree_code}
3 |     {result_calculation}
4 |     {final_assert}
5 | }}


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/custom_spflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/aqp_spn/custom_spflow/__init__.py


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/custom_spflow/custom_transform_structure.py:
--------------------------------------------------------------------------------
 1 | from spn.structure.Base import get_nodes_by_type, Product, Leaf, assign_ids
 2 | 
 3 | from aqp_spn.aqp_leaves import Sum
 4 | from aqp_spn.custom_spflow.custom_validity import is_valid
 5 | 
 6 | 
 7 | def Prune(node, light=False):
 8 |     """
 9 |     Prunes spn. Ensures that nodes have at least one child and that types of node and children differ.
10 |     Adapts weigths and optionally bloom filters accordingly.
11 |     :param node:
12 |     :return:
13 |     """
14 | 
15 |     # v, err = is_valid(node)
16 |     # assert v, err
17 |     nodes = get_nodes_by_type(node, (Product, Sum))
18 | 
19 |     while len(nodes) > 0:
20 |         n = nodes.pop()
21 | 
22 |         n_type = type(n)
23 |         is_sum = n_type == Sum
24 |         is_product = n_type == Product
25 | 
26 |         i = 0
27 |         while i < len(n.children):
28 |             c = n.children[i]
29 | 
30 |             # if my child has only one node, we can get rid of it and link directly to that grandchildren
31 |             # in this case, no bloom filters can be lost since we do not split
32 |             if not isinstance(c, Leaf) and len(c.children) == 1:
33 |                 n.children[i] = c.children[0]
34 |                 continue
35 | 
36 |             # if the type is similar to the type of the child
37 |             if n_type == type(c):
38 | 
39 |                 if is_sum and not light:
40 |                     old_len = len(n.cluster_centers)
41 |                     len_child_cluster = len(c.cluster_centers)
42 |                     del n.cluster_centers[i]
43 |                     n.cluster_centers.extend(c.cluster_centers)
44 | 
45 |                     assert old_len - 1 + len_child_cluster == len(
46 |                         n.cluster_centers), "cluster_center length mismatch, node " + n + c
47 | 
48 |                 del n.children[i]
49 |                 n.children.extend(c.children)
50 | 
51 |                 if is_sum:
52 |                     w = n.weights[i]
53 |                     del n.weights[i]
54 | 
55 |                     n.weights.extend([cw * w for cw in c.weights])
56 | 
57 |                 if is_product:
58 |                     # hence, child type is also product and we should not loose bloom filter
59 |                     if hasattr(n, 'binary_bloom_filters'):
60 |                         n.binary_bloom_filters = {**n.binary_bloom_filters, **c.binary_bloom_filters}
61 | 
62 |                 continue
63 | 
64 |             i += 1
65 |         if is_sum and i > 0:
66 |             n.weights[0] = 1.0 - sum(n.weights[1:])
67 | 
68 |     if isinstance(node, (Product, Sum)) and len(node.children) == 1:
69 |         node = node.children[0]
70 | 
71 |     assign_ids(node)
72 |     v, err = is_valid(node, light=light)
73 |     assert v, err
74 | 
75 |     return node
76 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/custom_spflow/custom_validity.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on March 20, 2018
  3 | 
  4 | @author: Alejandro Molina
  5 | """
  6 | import logging
  7 | 
  8 | import numpy as np
  9 | from math import isclose
 10 | from spn.structure.Base import get_nodes_by_type, Product
 11 | 
 12 | from aqp_spn.aqp_leaves import Sum, IdentityNumericLeaf
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def is_consistent(node):
 18 |     """
 19 |     all children of a product node have different scope
 20 |     """
 21 | 
 22 |     assert node is not None
 23 | 
 24 |     allchildscope = set()
 25 |     for prod_node in reversed(get_nodes_by_type(node, Product)):
 26 |         nscope = set(prod_node.scope)
 27 | 
 28 |         if len(prod_node.children) == 0:
 29 |             return False, "Product node %s has no children" % prod_node.id
 30 | 
 31 |         allchildscope.clear()
 32 |         sum_features = 0
 33 |         for child in prod_node.children:
 34 |             sum_features += len(child.scope)
 35 |             allchildscope.update(child.scope)
 36 | 
 37 |         if allchildscope != nscope or sum_features != len(allchildscope):
 38 |             return False, "children of (prod) node %s do not have exclusive scope" % prod_node.id
 39 | 
 40 |     return True, None
 41 | 
 42 | 
 43 | def is_complete(node):
 44 |     """
 45 |     all children of a sum node have same scope as the parent
 46 |     """
 47 | 
 48 |     assert node is not None
 49 | 
 50 |     for sum_node in reversed(get_nodes_by_type(node, Sum)):
 51 |         nscope = set(sum_node.scope)
 52 | 
 53 |         if len(sum_node.children) == 0:
 54 |             return False, "Sum node %s has no children" % sum_node.id
 55 | 
 56 |         for child in sum_node.children:
 57 |             if nscope != set(child.scope):
 58 |                 return False, "children of (sum) node %s do not have the same scope as parent" % sum_node.id
 59 | 
 60 |     return True, None
 61 | 
 62 | 
 63 | def is_valid_prob_sum(prob_sum, unique_vals, card):
 64 |     # return True, Null
 65 |     length = len(prob_sum) - 1
 66 | 
 67 |     if len(prob_sum) != len(unique_vals) + 1:
 68 |         return False, "len(prob_sum)!= len(unique_vals)+1"
 69 |     last_prob_sum = 0
 70 |     cards = []
 71 | 
 72 |     sum_card = 0
 73 |     for i in range(0, len(prob_sum)):
 74 |         if prob_sum[i] > 1.0001:
 75 |             return False, "prob_sum[" + str(i) + "] must be =< 1.000, actual value at position " + str(i) + ":" + str(
 76 |                 prob_sum[i]) + ", len:" + str(len(prob_sum))
 77 |         if last_prob_sum - 0.0000001 > prob_sum[i]:
 78 |             return False, "prob_sum value must be increase (last_prob_sum:" + str(last_prob_sum) + ", prob_sum[" + str(
 79 |                 i) + "]:" + str(prob_sum[i])
 80 |         num = (prob_sum[i] - last_prob_sum) * card
 81 |         if False and not isclose(num, round(num), abs_tol=0.05):
 82 |             err_msg = "wrong probability value at idx " + str(i) + " (" + str(
 83 |                 num) + ")- does not fit to an integer cardinality value for value " + str(unique_vals[i])
 84 | 
 85 |             return False, err_msg
 86 |         last_prob_sum = prob_sum[i]
 87 |         sum_card += round(num)
 88 |         cards.append(round(num))
 89 | 
 90 |     if not isclose(prob_sum[length], 1, abs_tol=0.05):
 91 |         return False, "Last value of prob_sum must be 1.0"
 92 |     if sum_card != card:
 93 |         return False, "Cardinality of the single values (" + str(
 94 |             sum_card) + ") does not match the overall cardinality (" + str(card) + ")"
 95 | 
 96 |     return True, None
 97 | 
 98 | 
 99 | def is_valid(node, check_ids=True, check_prob_sum=False, light=False):
100 |     #
101 |     if check_ids:
102 |         val, err = has_valid_ids(node)
103 |         if not val:
104 |             return val, err
105 | 
106 |     for n in get_nodes_by_type(node):
107 |         if len(n.scope) == 0:
108 |             return False, "node %s has no scope" % n.id
109 |         is_sum = isinstance(n, Sum)
110 |         is_prod = isinstance(n, Product)
111 |         is_float = isinstance(n, IdentityNumericLeaf)
112 | 
113 |         if is_sum:
114 |             if len(n.children) != len(n.weights):
115 |                 return False, "node %s has different children/weights" % n.id
116 | 
117 |             if not light:
118 |                 if len(n.children) != len(n.cluster_centers):
119 |                     return False, "node %s has different children/cluster_centers (#cluster_centers: %d, #childs: %d)" % (
120 |                         n.id, len(n.cluster_centers), len(n.children))
121 | 
122 |             weight_sum = np.sum(n.weights)
123 | 
124 |             if not isclose(weight_sum, 1, abs_tol=0.05):
125 |                 return False, "Sum of weights is not equal 1.0 (instead:" + weight_sum + ")"
126 | 
127 |         if is_sum or is_prod:
128 |             if len(n.children) == 0:
129 |                 return False, "node %s has no children" % n.id
130 | 
131 |         if is_float:
132 |             ok, err = is_valid_prob_sum(n.prob_sum, n.unique_vals, n.cardinality)
133 |             if not ok:
134 |                 return False, err
135 |             if check_prob_sum:
136 |                 assert (hasattr(n, 'prob_num')), str(n) + " has no property prob_num"
137 |                 assert hasattr((n, 'unique_vals'))
138 |                 if len(n.prob_sum) - 1 != len(n.unique_vals):
139 |                     #
140 |                     return False, "size of prob_sum does not match unique_vals (required: prob_sum -1 == unique_vals) "
141 | 
142 |     a, err = is_consistent(node)
143 |     if not a:
144 |         return a, err
145 | 
146 |     b, err = is_complete(node)
147 |     if not b:
148 |         return b, err
149 | 
150 |     return True, None
151 | 
152 | 
153 | def has_valid_ids(node):
154 |     ids = set()
155 |     all_nodes = get_nodes_by_type(node)
156 |     for n in all_nodes:
157 |         ids.add(n.id)
158 | 
159 |     if len(ids) != len(all_nodes):
160 |         return False, "Nodes are missing ids or there are repeated ids"
161 | 
162 |     if min(ids) != 0:
163 |         return False, "Node ids not starting at 0"
164 | 
165 |     if max(ids) != len(ids) - 1:
166 |         return False, "Node ids not consecutive"
167 | 
168 |     return True, None
169 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/custom_spflow/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | 
 4 | import numpy as np
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def default_slicer(data, cols, num_cond_cols=None):
10 |     if num_cond_cols is None:
11 |         if len(cols) == 1:
12 |             return data[:, cols[0]].reshape((-1, 1))
13 | 
14 |         return data[:, cols]
15 |     else:
16 |         return np.concatenate((data[:, cols], data[:, -num_cond_cols:]), axis=1)
17 | 
18 | 
19 | def compute_cartesian_product_completeness(col1, col2, ds_context, data, min_sample_size, max_sample_size,
20 |                                            oversampling_cart_product=10, debug=False):
21 |     """
22 |     Compute how many distinct value combinations appear for pair of columns in data. A low value is an indicator for
23 |     functional dependency or some different form of dependency.
24 |     :param col1:
25 |     :param col2:
26 |     :param ds_context:
27 |     :param data:
28 |     :param min_sample_size:
29 |     :param max_sample_size:
30 |     :param oversampling_cart_product:
31 |     :param debug:
32 |     :return:
33 |     """
34 | 
35 |     unique_tuples_start_t = time.perf_counter()
36 |     len_cartesian_product = ds_context.no_unique_values[col1] * ds_context.no_unique_values[col2]
37 |     sample_size = max(min(oversampling_cart_product * len_cartesian_product, max_sample_size), min_sample_size)
38 | 
39 |     sample_idx = np.random.randint(data.shape[0], size=sample_size)
40 |     if sample_size < data.shape[0]:
41 |         local_data_sample = data[sample_idx, :]
42 |     else:
43 |         local_data_sample = data
44 |     value_combinations_sample = set(
45 |         [(bin_data[0], bin_data[1],) for bin_data in
46 |          default_slicer(local_data_sample, [col1, col2])])
47 |     cartesian_product_completeness = len(value_combinations_sample) / len_cartesian_product
48 |     unique_tuples_end_t = time.perf_counter()
49 |     if debug:
50 |         logging.debug(
51 |             f"Computed unique combination set for scope ({col1}, {col2}) in "
52 |             f"{unique_tuples_end_t - unique_tuples_start_t} sec.")
53 |     return cartesian_product_completeness, value_combinations_sample, len_cartesian_product
54 | 
55 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/expectations.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from time import perf_counter
  3 | 
  4 | import numpy as np
  5 | from spn.algorithms.Inference import likelihood
  6 | from spn.structure.Base import Product
  7 | 
  8 | from aqp_spn.aqp_leaves import Sum
  9 | from aqp_spn.code_generation.convert_conditions import convert_range
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | def expectation(spn, feature_scope, inverted_features, ranges, node_expectation=None, node_likelihoods=None,
 15 |                 use_generated_code=False, spn_id=None, meta_types=None, gen_code_stats=None):
 16 |     """Compute the Expectation:
 17 |         E[1_{conditions} * X_feature_scope]
 18 |         First factor is one if condition is fulfilled. For the second factor the variables in feature scope are
 19 |         multiplied. If inverted_features[i] is True, variable is taken to denominator.
 20 |         The conditional expectation would be E[1_{conditions} * X_feature_scope]/P(conditions)
 21 |     """
 22 | 
 23 |     # evidence_scope = set([i for i, r in enumerate(ranges) if not np.isnan(r)])
 24 |     evidence_scope = set([i for i, r in enumerate(ranges[0]) if r is not None])
 25 |     evidence = ranges
 26 | 
 27 |     assert not (len(evidence_scope) > 0 and evidence is None)
 28 | 
 29 |     relevant_scope = set()
 30 |     relevant_scope.update(evidence_scope)
 31 |     relevant_scope.update(feature_scope)
 32 |     if len(relevant_scope) == 0:
 33 |         return np.ones((ranges.shape[0], 1))
 34 | 
 35 |     if ranges.shape[0] == 1:
 36 | 
 37 |         applicable = True
 38 |         if use_generated_code:
 39 |             boolean_relevant_scope = [i in relevant_scope for i in range(len(meta_types))]
 40 |             boolean_feature_scope = [i in feature_scope for i in range(len(meta_types))]
 41 |             applicable, parameters = convert_range(boolean_relevant_scope, boolean_feature_scope, meta_types, ranges[0],
 42 |                                                    inverted_features)
 43 | 
 44 |         # generated C++ code
 45 |         if use_generated_code and applicable:
 46 |             time_start = perf_counter()
 47 |             import optimized_inference
 48 | 
 49 |             spn_func = getattr(optimized_inference, f'spn{spn_id}')
 50 |             result = np.array([[spn_func(*parameters)]])
 51 | 
 52 |             time_end = perf_counter()
 53 | 
 54 |             if gen_code_stats is not None:
 55 |                 gen_code_stats.calls += 1
 56 |                 gen_code_stats.total_time += (time_end - time_start)
 57 | 
 58 |             # logger.debug(f"\t\tGenerated Code Latency: {(time_end - time_start) * 1000:.3f}ms")
 59 |             return result
 60 | 
 61 |         # lightweight non-batch version
 62 |         else:
 63 |             return np.array(
 64 |                 [[expectation_recursive(spn, feature_scope, inverted_features, relevant_scope, evidence,
 65 |                                         node_expectation, node_likelihoods)]])
 66 |     # full batch version
 67 |     return expectation_recursive_batch(spn, feature_scope, inverted_features, relevant_scope, evidence,
 68 |                                        node_expectation, node_likelihoods)
 69 | 
 70 | 
 71 | def expectation_recursive_batch(node, feature_scope, inverted_features, relevant_scope, evidence, node_expectation,
 72 |                                 node_likelihoods):
 73 |     if isinstance(node, Product):
 74 | 
 75 |         llchildren = np.concatenate(
 76 |             [expectation_recursive_batch(child, feature_scope, inverted_features, relevant_scope, evidence,
 77 |                                          node_expectation, node_likelihoods)
 78 |              for child in node.children if
 79 |              len(relevant_scope.intersection(child.scope)) > 0], axis=1)
 80 |         return np.nanprod(llchildren, axis=1).reshape(-1, 1)
 81 | 
 82 |     elif isinstance(node, Sum):
 83 |         if len(relevant_scope.intersection(node.scope)) == 0:
 84 |             return np.full((evidence.shape[0], 1), np.nan)
 85 | 
 86 |         llchildren = np.concatenate(
 87 |             [expectation_recursive_batch(child, feature_scope, inverted_features, relevant_scope, evidence,
 88 |                                          node_expectation, node_likelihoods)
 89 |              for child in node.children], axis=1)
 90 | 
 91 |         relevant_children_idx = np.where(np.isnan(llchildren[0]) == False)[0]
 92 |         if len(relevant_children_idx) == 0:
 93 |             return np.array([np.nan])
 94 | 
 95 |         weights_normalizer = sum(node.weights[j] for j in relevant_children_idx)
 96 |         b = np.array(node.weights)[relevant_children_idx] / weights_normalizer
 97 | 
 98 |         return np.dot(llchildren[:, relevant_children_idx], b).reshape(-1, 1)
 99 | 
100 |     else:
101 |         if node.scope[0] in feature_scope:
102 |             t_node = type(node)
103 |             if t_node in node_expectation:
104 |                 exps = np.zeros((evidence.shape[0], 1))
105 | 
106 |                 feature_idx = feature_scope.index(node.scope[0])
107 |                 inverted = inverted_features[feature_idx]
108 | 
109 |                 exps[:] = node_expectation[t_node](node, evidence, inverted=inverted)
110 |                 return exps
111 |             else:
112 |                 raise Exception('Node type unknown: ' + str(t_node))
113 | 
114 |         return likelihood(node, evidence, node_likelihood=node_likelihoods)
115 | 
116 | 
117 | def nanproduct(product, factor):
118 |     if np.isnan(product):
119 |         if not np.isnan(factor):
120 |             return factor
121 |         else:
122 |             return np.nan
123 |     else:
124 |         if np.isnan(factor):
125 |             return product
126 |         else:
127 |             return product * factor
128 | 
129 | 
130 | def expectation_recursive(node, feature_scope, inverted_features, relevant_scope, evidence, node_expectation,
131 |                           node_likelihoods):
132 |     if isinstance(node, Product):
133 | 
134 |         product = np.nan
135 |         for child in node.children:
136 |             if len(relevant_scope.intersection(child.scope)) > 0:
137 |                 factor = expectation_recursive(child, feature_scope, inverted_features, relevant_scope, evidence,
138 |                                                node_expectation, node_likelihoods)
139 |                 product = nanproduct(product, factor)
140 |         return product
141 | 
142 |     elif isinstance(node, Sum):
143 |         if len(relevant_scope.intersection(node.scope)) == 0:
144 |             return np.nan
145 | 
146 |         llchildren = [expectation_recursive(child, feature_scope, inverted_features, relevant_scope, evidence,
147 |                                             node_expectation, node_likelihoods)
148 |                       for child in node.children]
149 | 
150 |         relevant_children_idx = np.where(np.isnan(llchildren) == False)[0]
151 | 
152 |         if len(relevant_children_idx) == 0:
153 |             return np.nan
154 | 
155 |         weights_normalizer = sum(node.weights[j] for j in relevant_children_idx)
156 |         weighted_sum = sum(node.weights[j] * llchildren[j] for j in relevant_children_idx)
157 | 
158 |         return weighted_sum / weights_normalizer
159 | 
160 |     else:
161 |         if node.scope[0] in feature_scope:
162 |             t_node = type(node)
163 |             if t_node in node_expectation:
164 | 
165 |                 feature_idx = feature_scope.index(node.scope[0])
166 |                 inverted = inverted_features[feature_idx]
167 | 
168 |                 return node_expectation[t_node](node, evidence, inverted=inverted).item()
169 |             else:
170 |                 raise Exception('Node type unknown: ' + str(t_node))
171 | 
172 |         return node_likelihoods[type(node)](node, evidence).item()
173 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/group_by_combination.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | from spn.algorithms.Inference import likelihood
  6 | from spn.structure.Base import get_nodes_by_type, Leaf, Product, eval_spn_bottom_up, assign_ids
  7 | 
  8 | from aqp_spn.aqp_leaves import Sum
  9 | from aqp_spn.custom_spflow.custom_transform_structure import Prune
 10 | from aqp_spn.custom_spflow.custom_validity import is_valid
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def prod_group_by(node, children, data=None, dtype=np.float64):
 16 |     contains_probs = False
 17 |     contains_values = False
 18 |     contains_none_values = False
 19 |     contains_zero_prob = False
 20 |     group_by_scopes = []
 21 |     # Check if only probabilities contained
 22 |     for child in children:
 23 |         # value
 24 |         if isinstance(child, tuple):
 25 |             contains_values = True
 26 | 
 27 |             scope, values = child
 28 |             group_by_scopes += scope
 29 |             if values is None:
 30 |                 contains_none_values = True
 31 |         # probability
 32 |         else:
 33 |             contains_probs = True
 34 |             if (child == 0).any():
 35 |                 contains_zero_prob = True
 36 | 
 37 |     # Probability of subtree zero or no matching tuples
 38 |     if contains_zero_prob or contains_none_values:
 39 |         return [None], None
 40 |     # Cartesian product
 41 |     elif contains_values:
 42 |         result_values = None
 43 |         group_by_scopes.sort()
 44 |         for group_by_scope in group_by_scopes:
 45 |             matching_values = None
 46 |             matching_idx = None
 47 |             for child in children:
 48 |                 if isinstance(child, tuple):
 49 |                     scope, values = child
 50 |                     if group_by_scope in scope:
 51 |                         matching_values = values
 52 |                         matching_idx = scope.index(group_by_scope)
 53 |                         break
 54 |             assert matching_values is not None, "Matching values should not be None."
 55 |             if result_values is None:
 56 |                 result_values = [(matching_value[matching_idx],) for matching_value in matching_values]
 57 |             else:
 58 |                 result_values = [result_value + (matching_value[matching_idx],) for result_value in result_values for
 59 |                                  matching_value in matching_values]
 60 |                 # assert len(result_values) <= len(group_by_scopes)
 61 |         old_len = len(result_values)
 62 |         if hasattr(node, 'binary_bloom_filters'):  # , "For grouping product nodes must have bloom filters."
 63 |             for scope, bloom_filter in node.binary_bloom_filters.items():
 64 |                 if scope[0] in group_by_scopes and scope[1] in group_by_scopes:
 65 |                     idx_left = group_by_scopes.index(scope[0])
 66 |                     idx_right = group_by_scopes.index(scope[1])
 67 |                     result_values = [result_value for result_value in result_values if
 68 |                                      (result_value[idx_left], result_value[idx_right],) in bloom_filter]
 69 |         if old_len > len(result_values):
 70 |             logger.debug(
 71 |                 f"\t\tDue to bloom filters results were reduced by {(1 - len(result_values) / old_len) * 100}%")
 72 |         return group_by_scopes, set(result_values)
 73 |     # Only probabilities, normal inference
 74 |     elif contains_probs:
 75 |         llchildren = np.concatenate(children, axis=1)
 76 |         return np.nanprod(llchildren, axis=1).reshape(-1, 1)
 77 | 
 78 | 
 79 | def sum_group_by(node, children, data=None, dtype=np.float64):
 80 |     """
 81 |     Propagate expectations in sum node.
 82 | 
 83 |     :param node: sum node
 84 |     :param children: nodes below
 85 |     :param data:
 86 |     :param dtype:
 87 |     :return:
 88 |     """
 89 | 
 90 |     # either all tuples or
 91 |     if isinstance(children[0], tuple):
 92 |         result_values = None
 93 |         group_by_scope = [None]
 94 |         for scope, values in children:
 95 |             if values is not None:
 96 |                 group_by_scope = scope
 97 |                 if result_values is None:
 98 |                     result_values = values
 99 |                 else:
100 |                     result_values = result_values.union(values)
101 |         return group_by_scope, result_values
102 | 
103 |     # normal probability sum node code
104 |     llchildren = np.concatenate(children, axis=1)
105 |     relevant_children_idx = np.where(np.isnan(llchildren[0]) == False)[0]
106 |     if len(relevant_children_idx) == 0:
107 |         return np.array([np.nan])
108 | 
109 |     assert llchildren.dtype == dtype
110 | 
111 |     weights_normalizer = sum(node.weights[j] for j in relevant_children_idx)
112 |     b = np.array(node.weights, dtype=dtype)[relevant_children_idx] / weights_normalizer
113 | 
114 |     return np.dot(llchildren[:, relevant_children_idx], b).reshape(-1, 1)
115 | 
116 | 
117 | def group_by_combinations(spn, ds_context, feature_scope, ranges, node_distinct_vals=None, node_likelihoods=None):
118 |     """
119 |     Computes the distinct value combinations for features given the range conditions.
120 |     """
121 |     evidence_scope = set([i for i, r in enumerate(ranges[0]) if r is not None])
122 |     evidence = ranges
123 | 
124 |     # make feature scope sorted
125 |     feature_scope_unsorted = copy.copy(feature_scope)
126 |     feature_scope.sort()
127 |     # add range conditions to feature scope (makes checking with bloom filters easier)
128 |     feature_scope = list(set(feature_scope)
129 |                          .union(evidence_scope.intersection(np.where(ds_context.no_unique_values <= 1200)[0])))
130 |     feature_scope.sort()
131 |     inverted_order = [feature_scope.index(scope) for scope in feature_scope_unsorted]
132 | 
133 |     assert not (len(evidence_scope) > 0 and evidence is None)
134 | 
135 |     relevant_scope = set()
136 |     relevant_scope.update(evidence_scope)
137 |     relevant_scope.update(feature_scope)
138 |     marg_spn = marginalize(spn, relevant_scope)
139 | 
140 |     def leaf_expectation(node, data, dtype=np.float64, **kwargs):
141 | 
142 |         if node.scope[0] in feature_scope:
143 |             t_node = type(node)
144 |             if t_node in node_distinct_vals:
145 |                 vals = node_distinct_vals[t_node](node, evidence)
146 |                 return vals
147 |             else:
148 |                 raise Exception('Node type unknown: ' + str(t_node))
149 | 
150 |         return likelihood(node, evidence, node_likelihood=node_likelihoods)
151 | 
152 |     node_expectations = {type(leaf): leaf_expectation for leaf in get_nodes_by_type(marg_spn, Leaf)}
153 |     node_expectations.update({Sum: sum_group_by, Product: prod_group_by})
154 | 
155 |     result = eval_spn_bottom_up(marg_spn, node_expectations, all_results={}, data=evidence, dtype=np.float64)
156 |     if feature_scope_unsorted == feature_scope:
157 |         return result
158 |     scope, grouped_tuples = result
159 |     return feature_scope_unsorted, set(
160 |         [tuple(group_tuple[i] for i in inverted_order) for group_tuple in grouped_tuples])
161 | 
162 | 
163 | def marginalize(node, keep, light=False):
164 |     # keep must be a set of features that you want to keep
165 |     # Loc.enter()
166 |     keep = set(keep)
167 | 
168 |     # Loc.p('keep:', keep)
169 | 
170 |     def marg_recursive(node):
171 |         # Loc.enter()
172 |         new_node_scope = keep.intersection(set(node.scope))
173 |         # Loc.p("new_node_scope:", new_node_scope)
174 |         if len(new_node_scope) == 0:
175 |             # we are summing out this node
176 |             # Loc.leave(None)
177 |             return None
178 | 
179 |         if isinstance(node, Leaf):
180 |             if len(node.scope) > 1:
181 |                 raise Exception("Leaf Node with |scope| > 1")
182 |             # Loc.leave('Leaf.deepcopy()')
183 |             if light:
184 |                 return node
185 |             return copy.deepcopy(node)
186 | 
187 |         newNode = node.__class__()
188 |         newNode.cardinality = node.cardinality
189 | 
190 |         if isinstance(node, Sum):
191 |             newNode.weights.extend(node.weights)
192 |             if not light:
193 |                 newNode.cluster_centers.extend(node.cluster_centers)
194 |         if isinstance(node, Product):
195 |             if hasattr(node, 'binary_bloom_filters'):
196 |                 newNode.binary_bloom_filters = node.binary_bloom_filters
197 | 
198 |         for c in node.children:
199 |             new_c = marg_recursive(c)
200 |             if new_c is None:
201 |                 continue
202 |             newNode.children.append(new_c)
203 | 
204 |         newNode.scope.extend(new_node_scope)
205 | 
206 |         # Loc.leave()
207 |         return newNode
208 | 
209 |     newNode = marg_recursive(node)
210 | 
211 |     if not light:
212 |         assign_ids(newNode)
213 |         newNode = Prune(newNode, light=light)
214 | 
215 |         valid, err = is_valid(newNode, light=light)
216 |         assert valid, err
217 |     # Loc.leave()
218 |     return newNode
219 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/ranges.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class NominalRange:
 5 |     """
 6 |     This class specifies the range for a nominal attribute. It contains a list of integers which
 7 |     represent the values which are in the range.
 8 |     
 9 |     e.g. possible_values = [5,2] 
10 |     """
11 | 
12 |     def __init__(self, possible_values, null_value=None, is_not_null_condition=False):
13 |         self.is_not_null_condition = is_not_null_condition
14 |         self.possible_values = np.array(possible_values, dtype=np.int64)
15 |         self.null_value = null_value
16 | 
17 |     def is_impossible(self):
18 |         return len(self.possible_values) == 0
19 | 
20 |     def get_ranges(self):
21 |         return self.possible_values
22 | 
23 | 
24 | class NumericRange:
25 |     """
26 |     This class specifies the range for a numeric attribute. It contains a list of intervals which
27 |     represents the values which are valid. Inclusive Intervals specifies whether upper and lower bound are included.
28 |     
29 |     e.g. ranges = [[10,15],[22,23]] if valid values are between 10 and 15 plus 22 and 23 (bounds inclusive)
30 |     """
31 | 
32 |     def __init__(self, ranges, inclusive_intervals=None, null_value=None, is_not_null_condition=False):
33 |         self.is_not_null_condition = is_not_null_condition
34 |         self.ranges = ranges
35 |         self.null_value = null_value
36 |         self.inclusive_intervals = inclusive_intervals
37 |         if self.inclusive_intervals is None:
38 |             self.inclusive_intervals = []
39 |             for interval in self.ranges:
40 |                 self.inclusive_intervals.append([True, True])
41 | 
42 |     def is_impossible(self):
43 |         return len(self.ranges) == 0
44 | 
45 |     def get_ranges(self):
46 |         return self.ranges
47 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/util/Graphics.py:
--------------------------------------------------------------------------------
 1 | from spn.io.Graphics import plot_spn
 2 | 
 3 | def overwrite_plot_spn(spn, plotfile):
 4 |     import os
 5 |     try:
 6 |         os.remove(plotfile)
 7 |     except OSError as err:
 8 |         pass
 9 |     plot_spn(spn, plotfile)
10 | 
11 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/aqp_spn/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/aqp_spn/util/__init__.py


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/data_preparation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/data_preparation/__init__.py


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/ensemble_compilation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/ensemble_compilation/__init__.py


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/ensemble_compilation/graph_representation.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from enum import Enum
  3 | 
  4 | 
  5 | class Table:
  6 |     """Represents a table with foreign key and primary key relationships"""
  7 | 
  8 |     def __init__(self, table_name, primary_key=["id"], table_nn_attribute=None, table_size=1000, csv_file_location=None,
  9 |                  attributes=None, irrelevant_attributes=None, keep_fk_attributes=None, sample_rate=1.0, fd_list=None,
 10 |                  no_compression=None):
 11 | 
 12 |         self.table_name = table_name
 13 |         self.table_size = table_size
 14 |         self.primary_key = primary_key
 15 | 
 16 |         self.csv_file_location = csv_file_location
 17 |         self.attributes = attributes
 18 |         self.irrelevant_attributes = irrelevant_attributes
 19 |         if irrelevant_attributes is None:
 20 |             self.irrelevant_attributes = []
 21 |         self.keep_fk_attributes = keep_fk_attributes
 22 |         if keep_fk_attributes is None:
 23 |             self.keep_fk_attributes = []
 24 |         self.no_compression = no_compression
 25 |         if no_compression is None:
 26 |             self.no_compression = []
 27 | 
 28 |         if fd_list is None:
 29 |             self.fd_list = []
 30 |         else:
 31 |             self.fd_list = [(table_name + '.' + fd_source, table_name + '.' + fd_dest) for fd_source, fd_dest in
 32 |                             fd_list]
 33 | 
 34 |         # additional attribute indicating whether tuple is NULL (can occur since we learn SPN on FULL OUTER JOIN)
 35 |         if table_nn_attribute is None:
 36 |             self.table_nn_attribute = self.table_name + '_nn'
 37 | 
 38 |         # FK references
 39 |         self.outgoing_relationships = []
 40 | 
 41 |         # referenced as FK
 42 |         self.incoming_relationships = []
 43 |         self.sample_rate = sample_rate
 44 | 
 45 |     def children_fd_attributes(self, attribute):
 46 |         return [fd_source for fd_source, fd_dest in self.fd_list if fd_dest == attribute]
 47 | 
 48 |     def parent_fd_attributes(self, attribute):
 49 |         return [fd_dest for fd_source, fd_dest in self.fd_list if fd_source == attribute]
 50 | 
 51 | 
 52 | class Relationship:
 53 |     """Foreign key primary key relationship"""
 54 | 
 55 |     def __init__(self, start, end, start_attr, end_attr, multiplier_attribute_name):
 56 |         self.start = start.table_name
 57 |         self.start_attr = start_attr
 58 | 
 59 |         self.end = end.table_name
 60 |         self.end_attr = end_attr
 61 | 
 62 |         # matching tuples in FULL OUTER JOIN
 63 |         self.multiplier_attribute_name = multiplier_attribute_name
 64 | 
 65 |         # matching tuples (not NULL)
 66 |         self.multiplier_attribute_name_nn = multiplier_attribute_name + '_nn'
 67 | 
 68 |         self.identifier = self.start + '.' + self.start_attr + \
 69 |                           ' = ' + self.end + '.' + self.end_attr
 70 | 
 71 |         # for start table we are outgoing relationship
 72 |         start.outgoing_relationships.append(self)
 73 |         end.incoming_relationships.append(self)
 74 | 
 75 | 
 76 | class SchemaGraph:
 77 |     """Holds all tables and relationships"""
 78 | 
 79 |     def __init__(self):
 80 |         self.tables = []
 81 |         self.relationships = []
 82 |         self.table_dictionary = {}
 83 |         self.relationship_dictionary = {}
 84 | 
 85 |     def add_table(self, table):
 86 |         self.tables.append(table)
 87 |         self.table_dictionary[table.table_name] = table
 88 | 
 89 |     def add_relationship(self, start_name, start_attr, end_name, end_attr, multiplier_attribute_name=None):
 90 |         if multiplier_attribute_name is None:
 91 |             multiplier_attribute_name = 'mul_' + start_name + '.' + start_attr
 92 | 
 93 |         relationship = Relationship(self.table_dictionary[start_name],
 94 |                                     self.table_dictionary[end_name],
 95 |                                     start_attr,
 96 |                                     end_attr,
 97 |                                     multiplier_attribute_name)
 98 | 
 99 |         self.relationships.append(relationship)
100 |         self.relationship_dictionary[relationship.identifier] = relationship
101 | 
102 |         return relationship.identifier
103 | 
104 | 
105 | class QueryType(Enum):
106 |     AQP = 0
107 |     CARDINALITY = 1
108 | 
109 | 
110 | class AggregationType(Enum):
111 |     SUM = 0
112 |     AVG = 1
113 |     COUNT = 2
114 | 
115 | 
116 | class AggregationOperationType(Enum):
117 |     PLUS = 0
118 |     MINUS = 1
119 |     AGGREGATION = 2
120 | 
121 | 
122 | class Query:
123 |     """Represents query"""
124 | 
125 |     def __init__(self, schema_graph, query_type=QueryType.CARDINALITY, features=None):
126 |         self.query_type = query_type
127 |         self.schema_graph = schema_graph
128 |         self.table_set = set()
129 |         self.relationship_set = set()
130 |         self.table_where_condition_dict = {}
131 |         self.conditions = []
132 |         self.aggregation_operations = []
133 |         self.group_bys = []
134 | 
135 |     def remove_conditions_for_attributes(self, table, attributes):
136 |         def conflicting(condition):
137 |             return any([condition.startswith(attribute + ' ') or condition.startswith(attribute + '<') or
138 |                         condition.startswith(attribute + '>') or condition.startswith(attribute + '=') for
139 |                         attribute in attributes])
140 | 
141 |         if self.table_where_condition_dict.get(table) is not None:
142 |             self.table_where_condition_dict[table] = [condition for condition in
143 |                                                       self.table_where_condition_dict[table]
144 |                                                       if not conflicting(condition)]
145 |         self.conditions = [(cond_table, condition) for cond_table, condition in self.conditions
146 |                            if not (cond_table == table and conflicting(condition))]
147 | 
148 |     def copy_cardinality_query(self):
149 |         query = Query(self.schema_graph)
150 |         query.table_set = copy.copy(self.table_set)
151 |         query.relationship_set = copy.copy(self.relationship_set)
152 |         query.table_where_condition_dict = copy.copy(self.table_where_condition_dict)
153 |         query.conditions = copy.copy(self.conditions)
154 |         return query
155 | 
156 |     def add_group_by(self, table, attribute):
157 |         self.group_bys.append((table, attribute))
158 | 
159 |     def add_aggregation_operation(self, operation):
160 |         """
161 |         Adds operation to AQP query.
162 |         :param operation: (AggregationOperationType.AGGREGATION, operation_type, operation_factors) or (AggregationOperationType.MINUS, None, None)
163 |         :return:
164 |         """
165 |         self.aggregation_operations.append(operation)
166 | 
167 |     def add_join_condition(self, relationship_identifier):
168 | 
169 |         relationship = self.schema_graph.relationship_dictionary[relationship_identifier]
170 |         self.table_set.add(relationship.start)
171 |         self.table_set.add(relationship.end)
172 | 
173 |         self.relationship_set.add(relationship_identifier)
174 | 
175 |     def add_where_condition(self, table, condition):
176 |         if self.table_where_condition_dict.get(table) is None:
177 |             self.table_where_condition_dict[table] = [condition]
178 |         else:
179 |             self.table_where_condition_dict[table].append(condition)
180 |         self.conditions.append((table, condition))
181 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/ensemble_compilation/physical_db.py:
--------------------------------------------------------------------------------
  1 | import psycopg2
  2 | import pandas as pd
  3 | 
  4 | from ensemble_compilation.utils import gen_full_join_query, print_conditions
  5 | 
  6 | 
  7 | class DBConnection:
  8 | 
  9 |     def __init__(self, db_user="postgres", db_password="postgres", db_host="localhost", db_port="5432", db="shopdb"):
 10 |         self.db_user = db_user
 11 |         self.db_password = db_password
 12 |         self.db_host = db_host
 13 |         self.db_port = db_port
 14 |         self.db = db
 15 | 
 16 |     def vacuum(self):
 17 |         connection = psycopg2.connect(user=self.db_user,
 18 |                                       password=self.db_password,
 19 |                                       host=self.db_host,
 20 |                                       port=self.db_port,
 21 |                                       database=self.db)
 22 |         old_isolation_level = connection.isolation_level
 23 |         connection.set_isolation_level(0)
 24 |         query = "VACUUM"
 25 |         cursor = connection.cursor()
 26 |         cursor.execute(query)
 27 |         connection.commit()
 28 |         connection.set_isolation_level(old_isolation_level)
 29 | 
 30 |     def get_dataframe(self, sql):
 31 |         connection = psycopg2.connect(user=self.db_user,
 32 |                                       password=self.db_password,
 33 |                                       host=self.db_host,
 34 |                                       port=self.db_port,
 35 |                                       database=self.db)
 36 |         return pd.read_sql(sql, connection)
 37 | 
 38 |     def submit_query(self, sql):
 39 |         """Submits query and ignores result."""
 40 | 
 41 |         connection = psycopg2.connect(user=self.db_user,
 42 |                                       password=self.db_password,
 43 |                                       host=self.db_host,
 44 |                                       port=self.db_port,
 45 |                                       database=self.db)
 46 |         cursor = connection.cursor()
 47 |         cursor.execute(sql)
 48 |         connection.commit()
 49 | 
 50 |     def get_result(self, sql):
 51 |         """Fetches exactly one row of result set."""
 52 | 
 53 |         connection = psycopg2.connect(user=self.db_user,
 54 |                                       password=self.db_password,
 55 |                                       host=self.db_host,
 56 |                                       port=self.db_port,
 57 |                                       database=self.db)
 58 |         cursor = connection.cursor()
 59 | 
 60 |         cursor.execute(sql)
 61 |         record = cursor.fetchone()
 62 |         result = record[0]
 63 | 
 64 |         if connection:
 65 |             cursor.close()
 66 |             connection.close()
 67 | 
 68 |         return result
 69 | 
 70 |     def get_result_set(self, sql, return_columns=False):
 71 |         """Fetches all rows of result set."""
 72 | 
 73 |         connection = psycopg2.connect(user=self.db_user,
 74 |                                       password=self.db_password,
 75 |                                       host=self.db_host,
 76 |                                       port=self.db_port,
 77 |                                       database=self.db)
 78 |         cursor = connection.cursor()
 79 | 
 80 |         cursor.execute(sql)
 81 |         rows = cursor.fetchall()
 82 |         columns = [desc[0] for desc in cursor.description]
 83 | 
 84 |         if connection:
 85 |             cursor.close()
 86 |             connection.close()
 87 | 
 88 |         if return_columns:
 89 |             return rows, columns
 90 | 
 91 |         return rows
 92 | 
 93 | 
 94 | class TrueCardinalityEstimator:
 95 |     """Queries the database to return true cardinalities."""
 96 | 
 97 |     def __init__(self, schema_graph, db_connection):
 98 |         self.schema_graph = schema_graph
 99 |         self.db_connection = db_connection
100 | 
101 |     def true_cardinality(self, query):
102 |         full_join_query = gen_full_join_query(self.schema_graph, query.relationship_set, query.table_set, "JOIN")
103 | 
104 |         where_cond = print_conditions(query.conditions, seperator='AND')
105 |         if where_cond != "":
106 |             where_cond = "WHERE " + where_cond
107 |         sql_query = full_join_query.format("COUNT(*)", where_cond)
108 |         cardinality = self.db_connection.get_result(sql_query)
109 |         return sql_query, cardinality
110 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/ensemble_compilation/probabilistic_query.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | from ensemble_compilation.utils import print_conditions
  4 | 
  5 | 
  6 | class FactorType(Enum):
  7 |     INDICATOR_EXP = 0
  8 |     EXPECTATION = 1
  9 | 
 10 | 
 11 | class IndicatorExpectation:
 12 |     """
 13 |     Represents E[1_{conditions} * 1/ denominator_multipliers].
 14 |     """
 15 | 
 16 |     def __init__(self, denominator_multipliers, conditions, nominator_multipliers=None, spn=None, inverse=False,
 17 |                  table_set=None):
 18 |         self.nominator_multipliers = nominator_multipliers
 19 |         if self.nominator_multipliers is None:
 20 |             self.nominator_multipliers = []
 21 |         self.denominator_multipliers = denominator_multipliers
 22 |         self.conditions = conditions
 23 |         self.spn = spn
 24 |         self.min_val = 0
 25 |         self.inverse = inverse
 26 |         self.table_set = table_set
 27 |         if table_set is None:
 28 |             self.table_set = set()
 29 |         #  if self.spn is not None:
 30 |         #      self.min_val = 1 / self.spn.full_join_size
 31 | 
 32 |     def contains_groupby(self, group_bys):
 33 |         for table, attribute in group_bys:
 34 |             for cond_table, condition in self.conditions:
 35 |                 if cond_table == table and condition.startswith(attribute):
 36 |                     return True
 37 |         return False
 38 | 
 39 |     def matches(self, other_expectation, ignore_inverse=False, ignore_spn=False):
 40 |         if self.inverse != other_expectation.inverse and not ignore_inverse:
 41 |             return False
 42 |         if set(self.nominator_multipliers) != set(other_expectation.nominator_multipliers):
 43 |             return False
 44 |         if set(self.denominator_multipliers) != set(other_expectation.denominator_multipliers):
 45 |             return False
 46 |         if set(self.conditions) != set(other_expectation.conditions):
 47 |             return False
 48 |         if not ignore_spn and self.table_set != other_expectation.table_set:
 49 |             return False
 50 |         return True
 51 | 
 52 |     def __hash__(self):
 53 |         return hash((FactorType.INDICATOR_EXP, self.inverse, frozenset(self.nominator_multipliers),
 54 |                      frozenset(self.denominator_multipliers), frozenset(self.conditions), frozenset(self.table_set)))
 55 | 
 56 |     def is_inverse(self, other_expectation):
 57 |         return self.inverse != other_expectation.inverse and self.matches(other_expectation, ignore_inverse=True)
 58 | 
 59 |     def __str__(self):
 60 |         """
 61 |         Prints Expectation of multipliers for conditions.
 62 |         E(multipliers * 1_{c_1 Λ… Λc_n})
 63 |         """
 64 | 
 65 |         if self.inverse:
 66 |             formula = " / E("
 67 |         else:
 68 |             formula = " * E("
 69 | 
 70 |         for i, (table, normalizer) in enumerate(self.nominator_multipliers):
 71 |             formula += table + "." + normalizer
 72 |             if i < len(self.nominator_multipliers) - 1:
 73 |                 formula += "*"
 74 |         if len(self.nominator_multipliers) == 0:
 75 |             formula += "1"
 76 | 
 77 |         if len(self.denominator_multipliers) > 0:
 78 |             formula += "/("
 79 | 
 80 |             # 1/multiplier
 81 |             for i, (table, normalizer) in enumerate(self.denominator_multipliers):
 82 |                 formula += table + "." + normalizer
 83 |                 if i < len(self.denominator_multipliers) - 1:
 84 |                     formula += "*"
 85 |             formula += ")"
 86 | 
 87 |         # |c_1 Λ… Λc_n
 88 |         if len(self.conditions) > 0:
 89 |             formula += "* 1_{"
 90 |             formula += print_conditions(self.conditions)
 91 |             formula += "}"
 92 |         formula += ")"
 93 | 
 94 |         return formula
 95 | 
 96 |     def print_conditions(self, seperator='Λ'):
 97 |         return print_conditions(self.conditions, seperator=seperator)
 98 | 
 99 | 
100 | class Expectation:
101 |     """
102 |     Represents conditional expectation of feature with normalizing multipliers.
103 |     """
104 | 
105 |     def __init__(self, features, normalizing_multipliers, conditions, spn=None):
106 |         self.features = features
107 |         self.normalizing_multipliers = normalizing_multipliers
108 |         self.conditions = conditions
109 |         self.spn = spn
110 |         self.min_val = 1
111 | 
112 |     def matches(self, other_expectation, ignore_spn=False):
113 |         if set(self.features) != set(other_expectation.features):
114 |             return False
115 |         if set(self.normalizing_multipliers) != set(other_expectation.normalizing_multipliers):
116 |             return False
117 |         if set(self.conditions) != set(other_expectation.conditions):
118 |             return False
119 |         if not ignore_spn and self.spn != other_expectation.spn:
120 |             return False
121 |         return True
122 | 
123 |     def __hash__(self):
124 |         return hash((FactorType.EXPECTATION, frozenset(self.features), frozenset(self.normalizing_multipliers),
125 |                      frozenset(self.conditions), self.spn))
126 | 
127 |     def __str__(self):
128 |         """
129 |         Prints Expectation of feature for conditions.
130 |         E(feature | c_1 Λ… Λc_n) (norm by multipliers).
131 |         """
132 | 
133 |         formula = " * E("
134 |         # features
135 |         for i, (table, multiplier) in enumerate(self.features):
136 |             formula += table + "." + multiplier
137 |             if i < len(self.features) - 1:
138 |                 formula += "*"
139 | 
140 |         # /(multipliers)
141 |         if len(self.normalizing_multipliers) > 0:
142 |             formula += " /("
143 |             # 1/multiplier
144 |             for i, (table, normalizer) in enumerate(self.normalizing_multipliers):
145 |                 formula += table + "." + normalizer
146 |                 if i < len(self.normalizing_multipliers) - 1:
147 |                     formula += "*"
148 |             formula += ")"
149 | 
150 |         # |c_1 Λ… Λc_n
151 |         if len(self.conditions) > 0:
152 |             formula += "| "
153 |             formula += print_conditions(self.conditions)
154 | 
155 |         formula += ")"
156 | 
157 |         return formula
158 | 
159 |     def print_conditions(self, seperator='Λ'):
160 |         return print_conditions(self.conditions, seperator=seperator)
161 | 
162 | 
163 | class Probability:
164 | 
165 |     def __init__(self, conditions):
166 |         self.conditions = conditions
167 | 
168 |     def matches(self, other_probability):
169 |         if set(self.conditions) != set(other_probability.conditions):
170 |             return False
171 |         return True
172 | 
173 |     def __str__(self):
174 |         """
175 |         Prints Probability of conditions
176 |         """
177 | 
178 |         formula = ""
179 |         if len(self.conditions) > 0:
180 |             formula += " * P("
181 |             formula += print_conditions(self.conditions)
182 |             formula += ")"
183 | 
184 |         return formula
185 | 
186 |     def print_conditions(self, seperator='Λ'):
187 |         return print_conditions(self.conditions, seperator=seperator)
188 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/ensemble_compilation/utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | 
 4 | def print_conditions(conditions, seperator='Λ'):
 5 |     """Pretty prints a set of conditions with a custom seperator."""
 6 |         
 7 |     formula = ""
 8 |     for i, (table, condition) in enumerate(conditions):
 9 |         formula += table + "." + condition
10 |         if i < len(conditions) - 1:
11 |             formula += ' ' + seperator + ' '
12 | 
13 |     return formula
14 | 
15 | 
16 | def gen_full_join_query(schema_graph, relationship_set, table_set, join_type):
17 |     """
18 |     Creates the full outer join to for a relationship set for join_type FULL OUTER JOIN or JOIN
19 |     """
20 | 
21 |     from_clause = ""
22 |     if len(relationship_set) == 0:
23 |         assert(len(table_set) == 1)
24 | 
25 |         from_clause = list(table_set)[0]
26 |     
27 |     else:
28 |         included_tables = set()
29 |         relationships = copy.copy(relationship_set)
30 |         
31 |         while relationships:
32 |             # first relation to be included
33 |             if len(included_tables) == 0:
34 |                 relationship = relationships.pop()
35 |                 relationship_obj = schema_graph.relationship_dictionary[relationship]
36 |                 included_tables.add(relationship_obj.start)
37 |                 included_tables.add(relationship_obj.end)
38 |                 from_clause += relationship_obj.start + " " + join_type + " " + relationship_obj.end + " ON " + relationship
39 |             else:
40 |                 # search in suitable relations
41 |                 relationship_to_add = None
42 |                 for relationship in relationships:
43 |                     relationship_obj = schema_graph.relationship_dictionary[relationship]
44 |                     if (relationship_obj.start in included_tables and relationship_obj.end not in included_tables) or \
45 |                             (relationship_obj.end in included_tables and relationship_obj.start not in included_tables):
46 |                         relationship_to_add = relationship
47 |                 if relationship_to_add is None:
48 |                     raise ValueError("Query not a tree")
49 |                 # add it to where formula
50 |                 relationship_obj = schema_graph.relationship_dictionary[relationship_to_add]
51 |                 if (relationship_obj.start in included_tables and relationship_obj.end not in included_tables):
52 |                     from_clause += " " + join_type + " " + relationship_obj.end + " ON " + relationship_to_add
53 |                     included_tables.add(relationship_obj.end)
54 |                     relationships.remove(relationship_to_add)
55 |                 elif (relationship_obj.end in included_tables and relationship_obj.start not in included_tables):
56 |                     from_clause += " " + join_type + " " + relationship_obj.start + " ON " + relationship_to_add
57 |                     included_tables.add(relationship_obj.start)
58 |                     relationships.remove(relationship_to_add)
59 |     
60 |     return "SELECT {} FROM " + from_clause + " {}"


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/ensemble_creation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/ensemble_creation/__init__.py


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/ensemble_creation/naive.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from aqp_spn.aqp_spn import AQPSPN
 4 | from data_preparation.join_data_preparation import JoinDataPreparator
 5 | from ensemble_compilation.spn_ensemble import SPNEnsemble
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | RATIO_MIN_INSTANCE_SLICE = 1 / 100
10 | 
11 | 
12 | def create_naive_all_split_ensemble(schema, hdf_path, sample_size, ensemble_path, dataset, bloom_filters,
13 |                                     rdc_threshold, max_table_data, post_sampling_factor, incremental_learning_rate):
14 |     meta_data_path = hdf_path + '/meta_data.pkl'
15 |     prep = JoinDataPreparator(meta_data_path, schema, max_table_data=max_table_data)
16 |     spn_ensemble = SPNEnsemble(schema)
17 | 
18 |     logger.info(f"Creating naive ensemble.")
19 | 
20 |     for table_obj in schema.tables:
21 |         logger.info(f"Learning SPN for {table_obj.table_name}.")
22 |         if incremental_learning_rate > 0:
23 |             df_samples, df_inc_samples, meta_types, null_values, full_join_est = prep.generate_n_samples_with_incremental_part(
24 |                 sample_size,
25 |                 single_table=table_obj.table_name,
26 |                 post_sampling_factor=post_sampling_factor,
27 |                 incremental_learning_rate=incremental_learning_rate)
28 |             logger.debug(f"Requested {sample_size} samples and got {len(df_samples)} + {len(df_inc_samples)} "
29 |                          f"(for incremental learning)")
30 |         else:
31 |             df_samples, meta_types, null_values, full_join_est = prep.generate_n_samples(sample_size,
32 |                                                                                          single_table=table_obj.table_name,
33 |                                                                                          post_sampling_factor=post_sampling_factor)
34 | 
35 |         # learn spn
36 |         aqp_spn = AQPSPN(meta_types, null_values, full_join_est, schema, None, full_sample_size=len(df_samples),
37 |                          table_set={table_obj.table_name}, column_names=list(df_samples.columns),
38 |                          table_meta_data=prep.table_meta_data)
39 |         min_instance_slice = RATIO_MIN_INSTANCE_SLICE * min(sample_size, len(df_samples))
40 |         logger.debug(f"Using min_instance_slice parameter {min_instance_slice}.")
41 |         logger.info(f"SPN training phase with {len(df_samples)} samples")
42 |         aqp_spn.learn(df_samples.values, min_instances_slice=min_instance_slice, bloom_filters=bloom_filters,
43 |                       rdc_threshold=rdc_threshold)
44 |         if incremental_learning_rate > 0:
45 |             logger.info(f"additional incremental SPN training phase with {len(df_inc_samples)} samples "
46 |                         f"({incremental_learning_rate}%)")
47 |             aqp_spn.learn_incremental(df_inc_samples.values)
48 |         spn_ensemble.add_spn(aqp_spn)
49 | 
50 |     ensemble_path += '/ensemble_single_' + dataset + '_' + str(sample_size) + '.pkl'
51 |     logger.info(f"Saving ensemble to {ensemble_path}")
52 |     spn_ensemble.save(ensemble_path)
53 | 
54 | 
55 | def naive_every_relationship_ensemble(schema, hdf_path, sample_size, ensemble_path, dataset, bloom_filters,
56 |                                       rdc_threshold, max_table_data, post_sampling_factor,
57 |                                       incremental_learning_rate=0):
58 |     meta_data_path = hdf_path + '/meta_data.pkl'
59 |     prep = JoinDataPreparator(meta_data_path, schema, max_table_data=max_table_data)
60 |     spn_ensemble = SPNEnsemble(schema)
61 | 
62 |     logger.info(f"Creating naive ensemble for every relationship.")
63 |     for relationship_obj in schema.relationships:
64 |         logger.info(f"Learning SPN for {relationship_obj.identifier}.")
65 | 
66 |         if incremental_learning_rate > 0:
67 |             df_samples, df_inc_samples, meta_types, null_values, full_join_est = prep.generate_n_samples_with_incremental_part(
68 |                 sample_size, relationship_list=[relationship_obj.identifier], post_sampling_factor=post_sampling_factor,
69 |                 incremental_learning_rate=incremental_learning_rate)
70 |         else:
71 |             df_samples, meta_types, null_values, full_join_est = prep.generate_n_samples(
72 |                 sample_size, relationship_list=[relationship_obj.identifier], post_sampling_factor=post_sampling_factor)
73 |         logger.debug(f"Requested {sample_size} samples and got {len(df_samples)}")
74 | 
75 |         # learn spn
76 |         aqp_spn = AQPSPN(meta_types, null_values, full_join_est, schema,
77 |                          [relationship_obj.identifier], full_sample_size=len(df_samples),
78 |                          column_names=list(df_samples.columns), table_meta_data=prep.table_meta_data)
79 |         min_instance_slice = RATIO_MIN_INSTANCE_SLICE * min(sample_size, len(df_samples))
80 |         logger.debug(f"Using min_instance_slice parameter {min_instance_slice}.")
81 |         logger.info(f"SPN training phase with {len(df_samples)} samples")
82 |         aqp_spn.learn(df_samples.values, min_instances_slice=min_instance_slice, bloom_filters=bloom_filters,
83 |                       rdc_threshold=rdc_threshold)
84 |         if incremental_learning_rate > 0:
85 |             logger.info(f"additional incremental SPN training phase with {len(df_inc_samples)} samples "
86 |                         f"({incremental_learning_rate}%)")
87 |             aqp_spn.learn_incremental(df_inc_samples)
88 |         spn_ensemble.add_spn(aqp_spn)
89 | 
90 |     ensemble_path += '/ensemble_relationships_' + dataset + '_' + str(sample_size) + '.pkl'
91 |     logger.info(f"Saving ensemble to {ensemble_path}")
92 |     spn_ensemble.save(ensemble_path)
93 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/ensemble_creation/utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def create_random_join(schema, no_relationships):
 5 |     assert no_relationships >= 0, "No_relationships must be greater equal 0"
 6 | 
 7 |     start_tables = list(schema.tables)
 8 |     random.shuffle(start_tables)
 9 |     start_table_obj = start_tables[0]
10 | 
11 |     merged_tables = {start_table_obj.table_name}
12 |     relationships = set()
13 | 
14 |     for i in range(no_relationships):
15 | 
16 |         possible_next_relationships = list()
17 | 
18 |         for relationship_obj in schema.relationships:
19 |             # already in random relationships
20 |             if relationship_obj.identifier in relationships:
21 |                 continue
22 | 
23 |             if relationship_obj.start in merged_tables and \
24 |                     relationship_obj.end not in merged_tables:
25 |                 possible_next_relationships.append((relationship_obj.identifier, relationship_obj.end))
26 | 
27 |             elif relationship_obj.end in merged_tables and \
28 |                     relationship_obj.start not in merged_tables:
29 |                 possible_next_relationships.append((relationship_obj.identifier, relationship_obj.start))
30 | 
31 |         random.shuffle(possible_next_relationships)
32 |         if len(possible_next_relationships) == 0:
33 |             return list(relationships), merged_tables
34 | 
35 |         relationship, table = possible_next_relationships[0]
36 |         merged_tables.add(table)
37 |         relationships.add(relationship)
38 | 
39 |     return list(relationships), merged_tables
40 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/evaluation/cardinality_evaluation.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from time import perf_counter
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from ensemble_compilation.graph_representation import QueryType
  8 | from ensemble_compilation.physical_db import DBConnection, TrueCardinalityEstimator
  9 | from ensemble_compilation.spn_ensemble import read_ensemble
 10 | from evaluation.utils import parse_query, save_csv
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def compute_ground_truth(query_filename, target_path, physical_db_name):
 16 |     """
 17 |     Queries database for each query and stores result rows in csv file.
 18 |     :param query_filename: where to take queries from
 19 |     :param target_path: where to store dictionary
 20 |     :param physical_db_name: name of the database
 21 |     :return:
 22 |     """
 23 | 
 24 |     db_connection = DBConnection(db=physical_db_name)
 25 | 
 26 |     # read all queries
 27 |     with open(query_filename) as f:
 28 |         queries = f.readlines()
 29 | 
 30 |     csv_rows = []
 31 |     for query_no, query_str in enumerate(queries):
 32 |         logger.debug(f"Computing ground truth for cardinality query {query_no}: {query_str}")
 33 |         query_str = query_str.strip()
 34 |         cardinality_true = db_connection.get_result(query_str)
 35 | 
 36 |         csv_rows.append({'query_no': query_no,
 37 |                          'query': query_str,
 38 |                          'cardinality_true': cardinality_true})
 39 | 
 40 |     save_csv(csv_rows, target_path)
 41 | 
 42 | 
 43 | class GenCodeStats:
 44 | 
 45 |     def __init__(self):
 46 |         self.calls = 0
 47 |         self.total_time = 0.0
 48 | 
 49 | 
 50 | def evaluate_cardinalities(ensemble_location, physical_db_name, query_filename, target_csv_path, schema,
 51 |                            rdc_spn_selection, pairwise_rdc_path, use_generated_code=False,
 52 |                            true_cardinalities_path='./benchmarks/job-light/sql/job_light_true_cardinalities.csv',
 53 |                            max_variants=1, merge_indicator_exp=False, exploit_overlapping=False, min_sample_ratio=0):
 54 |     """
 55 |     Loads ensemble and evaluates cardinality for every query in query_filename
 56 |     :param exploit_overlapping:
 57 |     :param min_sample_ratio:
 58 |     :param max_variants:
 59 |     :param merge_indicator_exp:
 60 |     :param target_csv_path:
 61 |     :param query_filename:
 62 |     :param true_cardinalities_path:
 63 |     :param ensemble_location:
 64 |     :param physical_db_name:
 65 |     :param schema:
 66 |     :return:
 67 |     """
 68 |     if true_cardinalities_path is not None:
 69 |         df_true_card = pd.read_csv(true_cardinalities_path)
 70 |     else:
 71 |         # True cardinality via DB
 72 |         db_connection = DBConnection(db=physical_db_name)
 73 |         true_estimator = TrueCardinalityEstimator(schema, db_connection)
 74 | 
 75 |     # load ensemble
 76 |     spn_ensemble = read_ensemble(ensemble_location, build_reverse_dict=True)
 77 | 
 78 |     csv_rows = []
 79 |     q_errors = []
 80 | 
 81 |     # read all queries
 82 |     with open(query_filename) as f:
 83 |         queries = f.readlines()
 84 | 
 85 |     if use_generated_code:
 86 |         spn_ensemble.use_generated_code()
 87 | 
 88 |     latencies = []
 89 |     for query_no, query_str in enumerate(queries):
 90 | 
 91 |         query_str = query_str.strip()
 92 |         logger.debug(f"Predicting cardinality for query {query_no}: {query_str}")
 93 | 
 94 |         query = parse_query(query_str.strip(), schema)
 95 |         assert query.query_type == QueryType.CARDINALITY
 96 | 
 97 |         if df_true_card is None:
 98 |             assert true_estimator is not None
 99 |             _, cardinality_true = true_estimator.true_cardinality(query)
100 |         else:
101 |             cardinality_true = df_true_card.loc[df_true_card['query_no'] == query_no, ['cardinality_true']].values[0][0]
102 | 
103 |         # only relevant for generated code
104 |         gen_code_stats = GenCodeStats()
105 | 
106 |         card_start_t = perf_counter()
107 |         _, factors, cardinality_predict, factor_values = spn_ensemble \
108 |             .cardinality(query, rdc_spn_selection=rdc_spn_selection, pairwise_rdc_path=pairwise_rdc_path,
109 |                          merge_indicator_exp=merge_indicator_exp, max_variants=max_variants,
110 |                          exploit_overlapping=exploit_overlapping, return_factor_values=True,
111 |                          gen_code_stats=gen_code_stats)
112 |         card_end_t = perf_counter()
113 |         latency_ms = (card_end_t - card_start_t) * 1000
114 | 
115 |         logger.debug(f"\t\tLatency: {latency_ms:.2f}ms")
116 |         logger.debug(f"\t\tTrue: {cardinality_true}")
117 |         logger.debug(f"\t\tPredicted: {cardinality_predict}")
118 | 
119 |         q_error = max(cardinality_predict / cardinality_true, cardinality_true / cardinality_predict)
120 |         if cardinality_predict == 0 and cardinality_true == 0:
121 |             q_error = 1.0
122 | 
123 |         logger.debug(f"Q-Error was: {q_error}")
124 |         q_errors.append(q_error)
125 |         csv_rows.append({'query_no': query_no,
126 |                          'query': query_str,
127 |                          'cardinality_predict': cardinality_predict,
128 |                          'cardinality_true': cardinality_true,
129 |                          'latency_ms': latency_ms,
130 |                          'generated_spn_calls': gen_code_stats.calls,
131 |                          'latency_generated_code': gen_code_stats.total_time * 1000})
132 |         latencies.append(latency_ms)
133 | 
134 |     # print percentiles of published JOB-light
135 |     q_errors = np.array(q_errors)
136 |     q_errors.sort()
137 |     logger.info(f"{q_errors[-10:]}")
138 |     # https://arxiv.org/pdf/1809.00677.pdf
139 |     ibjs_vals = [1.59, 150, 3198, 14309, 590]
140 |     mcsn_vals = [3.82, 78.4, 362, 927, 57.9]
141 |     for i, percentile in enumerate([50, 90, 95, 99]):
142 |         logger.info(f"Q-Error {percentile}%-Percentile: {np.percentile(q_errors, percentile)} (vs. "
143 |                     f"MCSN: {mcsn_vals[i]} and IBJS: {ibjs_vals[i]})")
144 | 
145 |     logger.info(f"Q-Mean wo inf {np.mean(q_errors[np.isfinite(q_errors)])} (vs. "
146 |                 f"MCSN: {mcsn_vals[-1]} and IBJS: {ibjs_vals[-1]})")
147 |     logger.info(f"Latency avg: {np.mean(latencies):.2f}ms")
148 | 
149 |     # write to csv
150 |     save_csv(csv_rows, target_csv_path)
151 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/evaluation/confidence_interval_evaluation.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import logging
  3 | import pickle
  4 | from time import perf_counter
  5 | 
  6 | import math
  7 | import scipy
  8 | 
  9 | from ensemble_compilation.graph_representation import AggregationType
 10 | from ensemble_compilation.spn_ensemble import read_ensemble, logger
 11 | from evaluation.utils import parse_query, all_operations_of_type, save_csv
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def evaluate_confidence_intervals(ensemble_location, query_filename, target_path, schema, ground_truth_path,
 17 |                                   confidence_sample_size, rdc_spn_selection, pairwise_rdc_path,
 18 |                                   max_variants=5, merge_indicator_exp=False,
 19 |                                   exploit_overlapping=False, min_sample_ratio=0, sample_size=10000000,
 20 |                                   true_result_upsampling_factor=300):  # 100
 21 |     """
 22 |     Loads ensemble and computes metrics for confidence interval evaluation
 23 |     :param ensemble_location:
 24 |     :param query_filename:
 25 |     :param target_csv_path:
 26 |     :param schema:
 27 |     :param max_variants:
 28 |     :param merge_indicator_exp:
 29 |     :param exploit_overlapping:
 30 |     :param min_sample_ratio:
 31 |     :return:
 32 |     """
 33 | 
 34 |     spn_ensemble = read_ensemble(ensemble_location, build_reverse_dict=True)
 35 |     csv_rows = []
 36 | 
 37 |     # read all queries
 38 |     with open(query_filename) as f:
 39 |         queries = f.readlines()
 40 |     # read ground truth
 41 |     with open(ground_truth_path, 'rb') as handle:
 42 |         ground_truth = pickle.load(handle)
 43 | 
 44 |     for query_no, query_str in enumerate(queries):
 45 | 
 46 |         query_str = query_str.strip()
 47 |         logger.info(f"Evaluating the confidence intervals for query {query_no}: {query_str}")
 48 | 
 49 |         query = parse_query(query_str.strip(), schema)
 50 |         aqp_start_t = perf_counter()
 51 |         confidence_intervals, aqp_result = spn_ensemble.evaluate_query(query, rdc_spn_selection=rdc_spn_selection,
 52 |                                                                        pairwise_rdc_path=pairwise_rdc_path,
 53 |                                                                        merge_indicator_exp=merge_indicator_exp,
 54 |                                                                        max_variants=max_variants,
 55 |                                                                        exploit_overlapping=exploit_overlapping,
 56 |                                                                        debug=False,
 57 |                                                                        confidence_intervals=True,
 58 |                                                                        confidence_sample_size=confidence_sample_size)
 59 |         aqp_end_t = perf_counter()
 60 |         latency = aqp_end_t - aqp_start_t
 61 |         logger.info(f"\t\t{'total_time:':<32}{latency} secs")
 62 | 
 63 |         true_result = ground_truth[query_no]
 64 | 
 65 |         type_all_ops = None
 66 |         if all_operations_of_type(AggregationType.SUM, query):
 67 |             type_all_ops = AggregationType.SUM
 68 |         elif all_operations_of_type(AggregationType.AVG, query):
 69 |             type_all_ops = AggregationType.AVG
 70 |         elif all_operations_of_type(AggregationType.COUNT, query):
 71 |             type_all_ops = AggregationType.COUNT
 72 | 
 73 |         if isinstance(aqp_result, list):
 74 |             for result_row in true_result:
 75 |                 group_by_attributes = result_row[:-3]
 76 |                 matching_aqp_rows = [(matching_idx, aqp_row) for matching_idx, aqp_row in enumerate(aqp_result)
 77 |                                      if aqp_row[:-1] == group_by_attributes]
 78 |                 assert len(matching_aqp_rows) <= 1, "Multiple possible group by attributes found."
 79 |                 if len(matching_aqp_rows) == 1:
 80 |                     matching_idx, matching_aqp_row = matching_aqp_rows[0]
 81 |                     true_aggregate, std, count = result_row[-3:]
 82 | 
 83 |                     if count <= 1:
 84 |                         # std is not defined in this case
 85 |                         continue
 86 | 
 87 |                     interval = confidence_intervals[matching_idx]
 88 |                     aqp_std, true_std, relative_confidence_interval_error, true_result, aqp_aggregate = evaluate_stds(
 89 |                         matching_aqp_row[-1],
 90 |                         interval, count,
 91 |                         sample_size, std,
 92 |                         true_aggregate, type_all_ops,
 93 |                         true_result_upsampling_factor)
 94 | 
 95 |                     logger.debug(f"\t\taqp_std: {aqp_std}")
 96 |                     logger.debug(f"\t\ttrue_std: {true_std}")
 97 | 
 98 |                     csv_rows.append({'query_no': query_no,
 99 |                                      'latency': latency,
100 |                                      'aqp_std': aqp_std,
101 |                                      'aqp_aggregate': aqp_aggregate,
102 |                                      'true_std': true_std,
103 |                                      'true_aggregate': true_result,
104 |                                      'count': count,
105 |                                      'relative_confidence_interval_error': relative_confidence_interval_error
106 |                                      })
107 |         else:
108 |             true_aggregate, std, count = true_result[0][-3:]
109 | 
110 |             aqp_std, true_std, relative_confidence_interval_error, true_result, aqp_aggregate = evaluate_stds(
111 |                 aqp_result, confidence_intervals,
112 |                 count, sample_size, std,
113 |                 true_aggregate,
114 |                 type_all_ops,
115 |                 true_result_upsampling_factor)
116 |             logger.debug(f"\t\taqp_std: {aqp_std}")
117 |             logger.debug(f"\t\ttrue_std: {true_std}")
118 | 
119 |             csv_rows.append({'query_no': query_no,
120 |                              'latency': latency,
121 |                              'aqp_std': aqp_std,
122 |                              'aqp_aggregate': aqp_aggregate,
123 |                              'true_std': true_std,
124 |                              'true_aggregate': true_result,
125 |                              'count': count,
126 |                              'relative_confidence_interval_error': relative_confidence_interval_error
127 |                              })
128 | 
129 |     save_csv(csv_rows, target_path)
130 | 
131 | 
132 | def evaluate_stds(aqp_result, confidence_intervals, count, sample_size, std, true_result, type_all_ops,
133 |                   true_result_upsampling_factor):
134 |     std = float(std)
135 |     count = float(count)
136 |     true_result = float(true_result)
137 |     confidence_upper_bound = confidence_intervals[1]
138 |     ci_length = confidence_upper_bound - aqp_result
139 |     aqp_std = ci_length  # / scipy.stats.norm.ppf(0.95)
140 |     if type_all_ops == AggregationType.AVG:
141 |         # for normal random variable std/sqrt(n)
142 |         true_std = std / math.sqrt(count)
143 | 
144 |     elif type_all_ops == AggregationType.COUNT:
145 |         # for bernoulli: sqrt(n*p*(1-p))
146 | 
147 |         bernoulli_p = count / sample_size
148 |         true_std = math.sqrt(sample_size * bernoulli_p * (1 - bernoulli_p)) * true_result_upsampling_factor
149 |         true_result *= true_result_upsampling_factor
150 | 
151 |     elif type_all_ops == AggregationType.SUM:
152 |         # model sum as product of 1_c * X
153 | 
154 |         bernoulli_p = count / sample_size
155 |         bernoulli_std = math.sqrt(sample_size * bernoulli_p * (1 - bernoulli_p))
156 | 
157 |         rv_exp = true_result / count
158 |         rv_std = std / math.sqrt(count)
159 | 
160 |         true_std = math.sqrt((bernoulli_std ** 2 + bernoulli_p ** 2) * (rv_std ** 2 + rv_exp ** 2) -
161 |                              bernoulli_p ** 2 * rv_exp ** 2) * true_result_upsampling_factor
162 |         true_result *= true_result_upsampling_factor
163 | 
164 |     true_std *= scipy.stats.norm.ppf(0.95)
165 |     relative_confidence_interval_error = abs(aqp_std - true_std) / true_result
166 |     return aqp_std, true_std, relative_confidence_interval_error, true_result, aqp_result
167 | 


--------------------------------------------------------------------------------
/lecarb/estimator/deepdb/evaluation/spn_statistics.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import logging
 3 | import os
 4 | 
 5 | from spn.structure.Base import Node, get_nodes_by_type
 6 | 
 7 | from ensemble_compilation.spn_ensemble import read_ensemble
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def evaluate_spn_statistics(spn_path, target_csv_path, build_time_path):
13 |     csv_list = []
14 | 
15 |     # SPN learn times
16 |     for filename in os.listdir(spn_path):
17 |         logger.debug(f'Reading {filename}')
18 |         if not filename.startswith("ensemble") or filename.endswith('.zip'):
19 |             continue
20 | 
21 |         spn_ensemble = read_ensemble(os.path.join(spn_path, filename))
22 |         for spn in spn_ensemble.spns:
23 |             num_nodes = len(get_nodes_by_type(spn.mspn, Node))
24 |             upper_bound = 200 * len(spn.column_names) - 1
25 |             # assert num_nodes <= upper_bound, "Num of nodes upper bound is wrong"
26 |             csv_list.append((filename, spn.learn_time, spn.full_sample_size, spn.min_instances_slice, spn.rdc_threshold,
27 |                              len(spn.relationship_set), len(spn.table_set),
28 |                              " - ".join([table for table in spn.table_set]),
29 |                              len(spn.column_names),
30 |                              num_nodes,
31 |                              upper_bound))
32 | 
33 |     # HDF create times
34 |     with open(build_time_path) as f:
35 |         hdf_preprocessing_time = int(f.readlines()[0])
36 |         csv_list += [('generate_hdf', hdf_preprocessing_time, 0, 0, 0, 0, 0, "")]
37 | 
38 |     with open(target_csv_path, 'w', newline='') as f:
39 |         writer = csv.writer(f)
40 |         writer.writerow(
41 |             ['filename', 'learn_time', 'full_sample_size', 'min_instances_slice', 'rdc_threshold', 'no_joins',
42 |              'no_tables', 'tables', 'no_columns', 'structure_stats', 'upper_bound'])
43 |         writer.writerows(csv_list)
44 | 


--------------------------------------------------------------------------------
/lecarb/estimator/estimator.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import logging
 3 | import numpy as np
 4 | from typing import Tuple, Any
 5 | from ..workload.workload import Query, query_2_triple
 6 | from ..dataset.dataset import Table
 7 | 
 8 | L = logging.getLogger(__name__)
 9 | 
10 | class Estimator(object):
11 |     """Base class for a cardinality estimator."""
12 |     def __init__(self, table: Table, **kwargs: Any) -> None:
13 |         self.table = table
14 |         self.params = dict(kwargs)
15 | 
16 |     def __repr__(self) -> str:
17 |         pstr = ';'.join([f"{p}={v}" for p, v in self.params.items()])
18 |         return f"{self.__class__.__name__.lower()}-{pstr}"
19 | 
20 |     def query(self, query: Query) -> Tuple[float, float]:
21 |         """return est_card, dur_ms"""
22 |         raise NotImplementedError
23 | 
24 | def in_between(data: Any, val: Tuple[Any, Any]) -> bool:
25 |     assert len(val) == 2
26 |     lrange, rrange = val
27 |     return np.greater_equal(data, lrange) & np.less_equal(data, rrange)
28 | 
29 | OPS = {
30 |     '>': np.greater,
31 |     '<': np.less,
32 |     '>=': np.greater_equal,
33 |     '<=': np.less_equal,
34 |     '=': np.equal,
35 |     '[]': in_between
36 | }
37 | 
38 | class Oracle(Estimator):
39 |     def __init__(self, table):
40 |         super(Oracle, self).__init__(table=table)
41 | 
42 |     def query(self, query):
43 |         columns, operators, values = query_2_triple(query, with_none=False, split_range=False)
44 |         start_stmp = time.time()
45 |         bitmap = np.ones(self.table.row_num, dtype=bool)
46 |         for c, o, v in zip(columns, operators, values):
47 |             bitmap &= OPS[o](self.table.data[c], v)
48 |         card = bitmap.sum()
49 |         dur_ms = (time.time() - start_stmp) * 1e3
50 |         return card, dur_ms
51 | 
52 | #  from pandasql import sqldf <- too slow
53 |     #  def query(self, query):
54 |     #      sql = query_2_sql(query, self.table)
55 |     #      data = self.table.data
56 |     #      start_stmp = time.time()
57 |     #      df = sqldf(sql, locals())
58 |     #      card = df.iloc[0, 0]
59 |     #      dur_ms = (time.time() - start_stmp) * 1e3
60 |     #      return card, dur_ms
61 | 


--------------------------------------------------------------------------------
/lecarb/estimator/feedback_kde.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import logging
 3 | from typing import Any, Dict
 4 | import psycopg2
 5 | 
 6 | from .estimator import Estimator
 7 | from .utils import run_test
 8 | from ..workload.workload import query_2_kde_sql, load_queryset
 9 | from ..dataset.dataset import load_table
10 | from ..constants import KDE_DATABASE_URL
11 | 
12 | L = logging.getLogger(__name__)
13 | 
14 | class FeedbackKDE(Estimator):
15 |     def __init__(self, table, ratio, train_num, seed):
16 |         super(FeedbackKDE, self).__init__(table=table, version=table.version, ratio=ratio, train_num=train_num, seed=seed)
17 |         self.sample_num = int(table.row_num * ratio)
18 |         L.info(f"Going to collect {self.sample_num} samples")
19 | 
20 |         self.conn = psycopg2.connect(KDE_DATABASE_URL)
21 |         self.conn.set_session('read uncommitted', autocommit=True)
22 |         self.cursor = self.conn.cursor()
23 | 
24 |         # Make sure that debug mode is deactivated and that all model traces are removed (unless we want to reuse the model):
25 |         self.cursor.execute(f"SELECT setseed({1/seed});")
26 |         # self.cursor.execute("SET kde_debug TO true;")
27 |         self.cursor.execute("SET kde_debug TO false;")
28 |         self.cursor.execute("SET ocl_use_gpu TO true;")
29 |         self.cursor.execute("SET kde_error_metric TO Quadratic;")
30 | 
31 |         # Remove all existing model traces if we don't reuse the model.
32 |         self.cursor.execute("DELETE FROM pg_kdemodels;")
33 |         self.cursor.execute("DELETE FROM pg_kdefeedback;")
34 |         self.cursor.execute("SELECT pg_stat_reset();")
35 | 
36 |         # KDE-specific parameters.
37 |         self.cursor.execute(f"SET kde_samplesize TO {self.sample_num};")
38 |         self.cursor.execute("SET kde_enable TO true;")
39 |         self.cursor.execute("SET kde_collect_feedback TO true;")
40 | 
41 |     def train_batch(self, queries):
42 |         for i, query in enumerate(queries):
43 |             self.cursor.execute(query_2_kde_sql(query, self.table))
44 |             if (i + 1) % 100 == 0:
45 |                 L.info(f"{i+1} queries done")
46 |         L.info("Finishing running all training queries")
47 | 
48 |         self.cursor.execute("SET kde_collect_feedback TO false;") # We don't need further feedback collection.
49 |         self.cursor.execute("SET kde_enable_bandwidth_optimization TO true;")
50 |         self.cursor.execute(f"SET kde_optimization_feedback_window TO {len(queries)};")
51 | 
52 |         stat_cnt = 100
53 |         for c in self.table.columns.values():
54 |             self.cursor.execute(f"alter table \"{self.table.name}\" alter column {c.name} set statistics {stat_cnt};")
55 | 
56 |         self.cursor.execute(f"analyze \"{self.table.name}\"({','.join(self.table.columns.keys())});")
57 | 
58 |         sample_file = f"/tmp/sample_{self.table.name}.csv"
59 |         self.cursor.execute(f"SELECT kde_dump_sample('{self.table.name}', '{sample_file}');")
60 | 
61 |     def query(self, query):
62 |         sql = f"explain(format json) {query_2_kde_sql(query, self.table)}"
63 | 
64 |         start_stmp = time.time()
65 |         self.cursor.execute(sql)
66 |         dur_ms = (time.time() - start_stmp) * 1e3
67 |         res = self.cursor.fetchall()
68 |         card = res[0][0][0]['Plan']['Plan Rows']
69 |         #  L.info(card)
70 |         return card, dur_ms
71 | 
72 | def test_kde(seed: int, dataset: str, version: str, workload:str, params: Dict[str, Any], overwrite: bool):
73 |     """
74 |     params:
75 |         version: the version of table that postgres construct statistics, might not be the same with the one we test on
76 |         ratio: ratio of the sample size
77 |         train_num: number of queries use to train
78 |     """
79 |     # prioriy: params['version'] (build statistics from another dataset) > version (build statistics on the same dataset)
80 |     table = load_table(dataset, params.get('version') or version)
81 |     train_num = params['train_num']
82 | 
83 |     L.info("load training workload...")
84 |     queries = load_queryset(dataset, workload)['train'][:train_num]
85 | 
86 |     L.info("construct postgres estimator...")
87 |     estimator = FeedbackKDE(table, ratio=params['ratio'], train_num=train_num, seed=seed)
88 | 
89 |     L.info(f"start training with {train_num} queries...")
90 |     start_stmp = time.time()
91 |     estimator.train_batch(queries)
92 |     dur_min = (time.time() - start_stmp) / 60
93 |     L.info(f"built kde estimator: {estimator}, using {dur_min:1f} minutes")
94 | 
95 |     run_test(dataset, version, workload, estimator, overwrite)
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/lecarb/estimator/lw/README.md:
--------------------------------------------------------------------------------
1 | Implementation of paper [Selectivity Estimation for Range Predicates using Lightweight Models](http://www.vldb.org/pvldb/vol12/p1044-dutt.pdf)
2 | 


--------------------------------------------------------------------------------
/lecarb/estimator/lw/common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | import logging
 4 | 
 5 | from ..postgres import Postgres
 6 | from ...workload.workload import load_queryset, load_labels, query_2_sqls, query_2_vector
 7 | from ...constants import DATA_ROOT, PKL_PROTO
 8 | 
 9 | L = logging.getLogger(__name__)
10 | 
11 | # selectivity_list (np.array): selectivity for each attribute
12 | def AVI(sel_list):
13 |     return np.prod(sel_list) if len(sel_list) > 0 else 1.0
14 | 
15 | def EBO(sel_list):
16 |     s = 1.0
17 |     sorted_slist = np.sort(sel_list)
18 |     for i in range(min(4, sel_list.size)):
19 |         s = s * np.power(sorted_slist[i], 1 / (i+1))
20 |     return s
21 | 
22 | def MinSel(sel_list):
23 |     return sel_list.min() if len(sel_list) > 0 else 1.0
24 | 
25 | def encode_query(table, query, pg_est):
26 |         range_features = query_2_vector(query, table, upper=1000)
27 |         sqls = query_2_sqls(query, table)
28 |         sel_list = []
29 |         for sql in sqls:
30 |             pred, _ = pg_est.query_sql(sql)
31 |             sel_list.append(pred / table.row_num)
32 |         sel_list = np.array(sel_list)
33 |         ce_features = np.round(np.array([AVI(sel_list), EBO(sel_list), MinSel(sel_list)]) * table.row_num)
34 | 
35 |         return np.concatenate([range_features, encode_label(ce_features)])
36 | 
37 | def encode_label(label):
38 |     # +1 before log2 to deal with ground truth = 0 scenario
39 |     return np.log2(label + 1)
40 | 
41 | def decode_label(label):
42 |     return np.power(2, label) - 1
43 | 
44 | def encode_queries(table, queryset, labels, pg_est):
45 |     X = []
46 |     y = []
47 |     gt = []
48 | 
49 |     for query, label in zip(queryset, labels):
50 |         features = encode_query(table, query, pg_est)
51 |         log2l = encode_label(label.cardinality)
52 |         X.append(features)
53 |         y.append(log2l)
54 |         gt.append(label.cardinality)
55 | 
56 |     return np.array(X), np.array(y), np.array(gt)
57 | 
58 | def load_lw_dataset(table, workload, seed, bins):
59 |     query_path = DATA_ROOT / table.dataset / "lw"
60 |     query_path.mkdir(exist_ok=True)
61 | 
62 |     file_path = query_path / f"{table.version}_{workload}_{bins}_{seed}.pkl"
63 |     if file_path.is_file():
64 |         L.info(f"features already built in file {file_path}")
65 |         with open(file_path, 'rb') as f:
66 |             return pickle.load(f)
67 | 
68 |     pg_est = Postgres(table, bins, seed)
69 |     L.info(f"Start loading queryset:{workload} and labels for version {table.version} of dataset {table.dataset}...")
70 |     queryset = load_queryset(table.dataset, workload)
71 |     labels = load_labels(table.dataset, table.version, workload)
72 | 
73 |     lw_dataset = {}
74 |     for group in queryset.keys():
75 |         L.info(f"Start encode group: {group} with {len(labels[group])} queries...")
76 |         lw_dataset[group] = encode_queries(table, queryset[group], labels[group], pg_est)
77 | 
78 |     with open(file_path, 'wb') as f:
79 |         pickle.dump(lw_dataset, f, protocol=PKL_PROTO)
80 | 
81 |     return lw_dataset
82 | 


--------------------------------------------------------------------------------
/lecarb/estimator/lw/lw_nn.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import logging
  3 | from typing import Dict, Any, Tuple
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | from torch.utils.data import DataLoader, Dataset
  9 | 
 10 | from .model import LWNNModel
 11 | from .common import load_lw_dataset, encode_query, decode_label
 12 | from ..postgres import Postgres
 13 | from ..estimator import Estimator
 14 | from ..utils import report_model, evaluate, run_test
 15 | from ...dataset.dataset import load_table
 16 | from ...workload.workload import Query
 17 | from ...constants import DEVICE, MODEL_ROOT, NUM_THREADS
 18 | 
 19 | L = logging.getLogger(__name__)
 20 | 
 21 | class Args:
 22 |     def __init__(self, **kwargs):
 23 |         self.bs = 32
 24 |         self.epochs = 500
 25 |         self.lr = 0.001 # default value in both pytorch and keras
 26 |         self.hid_units = '128_64_32'
 27 |         self.bins = 200
 28 |         self.train_num = 10000
 29 | 
 30 |         # overwrite parameters from user
 31 |         self.__dict__.update(kwargs)
 32 | 
 33 | class LWQueryDataset(Dataset):
 34 |     def __init__(self, X, y, gt):
 35 |         super(LWQueryDataset, self).__init__()
 36 |         self.X = X
 37 |         self.y = y
 38 |         self.gt = gt
 39 |     def __len__(self):
 40 |         return len(self.y)
 41 |     def __getitem__(self, idx):
 42 |         return self.X[idx], self.y[idx], self.gt[idx]
 43 | 
 44 | def make_dataset(dataset, num=-1):
 45 |     X, y, gt = dataset
 46 |     L.info(f"{X.shape}, {y.shape}, {gt.shape}")
 47 |     if num <= 0:
 48 |         return LWQueryDataset(X, y, gt)
 49 |     else:
 50 |         return LWQueryDataset(X[:num], y[:num], gt[:num])
 51 | 
 52 | def train_lw_nn(seed, dataset, version, workload, params, sizelimit):
 53 |     # uniform thread number
 54 |     torch.set_num_threads(NUM_THREADS)
 55 |     assert NUM_THREADS == torch.get_num_threads(), torch.get_num_threads()
 56 |     L.info(f"torch threads: {torch.get_num_threads()}")
 57 | 
 58 |     torch.manual_seed(seed)
 59 |     np.random.seed(seed)
 60 | 
 61 |     # convert parameter dict of lw(nn)
 62 |     L.info(f"params: {params}")
 63 |     args = Args(**params)
 64 | 
 65 |     table = load_table(dataset, version)
 66 | 
 67 |     # create model
 68 |     fea_num = table.col_num*2+3
 69 |     model = LWNNModel(fea_num, args.hid_units).to(DEVICE)
 70 |     model_size = report_model(model)
 71 | 
 72 |     # check size limit
 73 |     if sizelimit > 0 and model_size > (sizelimit * table.data_size_mb):
 74 |         L.info(f"Exceeds size limit {model_size:.2f}MB > {sizelimit} x {table.data_size_mb}, do not conintue training!")
 75 |         return
 76 |     L.info(f'Overall LWNN model size = {model_size:.2f}MB')
 77 | 
 78 |     # load dataset
 79 |     dataset = load_lw_dataset(table, workload, seed, args.bins)
 80 |     train_dataset = make_dataset(dataset['train'], num=args.train_num)
 81 |     valid_dataset = make_dataset(dataset['valid'], num=args.train_num//10)
 82 | 
 83 |     L.info(f"Number of training samples: {len(train_dataset)}")
 84 |     L.info(f"Number of validation samples: {len(valid_dataset)}")
 85 |     train_loader = DataLoader(train_dataset, batch_size=args.bs)
 86 |     valid_loader = DataLoader(valid_dataset, batch_size=args.bs)
 87 | 
 88 |     # Train model
 89 |     state = {
 90 |         'seed': seed,
 91 |         'args': args,
 92 |         'device': DEVICE,
 93 |         'threads': torch.get_num_threads(),
 94 |         'dataset': table.dataset,
 95 |         'version': table.version,
 96 |         'workload': workload,
 97 |         'model_size': model_size,
 98 |         'fea_num': fea_num,
 99 |     }
100 |     model_path = MODEL_ROOT / table.dataset
101 |     model_path.mkdir(parents=True, exist_ok=True)
102 |     model_file = model_path / f"{table.version}_{workload}-{model.name()}_bin{args.bins}_ep{args.epochs}_bs{args.bs}_{args.train_num//1000}k-{seed}.pt"
103 | 
104 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
105 |     mse_loss = nn.MSELoss(reduction='none')
106 |     best_valid_loss = float('inf')
107 | 
108 |     start_stmp = time.time()
109 |     valid_time = 0
110 |     for epoch in range(args.epochs):
111 |         train_loss = torch.tensor([])
112 |         model.train()
113 |         for _, data in enumerate(train_loader):
114 |             inputs, labels, _ = data
115 |             inputs = inputs.to(DEVICE).float()
116 |             labels = labels.to(DEVICE).float()
117 | 
118 |             optimizer.zero_grad()
119 |             preds = model(inputs).reshape(-1)
120 | 
121 |             loss = mse_loss(preds, labels)
122 |             loss.mean().backward()
123 |             optimizer.step()
124 |             train_loss = torch.cat([train_loss, loss.cpu()])
125 |         dur_min = (time.time() - start_stmp) / 60
126 |         L.info(f"Epoch {epoch+1}, loss: {train_loss.mean()}, time since start: {dur_min:.1f} mins")
127 | 
128 |         L.info(f"Test on valid set...")
129 |         valid_stmp = time.time()
130 |         valid_loss = torch.tensor([])
131 |         valid_preds = torch.tensor([])
132 |         valid_gts = torch.tensor([])
133 |         model.eval()
134 |         for _, data in enumerate(valid_loader):
135 |             inputs, labels, gts = data
136 |             inputs = inputs.to(DEVICE).float()
137 |             labels = labels.to(DEVICE).float()
138 | 
139 |             with torch.no_grad():
140 |                 preds = model(inputs).reshape(-1)
141 |                 valid_preds = torch.cat([valid_preds, preds.cpu()])
142 |                 valid_gts = torch.cat([valid_gts, gts.float()])
143 | 
144 |                 loss = mse_loss(preds, labels)
145 |                 valid_loss = torch.cat([valid_loss, loss.cpu()])
146 | 
147 |         valid_loss = valid_loss.mean()
148 |         L.info(f'Valid loss is {valid_loss:.4f}')
149 |         valid_preds = np.maximum(np.round(decode_label(valid_preds)), 0.0)
150 |         L.info("Q-Error on validation set:")
151 |         _, metrics = evaluate(valid_preds, valid_gts)
152 | 
153 |         if valid_loss < best_valid_loss:
154 |             L.info('best valid loss for now!')
155 |             best_valid_loss = valid_loss
156 |             state['model_state_dict'] = model.state_dict()
157 |             state['optimizer_state_dict'] = optimizer.state_dict()
158 |             state['valid_error'] = {workload: metrics}
159 |             state['train_time'] = (valid_stmp-start_stmp-valid_time) / 60
160 |             state['current_epoch'] = epoch
161 |             torch.save(state, model_file)
162 | 
163 |         valid_time += time.time() - valid_stmp
164 | 
165 |     L.info(f"Training finished! Time spent since start: {(time.time()-start_stmp)/60:.2f} mins")
166 |     L.info(f"Model saved to {model_file}, best valid: {state['valid_error']}")
167 | 
168 | class LWNN(Estimator):
169 |     def __init__(self, model, model_name, pg_est, table):
170 |         super(LWNN, self).__init__(table=table, model=model_name)
171 |         self.model = model.to(DEVICE)
172 |         self.model.eval()
173 |         self.pg_est = pg_est
174 | 
175 |     def query(self, query):
176 |         if isinstance(query, Query):
177 |             query = encode_query(self.table, query, self.pg_est)
178 |         return self.query_vector(query)
179 | 
180 |     def query_vector(self, vec):
181 |         start_stmp = time.time()
182 |         with torch.no_grad():
183 |             pred = self.model(torch.FloatTensor(vec).to(DEVICE)).cpu().item()
184 |         dur_ms = (time.time() - start_stmp) * 1e3
185 |         return np.maximum(np.round(decode_label(pred)), 0.0), dur_ms
186 | 
187 | def load_lw_nn(dataset: str, model_name: str) -> Tuple[Estimator, Dict[str, Any]]:
188 |     model_file = MODEL_ROOT / dataset / f"{model_name}.pt"
189 |     L.info(f"load model from {model_file} ...")
190 |     state = torch.load(model_file, map_location=DEVICE)
191 |     args = state['args']
192 | 
193 |     table = load_table(dataset, state['version'])
194 |     # load model
195 |     model = LWNNModel(state['fea_num'], args.hid_units).to(DEVICE)
196 |     report_model(model)
197 |     L.info(f"Overall LWNN model size = {state['model_size']:.2f}MB")
198 |     model.load_state_dict(state['model_state_dict'])
199 |     pg_est = Postgres(table, args.bins, state['seed'])
200 | 
201 |     estimator = LWNN(model, model_name, pg_est, table)
202 |     return estimator, state
203 | 
204 | def test_lw_nn(dataset: str, version: str, workload: str, params: Dict[str, Any], overwrite: bool) -> None:
205 |     """
206 |     params:
207 |         model: model file name
208 |         use_cache: load processed vectors directly instead of build from queries
209 |     """
210 |     # uniform thread number
211 |     torch.set_num_threads(NUM_THREADS)
212 |     assert NUM_THREADS == torch.get_num_threads(), torch.get_num_threads()
213 |     L.info(f"Torch threads: {torch.get_num_threads()}")
214 | 
215 |     model_file = MODEL_ROOT / dataset / f"{params['model']}.pt"
216 |     L.info(f"Load model from {model_file} ...")
217 |     state = torch.load(model_file, map_location=DEVICE)
218 |     args = state['args']
219 | 
220 |     # load corresonding version of table
221 |     table = load_table(dataset, state['version'])
222 | 
223 |     # load model
224 |     model = LWNNModel(state['fea_num'], args.hid_units).to(DEVICE)
225 |     report_model(model)
226 |     L.info(f"Overall LWNN model size = {state['model_size']:.2f}MB")
227 |     model.load_state_dict(state['model_state_dict'])
228 | 
229 |     if params['use_cache']:
230 |         # do not need to connect postgres in this case
231 |         estimator = LWNN(model, params['model'], None, table)
232 |         L.info(f"Load and build lw(nn) estimator: {estimator}")
233 | 
234 |         # test table might has different version with train
235 |         test_table = load_table(dataset, version)
236 |         lw_dataset = load_lw_dataset(test_table, workload, state['seed'], args.bins)
237 |         X, _, gt = lw_dataset['test']
238 |         run_test(dataset, version, workload, estimator, overwrite, lw_vec=(X, gt))
239 |     else:
240 |         pg_est = Postgres(table, args.bins, state['seed'])
241 |         estimator = LWNN(model, params['model'], pg_est, table)
242 |         L.info(f"Load and build lw(nn) estimator: {estimator}")
243 | 
244 |         run_test(dataset, version, workload, estimator, overwrite)
245 | 
246 | 
247 | 


--------------------------------------------------------------------------------
/lecarb/estimator/lw/lw_tree.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import logging
  3 | from typing import Dict, Any, Tuple
  4 | import pickle
  5 | 
  6 | import numpy as np
  7 | import xgboost as xgb
  8 | 
  9 | from .common import load_lw_dataset, encode_query, decode_label
 10 | from ..postgres import Postgres
 11 | from ..estimator import Estimator
 12 | from ..utils import evaluate, run_test
 13 | from ...dataset.dataset import load_table
 14 | from ...workload.workload import Query
 15 | from ...constants import MODEL_ROOT, NUM_THREADS, PKL_PROTO
 16 | 
 17 | L = logging.getLogger(__name__)
 18 | 
 19 | class Args:
 20 |     def __init__(self, **kwargs):
 21 |         self.trees = 16
 22 |         self.bins = 200
 23 |         self.train_num = 10000
 24 | 
 25 |         # overwrite parameters from user
 26 |         self.__dict__.update(kwargs)
 27 | 
 28 | def train_lw_tree(seed, dataset, version, workload, params, sizelimit):
 29 |     np.random.seed(seed)
 30 | 
 31 |     # convert parameter dict of lw(nn)
 32 |     L.info(f"params: {params}")
 33 |     args = Args(**params)
 34 |     valid_num = args.train_num // 10
 35 | 
 36 |     table = load_table(dataset, version)
 37 |     dataset = load_lw_dataset(table, workload, seed, args.bins)
 38 |     train_X, train_y, _ = dataset['train']
 39 |     valid_X, valid_y, valid_gt = dataset['valid']
 40 | 
 41 |     # Train model
 42 |     model_path = MODEL_ROOT / table.dataset
 43 |     model_path.mkdir(parents=True, exist_ok=True)
 44 |     model_file = model_path / f"{table.version}_{workload}-lwxgb_tr{args.trees}_bin{args.bins}_{args.train_num//1000}k-{seed}.pkl"
 45 | 
 46 |     L.info(f"Start training...")
 47 |     start_stmp = time.time()
 48 |     model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=args.trees, random_state=seed, n_jobs=NUM_THREADS)
 49 |     model.fit(train_X[:args.train_num], train_y[:args.train_num], eval_set=[(valid_X[:valid_num], valid_y[:valid_num])])
 50 |     dur_min = (time.time() - start_stmp) / 60
 51 |     L.info(f"Finish training, time since start: {dur_min:.4f} mins")
 52 | 
 53 |     L.info(f"Run on valid set...")
 54 |     preds = np.maximum(np.round(decode_label(model.predict(valid_X[:valid_num]))), 0.0)
 55 |     gts = valid_gt[:valid_num]
 56 |     L.info("Q-Error on validation set:")
 57 |     _, metrics = evaluate(preds, gts)
 58 | 
 59 |     state = {
 60 |         'seed': seed,
 61 |         'args': args,
 62 |         'device': 'cpu',
 63 |         'threads': NUM_THREADS,
 64 |         'dataset': table.dataset,
 65 |         'version': table.version,
 66 |         'workload': workload,
 67 |         'model': model,
 68 |         'train_time': dur_min,
 69 |         'valid_error': {workload: metrics}
 70 |         #  'model_size': model_size,
 71 |     }
 72 |     with open(model_file, 'wb') as f:
 73 |         pickle.dump(state, f, protocol=PKL_PROTO)
 74 | 
 75 |     L.info(f'All finished! Time spent since training start: {(time.time()-start_stmp)/60:.2f} mins')
 76 |     L.info(f"Model saved to {model_file}")
 77 | 
 78 | class LWTree(Estimator):
 79 |     def __init__(self, model, model_name, pg_est, table):
 80 |         super(LWTree, self).__init__(table=table, model=model_name)
 81 |         self.model = model
 82 |         self.pg_est = pg_est
 83 | 
 84 |     def query(self, query):
 85 |         if isinstance(query, Query):
 86 |             query = encode_query(self.table, query, self.pg_est)
 87 |         return self.query_vector(np.expand_dims(query, axis=0))
 88 | 
 89 |     def query_vector(self, vec):
 90 |         start_stmp = time.time()
 91 |         pred = self.model.predict(vec).item()
 92 |         dur_ms = (time.time() - start_stmp) * 1e3
 93 |         return np.maximum(np.round(decode_label(pred)), 0.0), dur_ms
 94 | 
 95 | 
 96 | def load_lw_tree(dataset: str, model_name: str) -> Tuple[Estimator, Dict[str, Any]]:
 97 |     model_file = MODEL_ROOT / dataset / f"{model_name}.pkl"
 98 |     L.info(f"load model from {model_file} ...")
 99 |     with open(model_file, 'rb') as f:
100 |         state = pickle.load(f)
101 | 
102 |     # load model
103 |     args = state['args']
104 |     model = state['model']
105 |     table = load_table(dataset, state['version'])
106 |     pg_est = Postgres(table, args.bins, state['seed'])
107 | 
108 |     estimator = LWTree(model, model_name, pg_est, table)
109 |     return estimator, state
110 | 
111 | def test_lw_tree(dataset: str, version: str, workload: str, params: Dict[str, Any], overwrite: bool) -> None:
112 |     """
113 |     params:
114 |         model: model file name
115 |         use_cache: load processed vectors directly instead of build from queries
116 |     """
117 |     # uniform thread number
118 |     model_file = MODEL_ROOT / dataset / f"{params['model']}.pkl"
119 |     L.info(f"Load model from {model_file} ...")
120 |     with open(model_file, 'rb') as f:
121 |         state = pickle.load(f)
122 | 
123 |     # load corresonding version of table
124 |     table = load_table(dataset, state['version'])
125 | 
126 |     # load model
127 |     args = state['args']
128 |     model = state['model']
129 |     pg_est = Postgres(table, args.bins, state['seed'])
130 |     estimator = LWTree(model, params['model'], pg_est, table)
131 | 
132 |     L.info(f"Load and built lw(tree) estimator: {estimator}")
133 |     if params['use_cache']:
134 |         # test table might has different version with train
135 |         test_table = load_table(dataset, version)
136 |         lw_dataset = load_lw_dataset(test_table, workload, state['seed'], args.bins)
137 |         X, _, gt = lw_dataset['test']
138 |         run_test(dataset, version, workload, estimator, overwrite, lw_vec=(X, gt))
139 |     else:
140 |         run_test(dataset, version, workload, estimator, overwrite)
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/lecarb/estimator/lw/model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | class LWNNLayer(nn.Module):
 4 |     def __init__(self, input_len, output_len):
 5 |         super().__init__()
 6 |         self.layer = nn.Sequential(
 7 |             nn.Linear(input_len, output_len),
 8 |             nn.ReLU(inplace=True),
 9 |         )
10 | 
11 |     def forward(self, X):
12 |         return self.layer(X)
13 | 
14 | class LWNNModel(nn.Module):
15 |     def __init__(self, input_len, hid_units):
16 |         super().__init__()
17 |         self.hid_units = hid_units
18 | 
19 |         self.hid_layers = nn.Sequential()
20 |         for l, output_len in enumerate([int(u) for u in hid_units.split('_')]):
21 |             self.hid_layers.add_module('layer_{}'.format(l), LWNNLayer(input_len, output_len))
22 |             input_len = output_len
23 | 
24 |         self.final = nn.Linear(input_len, 1)
25 | 
26 |     def forward(self, X):
27 |         mid_out = self.hid_layers(X)
28 |         pred = self.final(mid_out)
29 | 
30 |         return pred
31 | 
32 |     def name(self):
33 |         return f"lwnn_hid{self.hid_units}"
34 | 


--------------------------------------------------------------------------------
/lecarb/estimator/mscn/README.md:
--------------------------------------------------------------------------------
1 | Paper: [Learned Cardinalities: Estimating Correlated Joins with Deep Learning](https://arxiv.org/pdf/1809.00677.pdf)
2 | Code Reference: [repo](https://github.com/andreaskipf/learnedcardinalities)
3 | 


--------------------------------------------------------------------------------
/lecarb/estimator/mscn/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | # Define model architecture
 6 | # removed all join related components since we only do cardinality estimation on single table
 7 | 
 8 | class SetConv(nn.Module):
 9 |     def __init__(self, sample_feats, predicate_feats, hid_units):
10 |         super(SetConv, self).__init__()
11 |         self.sample_feats = sample_feats
12 |         self.hid_units = hid_units
13 | 
14 |         self.sample_mlp1 = nn.Linear(sample_feats, hid_units)
15 |         self.sample_mlp2 = nn.Linear(hid_units, hid_units)
16 |         self.predicate_mlp1 = nn.Linear(predicate_feats, hid_units)
17 |         self.predicate_mlp2 = nn.Linear(hid_units, hid_units)
18 |         self.out_mlp1 = nn.Linear(hid_units * 2, hid_units)
19 |         self.out_mlp2 = nn.Linear(hid_units, 1)
20 | 
21 |     def forward(self, samples, predicates, sample_mask, predicate_mask):
22 |         # samples has shape [batch_size x num_joins+1 x sample_feats]
23 |         # predicates has shape [batch_size x num_predicates x predicate_feats]
24 |         # joins has shape [batch_size x num_joins x join_feats]
25 | 
26 |         hid_sample = F.relu(self.sample_mlp1(samples))
27 |         hid_sample = F.relu(self.sample_mlp2(hid_sample))
28 |         hid_sample = hid_sample * sample_mask  # Mask
29 |         hid_sample = torch.sum(hid_sample, dim=1, keepdim=False)
30 |         sample_norm = sample_mask.sum(1, keepdim=False)
31 |         hid_sample = hid_sample / sample_norm  # Calculate average only over non-masked parts
32 | 
33 |         hid_predicate = F.relu(self.predicate_mlp1(predicates))
34 |         hid_predicate = F.relu(self.predicate_mlp2(hid_predicate))
35 |         hid_predicate = hid_predicate * predicate_mask
36 |         hid_predicate = torch.sum(hid_predicate, dim=1, keepdim=False)
37 |         predicate_norm = predicate_mask.sum(1, keepdim=False)
38 |         hid_predicate = hid_predicate / predicate_norm
39 | 
40 |         hid = torch.cat((hid_sample, hid_predicate), 1)
41 |         hid = F.relu(self.out_mlp1(hid))
42 |         out = torch.sigmoid(self.out_mlp2(hid))
43 |         return out
44 | 
45 |     def name(self):
46 |         return f"mscn_hid{self.hid_units}_sample{self.sample_feats}"
47 | 


--------------------------------------------------------------------------------
/lecarb/estimator/mysql.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import mysql.connector
 3 | import logging
 4 | from typing import Any, Dict
 5 | import numpy as np
 6 | 
 7 | from .estimator import Estimator
 8 | from .utils import run_test
 9 | from ..workload.workload import query_2_sql
10 | from ..dataset.dataset import load_table
11 | from ..constants import MYSQL_HOST, MYSQL_PORT, MYSQL_DB, MYSQL_USER, MYSQL_PSWD
12 | 
13 | L = logging.getLogger(__name__)
14 | 
15 | class MySQL(Estimator):
16 |     def __init__(self, table, bucket, seed):
17 |         super(MySQL, self).__init__(table=table, version=table.version, bucket=bucket, seed=seed)
18 | 
19 |         self.conn = mysql.connector.connect(user=MYSQL_USER, password=MYSQL_PSWD, host=MYSQL_HOST, port=MYSQL_PORT, database=MYSQL_DB)
20 |         self.conn.autocommit = True
21 |         self.cursor = self.conn.cursor()
22 | 
23 |         # construct statistics
24 |         start_stmp = time.time()
25 |         self.cursor.execute(f"analyze table `{self.table.name}` update histogram on "
26 |                             f"{','.join([c.name for c in table.columns.values()])} "
27 |                             f"with {bucket} buckets;")
28 |         rows = self.cursor.fetchall()
29 |         L.info(f"{rows}")
30 |         dur_min = (time.time() - start_stmp) / 60
31 | 
32 |         L.info(f"construct statistics finished, using {dur_min:.4f} minutes")
33 | 
34 |     def query(self, query):
35 |         sql = 'explain {}'.format(query_2_sql(query, self.table, aggregate=False, dbms='mysql'))
36 |         #  L.info('sql: {}'.format(sql))
37 | 
38 |         start_stmp = time.time()
39 |         self.cursor.execute(sql)
40 |         dur_ms = (time.time() - start_stmp) * 1e3
41 |         res = self.cursor.fetchall()
42 |         assert len(res) == 1, res
43 |         # test 1
44 |         card = np.round(0.01 * res[0][10] * self.table.row_num)
45 |         # test 2
46 |         #  card = np.round(0.01 * res[0][10] * res[0][9])
47 |         #  L.info(card)
48 |         return card, dur_ms
49 | 
50 | def test_mysql(seed: int, dataset: str, version: str, workload:str, params: Dict[str, Any], overwrite: bool):
51 |     """
52 |     params:
53 |         version: the version of table that mysql construct statistics, might not be the same with the one we test on
54 |         bucket: number of bucket for each histogram
55 |     """
56 |     # prioriy: params['version'] (build statistics from another dataset) > version (build statistics on the same dataset)
57 |     table = load_table(dataset, params.get('version') or version)
58 | 
59 |     L.info("construct mysql estimator...")
60 |     estimator = MySQL(table, params['bucket'], seed=seed)
61 |     L.info(f"built mysql estimator: {estimator}")
62 | 
63 |     run_test(dataset, version, workload, estimator, overwrite)
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/lecarb/estimator/naru/README.md:
--------------------------------------------------------------------------------
1 | Paper: [Deep Unsupervised Cardinality Estimation](http://www.vldb.org/pvldb/vol13/p279-yang.pdf)
2 | Code Reference: [repo](https://github.com/naru-project/naru)
3 | 


--------------------------------------------------------------------------------
/lecarb/estimator/postgres.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import psycopg2
 3 | import logging
 4 | from typing import Any, Dict
 5 | 
 6 | from .estimator import Estimator
 7 | from .utils import run_test
 8 | from ..workload.workload import query_2_sql
 9 | from ..dataset.dataset import load_table
10 | from ..constants import DATABASE_URL
11 | 
12 | L = logging.getLogger(__name__)
13 | 
14 | class Postgres(Estimator):
15 |     def __init__(self, table, stat_target, seed):
16 |         super(Postgres, self).__init__(table=table, version=table.version, stat=stat_target, seed=seed)
17 | 
18 |         self.conn = psycopg2.connect(DATABASE_URL)
19 |         self.conn.autocommit = True
20 |         self.cursor = self.conn.cursor()
21 | 
22 |         # construct statistics
23 |         start_stmp = time.time()
24 |         self.cursor.execute('select setseed({});'.format(1 / seed))
25 |         for c in table.columns.values():
26 |             self.cursor.execute('alter table \"{}\" alter column {} set statistics {};'.format(
27 |                 table.name, c.name, stat_target))
28 |         self.cursor.execute('analyze \"{}\";'.format(self.table.name))
29 |         self.conn.commit()
30 |         dur_min = (time.time() - start_stmp) / 60
31 | 
32 |         # get size
33 |         self.cursor.execute('select sum(pg_column_size(pg_stats)) from pg_stats where tablename=\'{}\''.format(self.table.name))
34 |         size = self.cursor.fetchall()[0][0]
35 |         #  self.cursor.execute('select sum(pg_column_size(pg_stats_ext)) from pg_stats_ext where tablename=\'{}\''.format(self.table.name))
36 |         #  res = self.cursor.fetchall()[0][0]
37 |         # might not have content in ext table
38 |         #  if res is not None:
39 |         #      size += res
40 |         size = size / 1024 / 1024 # MB
41 | 
42 |         L.info(f"construct statistics finished, using {dur_min:.4f} minutes, All statistics consumes {size:.2f} MBs")
43 | 
44 |     def query(self, query):
45 |         sql = 'explain(format json) {}'.format(query_2_sql(query, self.table, aggregate=False))
46 |         #  L.info('sql: {}'.format(sql))
47 | 
48 |         start_stmp = time.time()
49 |         self.cursor.execute(sql)
50 |         dur_ms = (time.time() - start_stmp) * 1e3
51 |         res = self.cursor.fetchall()
52 |         card = res[0][0][0]['Plan']['Plan Rows']
53 |         #  L.info(card)
54 |         return card, dur_ms
55 | 
56 |     def query_sql(self, sql):
57 |         sql = 'explain(format json) {}'.format(sql)
58 |         #  L.info('sql: {}'.format(sql))
59 | 
60 |         start_stmp = time.time()
61 |         self.cursor.execute(sql)
62 |         res = self.cursor.fetchall()
63 |         card = res[0][0][0]['Plan']['Plan Rows']
64 |         #  L.info(card)
65 |         dur_ms = (time.time() - start_stmp) * 1e3
66 |         return card, dur_ms
67 | 
68 | def test_postgres(seed: int, dataset: str, version: str, workload:str, params: Dict[str, Any], overwrite: bool):
69 |     """
70 |     params:
71 |         version: the version of table that postgres construct statistics, might not be the same with the one we test on
72 |         stat_target: size of the statistics limit
73 |     """
74 |     # prioriy: params['version'] (build statistics from another dataset) > version (build statistics on the same dataset)
75 |     table = load_table(dataset, params.get('version') or version)
76 | 
77 |     L.info("construct postgres estimator...")
78 |     estimator = Postgres(table, stat_target=params['stat_target'], seed=seed)
79 |     L.info(f"built postgres estimator: {estimator}")
80 | 
81 |     run_test(dataset, version, workload, estimator, overwrite)
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/lecarb/estimator/sample.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import time
 3 | import logging
 4 | from typing import Any, Dict
 5 | import numpy as np
 6 | from .estimator import Estimator, OPS
 7 | from .utils import run_test
 8 | from ..workload.workload import query_2_triple
 9 | from ..dataset.dataset import load_table
10 | 
11 | L = logging.getLogger(__name__)
12 | 
13 | class Sampling(Estimator):
14 |     def __init__(self, table, ratio, seed):
15 |         super(Sampling, self).__init__(table=table, version=table.version, ratio=ratio, seed=seed)
16 |         self.sample = table.data.sample(frac=ratio, random_state=seed)
17 |         self.sample_num = len(self.sample)
18 | 
19 |     def query(self, query):
20 |         columns, operators, values = query_2_triple(query, with_none=False, split_range=False)
21 |         start_stmp = time.time()
22 |         bitmap = np.ones(self.sample_num, dtype=bool)
23 |         for c, o, v in zip(columns, operators, values):
24 |             bitmap &= OPS[o](self.sample[c], v)
25 |         card = np.round((self.table.row_num / self.sample_num) * bitmap.sum())
26 |         dur_ms = (time.time() - start_stmp) * 1e3
27 |         return card, dur_ms
28 | 
29 | def test_sample(seed: int, dataset: str, version: str, workload: str, params: Dict[str, Any], overwrite: bool) -> None:
30 |     """
31 |     params:
32 |         version: the version of table that the sample draw from, might not be the same with the one we test on
33 |         ratio: the ratio of the sample
34 |     """
35 |     # prioriy: params['version'] (draw sample from another dataset) > version (draw and test on the same dataset)
36 |     table = load_table(dataset, params.get('version') or version)
37 | 
38 |     L.info("construct sampling estimator...")
39 |     estimator = Sampling(table, ratio=params['ratio'] or 0.01, seed=seed)
40 |     L.info(f"built sampling estimator: {estimator}")
41 | 
42 |     run_test(dataset, version, workload, estimator, overwrite)
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/lecarb/estimator/utils.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import ray
  3 | import logging
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from scipy.stats.mstats import gmean
  8 | 
  9 | #  from .lw.lw_nn import LWNN
 10 | #  from .lw.lw_tree import LWTree
 11 | from .estimator import Estimator
 12 | from ..constants import NUM_THREADS, RESULT_ROOT
 13 | from ..workload.workload import load_queryset, load_labels
 14 | from ..dataset.dataset import load_table
 15 | 
 16 | L = logging.getLogger(__name__)
 17 | 
 18 | def report_model(model, blacklist=None):
 19 |     ps = []
 20 |     for name, p in model.named_parameters():
 21 |         if blacklist is None or blacklist not in name:
 22 |             ps.append(np.prod(p.size()))
 23 |     num_params = sum(ps)
 24 |     mb = num_params * 4 / 1024 / 1024
 25 |     L.info(f'Number of model parameters: {num_params} (~= {mb:.2f}MB)')
 26 |     L.info(model)
 27 |     return mb
 28 | 
 29 | def qerror(est_card, card):
 30 |     if est_card == 0 and card == 0:
 31 |         return 1.0
 32 |     if est_card == 0:
 33 |         return card
 34 |     if card == 0:
 35 |         return est_card
 36 |     if est_card > card:
 37 |         return est_card / card
 38 |     else:
 39 |         return card / est_card
 40 | 
 41 | def rmserror(preds, labels, total_rows):
 42 |     return np.sqrt(np.mean(np.square(preds/total_rows-labels/total_rows)))
 43 | 
 44 | def evaluate(preds, labels, total_rows=-1):
 45 |     errors = []
 46 |     for i in range(len(preds)):
 47 |         errors.append(qerror(float(preds[i]), float(labels[i])))
 48 | 
 49 |     metrics = {
 50 |         'max': np.max(errors),
 51 |         '99th': np.percentile(errors, 99),
 52 |         '95th': np.percentile(errors, 95),
 53 |         '90th': np.percentile(errors, 90),
 54 |         'median': np.median(errors),
 55 |         'mean': np.mean(errors),
 56 |         'gmean': gmean(errors)
 57 |     }
 58 | 
 59 |     if total_rows > 0:
 60 |         metrics['rms'] = rmserror(preds, labels, total_rows)
 61 |     L.info(f"{metrics}")
 62 |     return np.array(errors), metrics
 63 | 
 64 | def evaluate_errors(errors):
 65 |     metrics = {
 66 |         'max': np.max(errors),
 67 |         '99th': np.percentile(errors, 99),
 68 |         '95th': np.percentile(errors, 95),
 69 |         '90th': np.percentile(errors, 90),
 70 |         'median': np.median(errors),
 71 |         'mean': np.mean(errors),
 72 |         'gmean': gmean(errors)
 73 |     }
 74 |     L.info(f"{metrics}")
 75 |     return metrics
 76 | 
 77 | def report_errors(dataset, result_file):
 78 |     df = pd.read_csv(RESULT_ROOT / dataset / result_file)
 79 |     evaluate_errors(df['error'])
 80 | 
 81 | def report_dynamic_errors(dataset, old_new_file, new_new_file, max_t, current_t):
 82 |     '''
 83 |     max_t: Time limit for update
 84 |     current_t: Model's update time.
 85 |     old_new_path: Result file of applying stale model on new workload
 86 |     new_new_path: Result file of applying updated model on new workload
 87 |     '''
 88 |     old_new_path = RESULT_ROOT / dataset / old_new_file
 89 |     new_new_path = RESULT_ROOT / dataset / new_new_file
 90 |     if max_t > current_t:
 91 |         try:
 92 |             o_n = pd.read_csv(old_new_path)
 93 |             n_n = pd.read_csv(new_new_path)
 94 |             assert len(o_n) == len(n_n), "In current version, the workload test size should be same."
 95 |             o_n_s = o_n.sample(frac = current_t / max_t)
 96 |             n_n_s = n_n.sample(frac = 1 - current_t / max_t)
 97 |             mixed_df = pd.concat([o_n_s, n_n_s], ignore_index=True, sort=False)
 98 |             return evaluate_errors(mixed_df['error'])
 99 |         except OSError:
100 |             print('Cannot open file.')
101 |     return -1
102 | 
103 | def lazy_derive(origin_result_file, result_file, r, labels):
104 |     L.info("Already have the original result, directly derive the new prediction!")
105 |     df = pd.read_csv(origin_result_file)
106 |     with open(result_file, 'w') as f:
107 |         writer = csv.writer(f)
108 |         writer.writerow(['id', 'error', 'predict', 'label', 'dur_ms'])
109 |         for index, row in df.iterrows():
110 |             p = np.round(row['predict'] * r)
111 |             l = labels[index].cardinality
112 |             writer.writerow([int(row['id']), qerror(p, l), p, l, row['dur_ms']])
113 |     L.info("Done infering all predictions from previous result")
114 | 
115 | def run_test(dataset: str, version: str, workload: str, estimator: Estimator, overwrite: bool, lazy: bool=True, lw_vec=None, query_async=False) -> None:
116 |     # for inference speed.
117 |     torch.backends.cudnn.deterministic = False
118 |     torch.backends.cudnn.benchmark = True
119 | 
120 |     # uniform thread number
121 |     torch.set_num_threads(NUM_THREADS)
122 |     assert NUM_THREADS == torch.get_num_threads(), torch.get_num_threads()
123 |     L.info(f"torch threads: {torch.get_num_threads()}")
124 | 
125 |     L.info(f"Start loading queryset:{workload} and labels for version {version} of dataset {dataset}...")
126 |     # only keep test queries
127 |     queries = load_queryset(dataset, workload)['test']
128 |     labels = load_labels(dataset, version, workload)['test']
129 | 
130 |     if lw_vec is not None:
131 |         X, gt = lw_vec
132 |         #  assert isinstance(estimator, LWNN) or isinstance(estimator, LWTree), estimator
133 |         assert len(X) == len(queries), len(X)
134 |         assert np.array_equal(np.array([l.cardinality for l in labels]), gt)
135 |         L.info("Hack for LW's method, use processed vector instead of raw query")
136 |         queries = X
137 | 
138 |     # prepare file path, do not proceed if result already exists
139 |     result_path = RESULT_ROOT / f"{dataset}"
140 |     result_path.mkdir(parents=True, exist_ok=True)
141 |     result_file = result_path / f"{version}-{workload}-{estimator}.csv"
142 |     if not overwrite and result_file.is_file():
143 |         L.info(f"Already have the result {result_file}, do not run again!")
144 |         exit(0)
145 | 
146 |     r = 1.0
147 |     if version != estimator.table.version:
148 |         test_row = load_table(dataset, version).row_num
149 |         r = test_row / estimator.table.row_num
150 |         L.info(f"Testing on a different data version, need to adjust the prediction according to the row number ratio {r} = {test_row} / {estimator.table.row_num}!")
151 | 
152 |         origin_result_file = RESULT_ROOT / dataset / f"{estimator.table.version}-{workload}-{estimator}.csv"
153 |         if lazy and origin_result_file.is_file():
154 |             return lazy_derive(origin_result_file, result_file, r, labels)
155 | 
156 |     if query_async:
157 |         L.info("Start test estimator asynchronously...")
158 |         for i, query in enumerate(queries):
159 |             estimator.query_async(query, i)
160 | 
161 |         L.info('Waiting for queries to finish...')
162 |         stats = ray.get([w.get_stats.remote() for w in estimator.workers])
163 | 
164 |         errors = []
165 |         latencys = []
166 |         with open(result_file, 'w') as f:
167 |             writer = csv.writer(f)
168 |             writer.writerow(['id', 'error', 'predict', 'label', 'dur_ms'])
169 |             for i, label in enumerate(labels):
170 |                 r = stats[i%estimator.num_workers][i//estimator.num_workers]
171 |                 assert i == r.i, r
172 |                 error = qerror(r.est_card, label.cardinality)
173 |                 errors.append(error)
174 |                 latencys.append(r.dur_ms)
175 |                 writer.writerow([i, error, r.est_card, label.cardinality, r.dur_ms])
176 | 
177 |         L.info(f"Test finished, {np.mean(latencys)} ms/query in average")
178 |         evaluate_errors(errors)
179 |         return
180 | 
181 |     L.info("Start test estimator on test queries...")
182 |     errors = []
183 |     latencys = []
184 |     with open(result_file, 'w') as f:
185 |         writer = csv.writer(f)
186 |         writer.writerow(['id', 'error', 'predict', 'label', 'dur_ms'])
187 |         for i, data in enumerate(zip(queries, labels)):
188 |             query, label = data
189 |             est_card, dur_ms = estimator.query(query)
190 |             est_card = np.round(r * est_card)
191 |             error = qerror(est_card, label.cardinality)
192 |             errors.append(error)
193 |             latencys.append(dur_ms)
194 |             writer.writerow([i, error, est_card, label.cardinality, dur_ms])
195 |             if (i+1) % 1000 == 0:
196 |                 L.info(f"{i+1} queries finished")
197 |     L.info(f"Test finished, {np.mean(latencys)} ms/query in average")
198 |     evaluate_errors(errors)
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/lecarb/workload/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/workload/__init__.py


--------------------------------------------------------------------------------
/lecarb/workload/dump_quicksel.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import logging
  3 | from pathlib import Path
  4 | from typing import Dict, Any
  5 | import numpy as np
  6 | 
  7 | from .workload import load_queryset, load_labels, query_2_quicksel_vector, new_query
  8 | from ..dtypes import is_discrete, is_categorical
  9 | from ..dataset.dataset import load_table
 10 | from ..estimator.estimator import Oracle
 11 | from ..constants import DATA_ROOT
 12 | 
 13 | L = logging.getLogger(__name__)
 14 | 
 15 | def dump_quicksel_query_files(dataset: str, version: str, workload: str, overwrite: bool) -> None:
 16 |     result_path = DATA_ROOT / dataset / "quicksel"
 17 |     result_path.mkdir(exist_ok=True)
 18 |     if not overwrite and Path(result_path / f"{workload}-{version}-train.csv").is_file() and Path(result_path / f"{workload}-{version}-test.csv").is_file():
 19 |         L.info("Already has quicksel workload file dumped, do not continue")
 20 |         return
 21 | 
 22 |     table = load_table(dataset, version)
 23 |     queryset = load_queryset(dataset, workload)
 24 |     labels = load_labels(dataset, version, workload)
 25 | 
 26 |     discrete_cols = set()
 27 |     for col_name, col in table.columns.items():
 28 |         # hard code for power dataset since all these columns are actually integers
 29 |         if dataset[:5] == 'power' and col_name in ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']:
 30 |             discrete_cols.add(col_name)
 31 |             continue
 32 |         if is_discrete(col.dtype):
 33 |             discrete_cols.add(col_name)
 34 |     L.info(f"Detect discrete columns: {discrete_cols}")
 35 | 
 36 |     for group in ('train', 'test'):
 37 |         L.info(f"Start dump {workload} for {dataset}-{version}")
 38 |         result_file = result_path / f"{workload}-{version}-{group}.csv"
 39 |         with open(result_file, 'w') as f:
 40 |             writer = csv.writer(f)
 41 |             for query, label in zip(queryset[group], labels[group]):
 42 |                 vec = query_2_quicksel_vector(query, table, discrete_cols).tolist()
 43 |                 vec.append(label.selectivity)
 44 |                 writer.writerow(vec)
 45 |         L.info(f"File dumped to {result_file}")
 46 | 
 47 | def generate_quicksel_permanent_assertions(dataset: str, version: str, params: Dict[str, Dict[str, Any]], overwrite: bool) -> None:
 48 |     result_path = DATA_ROOT / dataset / "quicksel"
 49 |     result_path.mkdir(exist_ok=True)
 50 |     result_file = result_path / f"{version}-permanent.csv"
 51 |     if not overwrite and result_file.is_file():
 52 |         L.info("Already has permanent assertions generated, do not continue")
 53 |         return
 54 | 
 55 |     count = params['count']+1
 56 | 
 57 |     table = load_table(dataset, version)
 58 |     oracle = Oracle(table)
 59 | 
 60 |     discrete_cols = set()
 61 |     for col_name, col in table.columns.items():
 62 |         # hard code for power dataset since all these columns are actually integers
 63 |         if dataset[:5] == 'power' and col_name in ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']:
 64 |             discrete_cols.add(col_name)
 65 |             continue
 66 |         if is_discrete(col.dtype):
 67 |             discrete_cols.add(col_name)
 68 |     L.info(f"Detect discrete columns: {discrete_cols}")
 69 | 
 70 |     with open(result_file, 'w') as f:
 71 |         writer = csv.writer(f)
 72 |         writer.writerow([0.0, 1.0] * table.col_num + [1.0])
 73 |         for col_id, col in enumerate(table.columns.values()):
 74 |             L.info(f"Start generate permanent queries on column {col.name}")
 75 |             # hard code for power dataset since all these columns are actually integers
 76 |             if is_discrete(col.dtype) or (dataset[:5] == 'power' and col.name in ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']):
 77 |                 if is_categorical(col.dtype):
 78 |                     L.info("Categorical column")
 79 |                     if col.vocab_size <= count:
 80 |                         for i in range(col.vocab_size):
 81 |                             query = new_query(table, ncols=1)
 82 |                             query.predicates[col.name] = ('=', col.vocab[i])
 83 |                             card, _ = oracle.query(query)
 84 |                             #  vec = query_2_quicksel_vector(query, table, discrete_cols).tolist()
 85 |                             #  vec.append(card/table.row_num)
 86 |                             vec = [0.0, 1.0] * table.col_num
 87 |                             vec.append(card/table.row_num)
 88 |                             vec[col_id*2] = i/col.vocab_size
 89 |                             vec[col_id*2+1] = (i+1)/col.vocab_size
 90 |                             writer.writerow(vec)
 91 |                             L.info(f"# {i}: {query.predicates[col.name]}, card={card}\n\t{vec}")
 92 |                     else:
 93 |                         minval = 0
 94 |                         maxval = col.vocab_size
 95 |                         norm_range = np.linspace(0.0, 1.0, count, dtype=np.float32)
 96 |                         prange = minval + (maxval - minval) * norm_range
 97 |                         for i in range(len(prange)-1):
 98 |                             val0 = col.vocab[np.ceil(prange[i]).astype(int)]
 99 |                             val1 = col.vocab[np.ceil(prange[i+1]).astype(int)-1]
100 |                             assert np.greater_equal(np.array(val1).astype(object), val0), (val1, val0)
101 |                             query = new_query(table, ncols=1)
102 |                             query.predicates[col.name] = ('[]', (val0, val1))
103 |                             card, _ = oracle.query(query)
104 |                             #  vec = query_2_quicksel_vector(query, table, discrete_cols).tolist()
105 |                             #  vec.append(card/table.row_num)
106 | 
107 |                             vec = [0.0, 1.0] * table.col_num
108 |                             vec.append(card/table.row_num)
109 |                             vec[col_id*2] = norm_range[i]
110 |                             vec[col_id*2+1] = norm_range[i+1]
111 |                             writer.writerow(vec)
112 |                             L.info(f"# {i}: {query.predicates[col.name]}, card={card}\n\t{vec}")
113 |                 else:
114 |                     L.info("Integer column")
115 |                     minval = col.minval
116 |                     maxval = col.maxval + 1
117 |                     norm_range = np.linspace(0.0, 1.0, count, dtype=np.float32)
118 |                     prange = minval + (maxval - minval) * norm_range
119 |                     for i in range(len(prange)-1):
120 |                         val0 = np.ceil(prange[i])
121 |                         val1 = np.ceil(prange[i+1])-1
122 |                         assert val1 >= val0, (val0, val1)
123 |                         query = new_query(table, ncols=1)
124 |                         query.predicates[col.name] = ('[]', (val0, val1))
125 |                         card, _ = oracle.query(query)
126 |                         #  vec = query_2_quicksel_vector(query, table, discrete_cols).tolist()
127 |                         #  vec.append(card/table.row_num)
128 | 
129 |                         vec = [0.0, 1.0] * table.col_num
130 |                         vec.append(card/table.row_num)
131 |                         vec[col_id*2] = norm_range[i]
132 |                         vec[col_id*2+1] = norm_range[i+1]
133 |                         writer.writerow(vec)
134 |                         L.info(f"# {i}: {query.predicates[col.name]}, card={card}\n\t{vec}")
135 |             else:
136 |                 L.info("Real-value column")
137 |                 norm_range = np.linspace(0.0, 1.0, count, dtype=np.float32)
138 |                 prange = col.minval + (col.maxval - col.minval) * norm_range
139 |                 for i in range(len(prange)-1):
140 |                     query = new_query(table, ncols=1)
141 |                     query.predicates[col.name] = ('[]', (prange[i], prange[i+1]))
142 |                     card, _ = oracle.query(query)
143 |                     #  vec = query_2_quicksel_vector(query, table, discrete_cols).tolist()
144 |                     #  vec.append(card/table.row_num)
145 |                     vec = [0.0, 1.0] * table.col_num
146 |                     vec.append(card/table.row_num)
147 |                     vec[col_id*2] = norm_range[i]
148 |                     vec[col_id*2+1] = norm_range[i+1]
149 |                     writer.writerow(vec)
150 |                     L.info(f"# {i}: {query.predicates[col.name]}, card={card}\n\t{vec}")
151 | 


--------------------------------------------------------------------------------
/lecarb/workload/gen_label.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import List, Dict
 3 | 
 4 | from .workload import Label, Query, load_queryset, dump_labels
 5 | from ..estimator.estimator import Oracle
 6 | from ..estimator.sample import Sampling
 7 | from ..dataset.dataset import Table, load_table
 8 | 
 9 | L = logging.getLogger(__name__)
10 | 
11 | def generate_labels_for_queries(table: Table, queryset: Dict[str, List[Query]]) -> Dict[str, List[Label]]:
12 |     oracle = Oracle(table)
13 |     labels = {}
14 |     for group, queries in queryset.items():
15 |         l = []
16 |         for i, q in enumerate(queries):
17 |             card, _ = oracle.query(q)
18 |             l.append(Label(cardinality=card, selectivity=card/table.row_num))
19 |             if (i+1) % 1000 == 0:
20 |                 L.info(f"{i+1} labels generated for {group}")
21 |         labels[group] = l
22 | 
23 |     return labels
24 | 
25 | def generate_labels(dataset: str, version: str, workload: str) -> None:
26 | 
27 |     L.info("Load table...")
28 |     table = load_table(dataset, version)
29 | 
30 |     L.info("Load queryset from disk...")
31 |     queryset = load_queryset(dataset, workload)
32 | 
33 |     L.info("Start generate ground truth labels for the workload...")
34 |     labels = generate_labels_for_queries(table, queryset)
35 | 
36 |     L.info("Dump labels to disk...")
37 |     dump_labels(dataset, version, workload, labels)
38 | 
39 | def update_labels_for_queries(table: Table, queryset: Dict[str, List[Query]], seed: int, sampling_ratio: float=0.05) -> Dict[str, List[Label]]:
40 |     sample_ester = Sampling(table, sampling_ratio, seed)
41 |     labels = {}
42 |     for group, queries in queryset.items():
43 |         l = []
44 |         for i, q in enumerate(queries):
45 |             card, _ = sample_ester.query(q)
46 |             l.append(Label(cardinality=card, selectivity=card/table.row_num))
47 |             if (i+1) % 1000 == 0:
48 |                 L.info(f"{i+1} labels generated for {group}")
49 |         labels[group] = l
50 |     return labels
51 | 
52 | def update_labels(seed: int, dataset: str, version: str, workload: str, sampling_ratio: float=0.05) -> None:
53 | 
54 |     L.info("Load table...")
55 |     table = load_table(dataset, version)
56 | 
57 |     L.info("Load queryset from disk...")
58 |     queryset = load_queryset(dataset, workload)
59 | 
60 |     L.info("Updating ground truth labels for the workload, with sample size {}...".format(sampling_ratio))
61 |     labels = update_labels_for_queries(table, queryset, seed, sampling_ratio)
62 | 
63 |     L.info("Dump labels to disk...")
64 |     dump_labels(dataset, version, workload, labels)
65 | 
66 | 


--------------------------------------------------------------------------------
/lecarb/workload/gen_workload.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import logging
 3 | import numpy as np
 4 | from typing import Dict, Any
 5 | import copy
 6 | 
 7 | from . import generator
 8 | from .generator import QueryGenerator
 9 | from .gen_label import generate_labels_for_queries
10 | from .workload import dump_queryset, dump_labels
11 | from ..dataset.dataset import load_table
12 | 
13 | L = logging.getLogger(__name__)
14 | 
15 | def get_focused_table(table, ref_table, win_ratio):
16 |     focused_table = copy.deepcopy(table)
17 |     win_size = int(win_ratio * len(ref_table.data))
18 |     focused_table.data = focused_table.data.tail(win_size).reset_index(drop=True)
19 |     focused_table.parse_columns()
20 |     return focused_table
21 | 
22 | def generate_workload(
23 |     seed: int, dataset: str, version: str,
24 |     name: str, no_label: bool, old_version: str, win_ratio: str,
25 |     params: Dict[str, Dict[str, Any]]
26 | ) -> None:
27 | 
28 |     random.seed(seed)
29 |     np.random.seed(seed)
30 | 
31 |     attr_funcs = {getattr(generator, f"asf_{a}"): v for a, v in params['attr'].items()}
32 |     center_funcs = {getattr(generator, f"csf_{c}"): v for c, v in params['center'].items()}
33 |     width_funcs = {getattr(generator, f"wsf_{w}"): v for w, v in params['width'].items()}
34 | 
35 |     L.info("Load table...")
36 |     table = load_table(dataset, version)
37 |     if old_version and win_ratio:
38 |         L.info(f"According to {old_version}, generate queries for updated data in {version}...")
39 |         win_ratio = float(win_ratio)
40 |         assert 0<win_ratio<=1
41 |         old_table = load_table(dataset, old_version)
42 |         query_table = get_focused_table(table, old_table, win_ratio)
43 |         qgen = QueryGenerator(
44 |                 table=query_table,
45 |                 attr=attr_funcs,
46 |                 center=center_funcs,
47 |                 width=width_funcs,
48 |                 attr_params=params.get('attr_params') or {},
49 |                 center_params=params.get('center_params') or {},
50 |                 width_params=params.get('width_params') or {})
51 |     else:
52 |         qgen = QueryGenerator(
53 |             table=table,
54 |             attr=attr_funcs,
55 |             center=center_funcs,
56 |             width=width_funcs,
57 |             attr_params=params.get('attr_params') or {},
58 |             center_params=params.get('center_params') or {},
59 |             width_params=params.get('width_params') or {})
60 | 
61 |     queryset = {}
62 |     for group, num in params['number'].items():
63 |         L.info(f"Start generate workload with {num} queries for {group}...")
64 |         queries = []
65 |         for i in range(num):
66 |             queries.append(qgen.generate())
67 |             if (i+1) % 1000 == 0:
68 |                 L.info(f"{i+1} queries generated")
69 |         queryset[group] = queries
70 | 
71 |     L.info("Dump queryset to disk...")
72 |     dump_queryset(dataset, name, queryset)
73 | 
74 |     if no_label:
75 |         L.info("Finish without generating corresponding ground truth labels")
76 |         return
77 | 
78 |     L.info("Start generate ground truth labels for the workload...")
79 |     labels = generate_labels_for_queries(table, queryset)
80 | 
81 |     L.info("Dump labels to disk...")
82 |     dump_labels(dataset, version, name, labels)
83 | 


--------------------------------------------------------------------------------
/lecarb/workload/generator.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import logging
  3 | from typing import Dict, List, Any, Optional, Tuple
  4 | from typing_extensions import Protocol
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from ..dtypes import is_categorical
 10 | from ..dataset.dataset import Table, Column
 11 | from .workload import Query, new_query
 12 | 
 13 | L = logging.getLogger(__name__)
 14 | 
 15 | """====== Attribute Selection Functions ======"""
 16 | 
 17 | class AttributeSelFunc(Protocol):
 18 |     def __call__(self, table: Table, params: Dict[str, Any]) -> List[str]: ...
 19 | 
 20 | def asf_pred_number(table: Table, params: Dict[str, Any]) -> List[str]:
 21 |     if 'whitelist' in params:
 22 |         attr_domain = params['whitelist']
 23 |     else:
 24 |         blacklist = params.get('blacklist') or []
 25 |         attr_domain = [c for c in list(table.data.columns) if c not in blacklist]
 26 |     nums = params.get('nums')
 27 |     nums = nums or range(1, len(attr_domain)+1)
 28 |     num_pred = np.random.choice(nums)
 29 |     assert num_pred <= len(attr_domain)
 30 |     return np.random.choice(attr_domain, size=num_pred, replace=False)
 31 | 
 32 | def asf_comb(table: Table, params: Dict[str, Any]) -> List[str]:
 33 |     assert 'comb' in params and type(params['comb']) == list, params
 34 |     for c in params['comb']:
 35 |         assert c in table.columns, c
 36 |     return params['comb']
 37 | 
 38 | def asf_naru(table: Table, params: Dict[str, Any]) -> List[str]:
 39 |     num_filters = np.random.randint(5, 12)
 40 |     return np.random.choice(table.data.columns, size=num_filters, replace=False)
 41 | 
 42 | """====== Center Selection Functions ======"""
 43 | 
 44 | class CenterSelFunc(Protocol):
 45 |     def __call__(self, table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]: ...
 46 | 
 47 | DOMAIN_CACHE = {}
 48 | # This domain version makes sure that query's cardinality > 0
 49 | def csf_domain(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]:
 50 |     global DOMAIN_CACHE
 51 |     key = tuple(sorted(attrs))
 52 |     if key not in DOMAIN_CACHE:
 53 |         data_from = params.get('data_from') or 0
 54 |         DOMAIN_CACHE[key] = table.data[data_from:][attrs].drop_duplicates().index
 55 |         assert len(DOMAIN_CACHE[key]) > 0, key
 56 |     #  L.debug(f'Cache size: {len(DOMAIN_CACHE)}')
 57 |     row_id = np.random.choice(DOMAIN_CACHE[key])
 58 |     return [table.data.at[row_id, a] for a in attrs]
 59 | 
 60 | ROW_CACHE = None
 61 | GLOBAL_COUNTER = 1000
 62 | def csf_distribution(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]:
 63 |     global GLOBAL_COUNTER
 64 |     global ROW_CACHE
 65 |     if GLOBAL_COUNTER >= 1000:
 66 |         data_from = params.get('data_from') or 0
 67 |         ROW_CACHE = np.random.choice(range(data_from, len(table.data)), size=1000)
 68 |         GLOBAL_COUNTER = 0
 69 |     row_id = ROW_CACHE[GLOBAL_COUNTER]
 70 |     GLOBAL_COUNTER += 1
 71 |     #  data_from = params.get('data_from') or 0
 72 |     #  row_id = np.random.choice(range(data_from, len(table.data)))
 73 |     return [table.data.at[row_id, a] for a in attrs]
 74 | 
 75 | def csf_ood(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]:
 76 |     row_ids = np.random.choice(len(table.data), len(attrs))
 77 |     return [table.data.at[i, a] for i, a in zip(row_ids, attrs)]
 78 | 
 79 | def csf_vocab_ood(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]:
 80 |     centers = []
 81 |     for a in attrs:
 82 |         col = table.columns[a]
 83 |         centers.append(np.random.choice(col.vocab))
 84 |     return centers
 85 | 
 86 | def csf_domain_ood(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]:
 87 |     centers = []
 88 |     for a in attrs:
 89 |         col = table.columns[a]
 90 |         if is_categorical(col.dtype): # randomly pick one point from domain for categorical
 91 |             centers.append(np.random.choice(col.vocab))
 92 |         else: # uniformly pick one point from domain for numerical
 93 |             centers.append(random.uniform(col.minval, col.maxval))
 94 |     return centers
 95 | 
 96 | def csf_naru(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]:
 97 |     row_id = np.random.randint(0, len(table.data))
 98 |     return [table.data.at[row_id, a] for a in attrs]
 99 | 
100 | def csf_naru_ood(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]:
101 |     row_ids = np.random.choice(len(table.data), len(attrs))
102 |     return [table.data.at[i, a] for i, a in zip(row_ids, attrs)]
103 | 
104 | """====== Width Selection Functions ======"""
105 | 
106 | class WidthSelFunc(Protocol):
107 |     def __call__(self, table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query: ...
108 | 
109 | def parse_range(col: Column, left: Any, right: Any) -> Optional[Tuple[str, Any]]:
110 |     #  if left <= col.minval and right >= col.maxval:
111 |     #      return None
112 |     #  if left == right:
113 |     #      return ('=', left)
114 |     if left <= col.minval:
115 |         return ('<=', right)
116 |     if right >= col.maxval:
117 |         return ('>=', left)
118 |     return ('[]', (left, right))
119 | 
120 | def wsf_uniform(table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query:
121 |     query = new_query(table, ncols=len(attrs))
122 |     for a, c in zip(attrs, centers):
123 |         # NaN/NaT literal can only be assigned to = operator
124 |         if pd.isnull(c) or is_categorical(table.columns[a].dtype):
125 |             query.predicates[a] = ('=', c)
126 |             continue
127 |         col = table.columns[a]
128 |         width = random.uniform(0, col.maxval-col.minval)
129 |         query.predicates[a] = parse_range(col, c-width/2, c+width/2)
130 |     return query
131 | 
132 | def wsf_exponential(table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query:
133 |     query = new_query(table, ncols=len(attrs))
134 |     for a, c in zip(attrs, centers):
135 |         # NaN/NaT literal can only be assigned to = operator
136 |         if pd.isnull(c) or is_categorical(table.columns[a].dtype):
137 |             query.predicates[a] = ('=', c)
138 |             continue
139 |         col = table.columns[a]
140 |         lmd = 1 / ((col.maxval - col.minval) / 10)
141 |         width = random.expovariate(lmd)
142 |         query.predicates[a] = parse_range(col, c-width/2, c+width/2)
143 |     return query
144 | 
145 | def wsf_naru(table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query:
146 |     query = new_query(table, ncols=len(attrs))
147 |     ops = np.random.choice(['>=', '<=', '='], size=len(attrs))
148 |     for a, c, o in zip(attrs, centers, ops):
149 |         if table.columns[a].vocab_size >= 10:
150 |             query.predicates[a] = (o, c)
151 |         else:
152 |             query.predicates[a] = ('=', c)
153 |     return query
154 | 
155 | def wsf_equal(table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query:
156 |     query = new_query(table, ncols=len(attrs))
157 |     for a, c in zip(attrs, centers):
158 |         query.predicates[a] = ('=', c)
159 |     return query
160 | 
161 | class QueryGenerator(object):
162 |     table: Table
163 |     attr: Dict[AttributeSelFunc, float]
164 |     center: Dict[CenterSelFunc, float]
165 |     width: Dict[WidthSelFunc, float]
166 |     attr_params: Dict[str, Any]
167 |     center_params: Dict[str, Any]
168 |     width_params: Dict[str, Any]
169 | 
170 |     def __init__(
171 |             self, table: Table,
172 |             attr: Dict[AttributeSelFunc, float],
173 |             center: Dict[CenterSelFunc, float],
174 |             width: Dict[WidthSelFunc, float],
175 |             attr_params: Dict[str, Any],
176 |             center_params: Dict[str, Any],
177 |             width_params: Dict[str, Any]
178 |             ) -> None:
179 |         self.table = table
180 |         self.attr = attr
181 |         self.center = center
182 |         self.width = width
183 |         self.attr_params = attr_params
184 |         self.center_params = center_params
185 |         self.width_params = width_params
186 | 
187 |     def generate(self) -> Query:
188 |         attr_func = np.random.choice(list(self.attr.keys()), p=list(self.attr.values()))
189 |         #  L.info(f'start generate attr {attr_func.__name__}')
190 |         attr_lst = attr_func(self.table, self.attr_params)
191 | 
192 |         center_func = np.random.choice(list(self.center.keys()), p=list(self.center.values()))
193 |         #  L.info(f'start generate center points {center_func.__name__}')
194 |         center_lst = center_func(self.table, attr_lst, self.center_params)
195 | 
196 |         width_func = np.random.choice(list(self.width.keys()), p=list(self.width.values()))
197 |         #  L.info(f'start generate widths {width_func.__name__}')
198 |         return width_func(self.table, attr_lst, center_lst, self.width_params)
199 | 


--------------------------------------------------------------------------------
/lecarb/workload/merge_workload.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .workload import load_queryset, load_labels, dump_queryset, dump_labels
 3 | 
 4 | L = logging.getLogger(__name__)
 5 | 
 6 | def merge_workload(dataset: str, version: str, workload: str, count: int=10) -> None:
 7 |     queryset = {'train': [], 'valid': [], 'test': []}
 8 |     labels = {'train': [], 'valid': [], 'test': []}
 9 | 
10 |     for i in range(count):
11 |         L.info(f"Merge querset {workload}_{i}...")
12 |         qs = load_queryset(dataset, f"{workload}_{i}")
13 |         ls = load_labels(dataset, version, f"{workload}_{i}")
14 |         for k in queryset.keys():
15 |             #  print(f"{k}: {ls[k][0]}")
16 |             queryset[k] += qs[k]
17 |             labels[k] += ls[k]
18 | 
19 |     for k in queryset.keys():
20 |         L.info(f"Final queryset has {len(queryset[k])} queries with {len(labels[k])} labels")
21 | 
22 |     L.info("Dump queryset and labels...")
23 |     dump_queryset(dataset, workload, queryset)
24 |     dump_labels(dataset, version, workload, labels)
25 |     L.info(f"Done, run: rm data/{dataset}/workload/{workload}_[0-9]* to remove temporary files")
26 | 


--------------------------------------------------------------------------------
/lecarb/workload/workload.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from collections import OrderedDict
  3 | from typing import Dict, NamedTuple, Optional, Tuple, List, Any
  4 | import pickle
  5 | import numpy as np
  6 | 
  7 | from ..dtypes import is_categorical
  8 | from ..constants import DATA_ROOT, PKL_PROTO
  9 | from ..dataset.dataset import Table, load_table
 10 | 
 11 | class Query(NamedTuple):
 12 |     """predicate of each attritbute are conjunctive"""
 13 |     predicates: Dict[str, Optional[Tuple[str, Any]]]
 14 |     ncols: int
 15 | 
 16 | class Label(NamedTuple):
 17 |     cardinality: int
 18 |     selectivity: float
 19 | 
 20 | def new_query(table: Table, ncols) -> Query:
 21 |     return Query(predicates=OrderedDict.fromkeys(table.data.columns, None),
 22 |                  ncols=ncols)
 23 | 
 24 | def query_2_triple(query: Query, with_none: bool=True, split_range: bool=False
 25 |                ) -> Tuple[List[int], List[str], List[Any]]:
 26 |     """return 3 lists with same length: cols(columns names), ops(predicate operators), vals(predicate literals)"""
 27 |     cols = []
 28 |     ops = []
 29 |     vals = []
 30 |     for c, p in query.predicates.items():
 31 |         if p is not None:
 32 |             if split_range is True and p[0] == '[]':
 33 |                 cols.append(c)
 34 |                 ops.append('>=')
 35 |                 vals.append(p[1][0])
 36 |                 cols.append(c)
 37 |                 ops.append('<=')
 38 |                 vals.append(p[1][1])
 39 |             else:
 40 |                 cols.append(c)
 41 |                 ops.append(p[0])
 42 |                 vals.append(p[1])
 43 |         elif with_none:
 44 |             cols.append(c)
 45 |             ops.append(None)
 46 |             vals.append(None)
 47 |     return cols, ops, vals
 48 | 
 49 | def query_2_sql(query: Query, table: Table, aggregate=True, split=False, dbms='postgres'):
 50 |     preds = []
 51 |     for col, pred in query.predicates.items():
 52 |         if pred is None:
 53 |             continue
 54 |         op, val = pred
 55 |         if is_categorical(table.data[col].dtype):
 56 |             val = f"\'{val}\'" if not isinstance(val, tuple) else tuple(f"\'{v}\'" for v in val)
 57 |         if op == '[]':
 58 |             if split:
 59 |                 preds.append(f"{col} >= {val[0]}")
 60 |                 preds.append(f"{col} <= {val[1]}")
 61 |             else:
 62 |                 preds.append(f"({col} between {val[0]} and {val[1]})")
 63 |         else:
 64 |             preds.append(f"{col} {op} {val}")
 65 | 
 66 |     if dbms == 'mysql':
 67 |         return f"SELECT {'COUNT(*)' if aggregate else '*'} FROM `{table.name}` WHERE {' AND '.join(preds)}"
 68 |     return f"SELECT {'COUNT(*)' if aggregate else '*'} FROM \"{table.name}\" WHERE {' AND '.join(preds)}"
 69 | 
 70 | def query_2_kde_sql(query: Query, table: Table):
 71 |     preds = []
 72 |     for col, pred in query.predicates.items():
 73 |         if pred is None:
 74 |             continue
 75 |         op, val = pred
 76 |         if is_categorical(table.data[col].dtype):
 77 |             assert op =='=' and not isinstance(val, tuple), val
 78 |             val = table.columns[col].discretize(val).item()
 79 |         if op == '[]':
 80 |             preds.append(f"{col} >= {val[0]}")
 81 |             preds.append(f"{col} <= {val[1]}")
 82 |         else:
 83 |             preds.append(f"{col} {op} {val}")
 84 | 
 85 |     return f"SELECT * FROM \"{table.name}\" WHERE {' AND '.join(preds)}"
 86 | 
 87 | def query_2_deepdb_sql(query: Query, table: Table, aggregate=True, split=False):
 88 |     preds = []
 89 |     for col, pred in query.predicates.items():
 90 |         if pred is None:
 91 |             continue
 92 |         op, val = pred
 93 |         if op == '[]':
 94 |             val = table.columns[col].normalize(list(val))
 95 |             assert len(val) == 2, val
 96 |             if split:
 97 |                 preds.append(f"{col} >= {val[0]}")
 98 |                 preds.append(f"{col} <= {val[1]}")
 99 |             else:
100 |                 preds.append(f"({col} between {val[0]} and {val[1]})")
101 |         else:
102 |             val = table.columns[col].normalize(val).item()
103 |             preds.append(f"{col} {op} {val}")
104 | 
105 |     return f"SELECT {'COUNT(*)' if aggregate else '*'} FROM \"{table.name}\" WHERE {' AND '.join(preds)}"
106 | 
107 | def query_2_sqls(query: Query, table: Table):
108 |     sqls = []
109 |     for col, pred in query.predicates.items():
110 |         if pred is None:
111 |             continue
112 |         op, val = pred
113 |         if is_categorical(table.data[col].dtype):
114 |             val = f"\'{val}\'" if not isinstance(val, tuple) else tuple(f"\'{v}\'" for v in val)
115 | 
116 |         if op == '[]':
117 |             sqls.append(f"SELECT * FROM \"{table.name}\" WHERE {col} between {val[0]} and {val[1]}")
118 |         else:
119 |             sqls.append(f"SELECT * FROM \"{table.name}\" WHERE {col} {op} {val}")
120 |     return sqls
121 | 
122 | 
123 | def query_2_vector(query: Query, table: Table, upper: int=1):
124 |     vec = []
125 |     for col, pred in query.predicates.items():
126 |         if pred is None:
127 |             vec.extend([0.0, 1.0])
128 |             continue
129 |         op, val = pred
130 |         if op == '[]':
131 |             vec.extend([table.columns[col].normalize(val[0]).item(), table.columns[col].normalize(val[1]).item()])
132 |         elif op == '>=':
133 |             vec.extend([table.columns[col].normalize(val).item(), 1.0])
134 |         elif op == '<=':
135 |             vec.extend([0.0, table.columns[col].normalize(val).item()])
136 |         elif op == '=':
137 |             vec.extend([table.columns[col].normalize(val).item()] * 2)
138 |         else:
139 |             raise NotImplementedError
140 |     return np.array(vec) * upper
141 | 
142 | def query_2_quicksel_vector(query: Query, table: Table, discrete_cols=set()):
143 |     vec = []
144 |     for col_name, pred in query.predicates.items():
145 |         if pred is None:
146 |             vec.extend([0.0, 1.0])
147 |             continue
148 |         op, val = pred
149 |         col = table.columns[col_name]
150 | 
151 |         # adjust predicate to a proper range for discrete columns
152 |         if col_name in discrete_cols:
153 |             if is_categorical(col.dtype):
154 |                 val = col.discretize(val)
155 |                 minval = 0
156 |                 maxval = col.vocab_size
157 |                 vocab = np.arange(col.vocab_size)
158 |             else: # integer values
159 |                 minval = col.minval
160 |                 maxval = col.maxval + 1
161 |                 vocab = col.vocab
162 | 
163 |             if op == '=':
164 |                 val = (val, val)
165 |             elif op == '>=':
166 |                 val = (val, maxval)
167 |             elif op == '<=':
168 |                 val = (minval, val)
169 |             else:
170 |                 assert op == '[]'
171 | 
172 |             vocab = np.append(vocab, maxval)
173 |             # argmax return 0 if no value in array satisfies
174 |             val0 = vocab[np.argmax(vocab >= val[0])] if val[0] < maxval else maxval
175 |             val1 = vocab[np.argmax(vocab > val[1])] if val[1] < maxval else maxval
176 |             assert val0 <= val1, (val0, val1)
177 |             assert val0 >= minval and val0 <= maxval, (val0, minval, maxval)
178 |             assert val1 >= minval and val1 <= maxval, (val1, minval, maxval)
179 |             # normalize to [0, 1]
180 |             vec.extend([(val0-minval)/(maxval-minval), (val1-minval)/(maxval-minval)])
181 | 
182 |         # directly normalize continous columns
183 |         else:
184 |             if op == '>=':
185 |                 vec.extend([col.normalize(val).item(), 1.0])
186 |             elif op == '<=':
187 |                 vec.extend([0.0, col.normalize(val).item()])
188 |             elif op == '[]':
189 |                 vec.extend([col.normalize(val[0]).item(), col.normalize(val[1]).item()])
190 |             else:
191 |                 raise NotImplementedError
192 |     return np.array(vec)
193 | 
194 | 
195 | def dump_queryset(dataset: str, name: str, queryset: Dict[str, List[Query]]) -> None:
196 |     query_path = DATA_ROOT / dataset / "workload"
197 |     query_path.mkdir(exist_ok=True)
198 |     with open(query_path / f"{name}.pkl", 'wb') as f:
199 |         pickle.dump(queryset, f, protocol=PKL_PROTO)
200 | 
201 | def load_queryset(dataset: str, name: str) -> Dict[str, List[Query]]:
202 |     query_path = DATA_ROOT / dataset / "workload"
203 |     with open(query_path / f"{name}.pkl", 'rb') as f:
204 |         return pickle.load(f)
205 | 
206 | def dump_labels(dataset: str, version: str, name: str, labels: Dict[str, List[Label]]) -> None:
207 |     label_path = DATA_ROOT / dataset / "workload"
208 |     with open(label_path / f"{name}-{version}-label.pkl", 'wb') as f:
209 |         pickle.dump(labels, f, protocol=PKL_PROTO)
210 | 
211 | def load_labels(dataset: str, version: str, name: str) -> Dict[str, List[Label]]:
212 |     label_path = DATA_ROOT / dataset / "workload"
213 |     with open(label_path / f"{name}-{version}-label.pkl", 'rb') as f:
214 |         return pickle.load(f)
215 | 
216 | def dump_sqls(dataset: str, version: str, workload: str, group: str='test'):
217 |     table = load_table(dataset, version)
218 |     queryset = load_queryset(dataset, workload)
219 |     labels = load_labels(dataset, version, workload)
220 | 
221 |     with open('test.csv', 'w') as f:
222 |         writer = csv.writer(f)
223 |         for query, label in zip(queryset[group], labels[group]):
224 |             sql = query_2_sql(query, table, aggregate=False, dbms='sqlserver')
225 |             writer.writerow([sql, label.cardinality])
226 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "lecarb"
 3 | version = "0.1.0"
 4 | description = "Are We Ready For Learned Cardinality Estimation?"
 5 | authors = ["Weiyuan Wu <doomsplayer@gmail.com>", "Xiaoying Wang <wangxiaoying0369@gmail.com>", "Changbo Qu <quchangbo1990@gmail.com>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = "^3.7"
 9 | contexttimer = "0.3"
10 | pandas = "1.0.3"
11 | numpy = "1.18.1"
12 | docopt = "0.6"
13 | psycopg2-binary = "2.8.4"
14 | scikit-learn = "0.23.0"
15 | torch = "1.5.0"
16 | sqlparse = "0.3.1"
17 | dask = "1.2.0"
18 | toolz = "0.9.0"
19 | cloudpickle = "1.2.1"
20 | tables = "3.5.1"
21 | spflow = "0.0.34"
22 | bloom-filter = "1.3"
23 | python-dotenv = "0.13.0"
24 | jupyter = "1.0.0"
25 | jupyterlab = "2.1.4"
26 | seaborn = "0.11.0"
27 | scipy = "1.4.1"
28 | xgboost = "1.1.1"
29 | ray = "^0.8.7"
30 | Cython = "^0.29.21"
31 | pomegranate = "^0.13.4"
32 | pyodbc = "^4.0.30"
33 | mysql-connector-python = "^8.0.21"
34 | PyQt5 = "5.15.1"
35 | 
36 | [tool.poetry.dev-dependencies]
37 | mypy = "0.770"
38 | black = "19.10b0"
39 | pylint = "2.4.4"
40 | ipython = "7.13.0"
41 | 
42 | [build-system]
43 | requires = ["poetry>=0.12"]
44 | build-backend = "poetry.masonry.api"
45 | 


--------------------------------------------------------------------------------