├── workload-simulator ├── run.sh └── preprocessing.py ├── anonymizer ├── run-sampler.sh ├── run.sh ├── data-sampler.py └── log-anonymizer.py ├── clusterer ├── run_sensitivity.sh ├── run_generate_cluster_coverage.sh ├── logical_clustering_utility │ ├── schemaParser.py │ └── buildVectors.py ├── generate-cluster-coverage.py ├── online_logical_clustering.py └── online_clustering.py ├── forecaster ├── calc_mse.py ├── run_sample.sh ├── run_logical.sh ├── run_sensitivity.sh ├── run.sh ├── Utilities.py ├── models │ ├── FNN_Model.py │ ├── PSRNN_Model.py │ └── RNN_Model.py ├── generate_ensemble_hybrid.py ├── spectral │ └── Two_Stage_Regression.py ├── plot-sensitivity.py ├── plot-prediction-median-error.py └── exp_multi_online_continuous.py ├── run.sh ├── planner-simulator ├── schemaParser.py └── planner_simulator.py ├── README.md └── pre-processor ├── csv-combiner.py └── templatizer.py /workload-simulator/run.sh: -------------------------------------------------------------------------------- 1 | time python3 workload-simulator.py --project admission --workload admission-out.log --rows 50000 --num_queries 25000000 2 | #time python3 workload-simulator.py --project admission --workload admission-out.log --rows 50000 --num_queries 25000000 --logical 3 | #time python3 workload-simulator.py --project admission --workload admission-out.log --rows 50000 --num_queries 25000000 --static_suggest 4 | 5 | time python3 workload-simulator.py --project tiramisu --workload tiramisu-out.log --rows 500000 --num_queries 25000000 6 | #time python3 workload-simulator.py --project tiramisu --workload tiramisu-out.log --rows 500000 --num_queries 25000000 --logical 7 | #time python3 workload-simulator.py --project tiramisu --workload tiramisu-out.log --rows 500000 --num_queries 25000000 --static_suggest 8 | -------------------------------------------------------------------------------- /anonymizer/run-sampler.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # File Name: run.sh 3 | # Author: Lin Ma 4 | # mail: malin1993ml@gmail.com 5 | # Created Time: 03/04/17 6 | ######################################################################### 7 | #!/bin/bash 8 | 9 | trap onexit 1 2 3 15 10 | function onexit() { 11 | local exit_status=${1:-$?} 12 | pkill -f rnn.tag 13 | exit $exit_status 14 | } 15 | 16 | USAGE='usage: run.sh input_folder output_folder' 17 | 18 | if [ "$#" -ne 2 ]; then 19 | echo $USAGE 20 | exit 21 | fi 22 | 23 | mkdir -p $2 24 | 25 | for file in `find $1 -type f` 26 | do 27 | filename=`basename $file` 28 | if [[ $filename == *"schema"* ]]; then 29 | command="cp $file $2/" 30 | else 31 | if [ -f $2/$filename.anonymized.gz ]; then 32 | continue 33 | fi 34 | command="./data-sampler.py $file | gzip --best > $2/$filename.anonymized.sample.gz" 35 | fi 36 | 37 | if [ ! -f "$2/$filename.anonymized.sample.gz" ]; then 38 | echo $command 39 | eval $command & 40 | fi 41 | done 42 | 43 | wait 44 | 45 | -------------------------------------------------------------------------------- /anonymizer/run.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # File Name: run.sh 3 | # Author: Lin Ma 4 | # mail: malin1993ml@gmail.com 5 | # Created Time: 03/04/17 6 | ######################################################################### 7 | #!/bin/bash 8 | 9 | trap onexit 1 2 3 15 10 | function onexit() { 11 | local exit_status=${1:-$?} 12 | pkill -f rnn.tag 13 | exit $exit_status 14 | } 15 | 16 | USAGE='usage: run.sh input_folder output_folder' 17 | 18 | if [ "$#" -ne 2 ]; then 19 | echo $USAGE 20 | exit 21 | fi 22 | 23 | mkdir -p $2 24 | 25 | for file in `find $1 -type f` 26 | do 27 | filename=`basename $file` 28 | if [[ $filename == *"schema"* ]]; then 29 | command="cp $file $2/" 30 | else 31 | if [ -f $2/$filename.anonymized.gz ]; then 32 | continue 33 | fi 34 | command="./log-anonymizer.py --type mysql --version 5.5 $file | gzip --best > $2/$filename.anonymized.gz" 35 | fi 36 | 37 | if [ ! -f "$2/$filename.anonymized.gz" ]; then 38 | echo $command 39 | eval $command & 40 | fi 41 | done 42 | 43 | wait 44 | 45 | -------------------------------------------------------------------------------- /clusterer/run_sensitivity.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # File Name: run.sh 3 | # Author: Lin Ma 4 | # mail: malin1993ml@gmail.com 5 | # Created Time: 10/08/17 6 | ######################################################################### 7 | #!/bin/bash 8 | 9 | trap onexit 1 2 3 15 10 | function onexit() { 11 | local exit_status=${1:-$?} 12 | pkill -f hstore.tag 13 | exit $exit_status 14 | } 15 | 16 | # --------------------------------------------------------------------- 17 | 18 | # remove the log file 19 | if [ -f run.log ] ; then 20 | rm run.log 21 | fi 22 | 23 | PROJECT_ARRAY=( "tiramisu:tiramisu-combined-results" 24 | "oli:oli-combined-results" 25 | "admission:admission-combined-results" ) 26 | 27 | for PAIR in "${PROJECT_ARRAY[@]}"; do 28 | PROJECT="${PAIR%%:*}" 29 | DATA_PATH="${PAIR##*:}" 30 | for RHO in '0.55' '0.65' '0.75' '0.85' '0.95'; do 31 | cmd="time python3.5 online_clustering.py --project $PROJECT --dir $DATA_PATH 32 | --rho $RHO" 33 | 34 | echo $cmd 35 | echo $cmd >> run.log 36 | START=$(date +%s) 37 | 38 | eval $cmd & 39 | 40 | END=$(date +%s) 41 | DIFF=$(( $END - $START )) 42 | echo "Execution time: $DIFF seconds" 43 | echo -e "Execution time: $DIFF seconds\n" >> run.log 44 | 45 | done # RHO 46 | done # PROJECT 47 | 48 | -------------------------------------------------------------------------------- /clusterer/run_generate_cluster_coverage.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # File Name: run.sh 3 | # Author: Lin Ma 4 | # mail: malin1993ml@gmail.com 5 | # Created Time: 10/08/17 6 | ######################################################################### 7 | #!/bin/bash 8 | 9 | trap onexit 1 2 3 15 10 | function onexit() { 11 | local exit_status=${1:-$?} 12 | pkill -f hstore.tag 13 | exit $exit_status 14 | } 15 | 16 | # --------------------------------------------------------------------- 17 | 18 | # remove the log file 19 | if [ -f run.log ] ; then 20 | rm run.log 21 | fi 22 | 23 | for PROJECT in 'admission'; do 24 | for RHO in '0.1' '0.2' '0.3' '0.4'; do 25 | cmd="time python3.5 generate-cluster-coverage.py 26 | --project $PROJECT 27 | --assignment online-logical-clustering-results/$PROJECT-$RHO-assignments.pickle 28 | --output_csv_dir online-clusters-logical/$PROJECT/$RHO/ 29 | --output_dir cluster-coverage-logical/$PROJECT/$RHO/" 30 | 31 | echo $cmd 32 | echo $cmd >> run.log 33 | START=$(date +%s) 34 | 35 | eval $cmd & 36 | 37 | END=$(date +%s) 38 | DIFF=$(( $END - $START )) 39 | echo "Execution time: $DIFF seconds" 40 | echo -e "Execution time: $DIFF seconds\n" >> run.log 41 | 42 | done # RHO 43 | done # PROJECT 44 | 45 | -------------------------------------------------------------------------------- /forecaster/calc_mse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import fnmatch 4 | import csv 5 | from datetime import datetime, timedelta 6 | import sys 7 | import os 8 | import numpy as np 9 | 10 | DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" # Strip milliseconds ".%f" 11 | 12 | def GetMSE(input_path): 13 | dates = [] 14 | actual = [] 15 | predict = [] 16 | with open(input_path) as input_file: 17 | reader = csv.reader(input_file) 18 | for line in reader: 19 | #dates.append(datetime.strptime(line[0], DATETIME_FORMAT)) 20 | actual.append(float(line[1])) 21 | predict.append(float(line[2])) 22 | 23 | y = np.array(actual) 24 | y_hat = np.array(predict) 25 | 26 | data_min = 2 - np.min([np.min(y), np.min(y_hat)]) 27 | se = (np.log(y + data_min) - np.log(y_hat + data_min)) ** 2 28 | print("MSE of %s: %s" % (input_path, np.mean(se))) 29 | 30 | return se 31 | 32 | # ============================================== 33 | # main 34 | # ============================================== 35 | if __name__ == '__main__': 36 | losses = np.array([]) 37 | for root, dirnames, filenames in os.walk(sys.argv[1]): 38 | for filename in sorted(fnmatch.filter(filenames, '*.csv')): 39 | print(filename) 40 | file_path = os.path.join(root, filename) 41 | losses = np.append(losses, GetMSE(file_path)) 42 | 43 | print(np.mean(losses)) 44 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copy and decompress the sample data file 4 | fileid="1imVPNXk8mGU0v9OOhdp0d9wFDuYqARwZ" 5 | filename="tiramisu-sample.tar.gz" 6 | html=`curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}"` 7 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&`echo ${html}|grep -Po '(confirm=[a-zA-Z0-9\-_]+)'`&id=${fileid}" -o ${filename} 8 | tar -xvzf ${filename} 9 | 10 | # Generate and combine query templates 11 | ./pre-processor/templatizer.py tiramisu --dir tiramisu-sample/ --output templates 12 | ./pre-processor/csv-combiner.py --input_dir templates/ --output_dir tiramisu-combined-csv 13 | 14 | # Run through clustering algorithm 15 | ./clusterer/online_clustering.py --dir tiramisu-combined-csv/ --rho 0.8 16 | ./clusterer/generate-cluster-coverage.py --project tiramisu --assignment online-clustering-results/None-0.8-assignments.pickle --output_csv_dir online-clusters/ --output_dir cluster-coverage/ 17 | 18 | # Run forecasting models 19 | ./forecaster/run_sample.sh 20 | 21 | # Generate ENSEMBLE and HYBRID results 22 | ./forecaster/generate_ensemble_hybrid.py prediction-results/agg-60/horizon-4320/ar/ prediction-results/agg-60/horizon-4320/noencoder-rnn/ prediction-results/agg-60/horizon-4320/ensemble False 23 | ./forecaster/generate_ensemble_hybrid.py prediction-results/agg-60/horizon-4320/ensemble prediction-results/agg-60/horizon-4320/kr prediction-results/agg-60/horizon-4320/hybrid True 24 | -------------------------------------------------------------------------------- /forecaster/run_sample.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # File Name: run.sh 3 | # Author: Lin Ma 4 | # mail: malin1993ml@gmail.com 5 | # Created Time: 07/09/17 6 | ######################################################################### 7 | #!/bin/bash 8 | 9 | trap onexit 1 2 3 15 10 | function onexit() { 11 | local exit_status=${1:-$?} 12 | pkill -f hstore.tag 13 | exit $exit_status 14 | } 15 | 16 | # --------------------------------------------------------------------- 17 | 18 | log_name="run.log" 19 | 20 | # remove the log file 21 | if [ -f $log_name ] ; then 22 | rm $log_name 23 | fi 24 | 25 | for AGGREGATE in '60'; do 26 | for HORIZON in '4320'; do 27 | for PROJECT in 'tiramisu'; do 28 | for METHOD in 'ar' 'kr' 'rnn'; do 29 | cmd="time python3.5 forecaster/exp_multi_online_continuous.py $PROJECT 30 | --method $METHOD 31 | --aggregate $AGGREGATE 32 | --horizon $HORIZON 33 | --input_dir online-clusters/ 34 | --cluster_path cluster-coverage/coverage.pickle 35 | --output_dir prediction-results/" 36 | 37 | echo $cmd 38 | echo $cmd >> $log_name 39 | START=$(date +%s) 40 | 41 | eval $cmd 42 | 43 | END=$(date +%s) 44 | DIFF=$(( $END - $START )) 45 | echo "Execution time: $DIFF seconds" 46 | echo -e "Execution time: $DIFF seconds\n" >> $log_name 47 | 48 | done # METHOD 49 | done # PROJECT 50 | done # HORIZON 51 | done # AGGREGATE 52 | 53 | -------------------------------------------------------------------------------- /anonymizer/data-sampler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.5 2 | 3 | import sys 4 | import glob 5 | import collections 6 | import time 7 | import csv 8 | import os 9 | import datetime 10 | import gzip 11 | import re 12 | import argparse 13 | from multiprocessing import Process 14 | 15 | csv.field_size_limit(sys.maxsize) 16 | 17 | SAMPLE_STEP = 50 18 | 19 | OUTPUT = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL) 20 | 21 | def ProcessData(path, num_logs): 22 | data = [] 23 | processed_queries = 0 24 | templated_workload = dict() 25 | 26 | min_timestamp = datetime.datetime.max 27 | max_timestamp = datetime.datetime.min 28 | 29 | #try: 30 | f = gzip.open(path, mode='rt') 31 | reader = csv.reader(f, delimiter=',') 32 | 33 | for i, query_info in enumerate(reader): 34 | processed_queries += 1 35 | 36 | if (not num_logs is None) and processed_queries > num_logs: 37 | break 38 | 39 | if i % SAMPLE_STEP == 0: 40 | OUTPUT.writerow(query_info) 41 | 42 | # ============================================== 43 | # main 44 | # ============================================== 45 | if __name__ == '__main__': 46 | aparser = argparse.ArgumentParser(description='Templatize SQL Queries') 47 | aparser.add_argument('input', help='Input file') 48 | aparser.add_argument('--max_log', type=int, help='Maximum number of logs to process in a' 49 | 'data file. Process the whole file if not provided') 50 | args = vars(aparser.parse_args()) 51 | 52 | ProcessData(args['input'], args['max_log']) 53 | 54 | 55 | -------------------------------------------------------------------------------- /forecaster/run_logical.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # File Name: run.sh 3 | # Author: Lin Ma 4 | # mail: malin1993ml@gmail.com 5 | # Created Time: 07/09/17 6 | ######################################################################### 7 | #!/bin/bash 8 | 9 | trap onexit 1 2 3 15 10 | function onexit() { 11 | local exit_status=${1:-$?} 12 | pkill -f hstore.tag 13 | exit $exit_status 14 | } 15 | 16 | # --------------------------------------------------------------------- 17 | 18 | # remove the log file 19 | if [ -f run.log ] ; then 20 | rm run.log 21 | fi 22 | 23 | 24 | for RHO in '0.1' '0.2' '0.3' '0.4'; do 25 | for HORIZON in '60' '1440' '10080'; do 26 | for PROJECT in 'admission'; do 27 | for METHOD in 'ar'; do 28 | cmd="time python3.5 exp_multi_online_continuous.py $PROJECT --method $METHOD --aggregate 60 29 | --horizon $HORIZON 30 | --input_dir ~/peloton-tf/time-series-clustering/online-clusters-logical/$PROJECT/$RHO/ 31 | --cluster_path ~/peloton-tf/time-series-clustering/cluster-coverage-logical/$PROJECT/$RHO/coverage.pickle 32 | --output_dir ../prediction-logical-result/$PROJECT/$RHO/" 33 | 34 | echo $cmd 35 | echo $cmd >> run.log 36 | START=$(date +%s) 37 | 38 | eval $cmd & 39 | 40 | END=$(date +%s) 41 | DIFF=$(( $END - $START )) 42 | echo "Execution time: $DIFF seconds" 43 | echo -e "Execution time: $DIFF seconds\n" >> run.log 44 | 45 | done # METHOD 46 | done # PROJECT 47 | done # HORIZON 48 | done # RHO 49 | 50 | -------------------------------------------------------------------------------- /forecaster/run_sensitivity.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # File Name: run.sh 3 | # Author: Lin Ma 4 | # mail: malin1993ml@gmail.com 5 | # Created Time: 07/09/17 6 | ######################################################################### 7 | #!/bin/bash 8 | 9 | trap onexit 1 2 3 15 10 | function onexit() { 11 | local exit_status=${1:-$?} 12 | pkill -f hstore.tag 13 | exit $exit_status 14 | } 15 | 16 | # --------------------------------------------------------------------- 17 | 18 | # remove the log file 19 | if [ -f run.log ] ; then 20 | rm run.log 21 | fi 22 | 23 | 24 | for RHO in '0.55' '0.65' '0.75' '0.85' '0.95'; do 25 | for HORIZON in '60' '1440'; do 26 | for PROJECT in 'tiramisu' 'oli' 'admission'; do 27 | for METHOD in 'ar'; do 28 | cmd="time python3.5 exp_multi_online_continuous.py $PROJECT --method $METHOD --aggregate 60 29 | --horizon $HORIZON 30 | --input_dir ~/peloton-tf/time-series-clustering/online-clusters-sensitivity/$PROJECT/$RHO/ 31 | --cluster_path ~/peloton-tf/time-series-clustering/cluster-coverage-sensitivity/$PROJECT/$RHO/coverage.pickle 32 | --output_dir ../prediction-sensitivity-result/$PROJECT/$RHO/" 33 | 34 | echo $cmd 35 | echo $cmd >> run.log 36 | START=$(date +%s) 37 | 38 | eval $cmd & 39 | 40 | END=$(date +%s) 41 | DIFF=$(( $END - $START )) 42 | echo "Execution time: $DIFF seconds" 43 | echo -e "Execution time: $DIFF seconds\n" >> run.log 44 | 45 | done # METHOD 46 | done # PROJECT 47 | done # HORIZON 48 | done # RHO 49 | 50 | -------------------------------------------------------------------------------- /forecaster/run.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # File Name: run.sh 3 | # Author: Lin Ma 4 | # mail: malin1993ml@gmail.com 5 | # Created Time: 07/09/17 6 | ######################################################################### 7 | #!/bin/bash 8 | 9 | trap onexit 1 2 3 15 10 | function onexit() { 11 | local exit_status=${1:-$?} 12 | pkill -f hstore.tag 13 | exit $exit_status 14 | } 15 | 16 | # --------------------------------------------------------------------- 17 | 18 | log_name="run.log" 19 | 20 | # remove the log file 21 | if [ -f $log_name ] ; then 22 | rm $log_name 23 | fi 24 | 25 | #for AGGREGATE in '10' '20' '30' '60' '120'; do 26 | for AGGREGATE in '60'; do 27 | #for AGGREGATE in '1' '5' '10' '30' '60' '120'; do 28 | #for HORIZON in '720'; do 29 | #for HORIZON in '60' '2880' '4320' '10080'; do 30 | for HORIZON in '720' '1440' '7200'; do 31 | #for HORIZON in '60' '720' '1440' '2880' '4320' '7200' '10080'; do 32 | for PROJECT in 'admission'; do 33 | #for PROJECT in 'tiramisu' 'oli' 'admission'; do 34 | #for METHOD in 'kr'; do 35 | for METHOD in 'arma' 'ar' 'kr' 'fnn' 'rnn' 'psrnn'; do 36 | cmd="time python3.5 exp_multi_online_continuous.py $PROJECT 37 | --method $METHOD 38 | --aggregate $AGGREGATE 39 | --horizon $HORIZON" 40 | 41 | echo $cmd 42 | echo $cmd >> $log_name 43 | START=$(date +%s) 44 | 45 | eval $cmd 46 | 47 | END=$(date +%s) 48 | DIFF=$(( $END - $START )) 49 | echo "Execution time: $DIFF seconds" 50 | echo -e "Execution time: $DIFF seconds\n" >> $log_name 51 | 52 | done # METHOD 53 | done # PROJECT 54 | done # HORIZON 55 | done # AGGREGATE 56 | 57 | -------------------------------------------------------------------------------- /planner-simulator/schemaParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | import collections 6 | import numpy as np 7 | np.set_printoptions(threshold=np.inf) 8 | import re 9 | 10 | KEYWORDS = ["KEY", "PRIMARY", "UNIQUE", "CONSTRAINT"] 11 | 12 | def extract_tables_and_columns(sql_dump): 13 | """ Extracts the tables and columns from the sql_dump and 14 | puts them into the semantics dictionary passed in 15 | 16 | Args: 17 | semantics_dict (dict): dictionary to populate 18 | sql_dump (iterable): iterable containing the sql_dump 19 | 20 | Returns: 21 | altered semantics dictionary 22 | 23 | """ 24 | semantics_dict = collections.OrderedDict() 25 | table = None 26 | for line in sql_dump: 27 | table_name = re.match("CREATE TABLE (\`*.+\`*)\(.*", line) 28 | if table_name is not None: 29 | table = (table_name.group(1)).replace("(", "").replace("`", "").strip() 30 | semantics_dict[table] = collections.OrderedDict() 31 | semantics_dict[table]["num_of_accesses"] = 0 32 | continue 33 | elif table is not None: 34 | line = line.strip() 35 | # Note: this is table not table_name, like above 36 | column = re.match("\`*(\w+)\`*", line) 37 | if column is not None: 38 | key_2 = column.group(0).replace("`", "").strip() 39 | 40 | # Handle case of PRIMARY KEY, KEY, and UNIQUE KEY 41 | is_key = False 42 | for key in KEYWORDS: 43 | if key in key_2: # 44 | is_key = True 45 | if is_key: 46 | continue 47 | 48 | (semantics_dict[table])[key_2] = 0 49 | else: # inside a table but nothing is declared? Exit 50 | end_paren = re.match("\)", line) 51 | if end_paren is not None: 52 | table = None 53 | else: 54 | continue 55 | print("Populated schema dict...") 56 | return semantics_dict 57 | -------------------------------------------------------------------------------- /clusterer/logical_clustering_utility/schemaParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | import collections 6 | import numpy as np 7 | np.set_printoptions(threshold=np.inf) 8 | import re 9 | 10 | KEYWORDS = ["KEY", "PRIMARY", "UNIQUE", "CONSTRAINT"] 11 | 12 | def extract_tables_and_columns(sql_dump): 13 | """ Extracts the tables and columns from the sql_dump and 14 | puts them into the semantics dictionary passed in 15 | 16 | Args: 17 | semantics_dict (dict): dictionary to populate 18 | sql_dump (iterable): iterable containing the sql_dump 19 | 20 | Returns: 21 | altered semantics dictionary 22 | 23 | """ 24 | semantics_dict = collections.OrderedDict() 25 | table = None 26 | for line in sql_dump: 27 | table_name = re.match("CREATE TABLE (\`*.+\`*)\(.*", line) 28 | if table_name is not None: 29 | table = (table_name.group(1)).replace("(", "").replace("`", "").strip() 30 | semantics_dict[table] = collections.OrderedDict() 31 | semantics_dict[table]["num_of_accesses"] = 0 32 | continue 33 | elif table is not None: 34 | line = line.strip() 35 | # Note: this is table not table_name, like above 36 | column = re.match("\`*(\w+)\`*", line) 37 | if column is not None: 38 | key_2 = column.group(0).replace("`", "").strip() 39 | 40 | # Handle case of PRIMARY KEY, KEY, and UNIQUE KEY 41 | is_key = False 42 | for key in KEYWORDS: 43 | if key in key_2: # 44 | is_key = True 45 | if is_key: 46 | continue 47 | 48 | (semantics_dict[table])[key_2] = 0 49 | else: # inside a table but nothing is declared? Exit 50 | end_paren = re.match("\)", line) 51 | if end_paren is not None: 52 | table = None 53 | else: 54 | continue 55 | print("Populated schema dict...") 56 | return semantics_dict 57 | -------------------------------------------------------------------------------- /forecaster/Utilities.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.autograd import Variable 3 | import math 4 | import torch 5 | import matplotlib.pyplot as plt 6 | import os 7 | 8 | def onehot(X, dim): 9 | Xind = np.zeros(dim) 10 | Xind[X, np.arange(dim[1])] = 1 11 | return Xind 12 | 13 | def flat_prod(X,Y): 14 | XY = np.zeros((X.shape[0]*Y.shape[0], X.shape[1])) 15 | for i in range(X.shape[1]): 16 | XY[:,i] = np.kron(X[:,i], Y[:,i].T).reshape(X.shape[0]*Y.shape[0]) 17 | return XY 18 | 19 | def repackage_hidden(h): 20 | """Wraps hidden states in new Variables, to detach them from their history.""" 21 | if isinstance(h, tuple) or isinstance(h, list): 22 | return tuple(repackage_hidden(v) for v in h) 23 | else: 24 | return h.detach() 25 | 26 | def get_batch(source, i, bptt, evaluation=False): 27 | seq_len = min(bptt, source.shape[0] - 1 - i) 28 | data = source[i:i+seq_len] 29 | target = source[i+1:i+1+seq_len] 30 | return data, target 31 | 32 | def get_batch(source, i, bptt, evaluation=False, horizon=1): 33 | seq_len = min(bptt, source.shape[0] - horizon - i) 34 | data = source[i:i+seq_len] 35 | target = source[i+horizon:i+horizon+seq_len] 36 | return data, target 37 | 38 | def prettyPrint(description, loss): 39 | print('=' * 89) 40 | print('|| ',description, ' || loss {:5.3f}'.format(loss)) 41 | print('=' * 89) 42 | 43 | def my_plot(x_tst, y, i_plt,j_plt): 44 | plt.plot(x_tst[:,i_plt,j_plt]) 45 | plt.plot(y[:,i_plt,j_plt]) 46 | plt.show() 47 | 48 | def save_plot(x_tst, y, i_plt): 49 | x_tst = x_tst.transpose(1, 0, 2) 50 | y = y.transpose(1, 0, 2) 51 | plt.figure(figsize = (120, 2.5)) 52 | plt.plot(x_tst[:, :, i_plt].flatten(), linewidth = 0.5) 53 | plt.plot(y[:, :, i_plt].flatten(), linewidth = 0.5) 54 | #plt.ylim([0, 8000]) 55 | plot_dir = "../plot/regressed-admission-psrnn-lr1-log" 56 | #plot_dir = "../plot/regressed-admission-rnn-lr1-log" 57 | if not os.path.exists(plot_dir): 58 | os.makedirs(plot_dir) 59 | plt.savefig("%s/%d.pdf" % (plot_dir, i_plt)) 60 | plt.close() 61 | 62 | def plot_weights(W): 63 | plt.set_cmap('jet') 64 | plt.imshow(W) 65 | plt.show() 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /forecaster/models/FNN_Model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import math 3 | from torch.autograd import Variable 4 | 5 | 6 | class FNN_Model(nn.Module): 7 | """Container module with an encoder, a recurrent module, and a decoder.""" 8 | 9 | def __init__(self, rnn_type, ntoken, regress_dim, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): 10 | super(FNN_Model, self).__init__() 11 | 12 | nlayers = 1 13 | ninp = 10 14 | self.encoder = nn.Linear(ntoken * regress_dim, ninp) 15 | 16 | # First layer 17 | self.layers = [nn.Linear(ninp, nhid)] 18 | for i in range(nlayers - 1): 19 | layer = nn.Linear(nhid, nhid) 20 | self.layers.append(layer) 21 | 22 | self.layers = nn.ModuleList(self.layers) 23 | 24 | self.relu = nn.ReLU() 25 | self.dropout = nn.Dropout(dropout) 26 | 27 | self.decoder = nn.Linear(nhid, ntoken) 28 | 29 | # Optionally tie weights as in: 30 | # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) 31 | # https://arxiv.org/abs/1608.05859 32 | # and 33 | # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) 34 | # https://arxiv.org/abs/1611.01462 35 | if tie_weights: 36 | self.decoder.weight = self.encoder.weight 37 | 38 | self.init_weights() 39 | 40 | self.rnn_type = rnn_type 41 | self.nhid = nhid 42 | self.nlayers = nlayers 43 | 44 | def init_weights(self): 45 | initrange = 1 46 | 47 | encoder = self.encoder 48 | encoder.bias.data.fill_(0) 49 | encoder.weight.data.normal_(0, math.sqrt(2. / (encoder.in_features + encoder.out_features))) 50 | #encoder.weight.data.uniform_(-initrange, initrange) 51 | 52 | for layer in self.layers: 53 | layer.bias.data.fill_(0) 54 | layer.weight.data.normal_(0, math.sqrt(2. / (layer.in_features + layer.out_features))) 55 | #layer.weight.data.uniform_(-initrange, initrange) 56 | 57 | decoder = self.decoder 58 | decoder.bias.data.fill_(0) 59 | decoder.weight.data.normal_(0, math.sqrt(2. / (decoder.in_features + decoder.out_features))) 60 | #decoder.weight.data.uniform_(-initrange, initrange) 61 | 62 | def forward(self, input, hidden): 63 | 64 | bptt = input.size(0) 65 | 66 | emb = self.encoder(input) 67 | 68 | output = emb 69 | for layer in self.layers: 70 | output = self.relu(layer(output)) 71 | output = self.dropout(output) 72 | 73 | decoded = self.decoder(output) 74 | return decoded.view(1, bptt, -1), hidden 75 | 76 | def init_hidden(self, bsz): 77 | return None 78 | 79 | -------------------------------------------------------------------------------- /forecaster/models/PSRNN_Model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | import torch.nn.functional as Functional 6 | 7 | class PSRNN_Model(nn.Module): 8 | """Container module with an encoder, a recurrent module, and a decoder.""" 9 | 10 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False, 11 | cuda=False): 12 | super(PSRNN_Model, self).__init__() 13 | 14 | self.RBF = nn.Linear(ntoken, ninp) 15 | self.embedding = nn.Linear(ninp, nhid) 16 | self.W_FE_F = nn.Linear(nhid, nhid*nhid) 17 | self.decoder = nn.Linear(nhid, ntoken) 18 | 19 | self.ninp = ninp 20 | self.nhid = nhid 21 | self.nlayers = nlayers 22 | self.ntoken = ntoken 23 | 24 | self.x_1 = np.ones((1,self.nhid)) 25 | 26 | self.init_weights_random() 27 | 28 | self.use_cuda = cuda 29 | 30 | def init_weights_random(self): 31 | initrange = 0.1 32 | self.RBF.weight.data.uniform_(-initrange, initrange) 33 | self.RBF.bias.data.fill_(0) 34 | self.embedding.weight.data.uniform_(-initrange, initrange) 35 | self.embedding.bias.data.fill_(0) 36 | self.W_FE_F.weight.data.uniform_(-initrange, initrange) 37 | self.W_FE_F.bias.data.fill_(0) 38 | self.decoder.weight.data.uniform_(-initrange, initrange) 39 | self.decoder.bias.data.fill_(0) 40 | 41 | def init_weights_psr(self, RBF_weight, RBF_bias, embedding_weight, W_FE_F_weight, W_FE_F_bias, 42 | decoder_weight, decoder_bias, x_1): 43 | self.RBF.weight.data = torch.Tensor(RBF_weight) 44 | self.RBF.bias.data = torch.Tensor(RBF_bias) 45 | self.embedding.weight.data = torch.Tensor(embedding_weight) 46 | self.embedding.bias.data.fill_(0) 47 | self.W_FE_F.weight.data = torch.Tensor(W_FE_F_weight) 48 | self.W_FE_F.bias.data = torch.Tensor(W_FE_F_bias) 49 | self.decoder.weight.data = torch.Tensor(decoder_weight) 50 | self.decoder.bias.data = torch.Tensor(decoder_bias) 51 | self.x_1 = x_1 52 | 53 | def init_hidden(self, bsz): 54 | hidden = Variable(torch.Tensor(np.ones((bsz, 1)).dot(self.x_1))) 55 | if self.use_cuda: 56 | hidden = hidden.cuda() 57 | 58 | return hidden 59 | 60 | def forward(self, input, b): 61 | 62 | bptt = input.size(0) 63 | bsz = input.size(1) 64 | 65 | # encode observation 66 | input = input.view(bptt*bsz, input.size(2)) 67 | encoded = self.RBF(input).cos()*np.sqrt(2.)/np.sqrt(self.ninp) 68 | obs = self.embedding(encoded) 69 | obs = obs.view(bptt, bsz, self.nhid) 70 | 71 | # update state 72 | output = [0]*bptt 73 | for i in range(bptt): 74 | W = self.W_FE_F(b) 75 | W = W.view(self.nhid*bsz, self.nhid) 76 | b = [0]*bsz 77 | for j in range(bsz): 78 | obs_ij = obs[i,j,:].view(1,self.nhid) 79 | b[j] = W[j*self.nhid:(j+1)*self.nhid,:].mm(obs_ij.t()).t() 80 | ones = Variable(torch.ones(1,b[j].size()[1])) 81 | if self.use_cuda: 82 | ones = ones.cuda() 83 | b[j] = b[j].div((b[j].mm(b[j].t())).sqrt().mm(ones)) 84 | b = torch.cat(b) 85 | output[i] = self.decoder(b).view(1,b.size(0),-1) 86 | output = torch.cat(output) 87 | 88 | 89 | return output, b 90 | -------------------------------------------------------------------------------- /forecaster/models/RNN_Model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import math 3 | from torch.autograd import Variable 4 | 5 | 6 | class RNN_Model(nn.Module): 7 | """Container module with an encoder, a recurrent module, and a decoder.""" 8 | 9 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): 10 | super(RNN_Model, self).__init__() 11 | self.encoder = nn.Linear(ntoken, ninp) 12 | if rnn_type in ['LSTM', 'GRU']: 13 | self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout = dropout) 14 | else: 15 | try: 16 | nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] 17 | except KeyError: 18 | raise ValueError( """An invalid option for `--model` was supplied, 19 | options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") 20 | self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity) 21 | self.decoder = nn.Linear(nhid, ntoken) 22 | 23 | # Optionally tie weights as in: 24 | # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) 25 | # https://arxiv.org/abs/1608.05859 26 | # and 27 | # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) 28 | # https://arxiv.org/abs/1611.01462 29 | if tie_weights: 30 | self.decoder.weight = self.encoder.weight 31 | 32 | self.rnn_type = rnn_type 33 | self.nhid = nhid 34 | self.ninp = ninp 35 | self.nlayers = nlayers 36 | 37 | self.init_weights() 38 | 39 | 40 | def init_weights(self): 41 | initrange = 1 42 | encoder = self.encoder 43 | self.encoder.bias.data.fill_(0) 44 | #self.encoder.weight.data.uniform_(-initrange, initrange) 45 | encoder.weight.data.normal_(0, math.sqrt(2. / (encoder.in_features + encoder.out_features))) 46 | 47 | decoder = self.decoder 48 | self.decoder.bias.data.fill_(0) 49 | #self.decoder.weight.data.uniform_(-initrange, initrange) 50 | decoder.weight.data.normal_(0, math.sqrt(2. / (decoder.in_features + decoder.out_features))) 51 | 52 | self.rnn.weight_ih_l0.data.normal_(0, math.sqrt(2. / (self.ninp + self.nhid))) 53 | self.rnn.weight_ih_l0.data.normal_(0, math.sqrt(2. / (self.ninp + self.nhid))) 54 | self.rnn.bias_ih_l0.data.fill_(0) 55 | self.rnn.bias_hh_l0.data.fill_(0) 56 | 57 | self.rnn.weight_ih_l1.data.normal_(0, math.sqrt(2. / (self.ninp + self.nhid))) 58 | self.rnn.weight_ih_l1.data.normal_(0, math.sqrt(2. / (self.ninp + self.nhid))) 59 | self.rnn.bias_ih_l1.data.fill_(0) 60 | self.rnn.bias_hh_l1.data.fill_(0) 61 | 62 | 63 | 64 | def forward(self, input, hidden): 65 | 66 | bptt = input.size(0) 67 | bsz = input.size(1) 68 | 69 | input = input.view(bptt*bsz, -1) 70 | emb = self.encoder(input) 71 | emb = emb.view(bptt, bsz, -1) 72 | 73 | output, hidden = self.rnn(emb, hidden) 74 | decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) 75 | return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden 76 | 77 | def init_hidden(self, bsz): 78 | weight = next(self.parameters()).data 79 | if self.rnn_type == 'LSTM': 80 | return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()), 81 | Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())) 82 | else: 83 | return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()) 84 | 85 | -------------------------------------------------------------------------------- /forecaster/generate_ensemble_hybrid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import copy 4 | import fnmatch 5 | import csv 6 | from datetime import datetime, timedelta 7 | import sys 8 | import os 9 | import numpy as np 10 | 11 | DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" # Strip milliseconds ".%f" 12 | 13 | SPIKE = "True" 14 | PERCENT = 150 15 | 16 | def GenerateData(data_dicts): 17 | 18 | r = [] 19 | r_hat = [] 20 | 21 | for data_dict in data_dicts: 22 | r.append(data_dict[0][0]) 23 | r_hat.append(data_dict[0][1]) 24 | 25 | data_actual = np.mean(np.array(r), axis=0) 26 | 27 | if SPIKE == "True": 28 | data_ensemble = [] 29 | for i in range(len(r_hat[1])): 30 | if (r_hat[1][i] > 1000) and abs(r_hat[0][i] - r_hat[1][i]) / r_hat[0][i] > PERCENT / 100: 31 | data_ensemble.append(r_hat[1][i]) 32 | else: 33 | data_ensemble.append(r_hat[0][i]) 34 | 35 | data_ensemble = np.array(data_ensemble) 36 | else: 37 | r_hat = np.array(r_hat) 38 | data_min = 2 - np.min(r_hat) 39 | avg = np.log(r_hat + data_min) 40 | data_ensemble = np.exp(np.mean(avg, axis=0)) - data_min 41 | 42 | return (data_actual, data_ensemble) 43 | 44 | 45 | def GetMSE(input_path): 46 | dates = [] 47 | actual = [] 48 | predict = [] 49 | with open(input_path) as input_file: 50 | reader = csv.reader(input_file) 51 | for line in reader: 52 | dates.append(datetime.strptime(line[0], DATETIME_FORMAT)) 53 | actual.append(max(0, float(line[1]))) 54 | predict.append(max(0, float(line[2]))) 55 | 56 | y = np.array(actual) 57 | y_hat = np.array(predict) 58 | 59 | return (y, y_hat), dates 60 | 61 | def GetDataDict(input_dir): 62 | data_dict = {} 63 | 64 | losses = np.array([]) 65 | for root, dirnames, filenames in os.walk(input_dir): 66 | for filename in sorted(fnmatch.filter(filenames, '*.csv')): 67 | file_path = os.path.join(root, filename) 68 | print(filename, file_path) 69 | 70 | data_dict[filename] = GetMSE(file_path) 71 | 72 | return data_dict 73 | 74 | 75 | def WriteResult(path, dates, actual, predict): 76 | with open(path, "w") as csvfile: 77 | writer = csv.writer(csvfile, quoting = csv.QUOTE_ALL) 78 | for x in range(len(dates)): 79 | writer.writerow([dates[x], actual[x], predict[x]]) 80 | 81 | 82 | 83 | def Main(input_dir1, input_dir2, output_dir, spike): 84 | global SPIKE 85 | if spike != None: 86 | SPIKE = spike 87 | delimiter = "/" 88 | output_dir += delimiter 89 | print("output_dir: " + output_dir) 90 | if not os.path.exists(output_dir): 91 | os.makedirs(output_dir) 92 | 93 | data_dict1 = GetDataDict(input_dir1) 94 | data_dict2 = GetDataDict(input_dir2) 95 | 96 | for file_name in data_dict1: 97 | data = GenerateData([data_dict1[file_name], data_dict2[file_name]]) 98 | if SPIKE: 99 | tag = "" 100 | else: 101 | tag = "" 102 | WriteResult(output_dir + tag + file_name, data_dict1[file_name][1], data[0], data[1]) 103 | 104 | 105 | 106 | # ============================================== 107 | # main 108 | # ============================================== 109 | if __name__ == '__main__': 110 | """ 111 | Generate the output for ENSEMBLE (or HYBRID) model with the resutls from 112 | individual models. 113 | 114 | Args: 115 | arg1 : directory for the prediction result of the first (ensemble) model 116 | arg2 : directory for the prediction result of the second (kr) model 117 | arg3 : output directory of ensemble (hybrid) 118 | arg4 : whether we're generating hybrid method (True) or ensemble (False) 119 | """ 120 | Main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QueryBot 5000 2 | **QueryBot 5000 (QB5000)** is a robust forecasting framework that allows a DBMS to predict the expected arrival rate of queries 3 | in the future based on historical data. This is the source code for our 4 | [SIGMOD paper](http://www.cs.cmu.edu/~malin199/publications/2018.forecasting.sigmod.pdf): **_Query-based Workload Forecasting for Self-Driving Database Management Systems_**. 5 | 6 | ## Run forecasting on a sample of BusTracker workload: 7 | ./run.sh 8 | We provide an example of the workload forecasting for a sample subset of **BusTracker** workload. The prediction specified in the script is on 1 hour interval and 3 day horizon. The predicted arrival rates with different models for each cluster are in the _prediction-results_ folder. All the query templates of the workload can be found at _templates.txt_. 9 | 10 | The default experimental setting is to run under CPU. If you have a GPU, you can change [this parameter](https://github.com/malin1993ml/QueryBot5000/blob/master/forecaster/exp_multi_online_continuous.py#L101) to _True_ to enable GPU training. 11 | 12 | ### Dependencies 13 | python>=3.5 14 | scikit-learn>=0.18.1 15 | sortedcontainers>=1.5.7 16 | statsmodels>=0.8.0 17 | scipy>=0.19.0 18 | numpy>=1.14.2 19 | matplotlib>=2.0.2 20 | pytorch>=0.2.0_1 (you need to install the GPU version if you want to use GPU) 21 | 22 | ## Framework Pipeline: 23 | 24 | ### Anonymization 25 | We first anonymize all the queries from the real-world traces used in our experiments for privacy purposes. The components below use the anonymization results from this step as their input. 26 | 27 | cd anonymizer 28 | ./log-anonymizer.py --help 29 | 30 | ### Pre-processor 31 | This component extracts the **template**s from the anonymized queries and records the arrival rate history for each template. 32 | 33 | cd pre-processor 34 | ./templatizer.py --help 35 | 36 | ### Clusterer 37 | This component groups query templates with similar arrival rate patterns into **cluster**s. 38 | 39 | cd clusterer 40 | ./online_clustering.py --help 41 | _generate-cluster-coverage.py_ generates the time series for the largest _MAX_CLUSTER_NUM_ clusters on each day, which are used in the forecasting evaluation. 42 | 43 | ### Forecaster 44 | This component uses a combination of linear regression, recurrent neural network, and kernel regression to predict the arrival rate pattern of each query cluster on different prediction **horizon**s and **interval**s. 45 | 46 | cd forecaster 47 | ./exp_multi_online_continuous.py --help 48 | 49 | ### Workload Simulator 50 | This simulator populates a synthetic database with a given schema file, removes all the secondary indexes, replays the query trace of the workload, and builds appropriate indexes with the real-time workload forecasting results. 51 | 52 | cd workload-simulator 53 | ./workload-simulator.py --help 54 | 55 | ## Inquiry about Data 56 | Due to legal and privacy constraints, unfortunately we cannot publish the full datasets that we used in the experiments for the publication (especially for the two student-related **Admissions** and **MOOC** workloads). To the best of our effort, we managed to publish a subset (2% random sampling) of the **BusTracker** [workload trace](https://drive.google.com/file/d/1imVPNXk8mGU0v9OOhdp0d9wFDuYqARwZ/view?usp=sharing) and the [schema file](https://drive.google.com/file/d/1d4z3SAwIOmv_PJTlsUfPCNHxZu2r-g_O/view?usp=sharing). 57 | 58 | We use [this script](https://github.com/malin1993ml/QueryBot5000/blob/master/anonymizer/run-sampler.sh) to generate the sample subset of the original workload trace. 59 | 60 | ## NOTE 61 | This repo does not have an end-to-end running framework. We build different components separately and pass the results through a workload simulator that connects to MySQL/PostgreSQL for experimental purposes. We are integrating the full framework into [Peloton](http://pelotondb.io/) Self-Driving DBMS. Please check out our [source code](https://github.com/cmu-db/peloton/tree/master/src/include/brain) there for more reference. 62 | 63 | ## License 64 | Copyright 2018, Carnegie Mellon University 65 | 66 | Licensed under the Apache License, Version 2.0 (the "License"); 67 | you may not use this file except in compliance with the License. 68 | You may obtain a copy of the License at 69 | 70 | http://www.apache.org/licenses/LICENSE-2.0 71 | 72 | Unless required by applicable law or agreed to in writing, software 73 | distributed under the License is distributed on an "AS IS" BASIS, 74 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 75 | See the License for the specific language governing permissions and 76 | limitations under the License. 77 | -------------------------------------------------------------------------------- /workload-simulator/preprocessing.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import gzip 4 | import datetime 5 | import re 6 | import hashlib 7 | 8 | # ============================================== 9 | # PROJECT CONFIGURATIONS 10 | # ============================================== 11 | 12 | PROJECTS = { 13 | "tiramisu": { 14 | "name": "tiramisu", 15 | "files": "dbp*postgresql-*.anonymized.gz", 16 | "mysql": False, 17 | "query_index": 3, 18 | "param_index": 4, 19 | "conn_pos": 1, 20 | "time_stamp_format": "%Y-%m-%d %H:%M:%S", 21 | "schema": "../mysql/combinedTiramisuSchema.sql", 22 | }, 23 | "admission": { 24 | "name": "admission", 25 | "files": "magneto.log.*.anonymized.gz", 26 | "mysql": True, 27 | "type_index": 3, 28 | "query_index": 4, 29 | "conn_pos": 2, 30 | "time_stamp_format": "%Y-%m-%d %H:%M:%S", 31 | 'schema': "../mysql/gradAdmissions2009New.sql", 32 | }, 33 | "oli": { 34 | "name": "oli", 35 | "files": "db*logfile*.anonymized.gz", 36 | "mysql": True, 37 | "type_index": 2, 38 | "query_index": 3, 39 | "conn_pos": 1, 40 | "time_stamp_format": "%y%m%d %H:%M:%S", 41 | 'schema': "../mysql/olischema.sql", 42 | } 43 | } 44 | 45 | STATEMENTS = ['select', 'SELECT', 'INSERT', 'insert', 'UPDATE', 'update', 'delete', 'DELETE'] 46 | SALT = "I fucking hate anonymizing queries" 47 | SALT = SALT.encode('utf-8') 48 | 49 | 50 | def GetEnumDict(schema_file): 51 | enum_dict = dict() 52 | sql_schema = open(schema_file, 'r') 53 | for line in sql_schema: 54 | enum_match = re.search("(ENUM|enum)\((.*)\)", line) 55 | #print(line) 56 | if enum_match is not None: 57 | enums = enum_match.group(2) 58 | enums = re.split(", | |,", enums) 59 | 60 | for enum in enums: 61 | data_length = str(len(enum) - 2) 62 | cleaned = hashlib.md5(SALT + 63 | enum.encode("utf-8")).hexdigest() 64 | clean_enum = "'" + (data_length + "\\" + str(cleaned)) + "'" 65 | enum_dict[clean_enum] = enum 66 | 67 | return enum_dict 68 | 69 | 70 | def preprocess(config, path, num_logs = None): 71 | # input: string of path to csv file 72 | # output: prints lines consisting of timestamp and query (comma separated) 73 | 74 | enum_dict = GetEnumDict(config['schema']) 75 | 76 | processed_queries = 0 77 | 78 | f = gzip.open(path, mode='rt') 79 | reader = csv.reader(f, delimiter=',') 80 | 81 | for query_info in reader: 82 | processed_queries += 1 83 | 84 | if (not num_logs is None) and processed_queries > num_logs: 85 | break 86 | 87 | if config['name'] == 'tiramisu': 88 | time_stamp = query_info[0] 89 | time_stamp = time_stamp[: -8] # remove milliseconds and the time zone 90 | 91 | else: 92 | if query_info[config['type_index']] != 'Query': # skip if not a query 93 | continue 94 | 95 | # create timestamp 96 | if config['name'] == 'admission': 97 | day = query_info[0] 98 | time = query_info[1].split(".")[0] # removes the milliseconds 99 | time_stamp = day + " " + time 100 | 101 | if config['name'] == 'oli': 102 | time_stamp = query_info[0] 103 | if time_stamp[7] == ' ': 104 | time_stamp = time_stamp[0: 7] + '0' + time_stamp[8: -1] 105 | #IF 106 | 107 | time_stamp = datetime.datetime.strptime( 108 | time_stamp, config['time_stamp_format']) 109 | time_stamp = time_stamp.replace(second=0) # accurate to the minute 110 | # Format query 111 | query = query_info[config['query_index']] 112 | 113 | for stmt in STATEMENTS: 114 | idx = query.find(stmt) 115 | if idx >= 0: 116 | break 117 | 118 | if idx < 0: 119 | continue 120 | 121 | # put back all the params for unnamed prepared statements... 122 | # this is nasty... 123 | if (not config['mysql']) and "execute" in query: 124 | params = query_info[config['param_index']] 125 | params = re.findall("'.+?'", params) 126 | for i, param in reversed(list(enumerate(params))): 127 | query = query.replace("${}".format(i + 1), param) 128 | 129 | query = query[idx:] 130 | if query[-1] != ";": 131 | query += ";" 132 | 133 | for clean_enum in enum_dict: 134 | query = query.replace(clean_enum, enum_dict[clean_enum]) 135 | 136 | print(str(time_stamp) + ',' + query_info[config['conn_pos']] + ',' + query) 137 | 138 | if __name__ == '__main__': 139 | preprocess(PROJECTS[sys.argv[1]], sys.argv[2], 1100000000) 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /pre-processor/csv-combiner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import glob 5 | import collections 6 | import time 7 | import csv 8 | import os 9 | import datetime 10 | import gzip 11 | import re 12 | import argparse 13 | from multiprocessing import Process 14 | 15 | csv.field_size_limit(sys.maxsize) 16 | 17 | STATEMENTS = ['select', 'SELECT', 'INSERT', 'insert', 'UPDATE', 'update', 'delete', 'DELETE'] 18 | DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" 19 | TIME_STAMP_STEP = datetime.timedelta(minutes=1) 20 | 21 | def MakeCSVFiles(workload_dict, min_timestamp, max_timestamp, output_dir): 22 | print("Generating CSV files...") 23 | print(output_dir) 24 | 25 | # Create the result folder if not exists 26 | if not os.path.exists(output_dir): 27 | os.makedirs(output_dir) 28 | 29 | # delete any old existing files 30 | for old_file in os.listdir(output_dir): 31 | os.remove(output_dir + old_file) 32 | 33 | template_count = 0 34 | for template in workload_dict: 35 | template_timestamps = workload_dict[ 36 | template] # time stamps for ith cluster 37 | num_queries_for_template = sum(template_timestamps.values()) 38 | 39 | # write to csv file 40 | with open(output_dir + 'template' + str(template_count) + 41 | ".csv", 'w') as csvfile: 42 | template_writer = csv.writer(csvfile, dialect='excel') 43 | template_writer.writerow([num_queries_for_template, template]) 44 | for entry in sorted(template_timestamps): 45 | template_writer.writerow([entry, template_timestamps[entry]]) 46 | csvfile.close() 47 | template_count += 1 48 | 49 | print("Template count: " + str(template_count)) 50 | 51 | def AddEntry(template, reader, min_timestamp, max_timestamp, templated_workload): 52 | 53 | # Finer process the template a bit to reduce the total template numbers 54 | template = re.sub(r"&&&", r"#", template) 55 | template = re.sub(r"@@@", r"#", template) 56 | template = re.sub(r"[nN]ull", r"#", template) 57 | template = re.sub(r"NULL", r"#", template) 58 | template = re.sub(r"\s+", r" ", template) 59 | template = re.sub(r"\( ", r"(", template) 60 | template = re.sub(r" \)", r")", template) 61 | template = re.sub(r"([^ ])\(", r"\1 (", template) 62 | template = re.sub(r"\)([^ ])", r") \1", template) 63 | template = re.sub(r" IN \([^\(]*?\)", r" IN ()", template) 64 | template = re.sub(r" in \([^\(]*?\)", r" IN ()", template) 65 | template = re.sub(r"([=<>,!\?])([^ ])", r"\1 \2", template) 66 | template = re.sub(r"([^ ])=", r"\1 =", template) 67 | 68 | #if (template.find("gradAdmissions2#Test") > 0 and template.find("INSERT") >= 0 and 69 | if (template.find("INSERT") >= 0 and 70 | template.find("VALUES") > 0): 71 | template = template[: template.find("VALUES") + 6] 72 | 73 | for line in reader: 74 | time_stamp = datetime.datetime.strptime(line[0], DATETIME_FORMAT) 75 | count = int(line[1]) 76 | 77 | if not template in templated_workload: 78 | # add template 79 | templated_workload[template] = dict() 80 | 81 | if time_stamp in templated_workload[template]: 82 | templated_workload[template][time_stamp] += count 83 | else: 84 | templated_workload[template][time_stamp] = count 85 | 86 | min_timestamp = min(min_timestamp, time_stamp) 87 | max_timestamp = max(max_timestamp, time_stamp) 88 | 89 | return (templated_workload, min_timestamp, max_timestamp) 90 | 91 | 92 | def Combine(input_dir, output_dir): 93 | 94 | templated_workload = dict() 95 | 96 | min_timestamp = datetime.datetime.max 97 | max_timestamp = datetime.datetime.min 98 | 99 | target = os.path.join(input_dir, "*/*template*.csv") 100 | print(target) 101 | files = sorted([ x for x in glob.glob(target) ]) 102 | cnt = 0 103 | for x in files: 104 | print(x) 105 | with open(x, 'r') as f: 106 | reader = csv.reader(f) 107 | queries, template = next(reader) 108 | #statement = template.split(' ',1)[0] 109 | #if not statement in STATEMENTS: 110 | # continue 111 | 112 | templated_workload, min_timestamp, max_timestamp = AddEntry(template, reader, 113 | min_timestamp, max_timestamp, templated_workload) 114 | 115 | cnt += 1 116 | #if cnt == 1000: 117 | # break 118 | 119 | print(min_timestamp) 120 | print(max_timestamp) 121 | with open('templates.txt', 'w') as template_file: 122 | [ template_file.write(t + "\n") for t in sorted(templated_workload.keys()) ] 123 | 124 | MakeCSVFiles(templated_workload, min_timestamp, max_timestamp, output_dir) 125 | 126 | 127 | 128 | 129 | # ============================================== 130 | # main 131 | # ============================================== 132 | if __name__ == '__main__': 133 | aparser = argparse.ArgumentParser(description='Templated query csv combiner') 134 | aparser.add_argument('--input_dir', help='Input Data Directory') 135 | aparser.add_argument('--output_dir', help='Output Data Directory') 136 | args = vars(aparser.parse_args()) 137 | 138 | Combine(args['input_dir'], args['output_dir'] + '/') 139 | -------------------------------------------------------------------------------- /clusterer/logical_clustering_utility/buildVectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import re 5 | import string 6 | import numpy as np 7 | 8 | 9 | # Create the logical vectors for templates 10 | def create_vectors(templates, semantics_dict): 11 | 12 | ################################################################## 13 | # Variable initialization, and defining constants for use in our # 14 | # feature vector creation # 15 | ################################################################## 16 | data = [] 17 | # Special SQL-keywords 18 | query_type = ["SELECT", "INSERT", "UPDATE", "DELETE"] 19 | # INSERT, UPDATE and DELETE are closer to one another than to SELECT. 20 | keywords = ["GROUP", "ORDER", "HAVING", "LIMIT"] 21 | # keywords are wrapped up into SELECT 22 | 23 | # Don't change the next few lines 24 | tables = list(semantics_dict) 25 | columns = extract_columns(semantics_dict) 26 | num_tables = len(tables) 27 | num_columns = len(columns) 28 | 29 | # Result vector dictionary 30 | vector_dict = dict() 31 | 32 | ################################################################### 33 | # Vector Format # 34 | # [query type, table_dist, table_1, table_2, table_3..., table_n, # 35 | # column_1, column_2, ... , column_m] # 36 | ################################################################### 37 | for template in templates: 38 | 39 | vector_length = 2 + num_tables + num_columns 40 | vector = np.zeros(vector_length) 41 | 42 | #print(template) 43 | 44 | query = re.split('\s+', template) # split query by whitespace 45 | for sem in query: 46 | sem = str(sem) 47 | #print(sem) 48 | 49 | if sem in string.whitespace: # Verified correct 50 | continue 51 | 52 | if sem.upper() in query_type: # Verified correct 53 | sem = sem.upper() 54 | # This enforces the distance between query_type 55 | index = query_type.index(sem) 56 | #print("type?", index) 57 | if index != 0: 58 | vector[0] = index + 5 59 | continue 60 | else: 61 | vector[0] = index # 0 62 | continue 63 | 64 | elif sem.upper() in keywords: 65 | sem = sem.upper() 66 | # since we know that the keywords are a byproduct 67 | # of SELECT, since SELECT is 0, we modify on the first 68 | # index 69 | vector[0] += keywords.index(sem) 70 | continue 71 | 72 | elif sem in columns: 73 | #print("col1: ", sem) 74 | vector[2 + num_tables + columns.index(sem)] += 1 75 | continue 76 | 77 | else: # Handles cases of tables + useless stuff 78 | try: 79 | # Since iterating through log, will come across 80 | # words that are irrelevant. This allows us to remove 81 | # them in constant time 82 | semantics_dict[sem] 83 | table_index = tables.index(sem) 84 | #print(template) 85 | #print("table: ", sem) 86 | vector[2 + table_index] += 1 87 | continue 88 | except: 89 | # Meaning that it's not a table, but it could be a table.col 90 | # format. Yeah, fuck you too sql 91 | try: 92 | (table, col) = re.split("[.,]", sem)[:2] 93 | except: 94 | continue 95 | if table in tables: 96 | table_index = tables.index(table) 97 | else: 98 | table_index = -1 99 | if col in columns: 100 | col_index = columns.index(col) 101 | #print("col2: ", col) 102 | else: 103 | col_index = -1 104 | 105 | if table_index != -1: 106 | vector[2 + table_index] += 1 107 | if col_index != -1: 108 | vector[2 + num_tables + col_index] += 1 109 | continue 110 | 111 | vector_dict[template] = vector 112 | #print(vector) 113 | 114 | print("Built Feature Vectors...") 115 | return vector_dict 116 | 117 | 118 | def delete_zero_cols(arr): 119 | """ Deletes columns that are entirely zero (dimension reduction) 120 | Args: 121 | arr (numpy array): feature vectors 122 | Returns: 123 | (numpy array): reduced feature vectors 124 | """ 125 | 126 | zero_cols = np.nonzero(arr.sum(axis=0) == 0) 127 | arr = np.delete(arr, zero_cols, axis=1) 128 | return arr 129 | 130 | 131 | def extract_columns(semantics_dict): 132 | """Extracts the columns for each table (for the inner dictionary) 133 | from the semantic dictionary 134 | 135 | Args: 136 | semantics_dict (dict): dict of dicts 137 | 138 | Returns: 139 | (list): union of all columns from all tables 140 | 141 | """ 142 | columns = set() 143 | for table in semantics_dict: 144 | column_dict = semantics_dict[table] 145 | columns = columns.union(set(column_dict)) 146 | return list(columns) 147 | -------------------------------------------------------------------------------- /forecaster/spectral/Two_Stage_Regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | from sklearn.kernel_approximation import RBFSampler 4 | import scipy as sp 5 | import Utilities 6 | 7 | ############################################################################### 8 | # Project into RKHS 9 | ############################################################################### 10 | 11 | def ridgeRegression(X, Y, lr): 12 | 13 | A = X.dot(Y.T) 14 | B = Y.dot(Y.T) + lr*np.eye(Y.dot(Y.T).shape[0]) 15 | X, _, _, _ = np.linalg.lstsq(B.T, A.T) 16 | return X.T 17 | 18 | def batchRidgeRegression(XY_T, Y, lr): 19 | A = XY_T 20 | B = Y.dot(Y.T) + lr*np.eye(Y.dot(Y.T).shape[0]) 21 | Z, _, _, _ = np.linalg.lstsq(B.T, A.T) 22 | return Z.T 23 | 24 | def doubleBatchRidgeRegression(XY_T, YY_T, lr): 25 | A = XY_T 26 | B = YY_T + lr*np.eye(YY_T.shape[0]) 27 | Z, _, _, _ = np.linalg.lstsq(B.T, A.T) 28 | return Z.T 29 | 30 | def batch_transform(data, dataFun, rbf_sampler, U, matrix_batch_size): 31 | nSvd = U.shape[0] 32 | nData = data.shape[1] 33 | 34 | X_rbf = np.zeros((nSvd, nData)) 35 | for i in range(0,data.shape[1], matrix_batch_size): 36 | if i % (math.ceil(data.shape[1]/(matrix_batch_size*10))*matrix_batch_size) == 0: 37 | print(i,'/',nData) 38 | end = min(i+matrix_batch_size, nData) 39 | y = dataFun(data[:,i:end]) 40 | X_rbf[:,i:end] = U.dot(rbf_sampler.transform(y.T).T) 41 | return X_rbf 42 | 43 | def batch_svd(data, dataFun, rbf_sampler, args): 44 | C = np.zeros((args.nRFF,args.nRFF)) 45 | for i in range(0, data.shape[1], args.matrix_batch_size): 46 | if i % (math.ceil(data.shape[1]/(args.matrix_batch_size*10))*args.matrix_batch_size) == 0: 47 | print(i,'/', data.shape[1]) 48 | end = min(i+args.matrix_batch_size, data.shape[1]) 49 | x = dataFun(data[:,i:end]) 50 | x = rbf_sampler.transform(x.T).T 51 | C += x.dot(x.T) 52 | U, S, V = sp.sparse.linalg.svds(C, args.nhid) 53 | return U.T 54 | 55 | #def two_stage_regression2(corpus, batch_size, kernel_width, seed, nRFF, nSvd, reg_rate): 56 | #onehot(x[i:end],[input_dim,end-i]) 57 | 58 | def two_stage_regression(data, obsFun, pastFun, futureFun, shiftedFutureFun, outputFun, args): 59 | 60 | # create RBF projection 61 | obs_rbf_sampler = RBFSampler(gamma=args.kernel_width, random_state=args.seed*5, n_components=args.nRFF) 62 | obs_rbf_sampler.fit(np.zeros((1, obsFun(data[:,0].reshape(-1,1)).shape[0]))) 63 | past_rbf_sampler = RBFSampler(gamma=args.kernel_width, random_state=args.seed*7, n_components=args.nRFF) 64 | past_rbf_sampler.fit(np.zeros((1, pastFun(data[:,0].reshape(-1,1)).shape[0]))) 65 | future_rbf_sampler = RBFSampler(gamma=args.kernel_width, random_state=args.seed*9, n_components=args.nRFF) 66 | future_rbf_sampler.fit(np.zeros((1, futureFun(data[:,0].reshape(-1,1)).shape[0]))) 67 | 68 | # Calculate linear projection of RFF 69 | U_obs = batch_svd(data, obsFun, obs_rbf_sampler, args) 70 | U_past = batch_svd(data, pastFun, past_rbf_sampler, args) 71 | U_future = batch_svd(data, futureFun, future_rbf_sampler, args) 72 | 73 | # Project data using RBF then U 74 | Obs_U = batch_transform(data, obsFun, obs_rbf_sampler, U_obs, args.matrix_batch_size) 75 | P_U = batch_transform(data, pastFun, past_rbf_sampler, U_past, args.matrix_batch_size) 76 | F_U = batch_transform(data, futureFun, future_rbf_sampler, U_future, args.matrix_batch_size) 77 | FS_U = batch_transform(data, shiftedFutureFun, future_rbf_sampler, U_future, args.matrix_batch_size) 78 | 79 | data = data 80 | 81 | 82 | # stage 1 regression 83 | W_F_P = ridgeRegression(F_U, P_U, args.reg_rate) 84 | FE_P = np.zeros((F_U.shape[0]*Obs_U.shape[0], F_U.shape[0])) 85 | for i in range(0,F_U.shape[1], args.matrix_batch_size): 86 | if i % (math.ceil(F_U.shape[1]/(args.matrix_batch_size*10))*args.matrix_batch_size) == 0: 87 | print(i,'/',F_U.shape[1]) 88 | 89 | end = min(i+args.matrix_batch_size, F_U.shape[1]) 90 | FE_U_batch = Utilities.flat_prod(FS_U[:,i:end],Obs_U[:,i:end]) 91 | P_U_batch = P_U[:,i:end] 92 | FE_P += FE_U_batch.dot(P_U_batch.T) 93 | W_FE_P = batchRidgeRegression(FE_P, P_U, args.reg_rate) 94 | 95 | # apply stage 1 regression to data to generate input for stage2 regression 96 | E_F = W_F_P.dot(P_U) 97 | E_FE_F = np.zeros((W_FE_P.shape[0], F_U.shape[0])) 98 | for i in range(0,F_U.shape[1], args.matrix_batch_size): 99 | if i % (math.ceil(F_U.shape[1]/(args.matrix_batch_size*10))*args.matrix_batch_size) == 0: 100 | print(i,'/',F_U.shape[1]) 101 | 102 | end = min(i+args.matrix_batch_size, F_U.shape[1]) 103 | E_FE_batch = W_FE_P.dot(P_U[:,i:end]) 104 | E_F_batch = W_F_P.dot(P_U[:,i:end]) 105 | E_FE_F += E_FE_batch.dot(E_F_batch.T) 106 | 107 | # stage 2 regression 108 | W_FE_F = batchRidgeRegression(E_FE_F, E_F, args.reg_rate) 109 | 110 | # calculate initial state 111 | x_1 = np.mean(F_U,1).reshape(1,-1) 112 | 113 | # regress from state to predictions 114 | F_FU = np.zeros((outputFun(data[:,0].reshape(-1,1)).shape[0], args.nhid)) 115 | for i in range(0, data.shape[1], args.matrix_batch_size): 116 | if i % (math.ceil(data.shape[1]/(args.matrix_batch_size*10))*args.matrix_batch_size) == 0: 117 | print(i,'/',data.shape[1]) 118 | 119 | end = min(i+args.matrix_batch_size, data.shape[1]) 120 | output_batch = outputFun(data[:,i:end]) 121 | FU_batch = F_U[:,i:end] 122 | F_FU += output_batch.dot(FU_batch.T) 123 | 124 | W_pred = batchRidgeRegression(F_FU, F_U, args.reg_rate) 125 | 126 | return obs_rbf_sampler, U_obs, W_FE_F, np.zeros(W_FE_F.shape[0]), W_pred, np.zeros(W_pred.shape[0]), x_1 127 | -------------------------------------------------------------------------------- /pre-processor/templatizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.5 2 | 3 | import sys 4 | import glob 5 | import collections 6 | import time 7 | import csv 8 | import os 9 | import datetime 10 | import gzip 11 | import re 12 | import argparse 13 | from multiprocessing import Process 14 | 15 | csv.field_size_limit(sys.maxsize) 16 | 17 | TIME_STAMP_STEP = datetime.timedelta(minutes=1) 18 | STATEMENTS = ['select', 'SELECT', 'INSERT', 'insert', 'UPDATE', 'update', 'delete', 'DELETE'] 19 | 20 | # ============================================== 21 | # PROJECT CONFIGURATIONS 22 | # ============================================== 23 | 24 | PROJECTS = { 25 | "tiramisu": { 26 | "name": "tiramisu", 27 | "files": "dbp*postgresql-*.anonymized.sample.gz", 28 | "mysql": False, 29 | "query_index": 3, 30 | "time_stamp_format": "%Y-%m-%d %H:%M:%S" 31 | }, 32 | "admissions": { 33 | "name": "admissions", 34 | "files": "magneto.log.*.anonymized.gz", 35 | "mysql": True, 36 | "type_index": 3, 37 | "query_index": 4, 38 | "time_stamp_format": "%Y-%m-%d %H:%M:%S" 39 | }, 40 | "oli": { 41 | "name": "oli", 42 | "files": "db*logfile*.anonymized.gz", 43 | "mysql": True, 44 | "type_index": 2, 45 | "query_index": 3, 46 | "time_stamp_format": "%y%m%d %H:%M:%S" 47 | } 48 | } 49 | 50 | 51 | def ProcessData(path, output_dir, num_logs, config): 52 | # input: string of path to csv file 53 | # output: array of tuples 54 | # tuple setup: (time_stamp, query) 55 | # type: (datetime object, string) 56 | 57 | # Define time tracker 58 | #over_all_start = time.time() 59 | #start = time.time() 60 | 61 | print("Start processing: " + path) 62 | 63 | data = [] 64 | processed_queries = 0 65 | templated_workload = dict() 66 | 67 | min_timestamp = datetime.datetime.max 68 | max_timestamp = datetime.datetime.min 69 | 70 | try: 71 | f = gzip.open(path, mode='rt') 72 | reader = csv.reader(f, delimiter=',') 73 | 74 | for query_info in reader: 75 | processed_queries += 1 76 | 77 | if (not num_logs is None) and processed_queries > num_logs: 78 | break 79 | 80 | if config['name'] == 'tiramisu': 81 | time_stamp = query_info[0] 82 | time_stamp = time_stamp[: -8] # remove milliseconds and the time zone 83 | 84 | else: 85 | if query_info[config['type_index']] != 'Query': # skip if not a query 86 | continue 87 | 88 | # create timestamp 89 | if config['name'] == 'admissions': 90 | day = query_info[0] 91 | time = query_info[1].split(".")[0] # removes the milliseconds 92 | time_stamp = day + " " + time 93 | 94 | if config['name'] == 'oli': 95 | time_stamp = query_info[0] 96 | if time_stamp[7] == ' ': 97 | time_stamp = time_stamp[0: 7] + '0' + time_stamp[8: -1] 98 | #IF 99 | 100 | time_stamp = datetime.datetime.strptime( 101 | time_stamp, config['time_stamp_format']) 102 | time_stamp = time_stamp.replace(second=0) # accurate to the minute 103 | # Format query 104 | query = query_info[config['query_index']] 105 | 106 | for stmt in STATEMENTS: 107 | idx = query.find(stmt) 108 | if idx >= 0: 109 | break 110 | 111 | if idx < 0: 112 | continue 113 | 114 | min_timestamp = min(min_timestamp, time_stamp) 115 | max_timestamp = max(max_timestamp, time_stamp) 116 | 117 | # Update query templates 118 | GetTemplate(query[idx:], time_stamp, templated_workload) 119 | 120 | except Exception as e: 121 | print("It might be an incomplete file. But we continue anyway.") 122 | print(e) 123 | 124 | 125 | MakeCSVFiles(templated_workload, min_timestamp, max_timestamp, output_dir + '/' + 126 | path.split('/')[-1].split('.gz')[0] + '/') 127 | 128 | #end = time.time() 129 | #print("Preprocess and template extraction time for %s: %s" % (path, str(end - start))) 130 | 131 | def GetTemplate(query, time_stamp, templated_workload): 132 | # CHANGE: Returns a dictionary, where keys are templates, and they map to 133 | # a map of timestamps map to query counts with that timestamp 134 | 135 | STRING_REGEX = r'([^\\])\'((\')|(.*?([^\\])\'))' 136 | DOUBLE_QUOTE_STRING_REGEX = r'([^\\])"((")|(.*?([^\\])"))' 137 | 138 | INT_REGEX = r'([^a-zA-Z])-?\d+(\.\d+)?' # To prevent us from capturing table name like "a1" 139 | 140 | HASH_REGEX = r'(\'\d+\\.*?\')' 141 | 142 | template = re.sub(HASH_REGEX, r"@@@", query) 143 | template = re.sub(STRING_REGEX, r"\1&&&", template) 144 | template = re.sub(DOUBLE_QUOTE_STRING_REGEX, r"\1&&&", template) 145 | template = re.sub(INT_REGEX, r"\1#", template) 146 | 147 | if template in templated_workload: 148 | # add timestamp 149 | if time_stamp in templated_workload[template]: 150 | templated_workload[template][time_stamp] += 1 151 | else: 152 | templated_workload[template][time_stamp] = 1 153 | else: 154 | templated_workload[template] = dict() 155 | templated_workload[template][time_stamp] = 1 156 | 157 | return templated_workload 158 | 159 | 160 | def MakeCSVFiles(workload_dict, min_timestamp, max_timestamp, output_dir): 161 | print("Generating CSV files...") 162 | print(output_dir) 163 | 164 | # Create the result folder if not exists 165 | if not os.path.exists(output_dir): 166 | os.makedirs(output_dir) 167 | 168 | # delete any old existing files 169 | for old_file in os.listdir(output_dir): 170 | os.remove(output_dir + old_file) 171 | 172 | template_count = 0 173 | for template in workload_dict: 174 | #print(template) 175 | template_timestamps = workload_dict[ 176 | template] # time stamps for ith cluster 177 | #time_stamp_dict = collections.OrderedDict() 178 | num_queries_for_template = sum(template_timestamps.values()) 179 | 180 | # loops over timestamps stepping by TIME_STAMP_STEP 181 | #for i in range( 182 | # int((max_timestamp - min_timestamp) / TIME_STAMP_STEP) + 1): 183 | # time_stamp = min_timestamp + (i * TIME_STAMP_STEP) 184 | # if time_stamp in template_timestamps: 185 | # count = template_timestamps[time_stamp] 186 | # else: 187 | # count = 0 188 | 189 | # time_stamp_dict[time_stamp] = count 190 | 191 | # write to csv file 192 | with open(output_dir + 'template' + str(template_count) + 193 | ".csv", 'w') as csvfile: 194 | template_writer = csv.writer(csvfile, dialect='excel') 195 | template_writer.writerow([num_queries_for_template, template]) 196 | for entry in sorted(template_timestamps.keys()): 197 | template_writer.writerow([entry, template_timestamps[entry]]) 198 | #for entry in time_stamp_dict: 199 | # template_writer.writerow([entry, time_stamp_dict[entry]]) 200 | csvfile.close() 201 | template_count += 1 202 | 203 | print("Template count: " + str(template_count)) 204 | 205 | def ProcessAnonymizedLogs(input_dir, output_dir, max_log, config): 206 | target = os.path.join(input_dir, config['files']) 207 | files = sorted([ x for x in glob.glob(target) ]) 208 | 209 | proc = [] 210 | for i, log_file in enumerate(files): 211 | #if i < 45: 212 | # continue 213 | print(i, log_file) 214 | 215 | #continue 216 | 217 | # Process log 218 | p = Process(target = ProcessData, args = (log_file, output_dir, max_log, config)) 219 | p.start() 220 | proc.append(p) 221 | 222 | for p in proc: 223 | p.join() 224 | 225 | 226 | # ============================================== 227 | # main 228 | # ============================================== 229 | if __name__ == '__main__': 230 | aparser = argparse.ArgumentParser(description='Templatize SQL Queries') 231 | aparser.add_argument('project', choices=PROJECTS.keys(), help='Data source type') 232 | aparser.add_argument('--dir', help='Input Data Directory') 233 | aparser.add_argument('--output', help='Output data directory') 234 | aparser.add_argument('--max_log', type=int, help='Maximum number of logs to process in a' 235 | 'data file. Process the whole file if not provided') 236 | args = vars(aparser.parse_args()) 237 | 238 | ProcessAnonymizedLogs(args['dir'], args['output'], args['max_log'], PROJECTS[args['project']]) 239 | 240 | 241 | -------------------------------------------------------------------------------- /clusterer/generate-cluster-coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import fnmatch 3 | import csv 4 | from datetime import datetime, timedelta 5 | import datetime as dt 6 | import sys 7 | import os 8 | import pickle 9 | import numpy as np 10 | import shutil 11 | import argparse 12 | 13 | import matplotlib.pyplot as plt 14 | import matplotlib.ticker as plticker 15 | import matplotlib.dates as mpdates 16 | import matplotlib as mpl 17 | 18 | from sortedcontainers import SortedDict 19 | 20 | DATA_DICT = { 21 | #'admission': "../synthetic_workload/noise/", 22 | 'admission': "../clustering/timeseries/admissions/admission-combined-results-full/", 23 | 'oli': "oli-combined-results/", 24 | 'tiramisu': 'tiramisu-combined-csv/', 25 | } 26 | 27 | # Only looks at the csv files for the first 10 templates for testing purpose 28 | TESTING = False 29 | 30 | # The number of the largest clusters to consider for coverage evaluation and 31 | # forecasting 32 | MAX_CLUSTER_NUM = 3 33 | 34 | # If it's the full trace used for kernel regression, always aggregate the data 35 | # into 10 minutes intervals 36 | FULL = True 37 | 38 | # If it's the noisy data evaluation, use a smaller time gap to calculate the 39 | # total volume of the largest clusters. In the future we should automatically 40 | # adjust this to the point where the worklaod has shifted after we detect that a 41 | # shift happened (i.e., the majority of the workload comes from unseen queries). 42 | # And of course a long horizon prediction is hard to work if the shift only 43 | # happened for a short period. 44 | NOISE = False 45 | 46 | if FULL: 47 | AGGREGATE = 10 48 | else: 49 | AGGREGATE = 1 50 | 51 | if NOISE: 52 | LAST_TOTAL_TIME_GAP = 1200 # seconds 53 | else: 54 | LAST_TOTAL_TIME_GAP = 86400 # seconds 55 | 56 | DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" # Strip milliseconds ".%f" 57 | 58 | 59 | def LoadData(input_path): 60 | total_queries = dict() 61 | templates = [] 62 | min_date = datetime.max 63 | max_date = datetime.min 64 | data = dict() 65 | data_accu = dict() 66 | 67 | cnt = 0 68 | for csv_file in sorted(os.listdir(input_path)): 69 | print(csv_file) 70 | with open(input_path + "/" + csv_file, 'r') as f: 71 | reader = csv.reader(f) 72 | queries, template = next(reader) 73 | 74 | # To make the matplotlib work... 75 | template = template.replace('$', '') 76 | 77 | # Assume we already filtered out other types of queries when combining template csvs 78 | #statement = template.split(' ',1)[0] 79 | #if not statement in STATEMENTS: 80 | # continue 81 | 82 | #print queries, template 83 | total_queries[template] = int(queries) 84 | #print queries 85 | 86 | templates.append(template) 87 | 88 | # add template 89 | data[template] = SortedDict() 90 | data_accu[template] = SortedDict() 91 | 92 | total = 0 93 | 94 | for line in reader: 95 | time_stamp = datetime.strptime(line[0], DATETIME_FORMAT) 96 | count = int(line[1]) 97 | 98 | data[template][time_stamp] = count 99 | 100 | total += count 101 | data_accu[template][time_stamp] = total 102 | 103 | min_date = min(min_date, time_stamp) 104 | max_date = max(max_date, time_stamp) 105 | #break 106 | 107 | cnt += 1 108 | 109 | if TESTING: 110 | if cnt == 10: 111 | break 112 | 113 | templates = sorted(templates) 114 | 115 | return min_date, max_date, data, data_accu, total_queries, templates 116 | 117 | def GenerateData(min_date, max_date, data, data_accu, templates, assignment_dict, total_queries, 118 | num_clusters, output_csv_dir): 119 | plotted_total = 0 120 | plotted_cnt = 0 121 | totals = [] 122 | 123 | coverage_lists = [[] for i in range(MAX_CLUSTER_NUM)] 124 | 125 | top_clusters = [] 126 | 127 | online_clusters = dict() 128 | 129 | last_date = min_date 130 | if FULL: 131 | # Normal full evaluation 132 | assignment_dict = assignment_dict[0:] 133 | # used for the micro evaluation only for the spike patterns 134 | #assignment_dict = assignment_dict[365:] 135 | for current_date, assignments in assignment_dict: 136 | cluster_totals = dict() 137 | date_total = 0 138 | 139 | for template, cluster in assignments.items(): 140 | if cluster == -1: 141 | continue 142 | 143 | last_total_date = next(data_accu[template].irange(maximum = current_date, reverse = 144 | True)) 145 | if (current_date - last_total_date).seconds < LAST_TOTAL_TIME_GAP: 146 | template_total = data_accu[template][last_total_date] 147 | else: 148 | template_total = 0 149 | date_total += template_total 150 | 151 | if not cluster in cluster_totals: 152 | cluster_totals[cluster] = template_total 153 | else: 154 | cluster_totals[cluster] += template_total 155 | 156 | if len(cluster_totals) == 0: 157 | last_date = current_date 158 | continue 159 | 160 | sorted_clusters = sorted(cluster_totals.items(), key = lambda x: x[1], reverse = True) 161 | 162 | sorted_names, sorted_totals = zip(*sorted_clusters) 163 | 164 | current_top_clusters = sorted_clusters[:MAX_CLUSTER_NUM] 165 | print(current_date, current_top_clusters) 166 | 167 | if FULL: 168 | record_ahead_time = timedelta(days = 30) 169 | else: 170 | record_ahead_time = timedelta(days = 8) 171 | 172 | for c, v in current_top_clusters: 173 | if not c in online_clusters: 174 | online_clusters[c] = SortedDict() 175 | for template, cluster in assignments.items(): 176 | if cluster != c: 177 | continue 178 | 179 | if FULL: 180 | start_date = min_date 181 | else: 182 | start_date = max(min_date, last_date - dt.timedelta(weeks = 4)) 183 | for d in data[template].irange(start_date, last_date + record_ahead_time, (True, False)): 184 | if not d in online_clusters[cluster]: 185 | online_clusters[cluster][d] = data[template][d] 186 | else: 187 | online_clusters[cluster][d] += data[template][d] 188 | 189 | 190 | current_top_cluster_names = next(zip(*current_top_clusters)) 191 | for template, cluster in assignments.items(): 192 | if not cluster in current_top_cluster_names: 193 | continue 194 | 195 | for d in data[template].irange(last_date + record_ahead_time, current_date + 196 | record_ahead_time, (True, False)): 197 | if not d in online_clusters[cluster]: 198 | online_clusters[cluster][d] = data[template][d] 199 | else: 200 | online_clusters[cluster][d] += data[template][d] 201 | 202 | top_clusters.append((current_date, current_top_clusters)) 203 | 204 | for i in range(MAX_CLUSTER_NUM): 205 | coverage_lists[i].append(sum(sorted_totals[:i + 1]) / date_total) 206 | 207 | last_date = current_date 208 | 209 | coverage = [ sum(l) / len(l) for l in coverage_lists] 210 | 211 | for c in online_clusters: 212 | if (len(online_clusters[c]) < 2): 213 | continue 214 | l = online_clusters[c].keys()[0] 215 | r = online_clusters[c].keys()[-1] 216 | 217 | n = (r - l).seconds // 60 + (r - l).days * 1440 + 1 218 | dates = [l + dt.timedelta(minutes = i) for i in range(n)] 219 | v = 0 220 | #for d, v in online_clusters[c].items(): 221 | for d in dates: 222 | if d in online_clusters[c]: 223 | v += online_clusters[c][d] 224 | if d.minute % AGGREGATE == 0: 225 | WriteResult(output_csv_dir + "/" + str(c) + ".csv", d, v) 226 | v = 0 227 | 228 | return top_clusters, coverage 229 | 230 | def WriteResult(path, date, data): 231 | with open(path, "a") as csvfile: 232 | writer = csv.writer(csvfile, quoting = csv.QUOTE_ALL) 233 | writer.writerow([date, data]) 234 | 235 | def Main(project, assignment_path, output_csv_dir, output_dir): 236 | with open(assignment_path, 'rb') as f: 237 | num_clusters, assignment_dict, _ = pickle.load(f) 238 | 239 | min_date, max_date, data, data_accu, total_queries, templates = LoadData(DATA_DICT[project]) 240 | 241 | if not os.path.exists(output_dir): 242 | os.makedirs(output_dir) 243 | 244 | if os.path.exists(output_csv_dir): 245 | shutil.rmtree(output_csv_dir) 246 | os.makedirs(output_csv_dir) 247 | 248 | top_clusters, coverage = GenerateData(min_date, max_date, data, data_accu, templates, assignment_dict, 249 | total_queries, num_clusters, output_csv_dir) 250 | 251 | print(assignment_path, coverage) 252 | 253 | with open(output_dir + "coverage.pickle", 'wb') as f: # Python 3: open(..., 'wb') 254 | pickle.dump((top_clusters, coverage), f) 255 | 256 | 257 | # ============================================== 258 | # main 259 | # ============================================== 260 | if __name__ == '__main__': 261 | aparser = argparse.ArgumentParser(description='Generate Cluster Coverage') 262 | aparser.add_argument('--project', help='The name of the workload') 263 | aparser.add_argument('--assignment', help='The pickle file to store the clustering assignment') 264 | aparser.add_argument('--output_csv_dir', help='The directory to put the output csvs') 265 | aparser.add_argument('--output_dir', help='Where to put the output coverage files') 266 | args = vars(aparser.parse_args()) 267 | 268 | Main(args['project'], args['assignment'], args['output_csv_dir'], args['output_dir']) 269 | 270 | -------------------------------------------------------------------------------- /clusterer/online_logical_clustering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | from datetime import datetime 6 | import datetime as dt 7 | import argparse 8 | import csv 9 | import numpy as np 10 | import time 11 | import itertools 12 | import random 13 | import pickle 14 | import re 15 | import math 16 | 17 | import matplotlib.pyplot as plt 18 | import matplotlib.ticker as plticker 19 | import matplotlib.dates as mpdates 20 | import matplotlib as mpl 21 | 22 | from sortedcontainers import SortedDict 23 | 24 | from sklearn.preprocessing import normalize 25 | from sklearn.neighbors import NearestNeighbors 26 | 27 | from logical_clustering_utility.schemaParser import extract_tables_and_columns 28 | from logical_clustering_utility.buildVectors import create_vectors 29 | 30 | csv.field_size_limit(sys.maxsize) 31 | 32 | 33 | OUTPUT_DIR = 'online-logical-clustering-results/' 34 | 35 | YLABEL = r"# Queries / min" 36 | STATEMENTS = ['select', 'SELECT', 'INSERT', 'insert', 'UPDATE', 'update', 'delete', 'DELETE'] 37 | 38 | # "2016-10-31","17:50:21.344030" 39 | DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" # Strip milliseconds ".%f" 40 | 41 | TESTING = False 42 | 43 | USE_KNN = True 44 | KNN_ALG = "kd_tree" 45 | 46 | def LoadData(input_path): 47 | total_queries = dict() 48 | templates = [] 49 | min_date = datetime.max 50 | max_date = datetime.min 51 | data = dict() 52 | 53 | cnt = 0 54 | for csv_file in sorted(os.listdir(input_path)): 55 | print(csv_file) 56 | with open(input_path + "/" + csv_file, 'r') as f: 57 | reader = csv.reader(f) 58 | queries, template = next(reader) 59 | 60 | # To make the matplotlib work... 61 | template = template.replace('$', '') 62 | 63 | # Assume we already filtered out other types of queries when combining template csvs 64 | #statement = template.split(' ',1)[0] 65 | #if not statement in STATEMENTS: 66 | # continue 67 | 68 | #print queries, template 69 | total_queries[template] = int(queries) 70 | #print queries 71 | 72 | templates.append(template) 73 | 74 | # add template 75 | data[template] = SortedDict() 76 | 77 | for line in reader: 78 | time_stamp = datetime.strptime(line[0], DATETIME_FORMAT) 79 | count = int(line[1]) 80 | 81 | data[template][time_stamp] = count 82 | 83 | min_date = min(min_date, time_stamp) 84 | max_date = max(max_date, time_stamp) 85 | 86 | cnt += 1 87 | 88 | if TESTING: 89 | if cnt == 10: 90 | break 91 | 92 | templates = sorted(templates) 93 | 94 | return min_date, max_date, data, total_queries, templates 95 | 96 | def Similarity(x, y): 97 | return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y) + 1e-6) 98 | 99 | def AdjustCluster(min_date, current_date, next_date, data, last_ass, next_cluster, centers, 100 | cluster_totals, total_queries, cluster_sizes, rho, vector_dict): 101 | new_ass = last_ass.copy() 102 | 103 | print("Building kdtree for single point assignment") 104 | clusters = sorted(centers.keys()) 105 | 106 | samples = list() 107 | 108 | for cluster in clusters: 109 | sample = centers[cluster] 110 | samples.append(sample) 111 | 112 | if len(samples) == 0: 113 | nbrs = None 114 | else: 115 | normalized_samples = normalize(np.array(samples), copy = False) 116 | nbrs = NearestNeighbors(n_neighbors=1, algorithm=KNN_ALG, metric='l2') 117 | nbrs.fit(normalized_samples) 118 | 119 | print("Finish building kdtree for single point assignment") 120 | 121 | 122 | cnt = 0 123 | for t in sorted(data.keys()): 124 | cnt += 1 125 | # Test whether this template still belongs to the original cluster 126 | if new_ass[t] != -1: 127 | center = centers[new_ass[t]] 128 | #print(cnt, new_ass[t], Similarity(data[t], center, index)) 129 | if cluster_sizes[new_ass[t]] == 1 or Similarity(vector_dict[t], center) > rho: 130 | continue 131 | 132 | # the template is eliminated from the original cluster 133 | if new_ass[t] != -1: 134 | cluster = new_ass[t] 135 | cluster_sizes[cluster] -= 1 136 | centers[cluster] -= vector_dict[t] 137 | cluster_totals[cluster] -= total_queries[t] 138 | print("%s: template %s quit from cluster %d with total %d" % (next_date, cnt, cluster, 139 | total_queries[t])) 140 | 141 | 142 | # Whether this template has "arrived" yet? 143 | if new_ass[t] == -1 and len(list(data[t].irange(current_date, next_date))) == 0: 144 | continue 145 | 146 | new_cluster = None 147 | if nbrs != None: 148 | # whether this template is similar to the center of an existing cluster 149 | nbr = nbrs.kneighbors(normalize([vector_dict[t]]), return_distance = False)[0][0] 150 | if Similarity(vector_dict[t], centers[clusters[nbr]]) > rho: 151 | new_cluster = clusters[nbr] 152 | 153 | if new_cluster != None: 154 | if new_ass[t] == -1: 155 | print("%s: template %s joined cluster %d with total %d" % (next_date, cnt, 156 | new_cluster, total_queries[t])) 157 | else: 158 | print("%s: template %s reassigned to cluster %d with total %d" % (next_date, 159 | cnt, new_cluster, total_queries[t])) 160 | 161 | new_ass[t] = new_cluster 162 | centers[cluster] += vector_dict[t] 163 | cluster_totals[cluster] += total_queries[t] 164 | cluster_sizes[new_cluster] += 1 165 | continue 166 | 167 | if new_ass[t] == -1: 168 | print("%s: template %s created cluster as %d with total %d" % (next_date, cnt, 169 | next_cluster, total_queries[t])) 170 | else: 171 | print("%s: template %s recreated cluster as %d with total %d" % (next_date, cnt, 172 | next_cluster, total_queries[t])) 173 | 174 | new_ass[t] = next_cluster 175 | centers[next_cluster] = vector_dict[t] 176 | cluster_sizes[next_cluster] = 1 177 | cluster_totals[next_cluster] = total_queries[t] 178 | 179 | next_cluster += 1 180 | 181 | clusters = list(centers.keys()) 182 | # a union-find set to track the root cluster for clusters that have been merged 183 | root = [-1] * len(clusters) 184 | 185 | print("Building kdtree for cluster merging") 186 | samples = list() 187 | 188 | for cluster in clusters: 189 | sample = centers[cluster] 190 | samples.append(sample) 191 | 192 | if len(samples) == 0: 193 | nbrs = None 194 | else: 195 | normalized_samples = normalize(np.array(samples), copy = False) 196 | nbrs = NearestNeighbors(n_neighbors=2, algorithm=KNN_ALG, metric='l2') 197 | nbrs.fit(normalized_samples) 198 | print("Finish building kdtree for cluster merging") 199 | 200 | for i in range(len(clusters)): 201 | c1 = clusters[i] 202 | c = None 203 | 204 | if nbrs != None: 205 | nbr = nbrs.kneighbors([centers[c1]], return_distance = False)[0] 206 | 207 | if clusters[nbr[0]] == c1: 208 | nbr = nbr[1] 209 | else: 210 | nbr = nbr[0] 211 | 212 | while root[nbr] != -1: 213 | nbr = root[nbr] 214 | 215 | if c1 != clusters[nbr] and Similarity(centers[c1], centers[clusters[nbr]]) > rho: 216 | c = clusters[nbr] 217 | 218 | if c != None: 219 | centers[c] += centers[c1] 220 | cluster_sizes[c] += cluster_sizes[c1] 221 | 222 | del centers[c1] 223 | del cluster_sizes[c1] 224 | 225 | if nbrs != None: 226 | root[i] = nbr 227 | 228 | for t in data.keys(): 229 | if new_ass[t] == c1: 230 | new_ass[t] = c 231 | print("%d assigned to %d with total %d" % (c1, c, total_queries[t])) 232 | 233 | print("%s: cluster %d merged into cluster %d" % (next_date, c1, c)) 234 | 235 | return new_ass, next_cluster 236 | 237 | 238 | def OnlineClustering(min_date, max_date, data, total_queries, rho, vector_dict): 239 | print(rho) 240 | cluster_gap = 1440 241 | 242 | n = (max_date - min_date).seconds // 60 + (max_date - min_date).days * 1440 + 1 243 | num_gaps = n // cluster_gap 244 | 245 | centers = dict() 246 | cluster_totals = dict() 247 | cluster_sizes = dict() 248 | 249 | assignments = [] 250 | ass = dict() 251 | for t in data.keys(): 252 | ass[t] = -1 253 | assignments.append((min_date, ass)) 254 | 255 | current_date = min_date 256 | next_cluster = 0 257 | for i in range(num_gaps): 258 | next_date = current_date + dt.timedelta(minutes = cluster_gap) 259 | assign, next_cluster = AdjustCluster(min_date, current_date, next_date, data, assignments[-1][1], 260 | next_cluster, centers, cluster_totals, total_queries, cluster_sizes, rho, 261 | vector_dict) 262 | assignments.append((next_date, assign)) 263 | 264 | current_date = next_date 265 | 266 | 267 | return next_cluster, assignments, cluster_totals 268 | 269 | 270 | # ============================================== 271 | # main 272 | # ============================================== 273 | if __name__ == '__main__': 274 | aparser = argparse.ArgumentParser(description='Logical clusreting') 275 | aparser.add_argument('--dir', default="combined-results", help='The directory that contains the time series' 276 | 'csv files') 277 | aparser.add_argument('--schema_path', help='The path of the schema file') 278 | aparser.add_argument('--project', help='The name of the workload') 279 | aparser.add_argument('--rho', default=0.8, help='The threshold to determine' 280 | 'whether a query template belongs to a cluster') 281 | args = vars(aparser.parse_args()) 282 | 283 | if not os.path.exists(OUTPUT_DIR): 284 | os.makedirs(OUTPUT_DIR) 285 | 286 | min_date, max_date, data, total_queries, templates = LoadData(args['dir']) 287 | 288 | # Pre-processing: template extraction + schema parsing + preprocessing 289 | schema_file = open(args['schema_path'], 'r') 290 | schema_dict = extract_tables_and_columns(schema_file) 291 | 292 | # Get logical vectors for query templates 293 | vector_dict = create_vectors(templates, schema_dict) 294 | 295 | num_clusters, assignment_dict, cluster_totals = OnlineClustering(min_date, max_date, data, 296 | total_queries, float(args['rho']), vector_dict) 297 | 298 | with open(OUTPUT_DIR + "{}-{}-assignments.pickle".format(args['project'], args['rho']), 299 | 'wb') as f: # Python 3: open(..., 'wb') 300 | pickle.dump((num_clusters, assignment_dict, cluster_totals), f) 301 | 302 | print(num_clusters) 303 | print(cluster_totals) 304 | print(sum(cluster_totals.values())) 305 | print(sum(total_queries.values())) 306 | -------------------------------------------------------------------------------- /forecaster/plot-sensitivity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import fnmatch 4 | import csv 5 | import sys 6 | import os 7 | import pickle 8 | import numpy as np 9 | 10 | import matplotlib.pyplot as plt 11 | import matplotlib.ticker as plticker 12 | import matplotlib.dates as mpdates 13 | import matplotlib as mpl 14 | 15 | from datetime import datetime, timedelta 16 | 17 | import seaborn as sns 18 | from sortedcontainers import SortedDict 19 | 20 | DATA_DICT = {'admission': "~/peloton-tf/time-series-clustering/admission-combined-results/", 21 | 'oli': "~/peloton-tf/time-series-clustering/oli-combined-results/", 22 | 'tiramisu': '~/peloton-tf/time-series-clustering/tiramisu-combined-results/', 23 | } 24 | 25 | INPUT_DIR = "../prediction-sensitivity-result/" 26 | GRAPH_DIR = "../plot/sensitivity/" 27 | ASSIGNMENT_DIR = "~/peloton-tf/time-series-clustering/online-clustering-results/" 28 | HORIZON = "60" 29 | AGGREGATE = 60 30 | METHOD = "ar" 31 | PROJECTS = ['admission', 'tiramisu', 'oli'] 32 | RHOS = ['0.5', '0.6', '0.7', '0.8', '0.9'] 33 | #RHOS = ['0.5', '0.55', '0.6', '0.65', '0.7', '0.75', '0.8', '0.85', '0.9', '0.95'] 34 | 35 | DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" # Strip milliseconds ".%f" 36 | 37 | 38 | def SetupMplParams(): 39 | color = sns.color_palette("hls", 4) 40 | mpl.rcParams.update(mpl.rcParamsDefault) 41 | 42 | mpl.rcParams['ps.useafm'] = True 43 | mpl.rcParams['pdf.use14corefonts'] = True 44 | mpl.rcParams['text.usetex'] = True 45 | mpl.rcParams['text.latex.preamble'] = [ 46 | #r'\usepackage{siunitx}', # i need upright \micro symbols, but you need... 47 | #r'\sisetup{detect-all}', # ...this to force siunitx to actually use your fonts 48 | r'\usepackage{helvet}', # set the normal font here 49 | r'\usepackage{sansmath}', # load up the sansmath so that math -> helvet 50 | r'\sansmath' # <- tricky! -- # gotta actually # tell tex to use! 51 | ] 52 | mpl.rcParams['xtick.labelsize'] = 20 53 | mpl.rcParams['ytick.labelsize'] = 20 54 | 55 | return color 56 | 57 | def PlotLineChart(xaxis, data, name, xlabel, ylabel, color): 58 | fig = plt.figure() 59 | fig.set_size_inches(12, 2) 60 | ax = fig.add_subplot(111) 61 | loc = plticker.MultipleLocator(1) # this locator puts ticks at regular intervals 62 | ax.xaxis.set_major_locator(loc) 63 | loc = plticker.MaxNLocator(5) # this locator puts ticks at regular intervals 64 | ax.yaxis.set_major_locator(loc) 65 | ax.grid() 66 | 67 | n = len(xaxis) 68 | ax.set_ylabel(ylabel,fontsize=18,weight='bold') 69 | ax.set_xlabel(xlabel,fontsize=18,weight='bold') 70 | #ax.set_xlim([xaxis[0], xaxis[-1]]) 71 | #ax.set_ylim([0.6, 1.0]) 72 | 73 | ax.set_xticks(range(n)) 74 | ax.set_xticklabels(xaxis) 75 | 76 | ax.plot(range(n), data[0][:n], marker = 'o', color = color[1], label = 'Admissions', linewidth = 3) 77 | ax.plot(range(n), data[1][:n], marker = '^', color = color[3], label = 'BusTracker', linewidth = 3) 78 | ax.plot(range(n), data[2][:n], marker = 's', color = color[0], label = 'MOOC', linewidth = 3) 79 | #ax.legend(bbox_to_anchor = [1, 0.3], loc = 'lower right') 80 | #ax.legend(bbox_to_anchor = [0.5, 1], loc = 'lower center', ncol = 6) 81 | plt.savefig("%s%s.pdf" % (GRAPH_DIR, name), bbox_inches='tight') 82 | 83 | # produce a legend for the objects in the other figure 84 | figLegend = plt.figure(figsize = (4.5,0.4)) 85 | plt.figlegend(*ax.get_legend_handles_labels(), loc = 'center', ncol=6, fontsize=20) 86 | figLegend.savefig(GRAPH_DIR + "legend.pdf", bbox_inches='tight') 87 | plt.close(figLegend) 88 | 89 | plt.close(fig) 90 | 91 | 92 | def LoadData(input_path): 93 | total_queries = dict() 94 | templates = [] 95 | min_date = datetime.max 96 | max_date = datetime.min 97 | data = dict() 98 | data_accu = dict() 99 | 100 | for csv_file in sorted(os.listdir(os.path.expanduser(input_path))): 101 | print(csv_file) 102 | with open(os.path.expanduser(input_path) + "/" + csv_file, 'r') as f: 103 | reader = csv.reader(f) 104 | queries, template = next(reader) 105 | 106 | # To make the matplotlib work... 107 | template = template.replace('$', '') 108 | 109 | # Assume we already filtered out other types of queries when combining template csvs 110 | #statement = template.split(' ',1)[0] 111 | #if not statement in STATEMENTS: 112 | # continue 113 | 114 | #print queries, template 115 | total_queries[template] = int(queries) 116 | #print queries 117 | 118 | templates.append(template) 119 | 120 | # add template 121 | data[template] = SortedDict() 122 | data_accu[template] = SortedDict() 123 | 124 | total = 0 125 | 126 | for line in reader: 127 | ts = datetime.strptime(line[0], DATETIME_FORMAT) 128 | time_stamp = datetime(ts.year, ts.month, ts.day, ts.hour, 0, 0) 129 | count = int(line[1]) 130 | total += count 131 | if not time_stamp in data[template]: 132 | data[template][time_stamp] = 0 133 | data[template][time_stamp] += count 134 | data_accu[template][time_stamp] = total 135 | 136 | min_date = min(min_date, time_stamp) 137 | max_date = max(max_date, time_stamp) 138 | 139 | templates = sorted(templates) 140 | 141 | return min_date, max_date, data, data_accu, total_queries, templates 142 | 143 | def GetMSE(input_path): 144 | dates = [] 145 | actual = [] 146 | predict = [] 147 | predict_dict = SortedDict() 148 | with open(input_path) as input_file: 149 | reader = csv.reader(input_file) 150 | for line in reader: 151 | dates.append(datetime.strptime(line[0], DATETIME_FORMAT)) 152 | actual.append(max(0, float(line[1]))) 153 | if line[2] == "inf": 154 | line[2] = 0 155 | predict.append(max(0, float(line[2]))) 156 | predict_dict[dates[-1]] = predict[-1] 157 | 158 | y = np.array(actual) 159 | y_hat = np.array(predict) 160 | 161 | return predict_dict, dates[0], dates[-1] 162 | 163 | def GetDataDict(input_dir): 164 | data_dict = {} 165 | 166 | losses = np.array([]) 167 | for root, dirnames, filenames in os.walk(input_dir): 168 | for filename in sorted(fnmatch.filter(filenames, '*.csv')): 169 | print(filename) 170 | file_path = os.path.join(root, filename) 171 | 172 | data = GetMSE(file_path) 173 | data_dict[file_path] = data 174 | 175 | return data_dict 176 | 177 | def EvaluateAccuracy(data_dict, actual_data, project, assignment_dict, total_queries, data_accu, 178 | rho): 179 | print("start couting total_queries_per_cluster") 180 | total_queries_per_cluster = dict() 181 | 182 | for date, assignments in assignment_dict: 183 | cluster_dict = dict() 184 | total_queries_per_cluster[date] = cluster_dict 185 | for template, cluster in assignments.items(): 186 | if cluster == -1: 187 | continue 188 | if cluster not in cluster_dict: 189 | cluster_dict[cluster] = 0 190 | 191 | template_total = data_accu[template][next(data_accu[template].irange(maximum 192 | = date, reverse = True))] 193 | cluster_dict[cluster] += template_total 194 | 195 | print("finish couting total_queries_per_cluster") 196 | 197 | accuracy_list = [] 198 | for tp in data_dict: 199 | if (tp.find(project) >= 0 and tp.find("/" + rho + "/") >=0 and tp.find("horizon-" + HORIZON) 200 | >= 0 and tp.find("/{}/".format(METHOD)) >= 0): 201 | #print(project, rho, HORIZON) 202 | #print(tp) 203 | data = data_dict[tp] 204 | predict_cluster = int(os.path.splitext(os.path.basename(tp))[0]) 205 | 206 | 207 | for current_date, assignments in assignment_dict: 208 | if current_date < data[1]: 209 | continue 210 | if current_date > data[2] - timedelta(hours = 24): 211 | break 212 | 213 | #print(current_date) 214 | cluster_total = total_queries_per_cluster[current_date] 215 | 216 | for template, cluster in assignments.items(): 217 | #print(cluster, predict_cluster) 218 | if cluster != predict_cluster: 219 | continue 220 | 221 | dates = [current_date + timedelta(hours = i) for i in range(24)] 222 | for i, date in enumerate(dates): 223 | template_total = data_accu[template][next(data_accu[template].irange(maximum 224 | = current_date, reverse = True))] 225 | 226 | predict_value = data[0][next(data[0].irange(maximum = date, reverse = True))] 227 | expected_value = (predict_value * template_total / 228 | total_queries_per_cluster[current_date][cluster]) 229 | 230 | actual_value = actual_data[template][next(actual_data[template].irange(maximum = date, reverse = True))] 231 | #print(rho, cluster, date, actual_value, expected_value, predict_value) 232 | #accuracy_list.append((max(np.log(expected_value), 0) - 233 | # max(np.log(actual_value), 0)) ** 2) 234 | accuracy_list.append((expected_value - actual_value) ** 2) 235 | 236 | if len(accuracy_list) > 0: 237 | return np.sum(np.log(accuracy_list)) / len(assignment_dict) / 3000 238 | #return np.sum(accuracy_list) / len(assignment_dict) / 3000 239 | else: 240 | return 0 241 | 242 | 243 | def GetAccuracyData(data_dict): 244 | accuracies = [] 245 | for project in PROJECTS: 246 | _, _, actual_data, data_accu, total_queries, _ = LoadData(DATA_DICT[project]) 247 | accuracy_list = [] 248 | for rho in RHOS: 249 | with open(os.path.expanduser(ASSIGNMENT_DIR) + "{}-{}-assignments.pickle".format(project, rho), 'rb') as f: 250 | num_clusters, assignment_dict, cluster_totals = pickle.load(f) 251 | 252 | accuracy = EvaluateAccuracy(data_dict, actual_data, project, assignment_dict, 253 | total_queries, data_accu, rho) 254 | accuracy_list.append(accuracy) 255 | 256 | accuracies.append(accuracy_list) 257 | print(accuracy_list) 258 | 259 | return accuracies 260 | 261 | def Main(): 262 | color = SetupMplParams() 263 | 264 | data_dict = GetDataDict(INPUT_DIR) 265 | 266 | accuracy_data = GetAccuracyData(data_dict) 267 | 268 | PlotLineChart(RHOS, accuracy_data, "accuracy-sensitivity-{}-horizon{}".format(METHOD, HORIZON), 269 | "Similarity Threshold($\\rho$)", "MSE (log space)", color) 270 | 271 | 272 | # ============================================== 273 | # main 274 | # ============================================== 275 | if __name__ == '__main__': 276 | """ 277 | Generate MSE result plots for the sensitivity analysis of rho 278 | """ 279 | Main() 280 | -------------------------------------------------------------------------------- /clusterer/online_clustering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | from datetime import datetime 6 | import datetime as dt 7 | import argparse 8 | import csv 9 | import numpy as np 10 | import time 11 | import itertools 12 | import random 13 | import pickle 14 | import re 15 | import math 16 | 17 | import matplotlib.pyplot as plt 18 | import matplotlib.ticker as plticker 19 | import matplotlib.dates as mpdates 20 | import matplotlib as mpl 21 | 22 | from sortedcontainers import SortedDict 23 | 24 | from sklearn.preprocessing import normalize 25 | from sklearn.neighbors import NearestNeighbors 26 | 27 | csv.field_size_limit(sys.maxsize) 28 | 29 | 30 | # Only looks at the csv files for the first 10 templates for testing purpose 31 | TESTING = False 32 | 33 | # Whether use the KNN module from sklearn to accelerate finding the closest center 34 | USE_KNN = True 35 | # Which high-dimentional indexing algorithm to use 36 | KNN_ALG = "kd_tree" 37 | 38 | 39 | OUTPUT_DIR = 'online-clustering-results/' 40 | STATEMENTS = ['select', 'SELECT', 'INSERT', 'insert', 'UPDATE', 'update', 'delete', 'DELETE'] 41 | # "2016-10-31","17:50:21.344030" 42 | DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" # Strip milliseconds ".%f" 43 | 44 | 45 | def LoadData(input_path): 46 | total_queries = dict() 47 | templates = [] 48 | min_date = datetime.max 49 | max_date = datetime.min 50 | data = dict() 51 | 52 | cnt = 0 53 | for csv_file in sorted(os.listdir(input_path)): 54 | print(csv_file) 55 | with open(input_path + "/" + csv_file, 'r') as f: 56 | reader = csv.reader(f) 57 | queries, template = next(reader) 58 | 59 | # To make the matplotlib work... 60 | template = template.replace('$', '') 61 | 62 | # Assume we already filtered out other types of queries when combining template csvs 63 | #statement = template.split(' ',1)[0] 64 | #if not statement in STATEMENTS: 65 | # continue 66 | 67 | #print queries, template 68 | total_queries[template] = int(queries) 69 | #print queries 70 | 71 | templates.append(template) 72 | 73 | # add template 74 | data[template] = SortedDict() 75 | 76 | for line in reader: 77 | time_stamp = datetime.strptime(line[0], DATETIME_FORMAT) 78 | count = int(line[1]) 79 | 80 | data[template][time_stamp] = count 81 | 82 | min_date = min(min_date, time_stamp) 83 | max_date = max(max_date, time_stamp) 84 | 85 | cnt += 1 86 | 87 | if TESTING: 88 | if cnt == 10: 89 | break 90 | 91 | templates = sorted(templates) 92 | 93 | return min_date, max_date, data, total_queries, templates 94 | 95 | def Similarity(x, y, index): 96 | sumxx, sumxy, sumyy = 0, 0, 0 97 | for i in index: 98 | xi = x[i] if i in x else 0 99 | yi = y[i] if i in y else 0 100 | 101 | sumxx += xi * xi 102 | sumyy += yi * yi 103 | sumxy += xi * yi 104 | 105 | return sumxy / (math.sqrt(sumxx * sumyy) + 1e-6) 106 | 107 | def ExtractSample(x, index): 108 | v = [] 109 | for i in index: 110 | if i in x: 111 | v.append(x[i]) 112 | else: 113 | v.append(0) 114 | 115 | return np.array(v) 116 | 117 | def AddToCenter(center, lower_date, upper_date, data, positive = True): 118 | total = 0 119 | for d in data.irange(lower_date, upper_date, (True, False)): 120 | total += data[d] 121 | 122 | if d in center: 123 | if positive: 124 | center[d] += data[d] 125 | else: 126 | center[d] -= data[d] 127 | else: 128 | center[d] = data[d] 129 | 130 | return total 131 | 132 | def AdjustCluster(min_date, current_date, next_date, data, last_ass, next_cluster, centers, 133 | cluster_totals, total_queries, cluster_sizes, rho): 134 | n = (next_date - min_date).seconds // 60 + (next_date - min_date).days * 1440 + 1 135 | num_sample = 10000 136 | if n > num_sample: 137 | index = random.sample(range(0, n), num_sample) 138 | else: 139 | index = range(0, n) 140 | index = [ min_date + dt.timedelta(minutes = i) for i in index] 141 | 142 | new_ass = last_ass.copy() 143 | 144 | # Update cluster centers with new data in the last gap 145 | for cluster in centers.keys(): 146 | for template in last_ass: 147 | if last_ass[template] == cluster: 148 | cluster_totals[cluster] += AddToCenter(centers[cluster], current_date, next_date, data[template]) 149 | 150 | if USE_KNN: 151 | print("Building kdtree for single point assignment") 152 | clusters = sorted(centers.keys()) 153 | 154 | samples = list() 155 | 156 | for cluster in clusters: 157 | sample = ExtractSample(centers[cluster], index) 158 | samples.append(sample) 159 | 160 | if len(samples) == 0: 161 | nbrs = None 162 | else: 163 | normalized_samples = normalize(np.array(samples), copy = False) 164 | nbrs = NearestNeighbors(n_neighbors=1, algorithm=KNN_ALG, metric='l2') 165 | nbrs.fit(normalized_samples) 166 | 167 | print("Finish building kdtree for single point assignment") 168 | 169 | 170 | cnt = 0 171 | for t in sorted(data.keys()): 172 | cnt += 1 173 | # Test whether this template still belongs to the original cluster 174 | if new_ass[t] != -1: 175 | center = centers[new_ass[t]] 176 | #print(cnt, new_ass[t], Similarity(data[t], center, index)) 177 | if cluster_sizes[new_ass[t]] == 1 or Similarity(data[t], center, index) > rho: 178 | continue 179 | 180 | # the template is eliminated from the original cluster 181 | if new_ass[t] != -1: 182 | cluster = new_ass[t] 183 | #print(centers[new_ass[t]]) 184 | #print([ (d, data[t][d]) for d in data[t].irange(min_date, next_date, (True, False))]) 185 | cluster_sizes[cluster] -= 1 186 | AddToCenter(centers[cluster], min_date, next_date, data[t], False) 187 | print("%s: template %s quit from cluster %d with total %d" % (next_date, cnt, cluster, 188 | total_queries[t])) 189 | 190 | 191 | # Whether this template has "arrived" yet? 192 | if new_ass[t] == -1 and len(list(data[t].irange(current_date, next_date))) == 0: 193 | continue 194 | 195 | # whether this template is similar to the center of an existing cluster 196 | new_cluster = None 197 | if USE_KNN == False or nbrs == None: 198 | for cluster in centers.keys(): 199 | center = centers[cluster] 200 | if Similarity(data[t], center, index) > rho: 201 | new_cluster = cluster 202 | break 203 | else: 204 | nbr = nbrs.kneighbors(normalize([ExtractSample(data[t], index)]), return_distance = False)[0][0] 205 | if Similarity(data[t], centers[clusters[nbr]], index) > rho: 206 | new_cluster = clusters[nbr] 207 | 208 | if new_cluster != None: 209 | if new_ass[t] == -1: 210 | print("%s: template %s joined cluster %d with total %d" % (next_date, cnt, 211 | new_cluster, total_queries[t])) 212 | else: 213 | print("%s: template %s reassigned to cluster %d with total %d" % (next_date, 214 | cnt, new_cluster, total_queries[t])) 215 | 216 | new_ass[t] = new_cluster 217 | AddToCenter(centers[new_cluster], min_date, next_date, data[t]) 218 | cluster_sizes[new_cluster] += 1 219 | continue 220 | 221 | if new_ass[t] == -1: 222 | print("%s: template %s created cluster as %d with total %d" % (next_date, cnt, 223 | next_cluster, total_queries[t])) 224 | else: 225 | print("%s: template %s recreated cluster as %d with total %d" % (next_date, cnt, 226 | next_cluster, total_queries[t])) 227 | 228 | new_ass[t] = next_cluster 229 | centers[next_cluster] = SortedDict() 230 | AddToCenter(centers[next_cluster], min_date, next_date, data[t]) 231 | cluster_sizes[next_cluster] = 1 232 | cluster_totals[next_cluster] = 0 233 | 234 | next_cluster += 1 235 | 236 | clusters = list(centers.keys()) 237 | # a union-find set to track the root cluster for clusters that have been merged 238 | root = [-1] * len(clusters) 239 | 240 | if USE_KNN: 241 | print("Building kdtree for cluster merging") 242 | 243 | samples = list() 244 | 245 | for cluster in clusters: 246 | sample = ExtractSample(centers[cluster], index) 247 | samples.append(sample) 248 | 249 | if len(samples) == 0: 250 | nbrs = None 251 | else: 252 | normalized_samples = normalize(np.array(samples), copy = False) 253 | nbrs = NearestNeighbors(n_neighbors=2, algorithm=KNN_ALG, metric='l2') 254 | nbrs.fit(normalized_samples) 255 | 256 | print("Finish building kdtree for cluster merging") 257 | 258 | for i in range(len(clusters)): 259 | c1 = clusters[i] 260 | c = None 261 | 262 | if USE_KNN == False or nbrs == None: 263 | for j in range(i + 1, len(clusters)): 264 | c2 = clusters[j] 265 | if Similarity(centers[c1], centers[c2], index) > rho: 266 | c = c2 267 | break 268 | else: 269 | nbr = nbrs.kneighbors([ExtractSample(centers[c1], index)], return_distance = False)[0] 270 | 271 | if clusters[nbr[0]] == c1: 272 | nbr = nbr[1] 273 | else: 274 | nbr = nbr[0] 275 | 276 | while root[nbr] != -1: 277 | nbr = root[nbr] 278 | 279 | if c1 != clusters[nbr] and Similarity(centers[c1], centers[clusters[nbr]], index) > rho: 280 | c = clusters[nbr] 281 | 282 | if c != None: 283 | AddToCenter(centers[c], min_date, next_date, centers[c1]) 284 | cluster_sizes[c] += cluster_sizes[c1] 285 | 286 | del centers[c1] 287 | del cluster_sizes[c1] 288 | 289 | if USE_KNN == True and nbrs != None: 290 | root[i] = nbr 291 | 292 | for t in data.keys(): 293 | if new_ass[t] == c1: 294 | new_ass[t] = c 295 | print("%d assigned to %d with total %d" % (c1, c, total_queries[t])) 296 | 297 | print("%s: cluster %d merged into cluster %d" % (next_date, c1, c)) 298 | 299 | return new_ass, next_cluster 300 | 301 | 302 | def OnlineClustering(min_date, max_date, data, total_queries, rho): 303 | print(rho) 304 | cluster_gap = 1440 305 | 306 | n = (max_date - min_date).seconds // 60 + (max_date - min_date).days * 1440 + 1 307 | num_gaps = n // cluster_gap 308 | 309 | centers = dict() 310 | cluster_totals = dict() 311 | cluster_sizes = dict() 312 | 313 | assignments = [] 314 | ass = dict() 315 | for t in data.keys(): 316 | ass[t] = -1 317 | assignments.append((min_date, ass)) 318 | 319 | current_date = min_date 320 | next_cluster = 0 321 | for i in range(num_gaps): 322 | next_date = current_date + dt.timedelta(minutes = cluster_gap) 323 | # Calculate similarities based on arrival rates up to the past month 324 | month_min_date = max(min_date, next_date - dt.timedelta(days = 30)) 325 | assign, next_cluster = AdjustCluster(month_min_date, current_date, next_date, data, assignments[-1][1], 326 | next_cluster, centers, cluster_totals, total_queries, cluster_sizes, rho) 327 | assignments.append((next_date, assign)) 328 | 329 | current_date = next_date 330 | 331 | 332 | return next_cluster, assignments, cluster_totals 333 | 334 | 335 | # ============================================== 336 | # main 337 | # ============================================== 338 | if __name__ == '__main__': 339 | aparser = argparse.ArgumentParser(description='Time series clusreting') 340 | aparser.add_argument('--dir', default="combined-results", help='The directory that contains the time series' 341 | 'csv files') 342 | aparser.add_argument('--project', help='The name of the workload') 343 | aparser.add_argument('--rho', default=0.8, help='The threshold to determine' 344 | 'whether a query template belongs to a cluster') 345 | args = vars(aparser.parse_args()) 346 | 347 | if not os.path.exists(OUTPUT_DIR): 348 | os.makedirs(OUTPUT_DIR) 349 | 350 | min_date, max_date, data, total_queries, templates = LoadData(args['dir']) 351 | 352 | num_clusters, assignment_dict, cluster_totals = OnlineClustering(min_date, max_date, data, 353 | total_queries, float(args['rho'])) 354 | 355 | with open(OUTPUT_DIR + "{}-{}-assignments.pickle".format(args['project'], args['rho']), 356 | 'wb') as f: # Python 3: open(..., 'wb') 357 | pickle.dump((num_clusters, assignment_dict, cluster_totals), f) 358 | 359 | print(num_clusters) 360 | print(cluster_totals) 361 | print(sum(cluster_totals.values())) 362 | print(sum(total_queries.values())) 363 | -------------------------------------------------------------------------------- /planner-simulator/planner_simulator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import csv 5 | import fnmatch 6 | import re 7 | import sys 8 | import pickle 9 | import argparse 10 | import sqlparse 11 | import shutil 12 | import datetime as dt 13 | from datetime import datetime, timedelta 14 | 15 | from sortedcontainers import SortedDict 16 | 17 | from schemaParser import extract_tables_and_columns 18 | 19 | class Simulator: 20 | """Index suggestion algorithm that simulates a basic "what-if" API 21 | 22 | We recommend indexes based on the workload forecasting results at the 23 | current time stamp. We only recommend single-column indexes here. 24 | The expected arrival rate of each query template is calculated by the 25 | predicted arrival rate of the cluster it belongs to and the ratio between 26 | the volume of this template and the total volume of the cluster. 27 | The benefit of the index is estimated also by the cardinality of the column 28 | and whether the query can already use another index. 29 | """ 30 | def __init__(self, schema_file, original_path, predicted_path, assignment_path, 31 | top_cluster_path, max_cluster_num, aggregate, column_card, static_suggest): 32 | # params 33 | self.aggregate = aggregate 34 | self.max_cluster_num = max_cluster_num 35 | self.column_card = column_card 36 | self.static_suggest = static_suggest 37 | 38 | with open(assignment_path, 'rb') as f: 39 | _, self.assignment_dict, _ = pickle.load(f) 40 | 41 | with open(top_cluster_path, 'rb') as f: 42 | self.top_cluster, _ = pickle.load(f) 43 | 44 | # Pre-processing: template extraction + schema parsing + preprocessing 45 | sql_schema = open(schema_file, 'r') 46 | self.schema_dict = extract_tables_and_columns(sql_schema) 47 | 48 | self.data, self.total_queries, self.template_map = LoadOriginalData(original_path) 49 | 50 | self.predicted_data = LoadMultiplePredictedData(predicted_path) 51 | 52 | tables = self.schema_dict.keys() 53 | 54 | self.templates_dict = GetAccessDict(sorted(self.template_map.values()), tables, self.schema_dict) 55 | 56 | self.last_date = None 57 | # Clear the total_queries and calculate it online 58 | for template in self.total_queries: 59 | self.total_queries[template] = 0 60 | 61 | 62 | def SuggestIndex(self, start_date, duration, index_set): 63 | predicted_dict = BuildDictionary(self.schema_dict) 64 | predicted_data = self.predicted_data 65 | total_queries = self.total_queries 66 | aggregate = self.aggregate 67 | column_card = self.column_card 68 | template_map = self.template_map 69 | static_suggest = self.static_suggest 70 | data = self.data 71 | 72 | print(list(total_queries.values())[:30]) 73 | 74 | if static_suggest is True: 75 | # Suggest index based on all the workload trace until now if static_suggest is True 76 | 77 | if self.last_date is None: 78 | for template in total_queries: 79 | for date in data[template].irange(None, start_date): 80 | total_queries[template] += data[template][date] 81 | self.last_date = start_date 82 | 83 | for template in total_queries: 84 | if self.total_queries[template] < 100: 85 | continue 86 | 87 | template_dict = self.templates_dict[template_map[template]] 88 | weight = 10000 89 | for pair in template_dict: 90 | if pair in index_set: 91 | weight = 1 92 | 93 | for pair in template_dict: 94 | predicted_dict[pair] += total_queries[template] * column_card[pair] * weight 95 | else: 96 | if self.last_date is None: 97 | for template in total_queries: 98 | for date in data[template].irange(start_date - timedelta(weeks = 2), start_date): 99 | total_queries[template] += data[template][date] 100 | else: 101 | for template in total_queries: 102 | for date in data[template].irange(self.last_date, start_date): 103 | total_queries[template] += data[template][date] 104 | for template in total_queries: 105 | for date in data[template].irange(self.last_date - timedelta(weeks = 2), start_date - 106 | timedelta(weeks = 2)): 107 | total_queries[template] -= data[template][date] 108 | 109 | self.last_date = start_date 110 | 111 | 112 | for date, ass in self.assignment_dict: 113 | if date > start_date: 114 | print(date) 115 | break 116 | 117 | assignments = ass 118 | 119 | for date, cluster in self.top_cluster: 120 | if date > start_date: 121 | print(date) 122 | break 123 | 124 | cluster_list = cluster 125 | 126 | print("Clusters: ", cluster_list) 127 | clusters = next(zip(*cluster_list))[:self.max_cluster_num] 128 | print("Clusters: ", clusters, type(clusters[0])) 129 | 130 | total_queries_per_cluster = dict() 131 | for cluster in clusters: 132 | total_queries_per_cluster[cluster] = 0 133 | 134 | for template, cluster in assignments.items(): 135 | if cluster not in clusters: 136 | continue 137 | total_queries_per_cluster[cluster] += self.total_queries[template] 138 | 139 | cnt = 0 140 | cnt2 = 0 141 | for template, cluster in assignments.items(): 142 | template_dict = self.templates_dict[template_map[template]] 143 | 144 | cnt2 += 1 145 | if self.total_queries[template] < 100 or cluster not in clusters: 146 | continue 147 | cnt += 1 148 | print(type(cluster), cluster, total_queries[template], template[:50]) 149 | 150 | weight = 10000 151 | for pair in template_dict: 152 | if pair in index_set: 153 | weight = 1 154 | 155 | print(template) 156 | print(weight, "\n") 157 | 158 | 159 | for j in range(0, 60, aggregate): 160 | if j >= duration: 161 | break 162 | 163 | for pair in template_dict: 164 | predict_date = next(self.predicted_data[0][cluster].irange(maximum = start_date + 165 | timedelta(minutes = j), reverse = True)) 166 | 167 | predicted_dict[pair] += (predicted_data[0][cluster][predict_date] * 168 | total_queries[template] / total_queries_per_cluster[cluster] * 100 * 169 | column_card[pair] * weight) 170 | 171 | 172 | for j in range(60, 1440, aggregate): 173 | if j >= duration: 174 | break 175 | 176 | for pair in template_dict: 177 | predict_date = next(self.predicted_data[1][cluster].irange(maximum = start_date + 178 | timedelta(minutes = j), reverse = True)) 179 | 180 | predicted_dict[pair] += (predicted_data[1][cluster][predict_date] * 181 | total_queries[template] / total_queries_per_cluster[cluster] * 10 * 182 | column_card[pair] * weight) 183 | 184 | for j in range(1440, 10080, aggregate): 185 | if j >= duration: 186 | break 187 | 188 | for pair in template_dict: 189 | predict_date = next(self.predicted_data[2][cluster].irange(maximum = start_date + 190 | timedelta(minutes = j), reverse = True)) 191 | 192 | predicted_dict[pair] += (predicted_data[2][cluster][predict_date] * 193 | total_queries[template] / total_queries_per_cluster[cluster] * 1 * 194 | column_card[pair] * weight) 195 | 196 | print("Valid queries: ", cnt, cnt2) 197 | print(clusters) 198 | 199 | predicted_sorted_columns = sorted(predicted_dict.items(), key=lambda x: x[1], reverse = True) 200 | 201 | for pair in predicted_sorted_columns: 202 | print(pair) 203 | #if pair[1] == 0: 204 | # break 205 | if not pair[0] in index_set: 206 | return pair[0] 207 | 208 | return None 209 | 210 | def LoadOriginalData(input_path): 211 | datetime_format = "%Y-%m-%d %H:%M:%S" # Strip milliseconds ".%f" 212 | 213 | total_queries = dict() 214 | min_date = datetime.max 215 | max_date = datetime.min 216 | data = dict() 217 | 218 | # This is to keep track of our modification to the template. We have to keep the original 219 | # templates to restore the order in the clustering assignments. 220 | modified_template_map = dict() 221 | 222 | for csv_file in sorted(os.listdir(input_path)): 223 | print(csv_file) 224 | with open(input_path + "/" + csv_file, 'r') as f: 225 | reader = csv.reader(f) 226 | queries, template = next(reader) 227 | 228 | # To make the matplotlib work... 229 | template = template.replace('$', '') 230 | 231 | modified_template = template 232 | # replace '#' with 'param_holder' for sql parsing 233 | modified_template = modified_template.replace('#', "param_holder") 234 | # convert to lower case for matching convenience 235 | #modified_template = modified_template.lower() 236 | modified_template_map[template] = modified_template 237 | 238 | #print queries, template 239 | total_queries[template] = int(queries) 240 | 241 | # add template 242 | data[template] = SortedDict() 243 | 244 | #continue 245 | 246 | for line in reader: 247 | time_stamp = datetime.strptime(line[0], datetime_format) 248 | count = int(line[1]) 249 | 250 | data[template][time_stamp] = count 251 | 252 | min_date = min(min_date, time_stamp) 253 | max_date = max(max_date, time_stamp) 254 | 255 | return data, total_queries, modified_template_map 256 | 257 | 258 | def LoadData(file_path, aggregate): 259 | trajs = dict() 260 | 261 | datetime_format = "%Y-%m-%d %H:%M:%S" # Strip milliseconds ".%f" 262 | for csv_file in sorted(os.listdir(file_path)): 263 | print(csv_file) 264 | 265 | cluster = int(os.path.splitext(csv_file)[0]) 266 | trajs[cluster] = SortedDict() 267 | 268 | with open(file_path + "/" + csv_file, 'r') as f: 269 | reader = csv.reader(f) 270 | 271 | traj = list() 272 | date = list() 273 | cnt = 0 274 | tot = 0 275 | 276 | for line in reader: 277 | cnt += 1 278 | tot += float(line[1]) 279 | 280 | if cnt % aggregate == 0: 281 | time_stamp = datetime.strptime(line[0], datetime_format) 282 | trajs[cluster][time_stamp] = tot 283 | 284 | tot = 0 285 | 286 | return trajs 287 | 288 | def LoadMultiplePredictedData(paths): 289 | predicted_data = [] 290 | 291 | for path in paths: 292 | predicted_data.append(LoadData(path, 1)) 293 | 294 | return predicted_data 295 | 296 | 297 | def BuildDictionary(schema_dict): 298 | d = dict() 299 | 300 | for table, column_dict in schema_dict.items(): 301 | for column in column_dict: 302 | d[(table, column)] = 0 303 | 304 | return d 305 | 306 | def GetAccessDict(templates, tables, schema_dict): 307 | templates_dict = dict() 308 | 309 | for template in templates: 310 | #print("processing template: %s" % template) 311 | sql = sqlparse.parse(template)[0] 312 | token_list = [str(x) for x in sql.flatten()] 313 | 314 | token_set = set() 315 | 316 | table_map = dict() 317 | before_from = True 318 | #print() 319 | #print(template) 320 | 321 | # Find the alias for table names 322 | keywords = ['upper', 'lower', 'left', 'right', 'join', 'as', ',', 'group', 'order', 'set'] 323 | word_regex = re.compile('[\w]+') 324 | for i, token in enumerate(token_list): 325 | # Only look between from and where 326 | if token.lower() in ["from", "update"]: 327 | before_from = False 328 | if before_from: 329 | continue 330 | if token.lower() in ["set", 'where']: 331 | before_from = True 332 | 333 | table_name = None 334 | if token in tables: 335 | table_name = token 336 | if i > 1: 337 | combined_token = token_list[i - 2] + '.' + token 338 | if combined_token in tables: 339 | table_name = combined_token 340 | 341 | if table_name != None: 342 | if i < len(token_list) - 2: 343 | if token_list[i + 1] == ' ' and token_list[i + 2].lower() not in keywords: 344 | if word_regex.match(token): 345 | table_map[token_list[i + 2]] = table_name 346 | 347 | if i < len(token_list) - 4: 348 | if token_list[i + 2].lower() == 'as': 349 | table_map[token_list[i + 4]] = table_name 350 | 351 | table_map[table_name] = table_name 352 | 353 | #print("table_map: ", table_map) 354 | 355 | current_table = None 356 | within_where_clause = False 357 | for token in token_list: 358 | # The table does not exist in the schema file... 359 | # That happends with the test table 360 | if len(table_map) == 0: 361 | break 362 | 363 | # only consider columns within where clause 364 | if token.lower() in ['select', 'order', 'group', 'returning']: 365 | within_where_clause = False 366 | 367 | if within_where_clause: 368 | 369 | if token == 'param_holder': 370 | continue 371 | 372 | if word_regex.match(token): 373 | if token in table_map: 374 | current_table = table_map[token] 375 | continue 376 | 377 | if current_table == None: 378 | current_table = next(iter(table_map.values())) 379 | 380 | if token in schema_dict[current_table]: 381 | token_set.add((current_table, token)) 382 | current_table = None 383 | 384 | # only consider columns within where clause 385 | if token == 'where' or token == 'WHERE': 386 | within_where_clause = True 387 | 388 | templates_dict[template] = token_set 389 | #print(token_set) 390 | 391 | return templates_dict 392 | 393 | 394 | # ============================================== 395 | # main 396 | # ============================================== 397 | if __name__ == '__main__': 398 | SimulatorObject = Simulator("../workload-simulator/simulatorFiles/combined-results", 399 | ["../workload-simulator/simulatorFiles/online-prediction/ar-60","../workload-simulator/simulatorFiles/online-prediction/ar-1440","../workload-simulator/simulatorFiles/online-prediction/ar-10080"], 400 | "../workload-simulator/simulatorFiles/cluster-coverage/admission-assignments.pickle", 401 | "../workload-simulator/simulatorFiles/cluster-coverage/admission-coverage.pickle", 402 | 3, 403 | 60 404 | ) 405 | index = SimulatorObject.SuggestIndex(datetime(2017, 1, 1), 300, []) 406 | print(index) 407 | 408 | 409 | -------------------------------------------------------------------------------- /forecaster/plot-prediction-median-error.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import matplotlib.pyplot as plt 4 | import matplotlib.ticker as plticker 5 | import matplotlib.dates as mpdates 6 | import matplotlib.patches as mpatches 7 | import matplotlib as mpl 8 | import copy 9 | import fnmatch 10 | import csv 11 | from datetime import datetime, timedelta 12 | import sys 13 | import os 14 | import numpy as np 15 | 16 | import seaborn as sns 17 | 18 | PROJECT = "admission-full" 19 | #PROJECT = "admission" 20 | #PROJECT = "oli" 21 | #PROJECT = "tiramisu" 22 | 23 | GRAPH_DIR = "../plot/compare-avg-result/" 24 | 25 | INPUT_DIM = 1 26 | HORIZON = 0 27 | INTERVAL = 12 28 | TRAIN_NUM = 48 29 | 30 | def SetupMplParams(): 31 | #color = sns.color_palette("deep", 7) 32 | color = sns.color_palette("Set1", n_colors=8, desat=.7) 33 | mpl.rcParams.update(mpl.rcParamsDefault) 34 | 35 | mpl.rcParams['ps.useafm'] = True 36 | mpl.rcParams['pdf.use14corefonts'] = True 37 | mpl.rcParams['text.usetex'] = True 38 | mpl.rcParams['text.latex.preamble'] = [ 39 | #r'\usepackage{siunitx}', # i need upright \micro symbols, but you need... 40 | #r'\sisetup{detect-all}', # ...this to force siunitx to actually use your fonts 41 | r'\usepackage{helvet}', # set the normal font here 42 | r'\usepackage{sansmath}', # load up the sansmath so that math -> helvet 43 | r'\sansmath' # <- tricky! -- # gotta actually # tell tex to use! 44 | ] 45 | mpl.rcParams['xtick.labelsize'] = 20 46 | mpl.rcParams['ytick.labelsize'] = 20 47 | 48 | return color 49 | 50 | def Plot(data_dict, name, xlabel, ylabel, color): 51 | fig_length = 12 52 | 53 | fig = plt.figure() 54 | fig.set_size_inches(fig_length, 2) 55 | ax = fig.add_subplot(111) 56 | loc = plticker.MaxNLocator(3) # this locator puts ticks at regular intervals 57 | ax.yaxis.set_major_locator(loc) 58 | ax.grid() 59 | 60 | ax.set_ylabel(ylabel, fontsize=18, weight='bold') 61 | ax.set_xlabel(xlabel, fontsize=18, weight='bold') 62 | 63 | models = ["ar", "arma", "fnn", 'rnn', 'noencoder-rnn', 'psrnn', 'psrnn-h5'] 64 | labels = ["AR", "arma", "FNN", "RNN", "NE-RNN", "PSRNN", "PSRNN-H5"] 65 | horizons = ['60', '720', '1440', '2880', '4320', '7200', '10080'] 66 | xlabels = ["1 Hour", "12 Hour", "1 Day", "2 Days", "3 Days", "5 Days", "1 Week"] 67 | 68 | result = [] 69 | 70 | for i, m in enumerate(models): 71 | res = [] 72 | for h in horizons: 73 | print(m, h) 74 | r = np.array([]) 75 | r_hat = np.array([]) 76 | for tp in data_dict: 77 | if tp.find(PROJECT) >= 0 and tp.find("/" + m + "/") >=0 and tp.find("horizon-" + h) >= 0: 78 | r = np.append(r, data_dict[tp][0]) 79 | r_hat = np.append(r_hat, data_dict[tp][1]) 80 | 81 | res.append(np.mean((r - r_hat) ** 2)) 82 | print(np.mean(r - r_hat) ** 2) 83 | 84 | #result.append(np.log(res)) 85 | result.append(res) 86 | 87 | x = [x * 8 for x in range(7)] 88 | ax.bar([i - 1 for i in x], result[0], width = 1, label = labels[0], hatch='\\',color = color[0]) 89 | ax.bar([i for i in x], result[1], width = 1, label = labels[1], hatch='-',color = color[1]) 90 | ax.bar([i + 1 for i in x], result[2], width = 1, label = labels[2], hatch='|',color = color[2]) 91 | ax.bar([i + 2 for i in x], result[3], width = 1, label = labels[3], hatch='-',color = color[3]) 92 | ax.bar([i + 3 for i in x], result[4], width = 1, label = labels[4], hatch='\\',color = color[4]) 93 | ax.bar([i + 4 for i in x], result[5], width = 1, label = labels[5], hatch='|',color = color[5]) 94 | ax.bar([i + 5 for i in x], result[6], width = 1, label = labels[6], hatch='-',color = color[6]) 95 | 96 | ax.set_xticks([i + 1 for i in x]) 97 | ax.set_xlim([-2, x[-1] + 7]) 98 | 99 | #ax.set_ylim([0, 4.5]) 100 | 101 | ax.set_xticklabels(xlabels, fontsize = 15) 102 | ax.legend(bbox_to_anchor = [0.5, 1], loc = 'lower center', ncol = 6) 103 | plt.savefig("%s%s-predict-error.pdf" % (GRAPH_DIR, name), bbox_inches='tight') 104 | 105 | plt.close(fig) 106 | 107 | def RegressionOnModels(res, res_hat, h): 108 | data = np.array(res_hat).transpose() 109 | reg_y = np.array([]) 110 | reg_y_hat = np.array([]) 111 | for i in range(TRAIN_NUM + INPUT_DIM + h - 1, len(data) - INTERVAL, INTERVAL): 112 | train_data = data[i - TRAIN_NUM - INPUT_DIM:i] 113 | x, _ = Model.GeneratePair(train_data, HORIZON, INPUT_DIM) 114 | y = res[i - TRAIN_NUM - h + HORIZON:i - h + 1].reshape((-1, 1)) 115 | 116 | xx = Model.GetMatrix(x) 117 | 118 | params = Model.Training(xx, y) 119 | #print(params) 120 | 121 | test_data = data[i - INPUT_DIM + 1 - HORIZON:i + INTERVAL] 122 | x, _ = Model.GeneratePair(test_data, HORIZON, INPUT_DIM) 123 | y = res[i:i + INTERVAL].reshape((-1, 1)) 124 | y_hat = Model.Testing(params, x) 125 | reg_y = np.append(reg_y, y.flatten()) 126 | reg_y_hat = np.append(reg_y_hat, y_hat.flatten()) 127 | 128 | return reg_y, reg_y_hat 129 | 130 | def PlotCompareAvg(data_dict, name, xlabel, ylabel, color): 131 | fig_length = 12 132 | 133 | fig = plt.figure() 134 | fig.set_size_inches(fig_length, 2) 135 | ax = fig.add_subplot(111) 136 | loc = plticker.MaxNLocator(3) # this locator puts ticks at regular intervals 137 | ax.yaxis.set_major_locator(loc) 138 | ax.grid() 139 | 140 | ax.set_ylabel(ylabel, fontsize=18, weight='bold') 141 | ax.set_xlabel(xlabel, fontsize=18, weight='bold') 142 | 143 | models = ["ar", 'noencoder-rnn', 'psrnn-h5'] 144 | labels = ["AR", "RNN-AVG", "PSRNN-AVG", "AVG", "RNN-REG", "PSRNN-REG", "REG", "LAST", "CONSTANT"] 145 | horizons = ['60', '720', '1440', '2880', '4320', '7200', '10080'] 146 | xlabels = ["1 Hour", "12 Hour", "1 Day", "2 Days", "3 Days", "5 Days", "1 Week"] 147 | 148 | result = [] 149 | 150 | for i, m in enumerate(models): 151 | res = [] 152 | for h in horizons: 153 | print(m, h) 154 | r = np.array([]) 155 | r_hat = np.array([]) 156 | for tp in data_dict: 157 | if tp.find(PROJECT) >= 0 and tp.find("/" + m + "/") >=0 and tp.find("horizon-" + h) >= 0: 158 | print(tp) 159 | r = np.append(r, data_dict[tp][0]) 160 | r_hat = np.append(r_hat, data_dict[tp][1]) 161 | 162 | res.append(np.mean((r - r_hat) ** 2)) 163 | 164 | result.append(res) 165 | 166 | avg_res = [] 167 | rnn_avg_res = [] 168 | psrnn_avg_res = [] 169 | rnn_reg_res = [] 170 | psrnn_reg_res = [] 171 | reg_res = [] 172 | last_res = [] 173 | constant_res = [] 174 | for h in horizons: 175 | res = [] 176 | res_hat = [] 177 | for m in models: 178 | print(m, h) 179 | r = np.array([]) 180 | r_hat = np.array([]) 181 | for tp in sorted(data_dict.keys()): 182 | if tp.find(PROJECT) >= 0 and tp.find("/" + m + "/") >=0 and tp.find("horizon-" + h) >= 0: 183 | print(tp) 184 | r = np.append(r, data_dict[tp][0]) 185 | r_hat = np.append(r_hat, data_dict[tp][1]) 186 | 187 | res = r 188 | res_hat.append(r_hat) 189 | 190 | res_rnn = np.mean(np.array([res_hat[0], res_hat[1]]), axis = 0) 191 | res_psrnn = np.mean(np.array([res_hat[0], res_hat[2]]), axis = 0) 192 | res_avg = np.mean(np.array(res_hat), axis = 0) 193 | 194 | hour_h = int(h) // 60 195 | 196 | rnn_reg_y, rnn_reg_y_hat = RegressionOnModels(res, [res_hat[0], res_hat[1]], hour_h) 197 | psrnn_reg_y, psrnn_reg_y_hat = RegressionOnModels(res, [res_hat[0], res_hat[2]], hour_h) 198 | reg_y, reg_y_hat = RegressionOnModels(res, res_hat, hour_h) 199 | 200 | last_res.append(np.mean((res[hour_h:] - res[:-hour_h]) ** 2)) 201 | constant_res.append(np.mean((res - np.mean(res)) ** 2)) 202 | avg_res.append(np.mean((res - res_avg) ** 2)) 203 | rnn_avg_res.append(np.mean((res - res_rnn) ** 2)) 204 | psrnn_avg_res.append(np.mean((res - res_psrnn) ** 2)) 205 | rnn_reg_res.append(np.mean((rnn_reg_y - rnn_reg_y_hat) ** 2)) 206 | psrnn_reg_res.append(np.mean((psrnn_reg_y - psrnn_reg_y_hat) ** 2)) 207 | reg_res.append(np.mean((reg_y - reg_y_hat) ** 2)) 208 | print(h, len(res), len(reg_y)) 209 | 210 | result = result[0:1] 211 | result.append(rnn_avg_res) 212 | result.append(psrnn_avg_res) 213 | result.append(avg_res) 214 | result.append(rnn_reg_res) 215 | result.append(psrnn_reg_res) 216 | result.append(reg_res) 217 | result.append(last_res) 218 | result.append(constant_res) 219 | 220 | x = [x * 10 for x in range(7)] 221 | ax.bar([i - 1 for i in x], result[0], width = 1, label = labels[0], hatch='\\',color = color[0]) 222 | ax.bar([i for i in x], result[1], width = 1, label = labels[1], hatch='-',color = color[1]) 223 | ax.bar([i + 1 for i in x], result[2], width = 1, label = labels[2], hatch='|',color = color[2]) 224 | ax.bar([i + 2 for i in x], result[3], width = 1, label = labels[3], hatch='\\',color = color[3]) 225 | ax.bar([i + 3 for i in x], result[4], width = 1, label = labels[4], hatch='-',color = color[4]) 226 | ax.bar([i + 4 for i in x], result[5], width = 1, label = labels[5], hatch='|',color = color[5]) 227 | ax.bar([i + 5 for i in x], result[6], width = 1, label = labels[6], hatch='\\',color = color[6]) 228 | ax.bar([i + 6 for i in x], result[7], width = 1, label = labels[7], hatch='-',color = color[7]) 229 | ax.bar([i + 7 for i in x], result[8], width = 1, label = labels[8], hatch='|',color = color[8]) 230 | 231 | ax.set_xticks([i + 1 for i in x]) 232 | ax.set_xlim([-2, x[-1] + 9]) 233 | 234 | #ax.set_ylim([0, 4.5]) 235 | 236 | ax.set_xticklabels(xlabels, fontsize = 15) 237 | ax.legend(bbox_to_anchor = [0.5, 1], loc = 'lower center', ncol = 5) 238 | plt.savefig("%s%s-regression-predict-error-avg.pdf" % (GRAPH_DIR, name), bbox_inches='tight') 239 | 240 | plt.close(fig) 241 | 242 | def autolabel(ax, rects): 243 | # attach some text labels 244 | for rect in rects: 245 | height = rect.get_height() 246 | ax.text(rect.get_x() + rect.get_width()/2., height + 0.1, 247 | '%.1f' % height, size=10, 248 | ha='center', va='bottom') 249 | 250 | 251 | def PlotPaperGraph(data_dict, name, xlabel, ylabel, color): 252 | fig_length = 20 253 | 254 | fig = plt.figure() 255 | fig.set_size_inches(fig_length, 2) 256 | ax = fig.add_subplot(111) 257 | loc = plticker.MaxNLocator(3) # this locator puts ticks at regular intervals 258 | ax.yaxis.set_major_locator(loc) 259 | ax.grid() 260 | 261 | ax.set_ylabel(ylabel, fontsize=18, weight='bold') 262 | ax.set_xlabel(xlabel, fontsize=18, weight='bold') 263 | 264 | models = ["ar", 'kr', 'arma', 'fnn', 'noencoder-rnn', 'psrnn-h5'] 265 | labels = ["LR", "KR", "ARMA", "FNN", "RNN", "PSRNN", "ENSEMBLE (LR+RNN)", "HYBRID"] 266 | horizons = ['60', '720', '1440', '2880', '4320', '7200', '10080'] 267 | xlabels = ["1 Hour", "12 Hour", "1 Day", "2 Days", "3 Days", "5 Days", "1 Week"] 268 | 269 | result = [] 270 | hist_result = [] 271 | 272 | for i, m in enumerate(models): 273 | res = [] 274 | hist_res = [] 275 | for h in horizons: 276 | print(m, h) 277 | r = np.array([]) 278 | r_hat = np.array([]) 279 | for tp in data_dict: 280 | if tp.find(PROJECT) >= 0 and tp.find("/" + m + "/") >=0 and tp.find("horizon-" + h + 281 | "/") >= 0: 282 | print(tp) 283 | r = np.append(r, data_dict[tp][0]) 284 | r_hat = np.append(r_hat, data_dict[tp][1]) 285 | 286 | res.append(np.mean((r - r_hat) ** 2)) 287 | hist_res += (r - r_hat).tolist() 288 | 289 | result.append(res) 290 | hist_result.append(hist_res) 291 | 292 | avg_res = [] 293 | rnn_avg_res = [] 294 | psrnn_avg_res = [] 295 | mixture_res = [] 296 | hist_res = [] 297 | for h in horizons: 298 | res = [] 299 | res_hat = [] 300 | for m in models: 301 | print(m, h) 302 | r = np.array([]) 303 | r_hat = np.array([]) 304 | for tp in sorted(data_dict.keys()): 305 | if tp.find(PROJECT) >= 0 and tp.find("/" + m + "/") >=0 and tp.find("horizon-" + h + 306 | "/") >= 0: 307 | #print(tp) 308 | r = np.append(r, data_dict[tp][0]) 309 | r_hat = np.append(r_hat, data_dict[tp][1]) 310 | 311 | res = r 312 | print(h, m, r_hat.shape) 313 | res_hat.append(r_hat) 314 | 315 | #res_rnn = np.mean(np.array([res_hat[0], res_hat[1]]), axis = 0) 316 | res_psrnn = np.mean(np.array([res_hat[0], res_hat[4]]), axis = 0) 317 | #res_avg = np.mean(np.array(res_hat), axis = 0) 318 | 319 | 320 | data_mixture = [] 321 | for i in range(res_hat[1].shape[0]): 322 | if res_hat[1][i] > 7 and abs(res_hat[1][i] - res_psrnn[i]) > 1: 323 | data_mixture.append(res_hat[1][i]) 324 | else: 325 | data_mixture.append(res_psrnn[i]) 326 | data_mixture = np.array(data_mixture) 327 | 328 | hour_h = int(h) // 60 329 | 330 | #avg_res.append(np.mean((res - res_avg) ** 2)) 331 | #rnn_avg_res.append(np.mean((res - res_rnn) ** 2)) 332 | psrnn_avg_res.append(np.mean((res - res_psrnn) ** 2)) 333 | mixture_res.append(np.mean((res - data_mixture) ** 2)) 334 | hist_res += (res - res_psrnn).tolist() 335 | 336 | #result.append(rnn_avg_res) 337 | result.append(psrnn_avg_res) 338 | result.append(mixture_res) 339 | hist_result.append(hist_res) 340 | #result.append(avg_res) 341 | 342 | print(result) 343 | #result[1] = [0,0,0,0,0,0,0] 344 | 345 | x = [x * 9 for x in range(7)] 346 | ax.bar([i - 1 for i in x], result[0], width = 1, label = labels[0], hatch='\\',color = color[0]) 347 | ax.bar([i for i in x], result[1], width = 1, label = labels[1], hatch='-',color = color[1]) 348 | ax.bar([i + 1 for i in x], result[2], width = 1, label = labels[2], hatch='|',color = color[2]) 349 | ax.bar([i + 2 for i in x], result[3], width = 1, label = labels[3], hatch='/',color = color[3]) 350 | ax.bar([i + 3 for i in x], result[4], width = 1, label = labels[4], hatch='.',color = color[4]) 351 | ax.bar([i + 4 for i in x], result[5], width = 1, label = labels[5], hatch='+',color = color[7]) 352 | ax.bar([i + 5 for i in x], result[6], width = 1, label = labels[6], hatch='o',color = color[6]) 353 | ax.bar([i + 6 for i in x], result[7], width = 1, label = labels[7], hatch='*',color = color[5]) 354 | 355 | rects = ax.patches 356 | autolabel(ax, rects) 357 | 358 | ax.set_xticks([i + 1 for i in x]) 359 | ax.set_xlim([-2, x[-1] + 8]) 360 | 361 | if PROJECT == "tiramisu": 362 | ax.set_ylim([0, 5.4]) 363 | if PROJECT == "oli": 364 | ax.set_ylim([0, 17]) 365 | #if PROJECT == "admission-full": 366 | # ax.set_ylim([0, 10]) 367 | 368 | ax.set_xticklabels(xlabels, fontsize = 15) 369 | #ax.legend(bbox_to_anchor = [0.5, 1], loc = 'lower center', ncol = 8) 370 | plt.savefig("%s%s-predict-paper.pdf" % (GRAPH_DIR, name), bbox_inches='tight') 371 | 372 | plt.close(fig) 373 | 374 | WriteResult("%s%s-predict-paper.csv" % (GRAPH_DIR, name), xlabels, labels, result) 375 | 376 | PlotHistgram(name, hist_result) 377 | 378 | 379 | def PlotHistgram(name, hist_result): 380 | indexes = [0, 4, 6] 381 | models = ['lr', 'rnn', 'ensemble'] 382 | for i, index in enumerate(indexes): 383 | fig, ax = plt.subplots(figsize=(12,4)) 384 | #if PROJECT == "admission": 385 | # ax.set_ylim([0, 30000]) 386 | if PROJECT == "oli": 387 | ax.set_ylim([0, 0.25]) 388 | #ax.set_ylim([0, 0.7]) 389 | hist_range = (-15, 15) 390 | ax.hist(hist_result[index], 20, hist_range, normed=1, facecolor='green', alpha=0.75) 391 | plt.savefig("%s%s-%s-histogram.pdf" % (GRAPH_DIR, name, models[i]), bbox_inches='tight') 392 | 393 | 394 | def WriteResult(path, xlabels, labels, result): 395 | with open(path, "w") as csvfile: 396 | writer = csv.writer(csvfile) 397 | writer.writerow([""] + xlabels) 398 | for x in range(len(labels)): 399 | writer.writerow([labels[x]] + result[x]) 400 | 401 | 402 | def GetMSE(input_path): 403 | dates = [] 404 | actual = [] 405 | predict = [] 406 | with open(input_path) as input_file: 407 | reader = csv.reader(input_file) 408 | for line in reader: 409 | #dates.append(datetime.strptime(line[0], DATETIME_FORMAT)) 410 | actual.append(float(line[1])) 411 | predict.append(float(line[2])) 412 | 413 | y = np.array(actual) 414 | y_hat = np.array(predict) 415 | 416 | data_min = 2 - np.min([np.min(y), np.min(y_hat)]) 417 | se = (np.log(y + data_min) - np.log(y_hat + data_min)) ** 2 418 | print("MSE of %s: %s" % (input_path, np.mean(se))) 419 | 420 | return (np.log(y + data_min), np.log(y_hat + data_min)) 421 | 422 | def GetDataDict(input_dir): 423 | data_dict = {} 424 | 425 | losses = np.array([]) 426 | for root, dirnames, filenames in os.walk(input_dir): 427 | for filename in sorted(fnmatch.filter(filenames, '*.csv')): 428 | print(filename) 429 | file_path = os.path.join(root, filename) 430 | 431 | data = GetMSE(file_path) 432 | data_dict[file_path] = data 433 | 434 | return data_dict 435 | 436 | def Main(input_dir): 437 | if not os.path.exists(GRAPH_DIR): 438 | os.makedirs(GRAPH_DIR) 439 | 440 | color = SetupMplParams() 441 | 442 | data_dict = GetDataDict(input_dir) 443 | 444 | PlotPaperGraph(data_dict, PROJECT, "Prediction Horizon", "MSE (log space)", color) 445 | 446 | # ============================================== 447 | # main 448 | # ============================================== 449 | if __name__ == '__main__': 450 | """ 451 | Generate MSE result plots 452 | 453 | Args: 454 | arg1 : the result dir 455 | """ 456 | Main(sys.argv[1]) 457 | -------------------------------------------------------------------------------- /anonymizer/log-anonymizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.5 2 | 3 | import sys 4 | import re 5 | import gzip 6 | import csv 7 | import sqlparse 8 | import hashlib 9 | import string 10 | import logging 11 | import argparse 12 | import zipfile 13 | 14 | from pprint import pprint 15 | global ANONYMIZE 16 | 17 | # ============================================== 18 | # LOGGING CONFIGURATION 19 | # ============================================== 20 | 21 | LOG = logging.getLogger(__name__) 22 | LOG_handler = logging.StreamHandler() 23 | LOG_formatter = logging.Formatter( 24 | fmt='%(asctime)s[%(funcName)s:%(lineno)03d]%(levelname)-5s:%(message)s', 25 | datefmt='%m-%d-%Y %H:%M:%S') 26 | LOG_handler.setFormatter(LOG_formatter) 27 | LOG.addHandler(LOG_handler) 28 | LOG.setLevel(logging.INFO) 29 | 30 | CMD_TYPES = [ 31 | "Connect", 32 | "Quit", 33 | "Init DB", 34 | "Query", 35 | "Field List", 36 | "Statistics" 37 | ] 38 | # SQL commands that we just want to simply ignore 39 | IGNORED_CMDS = [] 40 | 41 | CLEAN_CMDS = [ 42 | re.compile(r"(WHERE|ON)[\s]{2,}", re.IGNORECASE) 43 | ] 44 | 45 | # T