├── .gitignore
├── figures
├── ambient_temp_ours.pdf
├── ambient_temp_ours.png
├── detailed_architecture.pdf
└── detailed_architecture.png
├── codes
├── __pycache__
│ ├── base.cpython-36.pyc
│ ├── utils.cpython-36.pyc
│ ├── models.cpython-36.pyc
│ ├── trainers.cpython-36.pyc
│ └── data_loader.cpython-36.pyc
├── .ipynb_checkpoints
│ └── NAB-anomaly-detection-checkpoint.ipynb
├── NAB_config.json
├── train.py
├── utils.py
├── data_loader.py
├── trainers.py
├── base.py
└── models.py
├── datasets
├── NAB-known-anomaly
│ ├── nyc_taxi.npz
│ ├── ambient_temp.npz
│ ├── ec2_request.npz
│ ├── machine_temp.npz
│ └── cpu_utilization.npz
└── NAB-dataset-preprocessing.ipynb
├── requirements.txt
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | experiments/
2 | anomaly-env/
3 | codes/__*
4 | .idea/
5 | codes/.ipynb_checkpoints/
6 | datasets/.ipynb_checkpoints/
7 |
--------------------------------------------------------------------------------
/figures/ambient_temp_ours.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/figures/ambient_temp_ours.pdf
--------------------------------------------------------------------------------
/figures/ambient_temp_ours.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/figures/ambient_temp_ours.png
--------------------------------------------------------------------------------
/figures/detailed_architecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/figures/detailed_architecture.pdf
--------------------------------------------------------------------------------
/figures/detailed_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/figures/detailed_architecture.png
--------------------------------------------------------------------------------
/codes/__pycache__/base.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/codes/__pycache__/base.cpython-36.pyc
--------------------------------------------------------------------------------
/codes/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/codes/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/codes/__pycache__/models.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/codes/__pycache__/models.cpython-36.pyc
--------------------------------------------------------------------------------
/codes/__pycache__/trainers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/codes/__pycache__/trainers.cpython-36.pyc
--------------------------------------------------------------------------------
/datasets/NAB-known-anomaly/nyc_taxi.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/datasets/NAB-known-anomaly/nyc_taxi.npz
--------------------------------------------------------------------------------
/codes/.ipynb_checkpoints/NAB-anomaly-detection-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/datasets/NAB-known-anomaly/ambient_temp.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/datasets/NAB-known-anomaly/ambient_temp.npz
--------------------------------------------------------------------------------
/datasets/NAB-known-anomaly/ec2_request.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/datasets/NAB-known-anomaly/ec2_request.npz
--------------------------------------------------------------------------------
/datasets/NAB-known-anomaly/machine_temp.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/datasets/NAB-known-anomaly/machine_temp.npz
--------------------------------------------------------------------------------
/codes/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/codes/__pycache__/data_loader.cpython-36.pyc
--------------------------------------------------------------------------------
/datasets/NAB-known-anomaly/cpu_utilization.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/HEAD/datasets/NAB-known-anomaly/cpu_utilization.npz
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 |
2 | ###### Requirements without Version Specifiers ######
3 | matplotlib
4 | sklearn
5 | scipy
6 | tqdm
7 |
8 | ###### Requirements with Version Specifiers ######
9 | # See https://www.python.org/dev/peps/pep-0440/#version-specifiers
10 | numpy == 1.16.0
11 | pandas == 0.25.3
12 | GPy == 1.9.9
13 | munch == 2.3.2
14 | opencv-contrib-python==4.1.0.25
15 | tensorflow-gpu == 1.15.2 # Version Matching. Must be version 0.6.1
16 | tensorflow_probability == 0.7.0 # Minimum version 0.8.0
17 |
18 |
19 |
--------------------------------------------------------------------------------
/codes/NAB_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "exp_name": "NAB",
3 | "dataset": "machine_temp",
4 | "y_scale": 5,
5 | "one_image": 0,
6 | "l_seq": 12,
7 | "l_win": 48,
8 | "n_channel": 1,
9 | "TRAIN_VAE": 1,
10 | "TRAIN_LSTM": 1,
11 | "TRAIN_sigma": 0,
12 | "batch_size": 32,
13 | "batch_size_lstm": 32,
14 | "load_model": 1,
15 | "load_dir": "default",
16 | "num_epochs_vae": 0,
17 | "num_epochs_lstm": 20,
18 | "learning_rate_vae": 0.0004,
19 | "learning_rate_lstm": 0.0002,
20 | "code_size": 6,
21 | "sigma": 0.1,
22 | "sigma2_offset": 0.01,
23 | "num_hidden_units": 512,
24 | "num_hidden_units_lstm": 64
25 | }
--------------------------------------------------------------------------------
/codes/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tensorflow as tf
3 | from data_loader import DataGenerator
4 | from models import VAEmodel, lstmKerasModel
5 | from trainers import vaeTrainer
6 | from utils import process_config, create_dirs, get_args, save_config
7 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
8 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
9 |
10 |
11 | def main():
12 | # capture the config path from the run arguments
13 | # then process the json configuration file
14 | try:
15 | args = get_args()
16 | config = process_config(args.config)
17 | except:
18 | print("missing or invalid arguments")
19 | exit(0)
20 |
21 | # create the experiments dirs
22 | create_dirs([config['result_dir'], config['checkpoint_dir'], config['checkpoint_dir_lstm']])
23 | # save the config in a txt file
24 | save_config(config)
25 | # create tensorflow session
26 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
27 | # create your data generator
28 | data = DataGenerator(config)
29 | # create a CNN model
30 | model_vae = VAEmodel(config)
31 | # create a trainer for VAE model
32 | trainer_vae = vaeTrainer(sess, model_vae, data, config)
33 | model_vae.load(sess)
34 | # here you train your model
35 | if config['TRAIN_VAE']:
36 | if config['num_epochs_vae'] > 0:
37 | trainer_vae.train()
38 |
39 | if config['TRAIN_LSTM']:
40 | # create a lstm model class instance
41 | lstm_model = lstmKerasModel(data)
42 |
43 | # produce the embedding of all sequences for training of lstm model
44 | # process the windows in sequence to get their VAE embeddings
45 | lstm_model.produce_embeddings(config, model_vae, data, sess)
46 |
47 | # Create a basic model instance
48 | lstm_nn_model = lstm_model.create_lstm_model(config)
49 | lstm_nn_model.summary() # Display the model's architecture
50 | # checkpoint path
51 | checkpoint_path = config['checkpoint_dir_lstm']\
52 | + "cp.ckpt"
53 | # Create a callback that saves the model's weights
54 | cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
55 | save_weights_only=True,
56 | verbose=1)
57 | # load weights if possible
58 | lstm_model.load_model(lstm_nn_model, config, checkpoint_path)
59 |
60 | # start training
61 | if config['num_epochs_lstm'] > 0:
62 | lstm_model.train(config, lstm_nn_model, cp_callback)
63 |
64 | # make a prediction on the test set using the trained model
65 | lstm_embedding = lstm_nn_model.predict(lstm_model.x_test, batch_size=config['batch_size_lstm'])
66 | print(lstm_embedding.shape)
67 |
68 | # visualise the first 10 test sequences
69 | for i in range(10):
70 | lstm_model.plot_lstm_embedding_prediction(i, config, model_vae, sess, data, lstm_embedding)
71 |
72 |
73 | if __name__ == '__main__':
74 | main()
75 |
--------------------------------------------------------------------------------
/codes/utils.py:
--------------------------------------------------------------------------------
1 | """ util function.py """
2 |
3 | import json
4 | import os
5 | import argparse
6 | import tensorflow as tf
7 | from datetime import datetime
8 |
9 |
10 | def get_config_from_json(json_file):
11 | """
12 | Get the config from a json file
13 | :param json_file:
14 | :return: config(dictionary)
15 | """
16 | # parse the configurations from the config json file provided
17 | with open(json_file, 'r') as config_file:
18 | config_dict = json.load(config_file)
19 |
20 | return config_dict
21 |
22 |
23 | def save_config(config):
24 | dateTimeObj = datetime.now()
25 | timestampStr = dateTimeObj.strftime("%d-%b-%Y-%H-%M")
26 | filename = config['result_dir'] + 'training_config_{}.txt'.format(timestampStr)
27 | config_to_save = json.dumps(config)
28 | f = open(filename, "w")
29 | f.write(config_to_save)
30 | f.close()
31 |
32 |
33 | def process_config(json_file):
34 | config = get_config_from_json(json_file)
35 |
36 | # create directories to save experiment results and trained models
37 | if config['load_dir'] == "default":
38 | save_dir = "../experiments/local-results/{}/{}/batch-{}".format(
39 | config['exp_name'], config['dataset'], config['batch_size'])
40 | else:
41 | save_dir = config['load_dir']
42 | # specify the saving folder name for this experiment
43 | if config['TRAIN_sigma'] == 1:
44 | save_name = '{}-{}-{}-{}-{}-trainSigma'.format(config['exp_name'],
45 | config['dataset'],
46 | config['l_win'],
47 | config['l_seq'],
48 | config['code_size'])
49 | else:
50 | save_name = '{}-{}-{}-{}-{}-fixedSigma-{}'.format(config['exp_name'],
51 | config['dataset'],
52 | config['l_win'],
53 | config['l_seq'],
54 | config['code_size'],
55 | config['sigma'])
56 | config['summary_dir'] = os.path.join(save_dir, save_name, "summary/")
57 | config['result_dir'] = os.path.join(save_dir, save_name, "result/")
58 | config['checkpoint_dir'] = os.path.join(save_dir, save_name, "checkpoint/")
59 | config['checkpoint_dir_lstm'] = os.path.join(save_dir, save_name, "checkpoint/lstm/")
60 |
61 | return config
62 |
63 |
64 | def create_dirs(dirs):
65 | """
66 | dirs - a list of directories to create if these directories are not found
67 | :param dirs:
68 | :return exit_code: 0:success -1:failed
69 | """
70 | try:
71 | for dir_ in dirs:
72 | if not os.path.exists(dir_):
73 | os.makedirs(dir_)
74 | return 0
75 | except Exception as err:
76 | print("Creating directories error: {0}".format(err))
77 | exit(-1)
78 |
79 |
80 | def count_trainable_variables(scope_name):
81 | total_parameters = 0
82 | for variable in tf.trainable_variables(scope_name):
83 | # shape is an array of tf.Dimension
84 | shape = variable.get_shape()
85 | variable_parameters = 1
86 | for dim in shape:
87 | variable_parameters *= dim.value
88 | total_parameters += variable_parameters
89 | print(
90 | 'The total number of trainable parameters in the {} model is: {}'.format(scope_name, total_parameters))
91 | return total_parameters
92 |
93 |
94 | def get_args():
95 | argparser = argparse.ArgumentParser(description=__doc__)
96 | argparser.add_argument(
97 | '-c', '--config',
98 | metavar='C',
99 | default='None',
100 | help='The Configuration file')
101 | args = argparser.parse_args()
102 | return args
103 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VAE-LSTM for anomaly detection (ICASSP'20)
2 |
3 | This Github repository hosts our code and pre-processed data to train a VAE-LSTM hybrid model for anomaly detection, as proposed in our paper:
4 | [Anomaly Detection for Time Series Using VAE-LSTM Hybrid Model](https://ieeexplore.ieee.org/document/9053558).
5 |
6 | [Shuyu Lin1](https://shuyulin.co.uk/), [Ronald Clark2](http://www.ronnieclark.co.uk), Robert Birke3, Sandro Schönborn3, Niki Trigoni1, [Stephen Roberts1](https://www.robots.ox.ac.uk/~sjrob/)
7 |
8 | 1University of Oxford, 2Imperial College London, 3ABB Corporate Research
9 |
10 | In short, our anomaly detection model contains:
11 | * a VAE unit which summarizes the local information of a short window into a low-dimensional embedding,
12 | * a LSTM model, which acts on the low- dimensional embeddings produced by the VAE model, to manage the sequential patterns over longer term.
13 |
14 | An overview of our model is shown below:
15 |
16 |
17 |
18 |
19 | An example of anomaly detection on a time series of office temperature, which is provided by Numenta anomaly benchmark (NAB) datasets in their known anomaly subgroup [link](https://github.com/numenta/NAB/tree/master/data/realKnownCause):
20 |
21 |
22 |
23 |
24 |
25 |
26 | To run our code, please follow the instructions shown below.
27 |
28 | ## Environment
29 | Our code is written in Python3 with tensorflow 1.5 library.
30 | Please install the python libraries listed in the requirements.txt. We suggest to build a virtual environment using virtualenv package. To install and set up virtualenv, please follwo the procedures [here](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
31 |
32 | ## Data pre-processing
33 | We pre-processed the NAB data, which consists of 5 sequences, by creating a training set that does not contain any anomalies while keeping the rest of sequence as test set for evaluation. In addition, we removed the mean and standardized the time series. Pre-processed data are included in the `datasets/` subfolder with this repository for convenience of running our code. A demo ipython notebook has also been provided in `datasets/` to show how the pre-processing is done. Explanation of the different features saved in the .npz files for the pre-processed datasets are given in the [discussion](https://github.com/lin-shuyu/VAE-LSTM-for-anomaly-detection/issues/3).
34 |
35 | If you want to use the data for your own project, please cite and refer to the [NAB project](https://numenta.com/machine-intelligence-technology/numenta-anomaly-benchmark/) and access the raw data if needed.
36 |
37 | ## Training
38 | Our VAE-LSTM model is defined in several files in the `codes/` subfolder, including train.py, base.py, utils.py, data_loader.py, models.py, trainers.py. To train our model, simply run
39 | `python3 train.py --config NAB_config.json`,
40 | where NAB_config.json defines all the hyper-parameters of our model and you can experiment by using different values.
41 |
42 | ## Anomaly detection using the trained model
43 | After the model has been trained, we also prepare an iPython-notebook in NAB-anomaly-detection.ipynb for you to detect some anomalies detection on the test set. All you need to do is to run the code, make sure the NAB_config.json is prepared so that the right trained model will be loaded. The only thing that you need to specify in order to achieve reasonable anomaly detection result is to set a threshold on the anomaly detection metric. We suggest to observe the histogram of the anomaly detection metric and set the threshold accordingly.
44 |
45 | Hope you enjoy playing with our code and find it helpful for your projects! Happy anomaly detection!
46 |
47 | If you find our codes/project relevant to your work, please cite us in your work:
48 |
49 | ```
50 | @INPROCEEDINGS{VAE-LSTM-AD,
51 | author={S. {Lin} and R. {Clark} and R. {Birke} and S. {Schönborn} and N. {Trigoni} and S. {Roberts}},
52 | booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
53 | title={Anomaly Detection for Time Series Using VAE-LSTM Hybrid Model},
54 | year={2020}}
55 | ```
56 |
--------------------------------------------------------------------------------
/codes/data_loader.py:
--------------------------------------------------------------------------------
1 | from base import BaseDataGenerator
2 | import numpy as np
3 | import matplotlib.pylab as plt
4 | from matplotlib.pyplot import savefig
5 |
6 |
7 | class DataGenerator(BaseDataGenerator):
8 | def __init__(self, config):
9 | super(DataGenerator, self).__init__(config)
10 | # load data here: generate 3 state variables: train_set, val_set and test_set
11 | self.load_NAB_dataset(self.config['dataset'], self.config['y_scale'])
12 |
13 | def load_NAB_dataset(self, dataset, y_scale=6):
14 | data_dir = '../datasets/NAB-known-anomaly/'
15 | data = np.load(data_dir + dataset + '.npz')
16 |
17 | # normalise the dataset by training set mean and std
18 | train_m = data['train_m']
19 | train_std = data['train_std']
20 | readings_normalised = (data['readings'] - train_m) / train_std
21 |
22 | # plot normalised data
23 | fig, axs = plt.subplots(1, 1, figsize=(18, 4), edgecolor='k')
24 | fig.subplots_adjust(hspace=.4, wspace=.4)
25 | axs.plot(data['t'], readings_normalised)
26 | if data['idx_split'][0] == 0:
27 | axs.plot(data['idx_split'][1] * np.ones(20), np.linspace(-y_scale, y_scale, 20), 'b-')
28 | else:
29 | for i in range(2):
30 | axs.plot(data['idx_split'][i] * np.ones(20), np.linspace(-y_scale, y_scale, 20), 'b-')
31 | axs.plot(*np.ones(20), np.linspace(-y_scale, y_scale, 20), 'b--')
32 | for j in range(len(data['idx_anomaly'])):
33 | axs.plot(data['idx_anomaly'][j] * np.ones(20), np.linspace(-y_scale, 0.75 * y_scale, 20), 'r--')
34 | axs.grid(True)
35 | axs.set_xlim(0, len(data['t']))
36 | axs.set_ylim(-y_scale, y_scale)
37 | axs.set_xlabel("timestamp (every {})".format(data['t_unit']))
38 | axs.set_ylabel("readings")
39 | axs.set_title("{} dataset\n(normalised by train mean {:.4f} and std {:.4f})".format(dataset, train_m, train_std))
40 | axs.legend(('data', 'train test set split', 'anomalies'))
41 | savefig(self.config['result_dir'] + '/raw_data_normalised.pdf')
42 |
43 | # slice training set into rolling windows
44 | n_train_sample = len(data['training'])
45 | n_train_vae = n_train_sample - self.config['l_win'] + 1
46 | rolling_windows = np.zeros((n_train_vae, self.config['l_win']))
47 | for i in range(n_train_sample - self.config['l_win'] + 1):
48 | rolling_windows[i] = data['training'][i:i + self.config['l_win']]
49 |
50 | # create VAE training and validation set
51 | idx_train, idx_val, self.n_train_vae, self.n_val_vae = self.separate_train_and_val_set(n_train_vae)
52 | self.train_set_vae = dict(data=np.expand_dims(rolling_windows[idx_train], -1))
53 | self.val_set_vae = dict(data=np.expand_dims(rolling_windows[idx_val], -1))
54 | self.test_set_vae = dict(data=np.expand_dims(rolling_windows[idx_val[:self.config['batch_size']]], -1))
55 |
56 | # create LSTM training and validation set
57 | for k in range(self.config['l_win']):
58 | n_not_overlap_wins = (n_train_sample - k) // self.config['l_win']
59 | n_train_lstm = n_not_overlap_wins - self.config['l_seq'] + 1
60 | cur_lstm_seq = np.zeros((n_train_lstm, self.config['l_seq'], self.config['l_win']))
61 | for i in range(n_train_lstm):
62 | cur_seq = np.zeros((self.config['l_seq'], self.config['l_win']))
63 | for j in range(self.config['l_seq']):
64 | # print(k,i,j)
65 | cur_seq[j] = data['training'][k + self.config['l_win'] * (j + i): k + self.config['l_win'] * (j + i + 1)]
66 | cur_lstm_seq[i] = cur_seq
67 | if k == 0:
68 | lstm_seq = cur_lstm_seq
69 | else:
70 | lstm_seq = np.concatenate((lstm_seq, cur_lstm_seq), axis=0)
71 |
72 | n_train_lstm = lstm_seq.shape[0]
73 | idx_train, idx_val, self.n_train_lstm, self.n_val_lstm = self.separate_train_and_val_set(n_train_lstm)
74 | self.train_set_lstm = dict(data=np.expand_dims(lstm_seq[idx_train], -1))
75 | self.val_set_lstm = dict(data=np.expand_dims(lstm_seq[idx_val], -1))
76 |
77 | def plot_time_series(self, data, time, data_list):
78 | fig, axs = plt.subplots(1, 4, figsize=(18, 2.5), edgecolor='k')
79 | fig.subplots_adjust(hspace=.8, wspace=.4)
80 | axs = axs.ravel()
81 | for i in range(4):
82 | axs[i].plot(time / 60., data[:, i])
83 | axs[i].set_title(data_list[i])
84 | axs[i].set_xlabel('time (h)')
85 | axs[i].set_xlim((np.amin(time) / 60., np.amax(time) / 60.))
86 | savefig(self.config['result_dir'] + '/raw_training_set_normalised.pdf')
87 |
--------------------------------------------------------------------------------
/codes/trainers.py:
--------------------------------------------------------------------------------
1 | from base import BaseTrain
2 | import numpy as np
3 | import matplotlib.pylab as plt
4 | from matplotlib.pyplot import savefig
5 | from scipy.stats import multivariate_normal
6 |
7 |
8 | class vaeTrainer(BaseTrain):
9 | def __init__(self, sess, model, data, config):
10 | super(vaeTrainer, self).__init__(sess, model, data, config)
11 |
12 | def train_epoch(self):
13 | self.cur_epoch = self.model.cur_epoch_tensor.eval(self.sess)
14 |
15 | # training
16 | self.sess.run(self.model.iterator.initializer,
17 | feed_dict={self.model.original_signal: self.data.train_set_vae['data'],
18 | self.model.seed: self.cur_epoch})
19 | self.n_train_iter = self.data.n_train_vae // self.config['batch_size']
20 | idx_check_point = (self.n_train_iter - 1)
21 | train_loss_cur_epoch = 0.0
22 | for i in range(self.n_train_iter):
23 | loss = self.train_step()
24 | self.sess.run(self.model.increment_global_step_tensor)
25 | self.train_loss.append(np.squeeze(loss))
26 | train_loss_cur_epoch = train_loss_cur_epoch + loss
27 | if i == idx_check_point:
28 | test_loss, test_recons_loss_weighted, test_kl, test_sigma_regularisor, test_code_std_norm, test_cur_sigma2, test_recons_loss_ls = self.test_step()
29 | self.train_loss_ave_epoch.append(train_loss_cur_epoch / self.n_train_iter)
30 |
31 | # validation
32 | self.iter_epochs_list.append(self.n_train_iter * (self.cur_epoch + 1))
33 | self.sess.run(self.model.iterator.initializer,
34 | feed_dict={self.model.original_signal: self.data.val_set_vae['data'],
35 | self.model.seed: self.cur_epoch})
36 | self.n_val_iter = self.data.n_val_vae // self.config['batch_size']
37 | val_loss_cur_epoch = 0.0
38 | for i in range(self.n_val_iter):
39 | val_loss = self.val_step()
40 | val_loss_cur_epoch = val_loss_cur_epoch + val_loss
41 | self.val_loss_ave_epoch.append(val_loss_cur_epoch / self.n_val_iter)
42 |
43 | # save the model parameters at the end of this epoch
44 | self.model.save(self.sess)
45 |
46 | print(
47 | "{}/{}, test loss: -elbo: {:.4f}, recons_loss_weighted: {:.4f}, recons_loss_ls: {:.4f}, KL_loss: {:.4f}, sigma_regularisor: {:.4f}, code_std_dev: {}".format(
48 | self.cur_epoch,
49 | self.config['num_epochs_vae'] - 1,
50 | test_loss,
51 | test_recons_loss_weighted,
52 | np.squeeze(np.mean(test_recons_loss_ls)),
53 | test_kl,
54 | test_sigma_regularisor,
55 | np.squeeze(test_code_std_norm)))
56 | print("Loss on training and val sets:\ntrain: {:.4f}, val: {:.4f}".format(
57 | self.train_loss_ave_epoch[self.cur_epoch],
58 | self.val_loss_ave_epoch[self.cur_epoch]))
59 | print("Current sigma2: {:.7f}".format(test_cur_sigma2))
60 |
61 | # save the current variables
62 | self.save_variables_VAE()
63 |
64 | # reconstruction plot
65 | self.plot_reconstructed_signal()
66 |
67 | # generate samples from prior
68 | self.generate_samples_from_prior()
69 |
70 | # plot the training and validation loss over iterations/epochs
71 | self.plot_train_and_val_loss()
72 |
73 | def train_step(self):
74 | batch_image = self.sess.run(self.model.input_image)
75 | feed_dict = {self.model.original_signal: batch_image,
76 | self.model.is_code_input: False,
77 | self.model.code_input: np.zeros((1, self.config['code_size'])),
78 | self.model.lr: self.config['learning_rate_vae'] * (0.98 ** self.cur_epoch)}
79 | train_loss, _ = self.sess.run([self.model.elbo_loss, self.model.train_step_gradient],
80 | feed_dict=feed_dict)
81 | return train_loss
82 |
83 | def val_step(self):
84 | input_image_val = self.sess.run(self.model.input_image)
85 | val_cost, recon_loss_val, kl_loss_val, std_dev_loss_val = self.sess.run([self.model.elbo_loss,
86 | self.model.ls_reconstruction_error,
87 | self.model.KL_loss,
88 | self.model.std_dev_norm],
89 | feed_dict={
90 | self.model.original_signal: input_image_val,
91 | self.model.is_code_input: False,
92 | self.model.code_input: np.zeros(
93 | (1, self.config['code_size']))})
94 | self.val_loss.append(np.squeeze(val_cost))
95 | self.recons_loss_val.append(np.squeeze(np.mean(recon_loss_val)))
96 | self.KL_loss_val.append(kl_loss_val)
97 | return val_cost
98 |
99 | def test_step(self):
100 | feed_dict = {self.model.original_signal: self.data.test_set_vae['data'],
101 | self.model.is_code_input: False,
102 | self.model.code_input: np.zeros((1, self.config['code_size']))}
103 | self.output_test, test_loss, test_recons_loss_weighted, test_kl, test_sigma_regularisor, test_code_std_norm, test_cur_sigma2, test_recons_loss_ls = self.sess.run(
104 | [self.model.decoded,
105 | self.model.elbo_loss,
106 | self.model.weighted_reconstruction_error_dataset,
107 | self.model.KL_loss,
108 | self.model.sigma_regularisor_dataset,
109 | self.model.std_dev_norm,
110 | self.model.sigma2,
111 | self.model.ls_reconstruction_error],
112 | feed_dict=feed_dict)
113 | self.test_sigma2.append(np.squeeze(test_cur_sigma2))
114 | return test_loss, test_recons_loss_weighted, test_kl, test_sigma_regularisor, test_code_std_norm, np.squeeze(
115 | test_cur_sigma2), test_recons_loss_ls
116 |
117 | def plot_reconstructed_signal(self):
118 | input_images = np.squeeze(self.data.test_set_vae['data'])
119 | decoded_images = np.squeeze(self.output_test)
120 | n_images = 20
121 | # plot the reconstructed image for a shape
122 | for j in range(self.config['n_channel']):
123 | fig, axs = plt.subplots(4, 5, figsize=(18, 10), edgecolor='k')
124 | fig.subplots_adjust(hspace=.4, wspace=.4)
125 | axs = axs.ravel()
126 | for i in range(n_images):
127 | if self.config['n_channel'] == 1:
128 | axs[i].plot(input_images[i])
129 | axs[i].plot(decoded_images[i])
130 | else:
131 | axs[i].plot(input_images[i, :, j])
132 | axs[i].plot(decoded_images[i, :, j])
133 | axs[i].grid(True)
134 | axs[i].set_xlim(0, self.config['l_win'])
135 | axs[i].set_ylim(-5, 5)
136 | if i == 19:
137 | axs[i].legend(('original', 'reconstructed'))
138 | plt.suptitle('Channel {}'.format(j))
139 | savefig(self.config['result_dir'] + 'test_reconstructed_{}_{}.pdf'.format(self.cur_epoch, j))
140 | fig.clf()
141 | plt.close()
142 |
143 | def generate_samples_from_prior(self):
144 | rv = multivariate_normal(np.zeros(self.config['code_size']), np.diag(np.ones(self.config['code_size'])))
145 | # Generate a batch size of samples from the prior samples
146 | n_images = 20
147 | samples_code_prior = rv.rvs(n_images)
148 | sampled_images = self.sess.run(self.model.decoded,
149 | feed_dict={self.model.original_signal: np.zeros(
150 | (n_images, self.config['l_win'], self.config['n_channel'])),
151 | self.model.is_code_input: True,
152 | self.model.code_input: samples_code_prior})
153 | sampled_images = np.squeeze(sampled_images)
154 | for j in range(self.config['n_channel']):
155 | fig, axs = plt.subplots(4, 5, figsize=(18, 10), edgecolor='k')
156 | fig.subplots_adjust(hspace=.4, wspace=.4)
157 | axs = axs.ravel()
158 | for i in range(n_images):
159 | if self.config['n_channel'] == 1:
160 | axs[i].plot(sampled_images[i])
161 | else:
162 | axs[i].plot(sampled_images[i, :, j])
163 | axs[i].grid(True)
164 | axs[i].set_xlim(0, self.config['l_win'])
165 | axs[i].set_ylim(-5, 5)
166 | plt.suptitle('Channel {}'.format(j))
167 | savefig(self.config['result_dir'] + 'generated_samples_{}_{}.pdf'.format(self.cur_epoch, j))
168 | fig.clf()
169 | plt.close()
170 |
--------------------------------------------------------------------------------
/codes/base.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow_probability as tfp
3 | import random
4 | import numpy as np
5 | import time
6 | import matplotlib.pylab as plt
7 | from matplotlib.pyplot import plot, savefig, figure
8 | from utils import count_trainable_variables
9 | tfd = tfp.distributions
10 |
11 |
12 | class BaseDataGenerator:
13 | def __init__(self, config):
14 | self.config = config
15 |
16 | # separate training and val sets
17 | def separate_train_and_val_set(self, n_win):
18 | n_train = int(np.floor((n_win * 0.9)))
19 | n_val = n_win - n_train
20 | idx_train = random.sample(range(n_win), n_train)
21 | idx_val = list(set(idx_train) ^ set(range(n_win)))
22 | return idx_train, idx_val, n_train, n_val
23 |
24 |
25 | class BaseModel:
26 | def __init__(self, config):
27 | self.config = config
28 | # init the global step
29 | self.init_global_step()
30 | # init the epoch counter
31 | self.init_cur_epoch()
32 | self.two_pi = tf.constant(2 * np.pi)
33 |
34 | # save function that saves the checkpoint in the path defined in the config file
35 | def save(self, sess):
36 | print("Saving model...")
37 | self.saver.save(sess, self.config['checkpoint_dir'],
38 | self.global_step_tensor)
39 | print("Model saved.")
40 |
41 | # load latest checkpoint from the experiment path defined in the config file
42 | def load(self, sess):
43 | print("checkpoint_dir at loading: {}".format(self.config['checkpoint_dir']))
44 | latest_checkpoint = tf.train.latest_checkpoint(self.config['checkpoint_dir'])
45 |
46 | if latest_checkpoint:
47 | print("Loading model checkpoint {} ...\n".format(latest_checkpoint))
48 | self.saver.restore(sess, latest_checkpoint)
49 | print("Model loaded.")
50 | else:
51 | print("No model loaded.")
52 |
53 | # initialize a tensorflow variable to use it as epoch counter
54 | def init_cur_epoch(self):
55 | with tf.variable_scope('cur_epoch'):
56 | self.cur_epoch_tensor = tf.Variable(0, trainable=False, name='cur_epoch')
57 | self.increment_cur_epoch_tensor = tf.assign(self.cur_epoch_tensor, self.cur_epoch_tensor + 1)
58 |
59 | # just initialize a tensorflow variable to use it as global step counter
60 | def init_global_step(self):
61 | # DON'T forget to add the global step tensor to the tensorflow trainer
62 | with tf.variable_scope('global_step'):
63 | self.global_step_tensor = tf.Variable(0, trainable=False, name='global_step')
64 | self.increment_global_step_tensor = tf.assign(
65 | self.global_step_tensor, self.global_step_tensor + 1)
66 |
67 | def define_loss(self):
68 | with tf.name_scope("loss"):
69 | # KL divergence loss - analytical result
70 | KL_loss = 0.5 * (tf.reduce_sum(tf.square(self.code_mean), 1)
71 | + tf.reduce_sum(tf.square(self.code_std_dev), 1)
72 | - tf.reduce_sum(tf.log(tf.square(self.code_std_dev)), 1)
73 | - self.config['code_size'])
74 | self.KL_loss = tf.reduce_mean(KL_loss)
75 |
76 | # norm 1 of standard deviation of the sample-wise encoder prediction
77 | self.std_dev_norm = tf.reduce_mean(self.code_std_dev, axis=0)
78 |
79 | weighted_reconstruction_error_dataset = tf.reduce_sum(
80 | tf.square(self.original_signal - self.decoded), [1, 2])
81 | weighted_reconstruction_error_dataset = tf.reduce_mean(weighted_reconstruction_error_dataset)
82 | self.weighted_reconstruction_error_dataset = weighted_reconstruction_error_dataset / (2 * self.sigma2)
83 |
84 | # least squared reconstruction error
85 | ls_reconstruction_error = tf.reduce_sum(
86 | tf.square(self.original_signal - self.decoded), [1, 2])
87 | self.ls_reconstruction_error = tf.reduce_mean(ls_reconstruction_error)
88 |
89 | # sigma regularisor - input elbo
90 | self.sigma_regularisor_dataset = self.input_dims / 2 * tf.log(self.sigma2)
91 | two_pi = self.input_dims / 2 * tf.constant(2 * np.pi)
92 |
93 | self.elbo_loss = two_pi + self.sigma_regularisor_dataset + \
94 | 0.5 * self.weighted_reconstruction_error_dataset + self.KL_loss
95 |
96 | def training_variables(self):
97 | encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoder")
98 | decoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "decoder")
99 | sigma_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "sigma2_dataset")
100 | self.train_vars_VAE = encoder_vars + decoder_vars + sigma_vars
101 |
102 | num_encoder = count_trainable_variables('encoder')
103 | num_decoder = count_trainable_variables('decoder')
104 | num_sigma2 = count_trainable_variables('sigma2_dataset')
105 | self.num_vars_total = num_decoder + num_encoder + num_sigma2
106 | print("Total number of trainable parameters in the VAE network is: {}".format(self.num_vars_total))
107 |
108 | def compute_gradients(self):
109 | self.lr = tf.placeholder(tf.float32, [])
110 | opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.95)
111 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
112 | gvs_dataset = opt.compute_gradients(self.elbo_loss, var_list=self.train_vars_VAE)
113 | print('gvs for dataset: {}'.format(gvs_dataset))
114 | capped_gvs = [(self.ClipIfNotNone(grad), var) for grad, var in gvs_dataset]
115 |
116 | with tf.control_dependencies(update_ops):
117 | self.train_step_gradient = opt.apply_gradients(capped_gvs)
118 | print("Reach the definition of loss for VAE")
119 |
120 | def ClipIfNotNone(self, grad):
121 | if grad is None:
122 | return grad
123 | return tf.clip_by_value(grad, -1, 1)
124 |
125 | def init_saver(self):
126 | self.saver = tf.train.Saver(max_to_keep=1, var_list=self.train_vars_VAE)
127 |
128 |
129 | class BaseTrain:
130 | def __init__(self, sess, model, data, config):
131 | self.model = model
132 | self.config = config
133 | self.sess = sess
134 | self.data = data
135 | self.init = tf.group(tf.global_variables_initializer(),
136 | tf.local_variables_initializer())
137 | self.sess.run(self.init)
138 |
139 | # keep a record of the training result
140 | self.train_loss = []
141 | self.val_loss = []
142 | self.train_loss_ave_epoch = []
143 | self.val_loss_ave_epoch = []
144 | self.recons_loss_train = []
145 | self.recons_loss_val = []
146 | self.KL_loss_train = []
147 | self.KL_loss_val = []
148 | self.sample_std_dev_train = []
149 | self.sample_std_dev_val = []
150 | self.iter_epochs_list = []
151 | self.test_sigma2 = []
152 |
153 | def train(self):
154 | self.start_time = time.time()
155 | for cur_epoch in range(0, self.config['num_epochs_vae'], 1):
156 | self.train_epoch()
157 |
158 | # compute current execution time
159 | self.current_time = time.time()
160 | elapsed_time = (self.current_time - self.start_time) / 60
161 | est_remaining_time = (
162 | self.current_time - self.start_time) / (cur_epoch + 1) * (
163 | self.config['num_epochs_vae'] - cur_epoch - 1)
164 | est_remaining_time = est_remaining_time / 60
165 | print("Already trained for {} min; Remaining {} min.".format(elapsed_time, est_remaining_time))
166 | self.sess.run(self.model.increment_cur_epoch_tensor)
167 |
168 | def save_variables_VAE(self):
169 | # save some variables for later inspection
170 | file_name = "{}{}-batch-{}-epoch-{}-code-{}-lr-{}.npz".format(self.config['result_dir'],
171 | self.config['exp_name'],
172 | self.config['batch_size'],
173 | self.config['num_epochs_vae'],
174 | self.config['code_size'],
175 | self.config['learning_rate_vae'])
176 | np.savez(file_name,
177 | iter_list_val=self.iter_epochs_list,
178 | train_loss=self.train_loss,
179 | val_loss=self.val_loss,
180 | n_train_iter=self.n_train_iter,
181 | n_val_iter=self.n_val_iter,
182 | recons_loss_train=self.recons_loss_train,
183 | recons_loss_val=self.recons_loss_val,
184 | KL_loss_train=self.KL_loss_train,
185 | KL_loss_val=self.KL_loss_val,
186 | num_para_all=self.model.num_vars_total,
187 | sigma2=self.test_sigma2)
188 |
189 | def plot_train_and_val_loss(self):
190 | # plot the training and validation loss over epochs
191 | plt.clf()
192 | figure(num=1, figsize=(8, 6))
193 | plot(self.train_loss, 'b-')
194 | plot(self.iter_epochs_list, self.val_loss_ave_epoch, 'r-')
195 | plt.legend(('training loss (total)', 'validation loss'))
196 | plt.title('training loss over iterations (val @ epochs)')
197 | plt.ylabel('total loss')
198 | plt.xlabel('iterations')
199 | plt.grid(True)
200 | savefig(self.config['result_dir'] + '/loss.png')
201 |
202 | # plot individual components of validation loss over epochs
203 | plt.clf()
204 | figure(num=1, figsize=(8, 6))
205 | plot(self.recons_loss_val, 'b-')
206 | plot(self.KL_loss_val, 'r-')
207 | plt.legend(('Reconstruction loss', 'KL loss'))
208 | plt.title('validation loss breakdown')
209 | plt.ylabel('loss')
210 | plt.xlabel('num of batch')
211 | plt.grid(True)
212 | savefig(self.config['result_dir'] + '/val-loss.png')
213 |
214 | # plot individual components of validation loss over epochs
215 | plt.clf()
216 | figure(num=1, figsize=(8, 6))
217 | plot(self.test_sigma2, 'b-')
218 | plt.title('sigma2 over training')
219 | plt.ylabel('sigma2')
220 | plt.xlabel('iter')
221 | plt.grid(True)
222 | savefig(self.config['result_dir'] + '/sigma2.png')
223 |
--------------------------------------------------------------------------------
/codes/models.py:
--------------------------------------------------------------------------------
1 | from base import BaseModel
2 | import os
3 | import numpy as np
4 | import matplotlib.pylab as plt
5 | from matplotlib.pyplot import savefig
6 | import tensorflow as tf
7 | import tensorflow_probability as tfp
8 |
9 | tfd = tfp.distributions
10 |
11 |
12 | class VAEmodel(BaseModel):
13 | def __init__(self, config):
14 | super(VAEmodel, self).__init__(config)
15 | self.input_dims = self.config['l_win'] * self.config['n_channel']
16 |
17 | self.define_iterator()
18 | self.build_model()
19 | self.define_loss()
20 | self.training_variables()
21 | self.compute_gradients()
22 | self.init_saver()
23 |
24 | def define_iterator(self):
25 | self.original_signal = tf.placeholder(tf.float32, [None, self.config['l_win'], self.config['n_channel']])
26 | self.seed = tf.placeholder(tf.int64, shape=())
27 | self.dataset = tf.data.Dataset.from_tensor_slices(self.original_signal)
28 | self.dataset = self.dataset.shuffle(buffer_size=60000, seed=self.seed)
29 | self.dataset = self.dataset.repeat(8000)
30 | self.dataset = self.dataset.batch(self.config['batch_size'], drop_remainder=True)
31 | self.iterator = self.dataset.make_initializable_iterator()
32 | self.input_image = self.iterator.get_next()
33 | self.code_input = tf.placeholder(tf.float32, [None, self.config['code_size']])
34 | self.is_code_input = tf.placeholder(tf.bool)
35 | self.sigma2_offset = tf.constant(self.config['sigma2_offset'])
36 |
37 | def build_model(self):
38 | init = tf.contrib.layers.xavier_initializer()
39 | with tf.variable_scope('encoder'):
40 | input_tensor = tf.expand_dims(self.original_signal, -1)
41 | if self.config['l_win'] == 24:
42 | conv_1 = tf.layers.conv2d(inputs=tf.pad(input_tensor, [[0, 0], [4, 4], [0, 0], [0, 0]], "SYMMETRIC"),
43 | filters=self.config['num_hidden_units'] / 16,
44 | kernel_size=(3, self.config['n_channel']),
45 | strides=(2, 1),
46 | padding='same',
47 | activation=tf.nn.leaky_relu,
48 | kernel_initializer=init)
49 | print("conv_1: {}".format(conv_1))
50 | conv_2 = tf.layers.conv2d(inputs=conv_1,
51 | filters=self.config['num_hidden_units'] / 8,
52 | kernel_size=(3, self.config['n_channel']),
53 | strides=(2, 1),
54 | padding='same',
55 | activation=tf.nn.leaky_relu,
56 | kernel_initializer=init)
57 | print("conv_2: {}".format(conv_2))
58 | conv_3 = tf.layers.conv2d(inputs=conv_2,
59 | filters=self.config['num_hidden_units'] / 4,
60 | kernel_size=(3, self.config['n_channel']),
61 | strides=(2, 1),
62 | padding='same',
63 | activation=tf.nn.leaky_relu,
64 | kernel_initializer=init)
65 | print("conv_3: {}".format(conv_3))
66 | conv_4 = tf.layers.conv2d(inputs=conv_3,
67 | filters=self.config['num_hidden_units'],
68 | kernel_size=(4, self.config['n_channel']),
69 | strides=1,
70 | padding='valid',
71 | activation=tf.nn.leaky_relu,
72 | kernel_initializer=init)
73 | print("conv_4: {}".format(conv_4))
74 | elif self.config['l_win'] == 48:
75 | conv_1 = tf.layers.conv2d(input_tensor,
76 | filters=self.config['num_hidden_units'] / 16,
77 | kernel_size=(3, self.config['n_channel']),
78 | strides=(2, 1),
79 | padding='same',
80 | activation=tf.nn.leaky_relu,
81 | kernel_initializer=init)
82 | print("conv_1: {}".format(conv_1))
83 | conv_2 = tf.layers.conv2d(inputs=conv_1,
84 | filters=self.config['num_hidden_units'] / 8,
85 | kernel_size=(3, self.config['n_channel']),
86 | strides=(2, 1),
87 | padding='same',
88 | activation=tf.nn.leaky_relu,
89 | kernel_initializer=init)
90 | print("conv_2: {}".format(conv_2))
91 | conv_3 = tf.layers.conv2d(inputs=conv_2,
92 | filters=self.config['num_hidden_units'] / 4,
93 | kernel_size=(3, self.config['n_channel']),
94 | strides=(2, 1),
95 | padding='same',
96 | activation=tf.nn.leaky_relu,
97 | kernel_initializer=init)
98 | print("conv_3: {}".format(conv_3))
99 | conv_4 = tf.layers.conv2d(inputs=conv_3,
100 | filters=self.config['num_hidden_units'],
101 | kernel_size=(6, self.config['n_channel']),
102 | strides=1,
103 | padding='valid',
104 | activation=tf.nn.leaky_relu,
105 | kernel_initializer=init)
106 | print("conv_4: {}".format(conv_4))
107 | elif self.config['l_win'] == 144:
108 | conv_1 = tf.layers.conv2d(inputs=input_tensor,
109 | filters=self.config['num_hidden_units'] / 16,
110 | kernel_size=(3, self.config['n_channel']),
111 | strides=(4, 1),
112 | padding='same',
113 | activation=tf.nn.leaky_relu,
114 | kernel_initializer=init)
115 | print("conv_1: {}".format(conv_1))
116 | conv_2 = tf.layers.conv2d(inputs=conv_1,
117 | filters=self.config['num_hidden_units'] / 8,
118 | kernel_size=(3, self.config['n_channel']),
119 | strides=(4, 1),
120 | padding='same',
121 | activation=tf.nn.leaky_relu,
122 | kernel_initializer=init)
123 | print("conv_2: {}".format(conv_2))
124 | conv_3 = tf.layers.conv2d(inputs=conv_2,
125 | filters=self.config['num_hidden_units'] / 4,
126 | kernel_size=(3, self.config['n_channel']),
127 | strides=(3, 1),
128 | padding='same',
129 | activation=tf.nn.leaky_relu,
130 | kernel_initializer=init)
131 | print("conv_3: {}".format(conv_3))
132 | conv_4 = tf.layers.conv2d(inputs=conv_3,
133 | filters=self.config['num_hidden_units'],
134 | kernel_size=(3, self.config['n_channel']),
135 | strides=1,
136 | padding='valid',
137 | activation=tf.nn.leaky_relu,
138 | kernel_initializer=init)
139 | print("conv_4: {}".format(conv_4))
140 |
141 | encoded_signal = tf.layers.flatten(conv_4)
142 | encoded_signal = tf.layers.dense(encoded_signal,
143 | units=self.config['code_size'] * 4,
144 | activation=tf.nn.leaky_relu,
145 | kernel_initializer=init)
146 | self.code_mean = tf.layers.dense(encoded_signal,
147 | units=self.config['code_size'],
148 | activation=None,
149 | kernel_initializer=init,
150 | name='code_mean')
151 | self.code_std_dev = tf.layers.dense(encoded_signal,
152 | units=self.config['code_size'],
153 | activation=tf.nn.relu,
154 | kernel_initializer=init,
155 | name='code_std_dev')
156 | self.code_std_dev = self.code_std_dev + 1e-2
157 | mvn = tfp.distributions.MultivariateNormalDiag(loc=self.code_mean, scale_diag=self.code_std_dev)
158 | self.code_sample = mvn.sample()
159 | print("finish encoder: \n{}".format(self.code_sample))
160 | print("\n")
161 |
162 | with tf.variable_scope('decoder'):
163 | encoded = tf.cond(self.is_code_input, lambda: self.code_input, lambda: self.code_sample)
164 | decoded_1 = tf.layers.dense(encoded,
165 | units=self.config['num_hidden_units'],
166 | activation=tf.nn.leaky_relu,
167 | kernel_initializer=init)
168 | decoded_1 = tf.reshape(decoded_1, [-1, 1, 1, self.config['num_hidden_units']])
169 | if self.config['l_win'] == 24:
170 | decoded_2 = tf.layers.conv2d(decoded_1,
171 | filters=self.config['num_hidden_units'],
172 | kernel_size=1,
173 | padding='same',
174 | activation=tf.nn.leaky_relu)
175 | decoded_2 = tf.reshape(decoded_2, [-1, 4, 1, self.config['num_hidden_units'] // 4])
176 | print("decoded_2 is: {}".format(decoded_2))
177 | decoded_3 = tf.layers.conv2d(decoded_2,
178 | filters=self.config['num_hidden_units'] // 4,
179 | kernel_size=(3, 1),
180 | strides=1,
181 | padding='same',
182 | activation=tf.nn.leaky_relu,
183 | kernel_initializer=init)
184 | decoded_3 = tf.nn.depth_to_space(input=decoded_3,
185 | block_size=2)
186 | decoded_3 = tf.reshape(decoded_3, [-1, 8, 1, self.config['num_hidden_units'] // 8])
187 | print("decoded_3 is: {}".format(decoded_3))
188 | decoded_4 = tf.layers.conv2d(decoded_3,
189 | filters=self.config['num_hidden_units'] // 8,
190 | kernel_size=(3, 1),
191 | strides=1,
192 | padding='same',
193 | activation=tf.nn.leaky_relu,
194 | kernel_initializer=init)
195 | decoded_4 = tf.nn.depth_to_space(input=decoded_4,
196 | block_size=2)
197 | decoded_4 = tf.reshape(decoded_4, [-1, 16, 1, self.config['num_hidden_units'] // 16])
198 | print("decoded_4 is: {}".format(decoded_4))
199 | decoded_5 = tf.layers.conv2d(decoded_4,
200 | filters=self.config['num_hidden_units'] // 16,
201 | kernel_size=(3, 1),
202 | strides=1,
203 | padding='same',
204 | activation=tf.nn.leaky_relu,
205 | kernel_initializer=init)
206 | decoded_5 = tf.nn.depth_to_space(input=decoded_5,
207 | block_size=2)
208 | decoded_5 = tf.reshape(decoded_5, [-1, self.config['num_hidden_units'] // 16, 1, 16])
209 | print("decoded_5 is: {}".format(decoded_5))
210 | decoded = tf.layers.conv2d(inputs=decoded_5,
211 | filters=self.config['n_channel'],
212 | kernel_size=(9, 1),
213 | strides=1,
214 | padding='valid',
215 | activation=None,
216 | kernel_initializer=init)
217 | print("decoded_6 is: {}".format(decoded))
218 | self.decoded = tf.reshape(decoded, [-1, self.config['l_win'], self.config['n_channel']])
219 | elif self.config['l_win'] == 48:
220 | decoded_2 = tf.layers.conv2d(decoded_1,
221 | filters=256 * 3,
222 | kernel_size=1,
223 | padding='same',
224 | activation=tf.nn.leaky_relu)
225 | decoded_2 = tf.reshape(decoded_2, [-1, 3, 1, 256])
226 | print("decoded_2 is: {}".format(decoded_2))
227 | decoded_3 = tf.layers.conv2d(decoded_2,
228 | filters=256,
229 | kernel_size=(3, 1),
230 | strides=1,
231 | padding='same',
232 | activation=tf.nn.leaky_relu,
233 | kernel_initializer=init)
234 | decoded_3 = tf.nn.depth_to_space(input=decoded_3,
235 | block_size=2)
236 | decoded_3 = tf.reshape(decoded_3, [-1, 6, 1, 128])
237 | print("decoded_3 is: {}".format(decoded_3))
238 | decoded_4 = tf.layers.conv2d(decoded_3,
239 | filters=128,
240 | kernel_size=(3, 1),
241 | strides=1,
242 | padding='same',
243 | activation=tf.nn.leaky_relu,
244 | kernel_initializer=init)
245 | decoded_4 = tf.nn.depth_to_space(input=decoded_4,
246 | block_size=2)
247 | decoded_4 = tf.reshape(decoded_4, [-1, 24, 1, 32])
248 | print("decoded_4 is: {}".format(decoded_4))
249 | decoded_5 = tf.layers.conv2d(decoded_4,
250 | filters=32,
251 | kernel_size=(3, 1),
252 | strides=1,
253 | padding='same',
254 | activation=tf.nn.leaky_relu,
255 | kernel_initializer=init)
256 | decoded_5 = tf.nn.depth_to_space(input=decoded_5,
257 | block_size=2)
258 | decoded_5 = tf.reshape(decoded_5, [-1, 48, 1, 16])
259 | print("decoded_5 is: {}".format(decoded_5))
260 | decoded = tf.layers.conv2d(inputs=decoded_5,
261 | filters=1,
262 | kernel_size=(5, self.config['n_channel']),
263 | strides=1,
264 | padding='same',
265 | activation=None,
266 | kernel_initializer=init)
267 | print("decoded_6 is: {}".format(decoded))
268 | self.decoded = tf.reshape(decoded, [-1, self.config['l_win'], self.config['n_channel']])
269 | elif self.config['l_win'] == 144:
270 | decoded_2 = tf.layers.conv2d(decoded_1,
271 | filters=32 * 27,
272 | kernel_size=1,
273 | strides=1,
274 | padding='same',
275 | activation=tf.nn.leaky_relu)
276 | decoded_2 = tf.reshape(decoded_2, [-1, 3, 1, 32 * 9])
277 | print("decoded_2 is: {}".format(decoded_2))
278 | decoded_3 = tf.layers.conv2d(decoded_2,
279 | filters=32 * 9,
280 | kernel_size=(3, 1),
281 | strides=1,
282 | padding='same',
283 | activation=tf.nn.leaky_relu,
284 | kernel_initializer=init)
285 | decoded_3 = tf.nn.depth_to_space(input=decoded_3,
286 | block_size=3)
287 | decoded_3 = tf.reshape(decoded_3, [-1, 9, 1, 32 * 3])
288 | print("decoded_3 is: {}".format(decoded_3))
289 | decoded_4 = tf.layers.conv2d(decoded_3,
290 | filters=32 * 3,
291 | kernel_size=(3, 1),
292 | strides=1,
293 | padding='same',
294 | activation=tf.nn.leaky_relu,
295 | kernel_initializer=init)
296 | decoded_4 = tf.nn.depth_to_space(input=decoded_4,
297 | block_size=2)
298 | decoded_4 = tf.reshape(decoded_4, [-1, 36, 1, 24])
299 | print("decoded_4 is: {}".format(decoded_4))
300 | decoded_5 = tf.layers.conv2d(decoded_4,
301 | filters=24,
302 | kernel_size=(3, 1),
303 | strides=1,
304 | padding='same',
305 | activation=tf.nn.leaky_relu,
306 | kernel_initializer=init)
307 | decoded_5 = tf.nn.depth_to_space(input=decoded_5,
308 | block_size=2)
309 | decoded_5 = tf.reshape(decoded_5, [-1, 144, 1, 6])
310 | print("decoded_5 is: {}".format(decoded_5))
311 | decoded = tf.layers.conv2d(inputs=decoded_5,
312 | filters=1,
313 | kernel_size=(9, self.config['n_channel']),
314 | strides=1,
315 | padding='same',
316 | activation=None,
317 | kernel_initializer=init)
318 | print("decoded_6 is: {}".format(decoded))
319 | self.decoded = tf.reshape(decoded, [-1, self.config['l_win'], self.config['n_channel']])
320 | print("finish decoder: \n{}".format(self.decoded))
321 | print('\n')
322 |
323 | # define sigma2 parameter to be trained to optimise ELBO
324 | with tf.variable_scope('sigma2_dataset'):
325 | if self.config['TRAIN_sigma'] == 1:
326 | sigma = tf.Variable(tf.cast(self.config['sigma'], tf.float32),
327 | dtype=tf.float32, trainable=True)
328 | else:
329 | sigma = tf.cast(self.config['sigma'], tf.float32)
330 | self.sigma2 = tf.square(sigma)
331 | if self.config['TRAIN_sigma'] == 1:
332 | self.sigma2 = self.sigma2 + self.sigma2_offset
333 |
334 | print("sigma2: \n{}\n".format(self.sigma2))
335 |
336 |
337 | class lstmKerasModel:
338 | def __init__(self, data):
339 | pass
340 |
341 | def create_lstm_model(self, config):
342 | lstm_input = tf.keras.layers.Input(shape=(config['l_seq'] - 1, config['code_size']))
343 | LSTM1 = tf.keras.layers.LSTM(config['num_hidden_units_lstm'], return_sequences=True)(lstm_input)
344 | LSTM2 = tf.keras.layers.LSTM(config['num_hidden_units_lstm'], return_sequences=True)(LSTM1)
345 | lstm_output = tf.keras.layers.LSTM(config['code_size'], return_sequences=True, activation=None)(LSTM2)
346 | lstm_model = tf.keras.Model(lstm_input, lstm_output)
347 | lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=config['learning_rate_lstm']),
348 | loss='mse',
349 | metrics=['mse'])
350 | return lstm_model
351 |
352 | def produce_embeddings(self, config, model_vae, data, sess):
353 | self.embedding_lstm_train = np.zeros((data.n_train_lstm, config['l_seq'], config['code_size']))
354 | for i in range(data.n_train_lstm):
355 | feed_dict = {model_vae.original_signal: data.train_set_lstm['data'][i],
356 | model_vae.is_code_input: False,
357 | model_vae.code_input: np.zeros((1, config['code_size']))}
358 | self.embedding_lstm_train[i] = sess.run(model_vae.code_mean, feed_dict=feed_dict)
359 | print("Finish processing the embeddings of the entire dataset.")
360 | print("The first a few embeddings are\n{}".format(self.embedding_lstm_train[0, 0:5]))
361 | self.x_train = self.embedding_lstm_train[:, :config['l_seq'] - 1]
362 | self.y_train = self.embedding_lstm_train[:, 1:]
363 |
364 | self.embedding_lstm_test = np.zeros((data.n_val_lstm, config['l_seq'], config['code_size']))
365 | for i in range(data.n_val_lstm):
366 | feed_dict = {model_vae.original_signal: data.val_set_lstm['data'][i],
367 | model_vae.is_code_input: False,
368 | model_vae.code_input: np.zeros((1, config['code_size']))}
369 | self.embedding_lstm_test[i] = sess.run(model_vae.code_mean, feed_dict=feed_dict)
370 | self.x_test = self.embedding_lstm_test[:, :config['l_seq'] - 1]
371 | self.y_test = self.embedding_lstm_test[:, 1:]
372 |
373 | def load_model(self, lstm_model, config, checkpoint_path):
374 | print(config['checkpoint_dir_lstm'] + 'checkpoint')
375 | if os.path.isfile(config['checkpoint_dir_lstm'] + 'checkpoint'):
376 | lstm_model.load_weights(checkpoint_path)
377 | print("LSTM model loaded.")
378 | else:
379 | print("No LSTM model loaded.")
380 |
381 | def train(self, config, lstm_model, cp_callback):
382 | lstm_model.fit(self.x_train, self.y_train,
383 | validation_data=(self.x_test, self.y_test),
384 | batch_size=config['batch_size_lstm'],
385 | epochs=config['num_epochs_lstm'],
386 | callbacks=[cp_callback])
387 |
388 | def plot_reconstructed_lt_seq(self, idx_test, config, model_vae, sess, data, lstm_embedding_test):
389 | feed_dict_vae = {model_vae.original_signal: np.zeros((config['l_seq'], config['l_win'], config['n_channel'])),
390 | model_vae.is_code_input: True,
391 | model_vae.code_input: self.embedding_lstm_test[idx_test]}
392 | decoded_seq_vae = np.squeeze(sess.run(model_vae.decoded, feed_dict=feed_dict_vae))
393 | print("Decoded seq from VAE: {}".format(decoded_seq_vae.shape))
394 |
395 | feed_dict_lstm = {model_vae.original_signal: np.zeros((config['l_seq'] - 1, config['l_win'], config['n_channel'])),
396 | model_vae.is_code_input: True,
397 | model_vae.code_input: lstm_embedding_test[idx_test]}
398 | decoded_seq_lstm = np.squeeze(sess.run(model_vae.decoded, feed_dict=feed_dict_lstm))
399 | print("Decoded seq from lstm: {}".format(decoded_seq_lstm.shape))
400 |
401 | fig, axs = plt.subplots(config['n_channel'], 2, figsize=(15, 4.5 * config['n_channel']), edgecolor='k')
402 | fig.subplots_adjust(hspace=.4, wspace=.4)
403 | axs = axs.ravel()
404 | for j in range(config['n_channel']):
405 | for i in range(2):
406 | axs[i + j * 2].plot(np.arange(0, config['l_seq'] * config['l_win']),
407 | np.reshape(data.val_set_lstm['data'][idx_test, :, :, j],
408 | (config['l_seq'] * config['l_win'])))
409 | axs[i + j * 2].grid(True)
410 | axs[i + j * 2].set_xlim(0, config['l_seq'] * config['l_win'])
411 | axs[i + j * 2].set_xlabel('samples')
412 | if config['n_channel'] == 1:
413 | axs[0 + j * 2].plot(np.arange(0, config['l_seq'] * config['l_win']),
414 | np.reshape(decoded_seq_vae, (config['l_seq'] * config['l_win'])), 'r--')
415 | axs[1 + j * 2].plot(np.arange(config['l_win'], config['l_seq'] * config['l_win']),
416 | np.reshape(decoded_seq_lstm, ((config['l_seq'] - 1) * config['l_win'])), 'g--')
417 | else:
418 | axs[0 + j * 2].plot(np.arange(0, config['l_seq'] * config['l_win']),
419 | np.reshape(decoded_seq_vae[:, :, j], (config['l_seq'] * config['l_win'])), 'r--')
420 | axs[1 + j * 2].plot(np.arange(config['l_win'], config['l_seq'] * config['l_win']),
421 | np.reshape(decoded_seq_lstm[:, :, j], ((config['l_seq'] - 1) * config['l_win'])), 'g--')
422 | axs[0 + j * 2].set_title('VAE reconstruction - channel {}'.format(j))
423 | axs[1 + j * 2].set_title('LSTM reconstruction - channel {}'.format(j))
424 | for i in range(2):
425 | axs[i + j * 2].legend(('ground truth', 'reconstruction'))
426 | savefig(config['result_dir'] + "lstm_long_seq_recons_{}.pdf".format(idx_test))
427 | fig.clf()
428 | plt.close()
429 |
430 | def plot_lstm_embedding_prediction(self, idx_test, config, model_vae, sess, data, lstm_embedding_test):
431 | self.plot_reconstructed_lt_seq(idx_test, config, model_vae, sess, data, lstm_embedding_test)
432 |
433 | fig, axs = plt.subplots(2, config['code_size'] // 2, figsize=(15, 5.5), edgecolor='k')
434 | fig.subplots_adjust(hspace=.4, wspace=.4)
435 | axs = axs.ravel()
436 | for i in range(config['code_size']):
437 | axs[i].plot(np.arange(1, config['l_seq']), np.squeeze(self.embedding_lstm_test[idx_test, 1:, i]))
438 | axs[i].plot(np.arange(1, config['l_seq']), np.squeeze(lstm_embedding_test[idx_test, :, i]))
439 | axs[i].set_xlim(1, config['l_seq'] - 1)
440 | axs[i].set_ylim(-2.5, 2.5)
441 | axs[i].grid(True)
442 | axs[i].set_title('Embedding dim {}'.format(i))
443 | axs[i].set_xlabel('windows')
444 | if i == config['code_size'] - 1:
445 | axs[i].legend(('VAE\nembedding', 'LSTM\nembedding'))
446 | savefig(config['result_dir'] + "lstm_seq_embedding_{}.pdf".format(idx_test))
447 | fig.clf()
448 | plt.close()
449 |
--------------------------------------------------------------------------------
/datasets/NAB-dataset-preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import csv\n",
11 | "import numpy as np\n",
12 | "import matplotlib.pylab as plt\n",
13 | "from matplotlib.pyplot import plot, ion, show, savefig, cla, figure"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## Helper functions to load and process original csv files"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "# this function load one .cvs (a sequence)\n",
30 | "def load_data(dataset, csv_folder='./NAB-known-anomaly/csv-files/'):\n",
31 | " if dataset == 'ambient_temp':\n",
32 | " data_file = os.path.join(csv_folder, 'ambient_temperature_system_failure.csv')\n",
33 | " anomalies = ['2013-12-22 20:00:00', '2014-04-13 09:00:00']\n",
34 | " t_unit = 'hour'\n",
35 | " elif dataset == 'cpu_utilization':\n",
36 | " data_file = os.path.join(csv_folder, 'cpu_utilization_asg_misconfiguration.csv')\n",
37 | " anomalies = ['2014-07-12 02:04:00', '2014-07-14 21:44:00']\n",
38 | " t_unit = '5 min'\n",
39 | " elif dataset == 'ec2_request':\n",
40 | " data_file = os.path.join(csv_folder, 'ec2_request_latency_system_failure.csv')\n",
41 | " anomalies = ['2014-03-14 09:06:00', '2014-03-18 22:41:00', '2014-03-21 03:01:00']\n",
42 | " t_unit = '5 min'\n",
43 | " elif dataset == 'machine_temp':\n",
44 | " data_file = os.path.join(csv_folder, 'machine_temperature_system_failure.csv')\n",
45 | " anomalies = ['2013-12-11 06:00:00', '2013-12-16 17:25:00', '2014-01-28 13:55:00', '2014-02-08 14:30:00']\n",
46 | " t_unit = '5 min'\n",
47 | " elif dataset == 'rogue_agent_key_hold':\n",
48 | " data_file = os.path.join(csv_folder, 'rogue_agent_key_hold.csv')\n",
49 | " anomalies = ['2014-07-15 08:30:00', '2014-07-17 09:50:00']\n",
50 | " t_unit = '5 min'\n",
51 | " elif dataset == 'rogue_agent_key_updown':\n",
52 | " data_file = os.path.join(csv_folder, 'rogue_agent_key_updown.csv')\n",
53 | " anomalies = ['2014-07-15 04:00:00', '2014-07-17 08:50:00']\n",
54 | " t_unit = '5 min'\n",
55 | " elif dataset == 'nyc_taxi':\n",
56 | " data_file = os.path.join(csv_folder, 'nyc_taxi.csv')\n",
57 | " anomalies = ['2014-11-01 19:00:00', '2014-11-27 15:30:00', '2014-12-25 15:00:00', '2015-01-01 01:00:00', \n",
58 | " '2015-01-27 00:00:00']\n",
59 | " t_unit = '30 min'\n",
60 | " \n",
61 | " t = []\n",
62 | " readings = []\n",
63 | " idx_anomaly = []\n",
64 | " i = 0\n",
65 | " with open(data_file) as csvfile:\n",
66 | " readCSV = csv.reader(csvfile, delimiter=',')\n",
67 | " print(\"\\n--> Anomalies occur at:\")\n",
68 | " for row in readCSV:\n",
69 | " if i > 0:\n",
70 | " t.append(i)\n",
71 | " readings.append(float(row[1]))\n",
72 | " for j in range(len(anomalies)):\n",
73 | " if row[0] == anomalies[j]:\n",
74 | " idx_anomaly.append(i)\n",
75 | " print(\" timestamp #{}: {}\".format(j, row[0]))\n",
76 | " i = i + 1\n",
77 | " t = np.asarray(t)\n",
78 | " readings = np.asarray(readings)\n",
79 | " print(\"\\nOriginal csv file contains {} timestamps.\".format(t.shape))\n",
80 | " print(\"Processed time series contain {} readings.\".format(readings.shape))\n",
81 | " print(\"Anomaly indices are {}\".format(idx_anomaly))\n",
82 | " \n",
83 | " return t, t_unit, readings, idx_anomaly"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 3,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# This function plots a dataset with the train/test split and known anomalies\n",
93 | "# Relies on helper function load_data()\n",
94 | "\n",
95 | "def process_and_save_specified_dataset(dataset, idx_split, y_scale=5, save_file=False):\n",
96 | " t, t_unit, readings, idx_anomaly = load_data(dataset)\n",
97 | " \n",
98 | " # split into training and test sets\n",
99 | " training = readings[idx_split[0]:idx_split[1]]\n",
100 | " t_train = t[idx_split[0]:idx_split[1]]\n",
101 | " \n",
102 | " # normalise by training mean and std \n",
103 | " train_m = np.mean(training)\n",
104 | " train_std = np.std(training)\n",
105 | " print(\"\\nTraining set mean is {}\".format(train_m))\n",
106 | " print(\"Training set std is {}\".format(train_std))\n",
107 | " readings_normalised = (readings - train_m) / train_std\n",
108 | " \n",
109 | " training = readings_normalised[idx_split[0]:idx_split[1]]\n",
110 | " if idx_split[0] == 0:\n",
111 | " test = readings_normalised[idx_split[1]:]\n",
112 | " t_test = t[idx_split[1]:] - idx_split[1]\n",
113 | " idx_anomaly_test = np.asarray(idx_anomaly) - idx_split[1]\n",
114 | " else:\n",
115 | " test = [readings_normalised[:idx_split[0]], readings_normalised[idx_split[1]:]]\n",
116 | " t_test = [t[:idx_split[0]], t[idx_split[1]:] - idx_split[1]]\n",
117 | " idx_anomaly_split = np.squeeze(np.argwhere(np.asarray(idx_anomaly)>idx_split[0]))\n",
118 | " idx_anomaly_test = [np.asarray(idx_anomaly[:idx_anomaly_split[0]]), \n",
119 | " np.asarray(idx_anomaly[idx_anomaly_split[0]:]) - idx_split[1]]\n",
120 | " print(\"Anomaly indices in the test set are {}\".format(idx_anomaly_test))\n",
121 | " \n",
122 | " if save_file:\n",
123 | " save_dir = './datasets/NAB-known-anomaly/'\n",
124 | " np.savez(save_dir+dataset+'.npz', t=t, t_unit=t_unit, readings=readings, idx_anomaly=idx_anomaly,\n",
125 | " idx_split=idx_split, training=training, test=test, train_m=train_m, train_std=train_std,\n",
126 | " t_train=t_train, t_test=t_test, idx_anomaly_test=idx_anomaly_test)\n",
127 | " print(\"\\nProcessed time series are saved at {}\".format(save_dir+dataset+'.npz'))\n",
128 | " else:\n",
129 | " print(\"\\nProcessed time series are not saved.\")\n",
130 | " \n",
131 | " # plot the whole normalised sequence\n",
132 | " fig, axs = plt.subplots(1, 1, figsize=(18, 4), edgecolor='k')\n",
133 | " fig.subplots_adjust(hspace=.4, wspace=.4)\n",
134 | " # axs = axs.ravel()\n",
135 | " # for i in range(4):\n",
136 | " axs.plot(t, readings_normalised)\n",
137 | " if idx_split[0] == 0:\n",
138 | " axs.plot(idx_split[1]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'b--')\n",
139 | " else:\n",
140 | " for i in range(2):\n",
141 | " axs.plot(idx_split[i]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'b--')\n",
142 | " for j in range(len(idx_anomaly)):\n",
143 | " axs.plot(idx_anomaly[j]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'r--')\n",
144 | " # axs.plot(data[:,1])\n",
145 | " axs.grid(True)\n",
146 | " axs.set_xlim(0, len(t))\n",
147 | " axs.set_ylim(-y_scale, y_scale)\n",
148 | " axs.set_xlabel(\"timestamp (every {})\".format(t_unit))\n",
149 | " axs.set_ylabel(\"normalised readings\")\n",
150 | " axs.set_title(\"{} dataset\\n(normalised by train mean {:.2f} and std {:.2f})\".format(dataset, train_m, train_std))\n",
151 | " axs.legend(('data', 'train test set split', 'anomalies'))\n",
152 | " \n",
153 | " return t, readings_normalised"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "## Example on ambient temperature series"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 4,
166 | "metadata": {},
167 | "outputs": [
168 | {
169 | "name": "stdout",
170 | "output_type": "stream",
171 | "text": [
172 | "\n",
173 | "--> Anomalies occur at:\n",
174 | " timestamp #0: 2013-12-22 20:00:00\n",
175 | " timestamp #1: 2014-04-13 09:00:00\n",
176 | "\n",
177 | "Original csv file contains (7267,) timestamps.\n",
178 | "Processed time series contain (7267,) readings.\n",
179 | "Anomaly indices are [3722, 6181]\n",
180 | "\n",
181 | "Training set mean is 72.04114695024849\n",
182 | "Training set std is 3.323831562717656\n",
183 | "Anomaly indices in the test set are [ 422 2881]\n",
184 | "\n",
185 | "Processed time series are not saved.\n"
186 | ]
187 | },
188 | {
189 | "data": {
190 | "image/png": "\n",
191 | "text/plain": [
192 | ""
193 | ]
194 | },
195 | "metadata": {
196 | "needs_background": "light"
197 | },
198 | "output_type": "display_data"
199 | }
200 | ],
201 | "source": [
202 | "dataset = 'ambient_temp'\n",
203 | "idx_split = [0,3300]\n",
204 | "\n",
205 | "t, readings_normalised = process_and_save_specified_dataset(dataset, idx_split)"
206 | ]
207 | }
208 | ],
209 | "metadata": {
210 | "kernelspec": {
211 | "display_name": "anomaly-env",
212 | "language": "python",
213 | "name": "anomaly-env"
214 | },
215 | "language_info": {
216 | "codemirror_mode": {
217 | "name": "ipython",
218 | "version": 3
219 | },
220 | "file_extension": ".py",
221 | "mimetype": "text/x-python",
222 | "name": "python",
223 | "nbconvert_exporter": "python",
224 | "pygments_lexer": "ipython3",
225 | "version": "3.6.9"
226 | }
227 | },
228 | "nbformat": 4,
229 | "nbformat_minor": 2
230 | }
231 |
--------------------------------------------------------------------------------