├── Analysis.png
├── Hyperparameters.png
├── NAB dataset.py
├── Preprocess
    ├── Preprocesser.py
    ├── __pycache__
    │   ├── Preprocesser.cpython-36.pyc
    │   └── import_data.cpython-36.pyc
    └── import_data.py
├── Preprocesser.py
├── README.md
├── anaomaly detection
    ├── __pycache__
    │   ├── anomaly_detection.cpython-36.pyc
    │   ├── detection_modes.cpython-36.pyc
    │   ├── time_series_analysis.cpython-36.pyc
    │   └── ts_plots.cpython-36.pyc
    ├── anomaly_detection.py
    ├── detection_modes.py
    ├── time_series_analysis.py
    └── ts_plots.py
├── experiments
    ├── 1d conv
    │   └── params.json
    ├── MLP
    │   └── params.json
    ├── general_settings.json
    └── wavenet
    │   └── params.json
├── import_data.py
├── main.py
├── model
    ├── Model.py
    ├── Param.py
    ├── __pycache__
    │   ├── Model.cpython-36.pyc
    │   ├── Param.cpython-36.pyc
    │   ├── train_and_evaluate.cpython-36.pyc
    │   ├── utils.cpython-36.pyc
    │   └── visualizing.cpython-36.pyc
    ├── train_and_evaluate.py
    ├── utils.py
    └── visualizing.py
├── results.png
├── search hyperparameter
    ├── __pycache__
    │   └── metrics_aggregation.cpython-36.pyc
    ├── metrics_aggregation.py
    ├── search_hyperparameters.py
    └── visulize_experiment_results.py
└── test_synthetic_data
    ├── data_generator.py
    └── test_synthetic_data.py


/Analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/Analysis.png


--------------------------------------------------------------------------------
/Hyperparameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/Hyperparameters.png


--------------------------------------------------------------------------------
/NAB dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import pandas as pd
 4 | from model.Param import *
 5 | from sklearn.preprocessing import RobustScaler
 6 | import matplotlib.pyplot as plt
 7 | from model.visualizing import plot_loss, plot_prediction
 8 | from model.Model import *
 9 | from pathlib import Path
10 | from matplotlib2tikz import save as tikz_save
11 | from scipy.stats import norm
12 | 
13 | def reconstruction_error(inputs, outputs):
14 |     """Return the mean square errors"""
15 |     inputs = np.array(inputs)
16 |     # inputs = pd.DataFrame(inputs)
17 |     # inputs = inputs.rolling(2, win_type='triang').mean().fillna(method='backfill').values.squeeze()
18 |     outputs = np.array(outputs)
19 |     error_reconstructed = inputs - outputs
20 |     return  error_reconstructed
21 | 
22 | def control_limits(variance_estimation, test_predicted):
23 |     upper_control_limit = np.array(test_predicted) + 3 * (np.array(variance_estimation[0:-1]) ** (1 / 2)) # [0:-1] not using the error in current step
24 |     lower_control_limit = np.array(test_predicted) - 3 * (np.array(variance_estimation[0:-1]) ** (1 / 2))
25 |     return upper_control_limit, lower_control_limit
26 | 
27 | rs = RobustScaler()
28 | 
29 | def rob_scal(data):
30 |     """Apply the RobustScaler to depress the effect of outliers
31 | 
32 |     Args:
33 |         data: 2D array [examples, features],
34 |         becasue it's raw data, so it should be seen as n examples with only one feature(each point)
35 | 
36 |     Returns:
37 |         signal_scaled: List
38 |     """
39 |     signal = data.reshape(-1,1) # It returns the reshaped array but doesn't change the original one
40 |     signal_scaled = rs.fit_transform(signal)
41 |     signal_scaled = signal_scaled.reshape(1, -1).squeeze()
42 |     return signal_scaled
43 | 
44 | # define paths
45 | # use the Python3 Pathlib modul to create platform independent path
46 | general_settings = Params.update(
47 |     Path("C:/Users/zhouyuxuan/PycharmProjects/Masterarbeit/experiments/general_settings.json"))
48 | 
49 | # load the parameters for the experiment params.json file in model dir
50 | model_dir = os.path.join(general_settings.experiments_path, general_settings.model_type)
51 | json_path = Path(model_dir) / 'params.json'
52 | params_train = Params.update(json_path)
53 | 
54 | ResultPath = "D:/NAB-master/NAB-master/results/1dConv"
55 | DataPath = "D:/NAB-master/NAB-master/data"
56 | for diret in os.listdir(DataPath):
57 |     if os.path.isdir(os.path.join(DataPath,diret)):
58 |         for file in os.listdir(os.path.join(DataPath,diret)):
59 |             df = pd.read_csv(os.path.join(DataPath,diret,file))
60 |             signal = df["value"].values
61 |             input = rob_scal(signal)
62 |             train = np.reshape(input,(1,-1))
63 | 
64 |             models, loss, predictions = Convolutioanl_autoencoder.train_and_predict(params_train, train,
65 |                                                                                 general_settings)
66 | 
67 |             # models, loss, predictions = Wavenet.train_and_predict(params_train, train, general_settings)
68 | 
69 |             error = predictions[1][0]
70 |             prediction = predictions[1][0]
71 |             data = train.squeeze()[:len(prediction)]
72 |             error_data = reconstruction_error(data, prediction)
73 |             mu, std = norm.fit(error_data)
74 |             plt.figure()
75 |             plt.plot(error_data)
76 |             plt.figure()
77 |             plt.plot(data)
78 |             plt.plot(prediction)
79 |             plt.legend(["preprocessed data", "prediction"])
80 |             LCL = prediction + mu - 6 * std
81 |             UCL = prediction + mu + 6 * std
82 |             plt.figure()
83 |             plt.plot(data)
84 |             mask_anomaly = (LCL > data).astype(int) + (UCL < data).astype(int)
85 |             anomaly = mask_anomaly * data
86 |             x = np.array(range(len(anomaly)))[anomaly != 0]
87 |             y = anomaly[anomaly != 0]
88 |             plt.plot(x, y, 'rx')
89 |             plt.legend(["original signal", "anomalies"], loc='upper center', bbox_to_anchor=(0.5, -0.15))
90 |             plt.fill_between(list(range(len(prediction))), UCL, LCL, color='k', alpha=.25)
91 |             score = pd.DataFrame(mask_anomaly, columns=["anomaly_score"])
92 |             result = pd.DataFrame(mask_anomaly, columns=["label"])
93 |             output = pd.concat([df,score,result],axis=1).fillna(0)
94 |             DirPath = os.path.join(ResultPath,diret)
95 |             if not os.path.exists(DirPath):
96 |                 os.makedirs(DirPath)
97 |             output.to_csv(os.path.join(DirPath,"1dConv_"+file), index=False)
98 | 


--------------------------------------------------------------------------------
/Preprocess/Preprocesser.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from Preprocess.import_data import open_data
 3 | from sklearn.preprocessing import RobustScaler
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | rs = RobustScaler()
 7 | 
 8 | def cut_head_tail(data, cut_num=50):
 9 |     """remove first/last few points
10 | 
11 |     Args:
12 |         data: List with each channel as sublist. Each channel itself contains the signal in one file as 1D array
13 | 
14 |     Returns:
15 |         data_cut: Same format as data, with head and tailed cut out
16 |     """
17 |     if cut_num == 0:
18 |         data_cut = data
19 |     else:
20 |         data_cut = data[cut_num:-cut_num]
21 |     return data_cut
22 | 
23 | def rob_scal(data):
24 |     """Apply the RobustScaler to depress the effect of outliers
25 | 
26 |     Args:
27 |         data: 2D array [examples, features],
28 |         becasue it's raw data, so it should be seen as n examples with only one feature(each point)
29 | 
30 |     Returns:
31 |         signal_scaled: List
32 |     """
33 |     signal = data.reshape(-1,1) # It returns the reshaped array but doesn't change the original one
34 |     signal_scaled = rs.fit_transform(signal)
35 |     signal_scaled = signal_scaled.reshape(1, -1).squeeze()
36 |     return signal_scaled
37 | 
38 | class Preprocesser():
39 |     # define the parameters as class attributes for preprocessing, which are not quite often changed
40 |     # remark: when the cut_num is too large e.g. 50, then there will be sample with empty content, thus raising error
41 |     cut_position = 0
42 |     def __init__(self, raw_data, processed_data, channel_list):
43 |         self.data = processed_data
44 |         self.raw_data = raw_data
45 |         self.channels = channel_list
46 |     def save(self, path):
47 |         np.save(path, self.data)
48 | 
49 |     def select_channel(self, channel_name):
50 |         # list.index() method returns the first value found in the list and has linear complexity, which is no problem for a short list in this case
51 |         index = self.channels.index(channel_name)
52 |         return self.data[index]
53 | 
54 |     @classmethod
55 |     def from_tdm(cls, path, channels, time_index):
56 |         data_read = open_data(path, channels)
57 |         data_array = cls.preprocess(data_read, time_index)
58 |         return cls(data_read, data_array, channels)
59 | 
60 |     # remark, because the length of each file is different, so the simple slicing can not be applied to the array,
61 |     # and the data can only be represented as a nested array, outside as 2D array(channel x file) and each element itsself is automatically set to list type(with different length)
62 |     # use for loop direktly, put the for loop outside, then it can be applied once to all the preprocessing functions
63 |     @staticmethod
64 |     def preprocess(data_read, time_index):
65 |         num_of_channels = len(data_read)
66 |         data_list = [[] for i in range(num_of_channels)]
67 |         for channel in range(len(data_read)):
68 |             # select the channel
69 |             signal_list = data_read[channel]
70 |             signal_list = list(signal_list)
71 |             # Filewise preprocessing
72 |             for file_number in range(data_read.shape[1]):
73 |                 signal = signal_list[file_number]
74 |                 # cut the head and tail
75 |                 signal_cut = cut_head_tail(signal, Preprocesser.cut_position)
76 |                 plt.figure()
77 |                 plt.plot(signal_cut)
78 |                 if not(time_index==True and channel==0):
79 |                     # scale the data
80 |                     signal_scaled = rob_scal(signal_cut)
81 |                     data_list[channel].append(signal_scaled)
82 |                 else:
83 |                     data_list[channel].append(signal_cut)
84 | 
85 |         data_array = np.array(data_list)
86 |         return data_array


--------------------------------------------------------------------------------
/Preprocess/__pycache__/Preprocesser.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/Preprocess/__pycache__/Preprocesser.cpython-36.pyc


--------------------------------------------------------------------------------
/Preprocess/__pycache__/import_data.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/Preprocess/__pycache__/import_data.cpython-36.pyc


--------------------------------------------------------------------------------
/Preprocess/import_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import fnmatch
 3 | import amp_tdm_loader
 4 | import numpy as np
 5 | 
 6 | def open_data(path, channels):
 7 |     """
 8 |     Read from TDM/TDX files
 9 |     
10 |     Args:
11 |         path: where to read the data from
12 |         channels: specifies selected channels as a list [channel_name1, channel_name2, ...];
13 | 
14 |     Returns:
15 |         data_array: # channels x # files . Each element itself contains the signal of every file as 1D array
16 |         num_of_files: the file number
17 |     """
18 |     namelist = fnmatch.filter(os.listdir(path), "*.TDM")
19 |     data_list = [[] for i in range(len(channels))]
20 |     for i in range(len(namelist)):
21 |         file = amp_tdm_loader.OpenFile(os.path.join(path, namelist[i]))
22 |     # Save each channel as 2 D array: #files * #signal length
23 |         for index in range(0, len(channels)):
24 |             data_list[index].append(file[channels[index]])
25 |     data_array = np.asarray(data_list)
26 |     return data_array
27 | 
28 | # Remark: the shape of the ndarray is just two dimensional, with each signal of one file as an element,
29 | #  because the length of each file is different
30 | 


--------------------------------------------------------------------------------
/Preprocesser.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from import_data import open_data
 3 | from sklearn.preprocessing import RobustScaler
 4 | 
 5 | rs = RobustScaler()
 6 | 
 7 | def cut_head_tail(data, cut_num=50):
 8 |     """remove first/last few points
 9 | 
10 |     Args:
11 |         data: List with each channel as sublist. Each channel itself contains the signal in one file as 1D array
12 | 
13 |     Returns:
14 |         data_cut: Same format as data, with head and tailed cut out
15 |     """
16 |     if cut_num == 0:
17 |         data_cut = data
18 |     else:
19 |         data_cut = data[cut_num:-cut_num]
20 |     return data_cut
21 | 
22 | def rob_scal(data):
23 |     """Apply the RobustScaler to depress the effect of outliers
24 | 
25 |     Args:
26 |         data: 2D array [examples, features],
27 |         becasue it's raw data, so it should be seen as n examples with only one feature(each point
28 | 
29 |     Returns:
30 |         signal_scaled: List
31 |     """
32 |     signal = data.reshape(-1,1) # It returns the reshaped array but doesn't change the original one
33 |     signal_scaled = rs.fit_transform(signal)
34 |     signal_scaled = signal_scaled.reshape(1, -1).squeeze()
35 |     return signal_scaled
36 | 
37 | def to_three_d_array(lists):
38 |     """Expand a list to 3D array
39 | 
40 |     Args: List [Example1, Example2......](outer [] as first dimensions when transformed to array
41 | 
42 |     Returs: 3D array [examples, features, channels]
43 |     """
44 |     arrays = np.array(lists)
45 |     arrays = np.reshape(arrays,(arrays.shape[0],-1,1))
46 |     return arrays
47 | 
48 | class Preprocesser():
49 |     # define the parameters as class attributes for preprocessing, which are not quite often changed
50 |     # remark: when the cut_num is too large e.g. 50, then there will be sample with empty content, thus raising error
51 |     cut_position = 0
52 |     def __init__(self, raw_data, processed_data, channel_list):
53 |         self.data = processed_data
54 |         self.raw_data = raw_data
55 |         self.channels = channel_list
56 |     def save(self, path):
57 |         np.save(path, self.data)
58 | 
59 |     def select_channel(self, channel_name):
60 |         # list.index() method returns the first value found in the list and has linear complexity, which is no problem for a short list in this case
61 |         index = self.channels.index(channel_name)
62 |         return self.data[index]
63 | 
64 |     @classmethod
65 |     def from_tdm(cls, path, channels, time_index):
66 |         data_read = open_data(path, channels)
67 |         data_array = cls.preprocess(data_read, time_index)
68 |         return cls(data_read, data_array, channels)
69 | 
70 |     # remark, because the length of each file is different, so the simple slicing can not be applied to the array,
71 |     # and the data can only be represented as a nested array, outside as 2D array(channel x file) and each element itsself is automatically set to list type(with different length)
72 |     # use for loop direktly, put the for loop outside, then it can be applied once to all the preprocessing functions
73 |     @staticmethod
74 |     def preprocess(data_read, time_index):
75 |         num_of_channels = len(data_read)
76 |         data_list = [[] for i in range(num_of_channels)]
77 |         for channel in range(len(data_read)):
78 |             # select the channel
79 |             signal_list = data_read[channel]
80 |             signal_list = list(signal_list)
81 |             # Filewise preprocessing
82 |             for file_number in range(data_read.shape[1]):
83 |                 signal = signal_list[file_number]
84 |                 # cut the head and tail
85 |                 signal_cut = cut_head_tail(signal, Preprocesser.cut_position)
86 |                 if not(time_index==True and channel==0):
87 |                     # scale the data
88 |                     signal_scaled = rob_scal(signal_cut)
89 |                     data_list[channel].append(signal_scaled)
90 |                 else:
91 |                     data_list[channel].append(signal_cut)
92 |         data_array = np.array(data_list)
93 |         return data_array


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Unsupervised-Online-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-
 2 | Unsupervised deep learning framework with both online(MLP: prediction-based, 1 D Conv and VAE: reconstruction-based, Wavenet: prediction-based) settings for anaomaly detection in time series data
 3 | 
 4 | # Update
 5 | The amp_tdm_loader is actually a modified version of the tdm_loader for our internal use only and I have just forgotten to remove this part.
 6 | 
 7 | I would recommend you to try out this framework on the NAB dataset instead using the NAB dataset.py file. If you want to use this on TDM file, then you have to write read_data function yourself.
 8 | 
 9 | ## Anomaly detection in time series data
10 | There are several common difficulties for anomaly detection in time series data:
11 | * Unbalanced data set: referring to the definition of anomaly, the anomaly data should always be the minority among the full data set as well as sampled. Indeed, the anomaly data are very rare in reality, forming together with the major normal data an extreme
12 | unbalanced set.
13 | * Sparse labels: on the one hand, the labels denoting whether an instance is normal or anomalous is in many applications time-consuming and prohibitively expensive to obtain. This is especially typical for time series data, where the sampling frequency could reach 1000 Hz or the time could range over decades, generating an enormous amount of data points. On the other hand, anomalous data is often not reproducible and fully concluded in reality. For example, a failure in the electronics of a sensor would arise an anomalous signal but another kind of failure may very likely cause new form of anomalous signal. In some area, anomalous instances could be fatal and hence extremely
14 | rare.
15 | * Concept drift: this phenomenon usually occurs in time series data, where the common i.i.d. assumption for machine learning models is often violated due to the varying latent conditions. In [4] it is defined: Patterns and relations in such data often evolve over time, thus, models built for analyzing such data quickly become obsolete over time. In machine learning and data mining this phenomenon is referred to as concept drift.
16 | 
17 | To tackle the concept drift problem in the time series data, i also employ the idea of online training. That is to say, the model is trained and generates outputs continuously with only a couple of examples once along the time axis, which enables an adaption on the
18 | varying streaming data. The target data of this framework is like in the most situations lack of labels for anomalousc examples, and is unbalanced towards a large amount of normal points as well as a drifted distribution along the time axis.
19 | 
20 | ## Unspupervised learning
21 | 
22 | Approach with unsupervised learning: without giving any label for normal or abnormal examples, the anomaly detection problem is formulated in another way: either by re-constructing the given input or predicting unseen examples given only part of the data set, a sequence of errors between the original data set and generated data set could be acquired. Then based on these errors, anomaly scores could be calculated via e.g. mean square errors.
23 | 
24 | Among these unsupervised methods, two main approaches are to be implemented and investigated, namely prediction-based and
25 | reconstruction-based anomaly detection in times series data:
26 | 
27 | * Prediction-based method: the models are given a segment of time series data, and the
28 | output shall be the predicted value of next few successive points based on the previous
29 | segment.
30 | * reconstruction-based method: Techniques such as auto-encoding or Principal Compo-
31 | nent Analysis first map the input data to a lower-dimensional space and then try to
32 | reconstruct it again without losing the main information.
33 | 
34 | ## Main Files
35 | 
36 | **Model.py**
37 | 
38 | In the **online** settings, the time series data are divided into fixed-sized segments, and each segment is seen as an example:
39 | 
40 | * **Multilayer Perceptron(MLP)**: predict the next elements based on the previous segment
41 | * **1 D Convolutional Auto-encoder**: reconstruct the given segment as input
42 | * **Variational 1D Convolutional Auto-encoder**: reconstruct the given segment as input
43 | 
44 | In the **seq2seq** settings, the following sequence to sequence models are employed to do the so-called iterative inference, that is to say, predict only one step ahead in an iteration, and with the predicted point added to the input, move to the next iteration:
45 | * **Wavenet** 
46 | * **LSTM**
47 | 
48 | **search_hyperparameters.py**
49 | 
50 | The Hyperparameters are all saved in a json file, in order to configure and record the hyperparameter settings for one expreriment automatically and efficiently. In addition, a Parameter class is created, so that at each iteration of the grid search, the settings could be updated and saved into a related json file, under the test subdirectory. Take a look at the code for details.
51 | 
52 | **metrics_aggregation.py**
53 | 
54 | In order to have an overview of the experiments results with different hyperparameter settings, the variants of one hyperparamter are compared via one or more user-defined metrics. And this file helps to aggregate these results automatically during grid search, creating a table summerizing the metrics values.
55 | 
56 | ## Test on NAB dataset
57 | More details about Numenta-Anomaly-Benchmark see [NAB dataset](https://numenta.com/machine-intelligence-technology/numenta-anomaly-benchmark/)
58 | 
59 | Hyperparameter settings(can be further optimized):
60 | 
61 | * Prediction step: 10
62 | * Window size: 128
63 | * Detection method: Gaussian
64 | * Threshold: 6 sigma
65 | * Epochs: 10
66 | 
67 | ![Test Reults in comparison to Entries to the 2016 NAB competition](results.png)
68 | 
69 | ![Result analysis](Analysis.png)
70 | 
71 | ## References
72 | https://github.com/cs231n/cs231n.github.io
73 | 
74 | https://github.com/numenta/NAB
75 | 


--------------------------------------------------------------------------------
/anaomaly detection/__pycache__/anomaly_detection.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/anaomaly detection/__pycache__/anomaly_detection.cpython-36.pyc


--------------------------------------------------------------------------------
/anaomaly detection/__pycache__/detection_modes.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/anaomaly detection/__pycache__/detection_modes.cpython-36.pyc


--------------------------------------------------------------------------------
/anaomaly detection/__pycache__/time_series_analysis.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/anaomaly detection/__pycache__/time_series_analysis.cpython-36.pyc


--------------------------------------------------------------------------------
/anaomaly detection/__pycache__/ts_plots.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/anaomaly detection/__pycache__/ts_plots.cpython-36.pyc


--------------------------------------------------------------------------------
/anaomaly detection/anomaly_detection.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.stats import norm
 3 | import matplotlib.pyplot as plt
 4 | from detection_modes import arima
 5 | from matplotlib2tikz import save as tikz_save
 6 | 
 7 | def reconstruction_error(inputs, outputs):
 8 |     """Return the mean square errors"""
 9 |     inputs = np.array(inputs)
10 |     outputs = np.array(outputs)
11 |     error_reconstructed = inputs - outputs
12 |     return  error_reconstructed
13 | 
14 | def control_limits(variance_estimation, test_predicted):
15 |     upper_control_limit = np.array(test_predicted) + 3 * (np.array(variance_estimation[0:-1]) ** (1 / 2)) # [0:-1] not using the error in current step
16 |     lower_control_limit = np.array(test_predicted) - 3 * (np.array(variance_estimation[0:-1]) ** (1 / 2))
17 |     return upper_control_limit, lower_control_limit
18 | 
19 | def anomaly_detection(train, predictions, detection_mode):
20 |     for counter, file in enumerate(range(len(predictions[0]))):
21 |         prediction = predictions[1][file]
22 |         data = train[file][:len(prediction)]
23 |         error_data = reconstruction_error(data, prediction)
24 |         plt.figure()
25 |         plt.plot(error_data)
26 |         # ts_analysis(error_data)
27 |         if detection_mode == "ARIMA":
28 | 
29 |             arima(error_data, data, prediction)
30 | 
31 | 
32 |         if detection_mode == "Gaussian":
33 |             # calculate the threshold by fitting a gaussian model
34 |             # fit a normal distribution
35 |             mu, std = norm.fit(error_data)
36 |             # plt.figure()
37 |             # # plot the histogram
38 |             # plt.hist(error_data, bins = 25, density=True, alpha=0.6, color='g')
39 |             # # plot the pdf
40 |             # # Plot the PDF.
41 |             # xmin, xmax = plt.xlim()
42 |             # x = np.linspace(xmin, xmax, 100)
43 |             # p = norm.pdf(x, mu, std)
44 |             # plt.plot(x, p, 'k', linewidth=2)
45 |             # title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
46 |             # plt.title(title)
47 |             # draw confidence bound(gray)
48 |             LCL = prediction + mu-3*std
49 |             UCL = prediction + mu+3*std
50 |             plt.figure()
51 |             plt.plot(data)
52 |             mask_anomaly = (LCL > data).astype(int) + (UCL < data).astype(int)
53 |             anomaly = mask_anomaly * data
54 |             x = np.array(range(len(anomaly)))[anomaly != 0]
55 |             y = anomaly[anomaly != 0]
56 |             plt.plot(x, y, 'rx')
57 |             plt.legend(["original signal","anomalies"],loc='upper center', bbox_to_anchor=(0.5,-0.15))
58 |             plt.fill_between(list(range(len(prediction))), UCL, LCL, color='k', alpha=.25)
59 |             # style in plot can not be displayed by matplotlib2tikz!!!!
60 |             # If we plot the boundary, sometimes it will be too large to show in the plot(on anomalous points)
61 |             tikz_save("wavefile{}.tex".format(counter))
62 | 
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/anaomaly detection/detection_modes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import itertools
 5 | import statsmodels.api as sm
 6 | import sys
 7 | from time_series_analysis import adf_test
 8 | 
 9 | def arima(error_data, data, prediction):
10 |     data = pd.DataFrame(data)
11 |     error = pd.DataFrame(error_data)
12 |     # define the p, d and q parameters to take any value between 0 and 2
13 |     p = q = range(3)
14 |     if adf_test(error_data):
15 |         d = range(3)
16 |     elif adf_test(error_data):
17 |         d = range(1,3)
18 |     else:
19 |         d = range(2,3)
20 |     # generate all different combinations of p, d and q triples
21 |     # itertools.product works as a nested for loop
22 |     pdq = list(itertools.product(p, d, q))
23 | 
24 |     # generate all different combinations of seasonal p,q and q triplets
25 |     # seasonal_pdq = [(x[0],x[1],x[2],100) for x in pdq]
26 |     # default no seasonal effect
27 |     # fit the model
28 |     best_aic = np.inf
29 |     best_pdq = None
30 |     best_seasonal_pdq = None
31 |     tmp_model = None
32 |     best_mdl = None
33 |     best_res = None
34 |     AICs = []
35 |     for param in pdq:
36 |             try:
37 |                 tmp_mdl = sm.tsa.SARIMAX(error, order = param, enforce_stationary=True,
38 |                                          enforce_invetibility=True)
39 |                 res = tmp_mdl.fit()
40 |                 AICs.append((res.aic))
41 |                 if res.aic <= best_aic:
42 |                     best_aic = res.aic
43 |                     best_pdq = param
44 |                     best_mdl = tmp_mdl
45 |                     best_res = res
46 |             except:
47 |                 print('Unexpected error:', sys.exc_info()[0])
48 |                 AICs.append(0)
49 |                 continue
50 |     print('Best SARIMAX{}x{} model - AIC:{}'.format(best_pdq, best_seasonal_pdq,best_aic))
51 | 
52 |     # visualizing result
53 |     best_res.summary()
54 |     best_res.plot_diagnostics()
55 |     fig=plt.figure()
56 |     ax = fig.add_subplot(111)
57 |     plt.plot(range(0,len(pdq)), AICs, '*-')
58 |     plt.title('AIC values')
59 |     for i in range(len(pdq)):
60 |         ax.annotate(pdq[i], xy=(i, AICs[i]))
61 | 
62 |     pred = best_res.get_prediction(start=0, end=len(error) - 1)
63 |     # maybe we could adjust the confidence interval dynamically, according to the mse niveau of each file
64 |     # 0.02 means 98% confidence interval
65 |     pred_ci = pred.conf_int(0.01)
66 |     pred_ci.iloc[0] = [0,0]
67 | 
68 |     # must reverse the error back to original data
69 |     ax = data.plot(label='Observed')
70 |     (pred.predicted_mean+prediction).plot(ax=ax, label='Prediction', alpha=0.5)
71 |     plt.legend(loc='best')
72 | 
73 | 
74 |     # draw confidence bound (gray)
75 |     # remark: the first confidence interval is extremely large, so we must substitute it
76 |     pred_ci.iloc[0,:] = pred_ci.iloc[1,:]
77 |     LCL =  pred_ci.iloc[:, 0] + prediction
78 |     UCL = pred_ci.iloc[:, 1] + prediction
79 |     plt.fill_between(pred_ci.index,
80 |                      LCL,
81 |                      UCL, color='g', alpha=.25)
82 |     LCL = pd.DataFrame(LCL)
83 |     LCL.columns = [0]
84 |     UCL = pd.DataFrame(UCL)
85 |     UCL.columns = [0]
86 |     # create a mask array for anomaly points
87 |     mask_anomaly = (LCL>data).astype(int)+(UCL<data).astype(int)
88 |     # Dataframe[i] takes the ith column out as pd.Series
89 |     anomaly = mask_anomaly.loc[mask_anomaly[0]==1]*data
90 |     plt.plot(anomaly,'r*')
91 |     # plt.fill_between(pd.DataFrame(prediction)[mask_anomaly].index, ax.get_ylim(), alpha = 0.9, color='r')
92 | 
93 | 


--------------------------------------------------------------------------------
/anaomaly detection/time_series_analysis.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from statsmodels.tsa.stattools import adfuller
 3 | from ts_plots import  scatter_plot_of_lags, plot_rolling_average, ts_plot, decomposition_plot
 4 | 
 5 | def adf_test(data):
 6 |     """
 7 |     Perform Augumented Dickey Fuller test(unit root test):
 8 |     test null-hypothesis for 'non-stationary random walk process'
 9 |     """
10 |     dftest = adfuller(data, regression='c',autolag='AIC')
11 |     dfoutput = pd.Series(dftest[0:4], index=['test statistic', 'p-value', '# of lags', '# of observations'])
12 |     for key, value in dftest[4].items():
13 |         dfoutput['Critical Value ({})'.format(key)] = value
14 |     print(dfoutput)
15 |     return dftest[1]>=dftest[4]["1%"]
16 | 
17 | def ts_analysis(data):
18 |     # if time series is not a Series object, convert it to
19 |     if not isinstance(data, pd.Series):
20 |         data = pd.Series(data)
21 | 
22 |     # apply time series analysis to the data
23 |     adf_test(data)
24 |     ts_plot(data, 10)
25 |     scatter_plot_of_lags(data, 9)
26 | 
27 |     # apply time series analysis to the 1 order differenced data
28 |     data_diff1 = data.diff().fillna(0)
29 |     adf_test(data_diff1)
30 |     ts_plot(data_diff1,10)
31 |     scatter_plot_of_lags(data_diff1, 9)
32 | 
33 |     # apply rolling average to observe the trend
34 |     plot_rolling_average(data, 12)
35 | 
36 |     # Decomposition of the original signal
37 |     decomposition_plot(data, period=10)


--------------------------------------------------------------------------------
/anaomaly detection/ts_plots.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import statsmodels.tsa.api as smt
  5 | import statsmodels.graphics.gofplots as smg
  6 | import math
  7 | from statsmodels.tsa.seasonal import seasonal_decompose
  8 | from stldecompose import decompose
  9 | from scipy.stats.distributions import t
 10 | 
 11 | def ts_plot(y, lags=None, title=''):
 12 |     """Calculate acf, pacf, histogram, and qq plot for a given time sereis
 13 |     """
 14 |     # if time series is not a Series object, convert it to
 15 |     if not isinstance(y, pd.Series):
 16 |         y = pd.Series(y)
 17 | 
 18 |     # initialize figure and axes
 19 |     fig = plt.figure(figsize=(14, 12))
 20 |     layout = (3, 2)
 21 |     ts_ax = plt.subplot2grid(layout, (0, 0), colspan = 2)
 22 |     acf_ax = plt.subplot2grid(layout, (1, 0))
 23 |     pacf_ax = plt.subplot2grid(layout, (1, 1))
 24 |     qq_ax = plt.subplot2grid(layout, (2, 0))
 25 |     hist_ax = plt.subplot2grid(layout, (2, 1))
 26 | 
 27 |     # time series plot
 28 |     y.plot(ax=ts_ax)
 29 |     plt.legend(loc="best")
 30 |     ts_ax.set_title(title)
 31 | 
 32 |     # acf and pacf plot
 33 |     smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
 34 |     smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
 35 | 
 36 |     # qq plot
 37 |     smg.qqplot(y, line='s', dist=t, fit=True,ax=qq_ax)
 38 |     qq_ax.set_title('Normal QQ Plot')
 39 | 
 40 |     # hist plot
 41 |     y.plot(ax=hist_ax, kind='hist', bins=25)
 42 |     hist_ax.set_title('Histogram')
 43 |     plt.tight_layout()
 44 |     # Remark: ion() turns the interactive mode on, unless the program will be blocked when the figure shows
 45 |     # But is only used for debug and interactive mode
 46 |     plt.ion()
 47 |     plt.show()
 48 |     plt.pause(0.01)
 49 |     return
 50 | 
 51 | def scatter_plot_of_lags(series_data, lags):
 52 |     """
 53 |     automatically adjust the layout of subplots to the input lags
 54 | 
 55 |     Args:
 56 |         series_data: pandas Series object
 57 |         lags: number lags to be plotted
 58 |     """
 59 |     # if time series is not a Series object, convert it to
 60 |     if not isinstance(series_data, pd.Series):
 61 |         series_data = pd.Series(series_data)
 62 | 
 63 |     ncols = 3
 64 |     # calculate the layout of subplots
 65 |     nrows = math.ceil(lags/ncols)
 66 |     fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5 * ncols, 5 * nrows))
 67 | 
 68 |     for ax, lag in zip(axes.flat, np.arange(1, lags + 1, 1)):
 69 |         # create the title string automatically according to the lag
 70 |         lag_str = 't-{}'.format(lag)
 71 |         # concatenate the lagged series to a Pandas DataFrame object
 72 |         X = (pd.concat([series_data, series_data.shift(-lag)], axis=1, keys=['y'] + [lag_str]))
 73 | 
 74 |         # plot data
 75 |         X.plot(ax=ax, kind='scatter', y='y', x=lag_str)
 76 |         # use the DataFrame method to get the correlation
 77 |         corr = X.corr().values[0][1]
 78 |         ax.set_ylabel('Original')
 79 |         ax.set_title('Lag: {} (corr={:.2f}'.format(lag_str, corr))
 80 |         ax.set_aspect('equal')
 81 | 
 82 |     fig.tight_layout()
 83 |     # Remark: ion() turns the interactive mode on, unless the program will be blocked when the figure shows
 84 |     # But is only used for debug and interactive mode
 85 |     plt.ion()
 86 |     plt.show()
 87 |     plt.pause(0.01)
 88 | 
 89 | def plot_rolling_average(series_data, window=12):
 90 |     """
 91 |     Plot rolling mean and rolling standard deviation for a given time series and window
 92 | 
 93 |     Args:
 94 |         series_data: pandas Series object
 95 |         lags: number lags to be plotted
 96 |     """
 97 |     # if time series is not a Series object, convert it to
 98 |     if not isinstance(series_data, pd.Series):
 99 |         series_data = pd.Series(series_data)
100 |     # calculate moving averages
101 |     rolling_mean = series_data.rolling(window).mean()
102 |     # median is more robust to outliers
103 |     # rolling_median = data.rolling(window).median()
104 |     rolling_std = series_data.rolling(window).std()
105 | 
106 |     # plot statistics
107 |     plt.figure()
108 |     plt.plot(series_data, label='Original')
109 |     plt.plot(rolling_mean, color='crimson', label='Moving average mean')
110 |     plt.plot(rolling_std, color='darkslateblue', label='Moving average standard deviation')
111 |     plt.legend(loc='best')
112 |     plt.title('Rolling Mean % Standard Deviation')
113 |     plt.ion()
114 |     plt.show()
115 |     plt.pause(0.01)
116 |     return
117 | 
118 | def decomposition_plot(series_data, period):
119 |     """
120 |     decomposition of the original signal for preliminary analysis
121 | 
122 |     Args:
123 |         series_data: Pandas Series object
124 |         period: estimated seasonal frequency
125 |     """
126 |     # if time series is not a Series object, convert it to
127 |     if not isinstance(series_data, pd.Series):
128 |         series_data = pd.Series(series_data)
129 |     # naive additive decomposition
130 |     decomp = seasonal_decompose(series_data.values, model='additive', freq=period)
131 |     decomp.plot()
132 | 
133 |     # stl decompose
134 |     stl = decompose(series_data.values, period=period)
135 |     stl.plot()
136 |     plt.show()
137 |     plt.pause(0.01)
138 | 
139 | 
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/experiments/1d conv/params.json:
--------------------------------------------------------------------------------
1 | {
2 |   "num_epochs": 10,
3 |   "batch_size": 128,
4 |   "save_summary_steps": 100,
5 |   "callbacks": "None",
6 |   "lags": 63,
7 |   "filter_size": 5,
8 |   "learning_rate": 0.01
9 | }


--------------------------------------------------------------------------------
/experiments/MLP/params.json:
--------------------------------------------------------------------------------
1 | {
2 |   "num_epochs": 10,
3 |   "batch_size": 128,
4 |   "save_summary_steps": 100,
5 |   "callbacks": "None",
6 |   "lags": 63,
7 |   "filter_size": 1,
8 |   "learning_rate": 0.01
9 | }


--------------------------------------------------------------------------------
/experiments/general_settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "experiments_path": "C:/Users/zhouyuxuan/PycharmProjects/Masterarbeit/experiments",
 3 |   "processed_data_path": "C:/Users/zhouyuxuan/PycharmProjects/Masterarbeit/Processed Data",
 4 |   "data_path": "C:/Users/zhouyuxuan/PycharmProjects/Masterarbeit/data",
 5 |   "channels": ["p_0", "p_1"],
 6 |   "model_type": "wavenet",
 7 |   "detection_mode": "Gaussian",
 8 |   "time_index": 0,
 9 |   "prediction_steps": 5
10 | }


--------------------------------------------------------------------------------
/experiments/wavenet/params.json:
--------------------------------------------------------------------------------
1 | {
2 |   "num_epochs": 10,
3 |   "batch_size": 128,
4 |   "save_summary_steps": 100,
5 |   "callbacks": "None",
6 |   "lags": 63,
7 |   "filter_size": 5,
8 |   "learning_rate": 0.01
9 | }


--------------------------------------------------------------------------------
/import_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import fnmatch
 3 | import amp_tdm_loader
 4 | import numpy as np
 5 | 
 6 | def open_data(path, channels):
 7 |     """
 8 |     Read from TDM/TDX files
 9 |     
10 |     Args:
11 |         path: where to read the data from
12 |         channels: specifies selected channels as a list [channel_name1, channel_name2, ...];
13 | 
14 |     Returns:
15 |         data_array: # channels x # files . Each element itself contains the signal of every file as 1D array
16 |         num_of_files: the file number
17 |     """
18 |     namelist = fnmatch.filter(os.listdir(path), "*.TDM")
19 |     data_list = [[] for i in range(len(channels))]
20 |     for i in range(len(namelist)):
21 |         file = amp_tdm_loader.OpenFile(os.path.join(path, namelist[i]))
22 |     # Save each channel as 2 D array: #files * #signal length
23 |         for index in range(0, len(channels)):
24 |             data_list[index].append(file[channels[index]])
25 |     data_array = np.asarray(data_list)
26 |     return data_array
27 | 
28 | # Remark: the shape of the ndarray is just two dimensional, with each signal of one file as an element,
29 | #  because the length of each file is different
30 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from model.Param import *
 3 | from Preprocess.Preprocesser import Preprocesser
 4 | from model.visualizing import plot_loss, plot_prediction
 5 | from model.Model import *
 6 | import os
 7 | import warnings
 8 | from anomaly_detection import anomaly_detection
 9 | from time_series_analysis import *
10 | warnings.filterwarnings("ignore")
11 | 
12 | # Here the prediction[1] is to be used, cause it's real prediction
13 | def mse_metric(train, predictions, prediction_steps):
14 |     MSE = []
15 |     for file in range(len(predictions[1])):
16 |         # if the sliding_step is large, maybe the last few points on the end could not be covered(less than the sliding_step)
17 |         error = train[file][:len(predictions[1][file])]-predictions[1][file]
18 |         # Median value is more stable than mean
19 |         mse = np.median((error)**2)
20 |         MSE.append(mse)
21 |     return MSE
22 | 
23 | ##### Initializing #####
24 | # define paths
25 | # use the Python3 Pathlib modul to create platform independent path
26 | general_settings = Params.update(
27 |     Path("C:/Users/zhouyuxuan/PycharmProjects/Masterarbeit/experiments/general_settings.json"))
28 | 
29 | # choose the channel for training
30 | channel_name = 'p_0'
31 | 
32 | # remark: for prediction-based model, they only use the lagged data as feature and the original data as label,
33 | # for reconstruction-based, they also include the original data as feature as well as label
34 | # and the number of features should be even number(multiple of cpus), if it's odd number, then Keras will raise error message
35 | 
36 | # load the parameters for the experiment params.json file in model dir
37 | model_dir = os.path.join(general_settings.experiments_path, general_settings.model_type)
38 | json_path = Path(model_dir) / 'params.json'
39 | params_train = Params.update(json_path)
40 | 
41 | ##### handle the data #####
42 | # preprocess the data
43 | preprocesser = Preprocesser.from_tdm(general_settings.data_path, general_settings.channels, general_settings.time_index)
44 | data_preprocessed_labeled = preprocesser.data
45 | 
46 | # select the channel out of the whole array
47 | data_all_files = preprocesser.select_channel(channel_name)
48 | 
49 | # even if read just one file, indexing like e.g. 3:4 should be used in order to keep the outer []
50 | train = data_all_files[:2]
51 | 
52 | # train and evaluate the model setting
53 | if general_settings.model_type == "1d conv":
54 |     models, loss, predictions = Convolutioanl_autoencoder.train_and_predict(params_train, train, general_settings)
55 | elif general_settings.model_type == "MLP":
56 |     models, loss, predictions = Multilayer_Perceptron.train_and_predict(params_train, train, general_settings)
57 | elif general_settings.model_type == "wavenet":
58 |     models, loss, predictions = Wavenet.train_and_predict(params_train, train, general_settings)
59 | else:
60 |     models, loss, predictions = Variational_Autoecnoder.train_and_predict(params_train, train, general_settings)
61 | 
62 | MSE = mse_metric(train, predictions, general_settings.prediction_steps)
63 | anomaly_detection(train, predictions, general_settings.detection_mode)


--------------------------------------------------------------------------------
/model/Model.py:
--------------------------------------------------------------------------------
  1 | from keras.layers import Lambda, Input, Dense, Conv1D, MaxPooling1D, UpSampling1D, Dropout
  2 | from keras.models import Model, Sequential
  3 | from keras.losses import mse
  4 | from keras import backend as K
  5 | import sys
  6 | from model.utils import *
  7 | import numpy as np
  8 | 
  9 | # Remark: for prediction-based model, they only use the lagged data as feature and the original data as label,
 10 | # for reconstruction-based, they also include the original data as feature as well as label
 11 | # and the number of features should be even number(multiple of cpus), if it's odd number, then Keras will raise error message
 12 | 
 13 | # Python 2/3 compatibility layer. Based on six.
 14 | 
 15 | PY3 = sys.version_info[0] == 3
 16 | 
 17 | if PY3:
 18 |     def get_im_class(meth):
 19 |         return meth.__self__.__class__
 20 | 
 21 | else:
 22 |     def get_im_class(meth):
 23 |         return meth.im_class
 24 | 
 25 | # for python3, all classes are new style classes, but for python2, there could exist old syle classes whoes type is 'instance'
 26 | def _mro(cls):
 27 |     """
 28 |     Return the method resolution order for ``cls`` -- i.e., a list
 29 |     containing ``cls`` and all its base classes, in the order in which
 30 |     they would be checked by ``getattr``.  For new-style classes, this
 31 |     is just cls.__mro__.  For classic classes, this can be obtained by
 32 |     a depth-first left-to-right traversal of ``__bases__``.
 33 |     """
 34 |     if isinstance(cls, type):
 35 |         return cls.__mro__
 36 |     else:
 37 |         mro = [cls]
 38 |         for base in cls.__bases__: mro.extend(_mro(base))
 39 |         return mro
 40 | 
 41 | # remark: don't need to split the data set inside the function, unless it would not be flexible enough,
 42 | # especially for the case where the data set for online and offline mode are totally differently split
 43 | def seq2seq_mode(models, application_set, num_epochs, learning_rate, sliding_step, model_type, callbacks=None):
 44 |     # define inference step
 45 |     def predict_sequence(input_sequence):
 46 |         history_sequence = input_sequence.copy()
 47 |         print("history sequence shape: ", history_sequence.shape)
 48 |         pred_sequence = np.zeros((sliding_step, 1))  # initialize output (pred_steps time steps)
 49 |         print(pred_sequence.shape)
 50 |         for i in range(sliding_step):
 51 |             # record next time step prediction (last time step of model output)
 52 |             # remark, if direkt indexing one dimension with integer, then this dimension will be reduced
 53 |             last_step_pred = models[0].predict(history_sequence)[:,-1:, :]
 54 |             print("last step prediction first 10 channels")
 55 |             print(last_step_pred.shape)
 56 |             pred_sequence[i, 0] = last_step_pred
 57 | 
 58 |             # add the next time step prediction to the history sequence
 59 |             # After experiments, it's better not to truncate the first element of history sequence
 60 |             history_sequence = np.concatenate([history_sequence,
 61 |                                                 last_step_pred.reshape(1, 1,1)], axis=1)
 62 | 
 63 |         return pred_sequence
 64 | 
 65 |     CallBacks = create_callbacks(callbacks)
 66 |     # opt = optimizers.Adam(lr=learning_rate)
 67 |     models[0].compile(optimizer="Adam", loss='MSE')
 68 |     # train and predict alternatively on the application set, using a universal validation set for all the updating steps
 69 |     loss = [[], []]
 70 |     train_losses = []
 71 |     predictions = [[], []] # the first element is prediction on training data, the second on unseen data
 72 |     # remark, if we really want to avoid overlapping, we can choose to loop with a step size equal to window size, but achtually we
 73 |     # don't need to bother, just choose the original time column, then it's fine
 74 |     # it's already prepadded
 75 |     for i in range(sliding_step-1,len(application_set)-sliding_step,sliding_step):
 76 |         # each data set contains pairs of features and labels in 0 and 1 position
 77 |         x = application_set[i]  # Dimension from outer to inner
 78 |         x = np.expand_dims(x, 0) # restore the dimension after slicing
 79 |         # extract next row of the lagged matrix, which is #stride step further than x at every column position
 80 |         x_next = application_set[i + sliding_step]
 81 |         x_next = np.expand_dims(x_next, 0)
 82 |         # Using teacher forcing during training, which means during inference every previous ground truth is fed to the network
 83 |         # Remark: keep the first points at training, the result is better than removing them
 84 |         history = models[0].fit(x[:,:-1,:], x[:,-sliding_step:,:], epochs=num_epochs, verbose=1, shuffle=False,callbacks=CallBacks)
 85 |         train_losses.append(history.history['loss'])
 86 |         # Special case(one step ahead prediction): for MLP, each prediction itself is a (1,) array, after squeeze then can not be converted to list
 87 |         if sliding_step > 1:
 88 |             pred = predict_sequence(x_next[:,:-sliding_step, :])
 89 |             predict = list(pred.squeeze())
 90 |             predictions[1].extend(predict)
 91 |             predict = list(predict_sequence(x[:,:-sliding_step, :]).squeeze())
 92 |             predictions[0].extend(predict)
 93 |         else:
 94 |             predictions[1].extend(predict_sequence(x_next[:,:-sliding_step, :]))
 95 |             predictions[0].extend(predict_sequence(x[:,:-sliding_step, :]))
 96 |     # convert the list of n (1,) arrays to a (n,1) array
 97 |     if sliding_step > 1:
 98 |         predictions[0] = np.expand_dims(np.array(predictions[0]), 1)
 99 |         predictions[1] = np.expand_dims(np.array(predictions[1]), 1)
100 |     else:
101 |         predictions[0] = np.array(predictions[0])
102 |         predictions[1] = np.array(predictions[1])
103 |     # rolling loss takes the training loss of last epoch for each example
104 |     rolling_loss = np.array(train_losses)[:, (num_epochs-1)]
105 |     loss[0] = rolling_loss
106 |     # train loss take the average value of the losses over the whole data set for each epoch
107 |     train_loss = np.average(train_losses, axis=0).flatten()
108 |     loss[1] = train_loss
109 |     return models, loss, predictions
110 | 
111 | 
112 | def online_mode(models, application_set, num_epochs, learning_rate, sliding_step, model_type, callbacks=None):
113 |     CallBacks = create_callbacks(callbacks)
114 |     # opt = optimizers.Adam(lr=learning_rate)
115 |     models[0].compile(optimizer="Adam", loss='MSE')
116 |     # train and predict alternatively on the application set, using a universal validation set for all the updating steps
117 |     loss = [[], []]
118 |     train_losses = []
119 |     predictions = [[], []] # the first element is prediction on training data, the second on unseen data
120 |     # remark, if we really want to avoid overlapping, we can choose to loop with a step size equal to window size, but achtually we
121 |     # don't need to bother, just choose the original time column, then it's fine
122 |     for i in range(0,len(application_set[0])-sliding_step,sliding_step):
123 |         # each data set contains pairs of features and labels in 0 and 1 position
124 |         x = application_set[0][i]  # Dimension from outer to inner
125 |         x = np.expand_dims(x, 0)  # restore the dimension after slicing
126 |         y = application_set[1][i]
127 |         y = np.expand_dims(y, 0)
128 |         # extract next row of the lagged matrix, which is  sliding_step further than x at every column position
129 |         x_next = application_set[0][i + sliding_step]
130 |         x_next = np.expand_dims(x_next, 0)
131 |         history = models[0].fit(x, y, epochs=num_epochs, verbose=1, shuffle=False,callbacks=CallBacks)
132 |         train_losses.append(history.history['loss'])
133 |         # Special case(one step ahead prediction): for MLP, each prediction itself is a (1,) array, after squeeze then can not be converted to list
134 |         if sliding_step > 1:
135 |             predict = list(models[0].predict(x_next).squeeze())[:sliding_step]
136 |             predictions[1].extend(predict)
137 |             predict = list(models[0].predict(x).squeeze())[:sliding_step]
138 |             predictions[0].extend(predict)
139 |         else:
140 |             predictions[1].extend(models[0].predict(x_next))
141 |             predictions[0].extend(models[0].predict(x))
142 |     # convert the list of n (1,) arrays to a (n,1) array
143 |     if sliding_step > 1:
144 |         predictions[0] = np.expand_dims(np.array(predictions[0]), 1)
145 |         predictions[1] = np.expand_dims(np.array(predictions[1]), 1)
146 |     else:
147 |         predictions[0] = np.array(predictions[0])
148 |         predictions[1] = np.array(predictions[1])
149 |     # rolling loss takes the training loss of last epoch for each example
150 |     rolling_loss = np.array(train_losses)[:, (num_epochs-1)]
151 |     loss[0] = rolling_loss
152 |     # train loss take the average value of the losses over the whole data set for each epoch
153 |     train_loss = np.average(train_losses, axis=0).flatten()
154 |     loss[1] = train_loss
155 |     return models, loss, predictions
156 | 
157 | # in python2 this will create a new style class object, but in python3 there's no need to do it
158 | class neural_network_model(object):
159 |     """
160 |     A processing interface:
161 | 
162 |     Subclasses must define:
163 |       - "__build_model", "__format_input"
164 |       - may include "__init__"
165 | 
166 |     """
167 |     @staticmethod
168 |     def build_model(lags, filter_size, prediction_steps=None):
169 |         """
170 |         return: Keras model instance
171 |         """
172 |         raise NotImplementedError()
173 | 
174 |     @staticmethod
175 |     def format_input(data, prediction_steps=None):
176 |         """
177 |         return: formatted input suitable for specific network
178 |         """
179 |         raise NotImplementedError()
180 | 
181 |     @classmethod
182 |     def _train_and_predict(cls, params, train, general_settings):
183 |         # submodels are e.g. encoder and decoder part of an autoencoder model, use * to recieve potentially multiple outputs
184 | 
185 |         results = [[],[]]
186 |         # the first element is rolling loss, the second training loss
187 |         losses =[[],[]]
188 | 
189 |         for file in range(len(train)):
190 |             # put the build_model here, then for each file train from scratch
191 |             models = cls.build_model(params.lags, params.filter_size, general_settings.prediction_steps)
192 |             if len(train[file])<50:
193 |                 results[0].append([0]*len(train[file]))
194 |                 results[1].append([0]*len(train[file]))
195 |                 losses[0].append(0)
196 |                 losses[1].append(0)
197 |             else:
198 |                 print(file)
199 |                 data = train[file]
200 |                 data_combined = create_lagged_df(data, params.lags)
201 |                 print("data input shape: ", data_combined.shape)
202 |                 train_set = cls.format_input(data_combined, general_settings.prediction_steps)
203 |                 if general_settings.model_type != "wavenet":
204 |                     models, loss, predictions = online_mode(models, train_set, params.num_epochs, params.learning_rate, general_settings.prediction_steps,  general_settings.model_type, params.callbacks)
205 |                 else:
206 |                     models, loss, predictions = seq2seq_mode(models, train_set, params.num_epochs, params.learning_rate,
207 |                                                              general_settings.prediction_steps, general_settings.model_type, params.callbacks)
208 |                     # take the first column out, or averaging over the window is also ok, because of the overlapping
209 |                     # numpy.squeeze() guarantees the dimension suitable for plot functions
210 |                 results[0].append(predictions[0][:, 0].squeeze())
211 |                 results[1].append(predictions[1][:, 0].squeeze())
212 |                 losses[0].append(loss[0])
213 |                 losses[1].append(loss[1])
214 | 
215 |         return models, losses, results
216 | 
217 |     # def classify(self, featureset):
218 |     #     """
219 |     #     :return: the most appropriate label for the given featureset.
220 |     #     :rtype: label
221 |     #     """
222 |     #     if overridden(self.classify_many):
223 |     #         return self.classify_many([featureset])[0]
224 |     #     else:
225 |     #         raise NotImplementedError()
226 | 
227 | class Wavenet(neural_network_model):
228 | 
229 |     def __init__(self, model, losses, results):
230 |         self.model = model
231 |         self.losses = losses
232 |         self.results = results
233 | 
234 |     @staticmethod
235 |     def build_model(input_dim, filter_width, prediction_steps=None):
236 |         """
237 |         return: Keras model instance
238 |         """
239 |         # convolutional layer oparameters
240 |         n_filters = 128
241 |         dilation_rates = [2 ** i for i in range(8)]
242 | 
243 |         # define an input history series and pass it through a stack of dilated causal convolutions
244 |         history_seq = Input(shape=(None, 1))
245 |         x = history_seq
246 | 
247 |         for dilation_rate in dilation_rates:
248 |             x = Conv1D(filters=n_filters,
249 |                        kernel_size=filter_width,
250 |                        padding='causal',
251 |                        dilation_rate=dilation_rate)(x)
252 | 
253 |         x = Dense(128, activation='relu')(x)
254 |         x = Dropout(.8)(x)
255 |         x = Dense(64)(x)
256 |         x = Dense(1)(x)
257 | 
258 |         # extract the last 16 time steps as the training target
259 |         def slice(x, seq_length):
260 |             return x[:, -seq_length:, :]
261 | 
262 |         pred_seq_train = Lambda(slice, arguments={'seq_length': prediction_steps})(x)
263 | 
264 |         model = Model(history_seq, pred_seq_train)
265 | 
266 |         return [model]
267 | 
268 | 
269 |     @staticmethod
270 |     def format_input(data, prediction_steps=None):
271 |         data = to_three_d_array(data)
272 |         return data
273 | 
274 |     @classmethod
275 |     def train_and_predict(cls, params, train, general_settings):
276 |         models, losses, results = cls._train_and_predict(params, train, general_settings)
277 |         return models, losses, results
278 | 
279 | 
280 | class Convolutioanl_autoencoder(neural_network_model):
281 | 
282 |     def __init__(self, model, losses, results):
283 |         self.model = model
284 |         self.losses = losses
285 |         self.results = results
286 | 
287 |     @staticmethod
288 |     def build_model(lags, filter_size, prediction_steps=None):
289 |         """
290 |         return: Keras model instance
291 |         """
292 |         # Can also use None to define a flexible placeholder for input data
293 |         input_dim = lags + 1
294 |         input_segment = Input(shape=(input_dim, 1))
295 | 
296 |         # Define encoder part
297 |         x = Conv1D(128, filter_size, activation='relu', padding='same')(input_segment)
298 |         x = MaxPooling1D(2, padding='same')(x)
299 |         x = Conv1D(64, filter_size, activation='relu', padding='same')(x)
300 |         x = MaxPooling1D(2, padding='same')(x)
301 |         x = Conv1D(32, filter_size, activation='relu',  padding='same')(x)
302 |         x = MaxPooling1D(2, padding='same')(x)
303 |         x = Conv1D(16, filter_size, activation='relu', padding='same')(x)
304 |         encoded = MaxPooling1D(2, padding='same')(x)
305 | 
306 |         # Define decoder part
307 |         x = Conv1D(16, filter_size, activation='relu', padding='same')(encoded)
308 |         x = UpSampling1D(2)(x)
309 |         x = Conv1D(32, filter_size, activation='relu', padding='same')(x)
310 |         x = UpSampling1D(2)(x)
311 |         x = Conv1D(64, filter_size, activation='relu', padding='same')(x)
312 |         x = UpSampling1D(2)(x)
313 |         x = Conv1D(128, filter_size, activation='relu', padding='same')(x)
314 |         x = UpSampling1D(2)(x)
315 |         decoded = Conv1D(1, filter_size, activation='linear', padding= 'same')(x)
316 |         autoencoder = Model(input_segment, decoded)
317 |         encoder = Model(input_segment, encoded)
318 |         ## TODO
319 |         # how to define the input shape of encoding, anyway to access the input_segement shape?
320 |         # encoded_input = Input(shape=())
321 |         return (autoencoder, encoder)
322 | 
323 |     @staticmethod
324 |     def format_input(data, prediction_steps=None):
325 |         """
326 |         return: formatted input suitable for specific network
327 |         """
328 |         data_combined = to_three_d_array(data)
329 |         # for reconstruction-based approach, label and features are the same
330 |         # the first elment is features X, the second is label Y
331 |         data_set = [data_combined, data_combined]
332 | 
333 |         return data_set
334 | 
335 |     @classmethod
336 |     def train_and_predict(cls, params, train, general_settings):
337 |         models, losses, results = cls._train_and_predict(params, train, general_settings)
338 |         return models, losses, results
339 | 
340 | class Multilayer_Perceptron(neural_network_model):
341 | 
342 |     def __init__(self, model, losses, results):
343 |         self.model = model
344 |         self.losses = losses
345 |         self.results = results
346 | 
347 |     @staticmethod
348 |     def build_model(lags, filter_size, prediction_steps=1):
349 |         """
350 |         return: Keras model instance
351 |         """
352 |         input_dim = lags - prediction_steps + 1
353 |         mdl = Sequential()
354 |         # remark: input_shape: tuple, but input_dim: integer(especially for 1 D layer)
355 |         mdl.add(Dense(128, input_dim=input_dim, activation='relu'))
356 |         mdl.add(Dense(64, activation='relu'))
357 |         mdl.add(Dense(prediction_steps))
358 |         return [mdl]
359 | 
360 |     @staticmethod
361 |     def format_input(data, prediction_steps=1):
362 |         """
363 |         return: formatted input suitable for specific network
364 |         """
365 |         data_set = [data[:, :-prediction_steps], data[:, -prediction_steps:]]
366 | 
367 |         return data_set
368 | 
369 |     @classmethod
370 |     def train_and_predict(cls, params, train, general_settings):
371 |         models, losses, results = cls._train_and_predict(params, train, general_settings)
372 |         return models, losses, results
373 | 
374 | class Variational_Autoecnoder(neural_network_model):
375 | 
376 |     def __init__(self, model, losses, results):
377 |         self.model = model
378 |         self.losses = losses
379 |         self.results = results
380 | 
381 |     @staticmethod
382 |     def build_model(lags, filter_size, prediction_steps=None):
383 |         """
384 |         return: Keras model instance
385 |         """
386 |         input_dim = lags+1
387 |         def sampling(args):
388 |             """Reparameterization trick by sampling fr an isotropic unit Gaussian.
389 |             # Arguments:
390 |                 args (tensor): mean and log of variance of Q(z|X)
391 |             # Returns:
392 |                 z (tensor): sampled latent vector
393 |             """
394 | 
395 |             z_mean, z_log_var = args
396 |             # by default, random_normal has mean=0 and std=1.0
397 |             epsilon = K.random_normal(shape=K.shape(z_mean))
398 |             return z_mean + K.exp(0.5 * z_log_var) * epsilon
399 | 
400 |         def vae_loss(x, x_decoded_mean, z_log_var, z_mean):
401 |             mse_loss = K.sum(mse(x, x_decoded_mean), axis=1)
402 |             kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=[1, 2])
403 |             return K.mean(mse_loss + kl_loss)
404 | 
405 |         input_segment = Input(shape=(input_dim, 1))
406 | 
407 |         # Define encoder part
408 |         x = Conv1D(32, filter_size, activation='relu', padding='same')(input_segment)
409 |         x = MaxPooling1D(2, padding='same')(x)
410 |         x = Conv1D(16, filter_size, activation='relu', padding='same')(x)
411 |         x = MaxPooling1D(2, padding='same')(x)
412 |         x = Conv1D(4, filter_size, activation='relu',  padding='same')(x)
413 |         x = MaxPooling1D(2, padding='same')(x)
414 |         z_mean = Dense(8)(x)
415 |         z_log_sigma = Dense(8)(x)
416 |         # Remark: this layer will cause the training to fail on the last batch, if the last batch is shorter
417 |         # so a padding trick must be applied
418 |         encoded = Lambda(sampling)([z_mean, z_log_sigma])
419 | 
420 |         # Define decoder part
421 |         x = Conv1D(4, filter_size, activation='relu', padding='same')(x)
422 |         x = UpSampling1D(2)(x)
423 |         x = Conv1D(16, filter_size, activation='relu', padding='same')(x)
424 |         x = UpSampling1D(2)(x)
425 |         x = Conv1D(32, filter_size, activation='relu', padding='same')(x)
426 |         x = UpSampling1D(2)(x)
427 |         decoded = Conv1D(1, filter_size, activation='linear', padding= 'same')(x)
428 |         autoencoder = Model(input_segment, decoded)
429 |         vae_losses = vae_loss(input_segment, decoded, z_log_sigma, z_mean)
430 |         autoencoder.add_loss(vae_losses)
431 |         encoder = Model(input_segment, encoded)
432 |         return  (autoencoder, encoder)
433 | 
434 |     @staticmethod
435 |     def format_input(data, prediction_steps=None):
436 |         """
437 |         return: formatted input suitable for specific network
438 |         """
439 |         data_combined = to_three_d_array(data)
440 |         # for reconstruction-based approach, label and features are the same
441 |         # the first elment is features X, the second is label Y
442 |         data_set = [data_combined, data_combined]
443 | 
444 |         return data_set
445 | 
446 |     @classmethod
447 |     def train_and_predict(cls, params, train, generall_settings):
448 |         models, losses, results = cls._train_and_predict(params, train, generall_settings)
449 |         return models, losses, results


--------------------------------------------------------------------------------
/model/Param.py:
--------------------------------------------------------------------------------
 1 | """General utility functions"""
 2 | 
 3 | import json
 4 | import os
 5 | import pathlib
 6 | import numpy as np
 7 | 
 8 | class Params():
 9 |     """Class that loads hyperparameters from a json file.
10 |     Example:
11 |     ```
12 |     params = Params(json_path)
13 |     print(params.learning_rate)
14 |     params.learning_rate = 0.5  # change the value of learning_rate in params
15 |     ```
16 |     """
17 |     # ** is unpack operator, e.g. function(name,age) can take dict = {'name':'alon', 'age':18}
18 |     # as keyword arguments: function(**dict)
19 |     def __init__(self, **kwargs):
20 |         # method for constructing a class from a dictionary
21 |         self.__dict__.update(kwargs)
22 |         pass
23 | 
24 |     def save(self, json_path):
25 |         """Saves parameters to json file"""
26 |         with open(json_path, 'w') as f:
27 |             json.dump(self.__dict__, f, indent=4)
28 | 
29 |     @classmethod
30 |     def update(cls, json_path):
31 |         """Loads parameters from json file"""
32 |         with open(json_path) as f:
33 |             params = json.load(f)
34 |             return cls(**params)
35 | 
36 |     @property
37 |     def dict(self):
38 |         """Gives dict-like access to Params instance by `params.dict['learning_rate']`"""
39 |         return self.__dict__
40 | 
41 | def save_dict_to_json(d, json_path):
42 |     """Saves dict of floats in json file
43 |     Args:
44 |         d: (dict) of float-castable values (np.float, int, float, etc.)
45 |         json_path: (string) path to json file
46 |     """
47 |     with open(json_path, 'w') as f:
48 |         # We need to convert the values to float for json (it doesn't accept np.array, np.float, )
49 |         d = {k: float(v) for k, v in d.items()}
50 |         json.dump(d, f, indent=4)
51 | 


--------------------------------------------------------------------------------
/model/__pycache__/Model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/model/__pycache__/Model.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/Param.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/model/__pycache__/Param.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/train_and_evaluate.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/model/__pycache__/train_and_evaluate.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/model/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/visualizing.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/model/__pycache__/visualizing.cpython-36.pyc


--------------------------------------------------------------------------------
/model/train_and_evaluate.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from model.Param import save_dict_to_json
 3 | from Preprocess.Preprocesser import Preprocesser
 4 | from model.visualizing import plot_loss, plot_prediction
 5 | from model.Model import Convolutioanl_autoencoder, Multilayer_Perceptron, Variational_Autoecnoder, Wavenet
 6 | import os
 7 | import numpy as np
 8 | warnings.filterwarnings("ignore")
 9 | 
10 | def mse_metric(train, predictions, prediction_steps):
11 |     MSE = []
12 |     for file in range(len(predictions[0])):
13 |         # if the sliding_step is large, maybe the last few points on the end could not be covered(less than the sliding_step)
14 |         error = train[file][prediction_steps:prediction_steps+len(predictions[0][file][prediction_steps:])]-predictions[0][file][prediction_steps:]
15 |         # Median value is more stable than mean
16 |         mse = np.median((error)**2)
17 |         MSE.append(mse)
18 |     return MSE
19 | 
20 | def train_and_evaluate(params_train, general_settings, job_dir):
21 | 
22 |     ##### handle the data #####
23 |     # preprocess the data
24 |     preprocesser = Preprocesser.from_tdm(general_settings.data_path, general_settings.channels, general_settings.time_index)
25 | 
26 |     for channel_name in general_settings.channels:
27 |         # select the channel out of the whole array
28 |         data_all_files = preprocesser.select_channel(channel_name)
29 | 
30 |         # even if read just one file, indexing like e.g. 3:4 should be used in order to keep the outer []
31 |         train = data_all_files[0:5]
32 | 
33 |         # train and evaluate the model setting
34 |         if general_settings.model_type == "1d conv":
35 |             models, loss, predictions = Convolutioanl_autoencoder.train_and_predict(params_train, train, general_settings)
36 |         elif general_settings.model_type == "MLP":
37 |             models, loss, predictions = Multilayer_Perceptron.train_and_predict(params_train, train, general_settings)
38 |         elif general_settings.model_type == "wavenet":
39 |             models, loss, predictions = Wavenet.train_and_predict(params_train, train, general_settings)
40 |         else:
41 |             models, loss, predictions = Variational_Autoecnoder.train_and_predict(params_train, train, general_settings)
42 | 
43 |         models[0].save(os.path.join(job_dir,channel_name+"_Model.h5"))
44 |         np.save(os.path.join(job_dir,channel_name+"_loss"), loss)
45 |         np.save(os.path.join(job_dir,channel_name+"_predictions"), predictions)
46 |         np.save(os.path.join(job_dir, channel_name+"_train"), train)
47 |         plot_loss(loss)
48 |         plot_prediction(train,predictions, general_settings.prediction_steps)
49 |         MSE = mse_metric(train, predictions, general_settings.prediction_steps)
50 |         metric = {channel_name+"_file"+str(i):mse for i, mse in enumerate(MSE)}
51 |         json_path = os.path.join(job_dir, channel_name+"_metric.json")
52 |         save_dict_to_json(metric, json_path)
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/model/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | def to_three_d_array(lists):
 5 |     """Expand a list to 3D array
 6 | 
 7 |     Args: List [Example1, Example2......](outer [] as first dimensions when transformed to array
 8 | 
 9 |     Returs: 3D array [examples, features, channels]
10 |     """
11 |     arrays = np.array(lists)
12 |     arrays = np.reshape(arrays,(arrays.shape[0],-1,1))
13 |     return arrays
14 | 
15 | def create_lagged_df(data, lags):
16 |     data = pd.DataFrame(data)
17 |     # Apply rolling average to smoothen the data
18 |     # data.rolling(2, win_type='triang').mean()
19 |     df = pd.concat([data.shift(lag) for lag in range(-lags,0)], axis=1)
20 |     df.columns = ['lag {}'.format(-lag) for lag in range(-lags,0)]
21 |     data_combined = df.join(data)
22 |     # Padded in the left, in order to synchronize with the original data
23 |     data_combined = data_combined.fillna(0).values
24 |     return data_combined
25 | 
26 | def create_callbacks(callbacks):
27 |     if callbacks == None:
28 |         CallBacks = None
29 |     else:
30 |         CallBacks = []
31 |         for CallBack in callbacks:
32 |             if CallBack == "early stopping":
33 |                 from keras.callbacks import EarlyStopping
34 |                 # Because stochastic gradient descent is noisy, patience must be set to a relative large number
35 |                 early_stopping_monitor = EarlyStopping(patience=10)
36 |                 CallBacks.append(early_stopping_monitor)
37 | 
38 |             if CallBack == "TensorBoard":
39 |                 from keras.callbacks import TensorBoard
40 |                 log_dir = 'C:/Users/zhouyuxuan/PycharmProjects/Masterarbeit/experiments/logdir'
41 |                 CallBacks.append(TensorBoard(log_dir=log_dir))
42 |     return CallBacks
43 | 


--------------------------------------------------------------------------------
/model/visualizing.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | def plot_loss(loss):
 5 |         for file in range(len(loss[0])):
 6 |             plt.figure()
 7 |             plt.title("file {}".format(file))
 8 |             plt.subplot(2,1,1)
 9 |             plt.plot(loss[0][file])
10 |             plt.legend("train loss")
11 | 
12 |             plt.subplot(2,1,2)
13 |             plt.plot(loss[1][file])
14 |             plt.legend("rolling loss")
15 | 
16 |             # plt.ion()
17 |             # plt.show()
18 |             # plt.pause(0.001)
19 | 
20 | def plot_prediction(train, predictions,prediction_steps):
21 |     for file in range(len(predictions[0])):
22 |         plt.figure()
23 |         plt.plot( predictions[0][file])
24 |         plt.plot(train[file])
25 |         plt.plot(predictions[1][file])
26 |         # prediction = [0]*prediction_steps
27 |         # prediction.extend(predictions[1][file])
28 |         # plt.plot(prediction)
29 |         plt.legend(["prediction on train(lagged) data","data","prediction"])
30 |         # plt.ion()
31 |         # plt.show()
32 |         # plt.pause(0.001)
33 | 
34 | 


--------------------------------------------------------------------------------
/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/results.png


--------------------------------------------------------------------------------
/search hyperparameter/__pycache__/metrics_aggregation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhouYuxuanYX/Unsupervised-Deep-Learning-Framework-for-Anomaly-Detection-in-Time-Series-/993f32d16c0fd25c9046fec31e8700b139856f7b/search hyperparameter/__pycache__/metrics_aggregation.cpython-36.pyc


--------------------------------------------------------------------------------
/search hyperparameter/metrics_aggregation.py:
--------------------------------------------------------------------------------
 1 | """Aggregates results from the metrics_eval_best_weights.json in a parent folder"""
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | 
 7 | from tabulate import tabulate
 8 | 
 9 | def aggregate_metrics(parent_dir, metrics, channel_name):
10 |     """Aggregate the metrics of all experiments in folder `parent_dir`.
11 |     Assumes that `parent_dir` contains multiple experiments, with their results stored in
12 |     `parent_dir/subdir/metrics_dev.json`
13 |     Args:
14 |         parent_dir: (string) path to directory containing experiments results
15 |         metrics: (dict) subdir -> {'accuracy': ..., ...}
16 |     """
17 |     # Get the metrics for the folder if it has results from an experiment
18 |     metrics_file = os.path.join(parent_dir, channel_name+'_metric.json')
19 |     if os.path.isfile(metrics_file):
20 |         with open(metrics_file, 'r') as f:
21 |             metrics[parent_dir] = json.load(f)
22 | 
23 |     # Check every subdirectory of parent_dir
24 |     for subdir in os.listdir(parent_dir):
25 |         if not os.path.isdir(os.path.join(parent_dir, subdir)):
26 |             continue
27 |         else:
28 |             aggregate_metrics(os.path.join(parent_dir, subdir), metrics, channel_name)
29 | 
30 | 
31 | def metrics_to_table(metrics):
32 |     # Get the headers from the first subdir. Assumes everything has the same metrics
33 |     headers = metrics[list(metrics.keys())[0]].keys()
34 |     table = [[subdir] + [values[h] for h in headers] for subdir, values in metrics.items()]
35 |     res = tabulate(table, headers, tablefmt='pipe')
36 | 
37 |     return res
38 | 
39 | 


--------------------------------------------------------------------------------
/search hyperparameter/search_hyperparameters.py:
--------------------------------------------------------------------------------
 1 | from model.train_and_evaluate import train_and_evaluate
 2 | from pathlib import Path
 3 | from model.Param import *
 4 | import os
 5 | from metrics_aggregation import metrics_to_table, aggregate_metrics
 6 | 
 7 | def launch_training_job(model_dir, parameter_name, parameter_value, params, general_settings):
 8 |     """Launch training of the model with a set of hyperparameters in parent_dir/job_name
 9 |        Args:
10 |         parameter_naem: select which parameter to be varied
11 |         params: (dict) containing basic setting of hyperparameters
12 |        """
13 |     # Create a new folder in parent_dir with unique_name "job_name"
14 |     parameter_dir = os.path.join(model_dir, parameter_name+"_pred_step_"+str(general_settings.prediction_steps))
15 |     if not os.path.exists(parameter_dir):
16 |         os.makedirs(parameter_dir)
17 |     job_dir = os.path.join(parameter_dir, parameter_name+"_"+str(parameter_value))
18 |     if not os.path.exists(job_dir):
19 |         os.makedirs(job_dir)
20 | 
21 |     # Write parameters in json file
22 |     json_path = os.path.join(job_dir, 'params.json')
23 |     params.save(json_path)
24 | 
25 |     # Launch training with this config
26 |     train_and_evaluate(params, general_settings, job_dir)
27 | 
28 | if __name__ == "__main__":
29 |     ##### Initializing #####
30 | 
31 |     # define paths
32 |     # use the Python3 Pathlib modul to create platform independent path
33 |     general_settings = Params.update(
34 |         Path("C:/Users/zhouyuxuan/PycharmProjects/Masterarbeit/experiments/general_settings.json"))
35 | 
36 |     model_dir = os.path.join(general_settings.experiments_path, general_settings.model_type)
37 |     if not os.path.exists(model_dir):
38 |         os.makedirs(model_dir)
39 | 
40 |     # load the parameters for the experiment params.json file in model path
41 |     json_path = Path(model_dir) / 'params.json'
42 |     params = Params.update(json_path)
43 | 
44 |     ### hyperparameter search ####
45 |     # # learning rate search
46 |     # for lr in [0.1, 0.05]:
47 |     #     params.learning_rate = lr
48 |     #     # Launch a training in this directory -- it will call `train_and_evaluate.py`
49 |     #     launch_training_job(model_dir, "learning_rate", lr, params, general_settings)
50 | 
51 |     # epochs search
52 |     for epochs in [5]:
53 |         params.num_epochs = epochs
54 |         launch_training_job(model_dir, "num_epochs", epochs, params, general_settings)
55 | 
56 | 
57 |     # # Aggregate metrics from args.parent_dir directory
58 |     # # parent_dir = os.path.join(model_dir, "num_epochs_pred_step_1")
59 |     # parent_dir = model_dir
60 |     # for channel_name in general_settings.channels:
61 |     #     metrics = dict()
62 |     #     aggregate_metrics(parent_dir, metrics, channel_name)
63 |     #     table = metrics_to_table(metrics)
64 |     #
65 |     #     # Display the table to terminal
66 |     #     print(table)
67 |     #
68 |     #     # Save results in parent_dir/results.md
69 |     #     save_file = os.path.join(parent_dir, channel_name+"_results.md")
70 |     #     with open(save_file, 'w') as f:
71 |     #         f.write(table)
72 | 


--------------------------------------------------------------------------------
/search hyperparameter/visulize_experiment_results.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from model.Param import *
 3 | import os
 4 | import numpy as np
 5 | from model.visualizing import plot_loss, plot_prediction
 6 | from anomaly_detection import anomaly_detection
 7 | 
 8 | # define paths
 9 | # use the Python3 Pathlib modul to create platform independent path
10 | general_settings = Params.update(
11 |     Path("C:/Users/zhouyuxuan/PycharmProjects/Masterarbeit/experiments/general_settings.json"))
12 | 
13 | model_dir = os.path.join(general_settings.experiments_path, "MLP")
14 | experiment_dir = os.path.join(model_dir, "num_epochs_pred_step_1")
15 | 
16 | for channel_name in general_settings.channels:
17 |     # Check every subdirectory of parent_dir
18 |     for subdir in os.listdir(experiment_dir):
19 |         loss = np.load(os.path.join(experiment_dir,subdir,channel_name+"_loss.npy"))
20 |         predictions = np.load(os.path.join(experiment_dir, subdir, channel_name+"_predictions.npy"))
21 |         train = np.load(os.path.join(experiment_dir, subdir,channel_name+"_train.npy"))
22 |         plot_loss(loss)
23 |         plot_prediction(train, predictions, 5) # choose prediction steps
24 |         anomaly_detection(train, predictions, general_settings.detection_mode)


--------------------------------------------------------------------------------
/test_synthetic_data/data_generator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def generate_sinus(noise=True):
 5 |     signal = []
 6 |     for i in range(5):
 7 |         x = np.linspace(-np.pi, np.random.randint(1,6)*np.pi,np.random.randint(100,1000))
 8 |         # Generate different frequency
 9 |         signal.extend(list(np.sin(np.random.randint(1,2)*x)))
10 |         # add noise
11 |     if noise == True:
12 |         signal =np.random.randint(5,20)*np.array(signal) + 3*np.random.rand(len(signal))
13 |     else:
14 |         signal=np.random.randint(5,20)*np.array(signal)
15 |     return signal
16 | 
17 | def generate_dataset(noise=True,trend=True, error=True):
18 |     data_list = [[]]
19 |     for i in range(5):
20 |         x = generate_sinus(noise)
21 |         for j in range(40):
22 |             if error == True:
23 |                 x[np.random.randint(0,len(x))] =np.random.randint(-20,20)
24 |             if trend == True:
25 |                 # # Line
26 |                 # x = x + np.linspace(0,2,len(x))
27 |                 # # broken line
28 |                 # x = x +np.concatenate((np.linspace(0,2,len(x)//2),np.linspace(2,0,len(x)-len(x)//2)))
29 |                 # # hyperbola
30 |                 # x = x + np.linspace(0,1.4,len(x))**2
31 |                 # sudden mean shift
32 |                 x = x + np.concatenate((np.linspace(0, 1, len(x) // 2), np.linspace(0.7, 1, len(x) - (len(x) // 2))))
33 |         data_list[0].append(x)
34 |     data = np.array(data_list)
35 |     return data
36 | 
37 | def show():
38 |     data = generate_dataset()
39 |     for i in range(len(data[0])):
40 |         plt.figure()
41 |         plt.plot(data[0][i])
42 | show()


--------------------------------------------------------------------------------
/test_synthetic_data/test_synthetic_data.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from model.Param import *
 3 | from Preprocess.Preprocesser import Preprocesser
 4 | from model.visualizing import plot_loss, plot_prediction
 5 | from model.Model import *
 6 | import os
 7 | import warnings
 8 | from anomaly_detection import anomaly_detection
 9 | from data_generator import generate_dataset
10 | warnings.filterwarnings("ignore")
11 | 
12 | # Here the prediction[1] is to be used, cause it's real prediction
13 | def mse_metric(train, predictions, prediction_steps):
14 |     MSE = []
15 |     for file in range(len(predictions[1])):
16 |         # if the sliding_step is large, maybe the last few points on the end could not be covered(less than the sliding_step)
17 |         error = train[file][:len(predictions[1][file])]-predictions[1][file]
18 |         # Median value is more stable than mean
19 |         mse = np.median((error)**2)
20 |         MSE.append(mse)
21 |     return MSE
22 | 
23 | ##### Initializing #####
24 | 
25 | # define paths
26 | # use the Python3 Pathlib modul to create platform independent path
27 | general_settings = Params.update(
28 |     Path("C:/Users/zhouyuxuan/PycharmProjects/Masterarbeit/experiments/general_settings.json"))
29 | 
30 | # choose the channel for training
31 | channel_name = 'p_0'
32 | 
33 | # remark: for prediction-based model, they only use the lagged data as feature and the original data as label,
34 | # for reconstruction-based, they also include the original data as feature as well as label
35 | # and the number of features should be even number(multiple of cpus), if it's odd number, then Keras will raise error message
36 | 
37 | # load the parameters for the experiment params.json file in model dir
38 | model_dir = os.path.join(general_settings.experiments_path, general_settings.model_type)
39 | json_path = Path(model_dir) / 'params.json'
40 | params_train = Params.update(json_path)
41 | ##### handle the data #####
42 | # prepare the synthetic data
43 | data = generate_dataset(noise=True, trend=True,error=False)
44 | 
45 | # preprocess the data
46 | preprocessed_data = Preprocesser.preprocess(data,0)
47 | train = preprocessed_data[0]
48 | 
49 | # train and evaluate the model setting
50 | # train and evaluate the model setting
51 | if general_settings.model_type == "1d conv":
52 |     models, loss, predictions = Convolutioanl_autoencoder.train_and_predict(params_train, train, general_settings)
53 | elif general_settings.model_type == "MLP":
54 |     models, loss, predictions = Multilayer_Perceptron.train_and_predict(params_train, train, general_settings)
55 | elif general_settings.model_type == "wavenet":
56 |     models, loss, predictions = Wavenet.train_and_predict(params_train, train, general_settings)
57 | else:
58 |     models, loss, predictions = Variational_Autoecnoder.train_and_predict(params_train, train, general_settings)
59 | 
60 | MSE = mse_metric(train, predictions, general_settings.prediction_steps)
61 | import matplotlib.pyplot as plt
62 | plt.figure()
63 | plt.plot(MSE)
64 | plot_loss(loss)
65 | plot_prediction(train, predictions,general_settings.prediction_steps)
66 | anomaly_detection(train, predictions, general_settings.detection_mode)


--------------------------------------------------------------------------------