├── .DS_Store
├── README.md
├── code
    ├── config.py
    ├── detect.py
    ├── domi.py
    ├── evaluate.py
    ├── model.py
    ├── pot.py
    └── util.py
└── requirements.txt


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NetManAIOps/DOMI_code/94b5a415c5b6c38dfd7169a07a40b08df3821cad/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DOMI
 2 | 
 3 | 
 4 | ###  Detecting Outlier Machine Instances through One Dimensional CNN Gaussian Mixture Variational AutoEncoder
 5 | 
 6 | DOMI is a VAE-based model which glues one Dimensional Convolution Neural Network and Gaussian Mixture Variational auto-encoder. 
 7 | It aims at detecting outlier machine instances and its core idea is to learn the normal patterns of multivariate time series
 8 | and use the reconstruction probability to do outlier judgment. 
 9 | Moreover, for a detected outlier machine instance, DOMI provides interpretation based on reconstruction probability changes of univaraite time series.
10 | 
11 | 
12 | 
13 | ## Getting Started
14 | 
15 | #### Clone the repo
16 | 
17 | ```
18 | git clone https://github.com/Tsinghuasuya/DOMI_code
19 | ```
20 | 
21 | #### Get data from github and unzip 
22 | 
23 | ```
24 | git lfs clone https://github.com/Tsinghuasuya/DOMI_dataset && cd DOMI_dataset && unzip publicDataset.zip  && cd  ../DOMI_code
25 | ```
26 | 
27 | 
28 | #### Install dependencies (with python 3.6) 
29 | 
30 | (virtualenv is recommended)
31 | 
32 | ```shell
33 | pip install -r requirements.txt
34 | ```
35 | 
36 | 
37 | #### Run the code
38 | 
39 | ```
40 | cd code && python domi.py
41 | ```
42 | 
43 | If you want to change the default configuration, you can edit `ExpConfig` in `config.py` or 
44 | overwrite the config in `domi.py` using command line args. For example:
45 | 
46 | ```
47 | python domi.py --noExp=2 --max_epoch=100 --initial_lr=0.0001 
48 | ```
49 | 
50 | 
51 | ## Result
52 | 
53 | After running the programmings, you can get the output in the file directory that you set in the config. For each instance, you can get the total score and score of each univariate time series. 
54 | All the results are in the folder `{config.result_dir}/`, with trained model in `{config.result_dir}/DOMI_{noExp}.model`, the output and config of DOMI in the folder `{config.result_dir}/DOMI_{noExp}/`, 
55 | and the detailed detection results are in the folder `DOMI_{noExp}/result_summay/`. It's made up by the following parts:
56 | * `OutlierScores_metric.txt`: score of each univariate time series for instance in the testing dataset.
57 | * `OutlierScores.txt`: score for each instance in the testing dataset.
58 | * `MetricResult.txt`: interpretation using the univariate time series of machine instances.
59 | * `PRF.txt`: summary of the overall statistics, including expected score of each metric and F1-score, recall, precision. 
60 | 


--------------------------------------------------------------------------------
/code/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import tfsnippet as spt
 3 | 
 4 | 
 5 | class ExpConfig(spt.Config):
 6 |     # Data options
 7 |     noExp               = 1
 8 |     GPU_number          = '0'
 9 |     channels_last       = True
10 |     datapathForTrain    = "../../DOMI_dataset/publicDataset/train_data"
11 |     datapathForTest     = "../../DOMI_dataset/publicDataset/test_data"
12 |     dataReadformat      = "each"                           # or all
13 |     labelpath           = "../../DOMI_dataset/publicDataset/test_label/"
14 |     interpret_filepath  = "../../DOMI_dataset/publicDataset/interpretation_label.txt"
15 |     result_dir          = "results"
16 | 
17 |     # model parameters
18 |     n_c                 = 4
19 |     strides1            = 4
20 |     strides2            = 3
21 |     kernel_size1        = 12
22 |     kernel_size2        = 6
23 |     timeLength          = 288
24 |     metricNumber        = 19
25 |     x_dim               = timeLength*metricNumber
26 |     z_dim               = 10
27 |     norm                = False
28 |     VALID_PORTION       = 0.1
29 |     act_norm            = True
30 |     l2_reg              = 0.0001
31 |     shortcut_kernel_size= 1
32 | 
33 |     # Training parameters
34 |     batch_size          = 32                    # 32
35 |     initial_lr          = 0.001                 # 0.0005, 0.001
36 |     lr_anneal_factor    = 0.5                   # 0.5, 0.75
37 |     lr_anneal_epoch_freq= 5                     # 20
38 |     max_epoch           = 10                    # 50, 100, 200
39 |     lr_anneal_step_freq = None
40 |     max_step            = None
41 |     write_summary       = False
42 |     grad_clip_norm      = 1.0
43 |     check_numerics      = True
44 |     std_epsilon         = 1e-10
45 | 
46 |     # Evaluation parameters
47 |     test_batch_size     = 32                    # 64, 128, 256
48 |     batchTest           = True
49 |     test_n_z            = 500                   # 5000, 1000
50 |     train_n_samples     = None
51 |     savetrainDS         = True
52 |     savetestDS          = True
53 |     savetestTS          = False
54 |     evaluation          = True
55 |     saveMetricInfo      = True
56 | 
57 |     # Test
58 |     q                   = 1e-4
59 |     level               = 0.2
60 | 
61 |     @property
62 |     def x_shape(self):
63 |         return (self.timeLength, self.metricNumber, 1) if self.channels_last else (1, self.timeLength, self.metricNumber)
64 | 


--------------------------------------------------------------------------------
/code/detect.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | from pot import POT
 4 | 
 5 | 
 6 | def pot_eval(init_score, q, level):
 7 |     """
 8 |     Run POT method on given score.
 9 |     init_score : The data to get init threshold. the outlier score of train set.
10 |     q (float): Detection level (risk)
11 |     level (float): Probability associated with the initial threshold t
12 |     return the threshold under POT estimation algorithm.
13 |     """
14 |     s = POT(q)  # SPOT object
15 |     pot_th = s.initialize(init_score, level=level)  # initialization step
16 |     return pot_th
17 | 
18 | 
19 | def cal_scoreChanges(outlierScore_list, ave_twoMetricScore = None, twoMetricScore = None):
20 |     """
21 |     get the change score of each metric
22 |     return the list of outlier score change.
23 |     """
24 |     TwoMetricScoreList = []
25 |     for i in range(0, len(outlierScore_list)):
26 |         TwoMetricScoreList.append(-1*(np.array(twoMetricScore[i]) - np.array(ave_twoMetricScore)))
27 |     return TwoMetricScoreList
28 | 
29 | 
30 | def cal_binaryResult(outlierScore_list, threshold, timeIndex, saveMetricInfo = False,
31 |     labelFileNameLineCntList = None):
32 |     """
33 |     output result according the threshold
34 |     return the binary result whether it's an outlier.
35 |     """
36 |     result_dict = dict()
37 |     fileNameLineCntList = []
38 |     for i in range(0, len(outlierScore_list)):
39 |         if outlierScore_list[i] < threshold:
40 |             result_dict[i] = outlierScore_list[i]
41 |             if saveMetricInfo:
42 |                 fileNameLineCntList.append(labelFileNameLineCntList[i])
43 |     resultArray = [timeIndex[index] for index, value in result_dict.items()]
44 |     if saveMetricInfo:
45 |         return resultArray, fileNameLineCntList
46 |     else:
47 |         return resultArray
48 | 
49 | 


--------------------------------------------------------------------------------
/code/domi.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import functools
  3 | import sys
  4 | import os
  5 | import time
  6 | import numpy as np
  7 | np.set_printoptions(precision=2)
  8 | from argparse import ArgumentParser
  9 | import tensorflow as tf
 10 | from pprint import pformat
 11 | from tensorflow.contrib.framework import arg_scope
 12 | 
 13 | import tfsnippet as spt
 14 | from tfsnippet.dataflows import DataFlow
 15 | from tfsnippet.scaffold import CheckpointSaver
 16 | from tfsnippet.utils import split_numpy_array, get_batch_size
 17 | from tfsnippet.examples.utils import MLResults, print_with_title, MultiGPU
 18 | 
 19 | from util import save_file, read_file, load_matrix_allData, get_machineID, cat_List
 20 | from evaluate import evaluate, interpretation_hit_ratio
 21 | from detect import pot_eval, cal_binaryResult, cal_scoreChanges
 22 | from model import q_net, p_net
 23 | 
 24 | from config import ExpConfig
 25 | config = ExpConfig()
 26 | 
 27 | 
 28 | def main():
 29 |     # parse the arguments
 30 |     arg_parser = ArgumentParser()
 31 |     spt.register_config_arguments(config, arg_parser, title='Model options')
 32 |     spt.register_config_arguments(spt.settings, arg_parser, prefix='tfsnippet', title='TFSnippet options')
 33 |     arg_parser.parse_args(sys.argv[1:])
 34 | 
 35 |     # print the config
 36 |     print_with_title('Configurations', pformat(config.to_dict()), after='\n')
 37 | 
 38 |     # open the result object and prepare for result directories
 39 |     model_file = config.result_dir + "/" + os.path.basename(__file__).split(".py")[0] + "_" + \
 40 |                  str(config.noExp) + ".model"
 41 |     dirName = os.path.basename(__file__).split(".py")[0] + "_" + str(config.noExp)
 42 |     results = MLResults(os.path.join(config.result_dir, dirName))
 43 |     results.save_config(config)  # save experiment settings
 44 |     results.make_dirs('train_summary', exist_ok=True)
 45 |     results.make_dirs('result_summary', exist_ok=True)
 46 |     results.make_dirs('mid_summary', exist_ok=True)
 47 | 
 48 |     # os.environ["CUDA_VISIBLE_DEVICES"] = config.GPU_number
 49 | 
 50 |     # input placeholders
 51 |     input_x = tf.placeholder(dtype=tf.float32, shape=(None,) + config.x_shape, name='input_x')
 52 |     learning_rate = spt.AnnealingVariable('learning_rate', config.initial_lr, config.lr_anneal_factor, min_value=1e-6)
 53 |     multi_gpu = MultiGPU(disable_prebuild=True)
 54 |     # multi_gpu = MultiGPU()
 55 | 
 56 |     # derive the training operation
 57 |     gradses = []
 58 |     grad_vars = []
 59 |     train_losses = []
 60 |     BATCH_SIZE = get_batch_size(input_x)
 61 | 
 62 |     for dev, pre_build, [dev_input_x] in multi_gpu.data_parallel(BATCH_SIZE, [input_x]):
 63 |         with tf.device(dev), multi_gpu.maybe_name_scope(dev):
 64 |             # derive the loss for initializing
 65 |             with tf.name_scope('initialization'), \
 66 |                     arg_scope([p_net, q_net], is_initializing=True), \
 67 |                     spt.utils.scoped_set_config(spt.settings, auto_histogram=False):
 68 |                 init_q_net = q_net(dev_input_x, n_z=config.train_n_samples)
 69 |                 init_chain = init_q_net.chain(p_net, latent_axis=0, observed={'x': dev_input_x})
 70 |                 init_loss = tf.reduce_mean(init_chain.vi.training.vimco())
 71 | 
 72 |             # derive the loss and lower-bound for training
 73 |             with tf.name_scope('training'), \
 74 |                     arg_scope([p_net, q_net], is_training=True):
 75 |                 train_q_net = q_net(dev_input_x, n_z=config.train_n_samples)
 76 |                 train_chain = train_q_net.chain(p_net, latent_axis=0, observed={'x': dev_input_x})
 77 |                 train_loss = (
 78 |                     tf.reduce_mean(train_chain.vi.training.vimco()) +
 79 |                     tf.losses.get_regularization_loss()
 80 |                 )
 81 |                 train_losses.append(train_loss)
 82 | 
 83 |             # derive the logits output for testing
 84 |             with tf.name_scope('testing'):
 85 |                 test_q_net = q_net(dev_input_x, n_z=config.test_n_z)
 86 |                 test_chain = test_q_net.chain(p_net, latent_axis=0, observed={'x': dev_input_x})
 87 |                 # log_prob of X and each univariate time series of X
 88 |                 log_prob = tf.reduce_mean(test_chain.model['x'].distribution.log_prob(dev_input_x), 0)
 89 |                 log_prob_per_element = tf.reduce_sum(log_prob)
 90 |                 log_prob_per_element_univariate_TS = tf.reduce_sum(log_prob, [0, 1, 3])
 91 |                 log_prob_per_element_univariate_TS_All = tf.reduce_sum(log_prob, [1, 3])
 92 | 
 93 |             # derive the optimizer
 94 |             with tf.name_scope('optimizing'):
 95 |                 params = tf.trainable_variables()
 96 |                 optimizer = tf.train.AdamOptimizer(learning_rate)
 97 |                 grads = optimizer.compute_gradients(train_loss, params)
 98 |                 for grad, var in grads:
 99 |                     if grad is not None and var is not None:
100 |                         if config.grad_clip_norm:
101 |                             grad = tf.clip_by_norm(grad, config.grad_clip_norm)
102 |                         if config.check_numerics:
103 |                             grad = tf.check_numerics(grad, 'gradient for {} has numeric issue'.format(var.name))
104 |                         grad_vars.append((grad, var))
105 |                 gradses.append(grad_vars)
106 | 
107 |     # merge multi-gpu outputs and operations
108 |     [train_loss] = multi_gpu.average([train_losses], BATCH_SIZE)
109 |     train_op = multi_gpu.apply_grads(
110 |         grads=multi_gpu.average_grads(gradses),
111 |         optimizer=optimizer,
112 |         control_inputs=tf.get_collection(tf.GraphKeys.UPDATE_OPS)
113 |     )
114 | 
115 |     # sort the contribution of each univariate_TS of input
116 |     SORT_UNIVARIATE_TS_INPUT = tf.placeholder(dtype=tf.float32, shape=(None, None), name='SORT_UNIVARIATE_TS_INPUT')
117 |     SORT_UNIVARIATE_TS = tf.nn.top_k(SORT_UNIVARIATE_TS_INPUT, k=config.metricNumber).indices + 1
118 | 
119 |     # load the training and testing data
120 |     print("="*10+"Shape of Input data"+"="*10)
121 |     x, time_indexs, x_test, time_indexs2 = load_matrix_allData(
122 |         config.dataReadformat, config.datapathForTrain, config.datapathForTest, config.timeLength, config.metricNumber,
123 |         "TrainFileNameList.txt", "TestFileNameList.txt", results, config.norm
124 |     )
125 | 
126 |     x_test = x_test.reshape([-1, config.timeLength, config.metricNumber, 1])
127 |     print("Test:", x_test.shape)
128 |     if config.batchTest:
129 |         test_flow = DataFlow.arrays([x_test], config.test_batch_size) # DataFlow is iterator
130 |         del x_test
131 |     x_train, x_val = split_numpy_array(x, portion=config.VALID_PORTION)
132 |     x_train = x_train.reshape([-1, config.timeLength, config.metricNumber, 1])
133 |     x_val = x_val.reshape([-1, config.timeLength, config.metricNumber, 1])
134 |     train_flow = DataFlow.arrays([x_train], config.batch_size, shuffle=False, skip_incomplete=True)
135 |     val_flow = DataFlow.arrays([x_val], config.test_batch_size)
136 |     print("Note:", config.x_dim, ", x_dim = size of datapoint = timeLength * metricNumber")
137 |     print("Input data shape:", x.shape, "Train data shape:", x_train.shape, "Validation data shape:", x_val.shape)
138 |     del x_train, x_val, x
139 | 
140 |     # training part
141 |     with spt.utils.create_session().as_default() as session:
142 |         spt.utils.ensure_variables_initialized()
143 |         saver = CheckpointSaver(tf.trainable_variables(), model_file)
144 |         if os.path.exists(model_file):
145 |             # load the parameters of trained model
146 |             saver.restore_latest()
147 |         else:
148 |             # initialize the network
149 |             while True:
150 |                 breakFlag = 0
151 |                 for [x] in train_flow:
152 |                     INITLOSS = session.run(init_loss, feed_dict={input_x: x})
153 |                     print('Network initialized, first-batch loss is {:.6g}.'.format(INITLOSS))
154 |                     if np.isnan(INITLOSS) or np.isinf(INITLOSS) or INITLOSS > 10 ** 5:
155 |                         pass
156 |                     else:
157 |                         breakFlag = 1
158 |                         break
159 |                 if breakFlag:
160 |                     break
161 | 
162 |             # train the network
163 |             with train_flow.threaded(10) as train_flow:
164 |                 with spt.TrainLoop(params,
165 |                                    var_groups=['q_net', 'p_net'],
166 |                                    max_epoch=config.max_epoch,
167 |                                    max_step=config.max_step,
168 |                                    summary_dir=(results.system_path('train_summary') if config.write_summary else None),
169 |                                    summary_graph=tf.get_default_graph(),
170 |                                    early_stopping=True) as loop:
171 |                     trainer = spt.Trainer(
172 |                         loop, train_op, [input_x], train_flow,
173 |                         metrics={'loss': train_loss},
174 |                         summaries=tf.summary.merge_all(spt.GraphKeys.AUTO_HISTOGRAM)
175 |                     )
176 |                     # anneal the learning rate
177 |                     trainer.anneal_after(
178 |                         learning_rate,
179 |                         epochs=config.lr_anneal_epoch_freq,
180 |                         steps=config.lr_anneal_step_freq
181 |                     )
182 |                     validator = spt.Validator(
183 |                         loop, train_loss, [input_x], val_flow,
184 |                     )
185 |                     trainer.evaluate_after_epochs(validator, freq=10)
186 |                     trainer.log_after_epochs(freq=1)
187 |                     trainer.run()
188 |                 saver.save()
189 | 
190 |             # save the training infomation
191 |             firWrite = True
192 |             num = 0
193 |             time0 = time.time()
194 |             for [x_train] in train_flow:
195 |                 if config.savetrainDS:
196 |                     # log prob of each metric of each instance
197 |                     log_prob_per_element_univariate_TS_list_item_Train = (session.run(
198 |                         log_prob_per_element_univariate_TS_All,
199 |                         feed_dict={input_x: x_train}
200 |                     ))
201 |                     log_prob_per_element_univariate_TS_list_Train = log_prob_per_element_univariate_TS_list_item_Train
202 |                     log_prob_per_element_list_Train = np.sum(
203 |                         np.array(log_prob_per_element_univariate_TS_list_item_Train), axis=1
204 |                     ).tolist()
205 |                     if firWrite:
206 |                         save_file(
207 |                             results.system_path("train_summary"), "OutlierScores_metric.txt",
208 |                             log_prob_per_element_univariate_TS_list_Train
209 |                         )
210 |                         save_file(
211 |                             results.system_path("train_summary"), "OutlierScores.txt", log_prob_per_element_list_Train)
212 |                     else:
213 |                         save_file(
214 |                             results.system_path("train_summary"), "OutlierScores_metric.txt",
215 |                             log_prob_per_element_univariate_TS_list_Train, "\n", "a"
216 |                         )
217 |                         save_file(
218 |                             results.system_path("train_summary"), "OutlierScores.txt",
219 |                             log_prob_per_element_list_Train, "\n", "a"
220 |                         )
221 | 
222 |                 firWrite = False
223 |                 num += 1
224 |                 if num % 1000 == 0:
225 |                     print(
226 |                         "-----Train %s >>>>>:Sum time of batch instances:%s" %
227 |                         (num, float(time.time()-time0)/float(num))
228 |                     )
229 |             del train_flow, val_flow
230 | 
231 |         # online test
232 |         time2 = time.time()
233 |         log_prob_per_element_list, log_prob_per_element_univariate_TS_list = [], []
234 |         if config.batchTest:
235 |             num = 0
236 |             for [x_test] in test_flow:
237 |                 if config.savetestDS:
238 |                     # log prob of each metric of each instance
239 |                     log_prob_per_element_univariate_TS_list_item = (session.run(
240 |                         log_prob_per_element_univariate_TS_All,
241 |                         feed_dict={input_x: x_test}
242 |                     ))
243 |                     log_prob_per_element_univariate_TS_list += log_prob_per_element_univariate_TS_list_item.tolist()
244 |                     log_prob_per_element_list += np.sum(np.array(
245 |                         log_prob_per_element_univariate_TS_list_item), axis=1
246 |                     ).tolist()
247 | 
248 |                 num += 1
249 |                 if num % 200 == 0:
250 |                     print(
251 |                         "-----Test %s >>>>>:Sum time of batch instances:%s" %
252 |                         (num, float(time.time()-time2)/float(num))
253 |                     )
254 |         else:
255 |             num = 1
256 |             for batch_x in x_test:
257 |                 if config.savetestTS:
258 |                     log_prob_per_element_list_item = (session.run(log_prob_per_element, feed_dict={input_x: [batch_x]}))
259 |                     log_prob_per_element_list.append(log_prob_per_element_list_item)
260 | 
261 |                 if config.savetestDS:
262 |                     log_prob_per_element_univariate_TS_list_item = (session.run(
263 |                         log_prob_per_element_univariate_TS,
264 |                         feed_dict={input_x: [batch_x]}
265 |                     ))
266 |                     log_prob_per_element_univariate_TS_list.append(log_prob_per_element_univariate_TS_list_item)
267 |                     log_prob_per_element_list.append(sum(log_prob_per_element_univariate_TS_list_item))
268 | 
269 |                 if num % 200 == 0:
270 |                     print(
271 |                         "-----Test>>>>>:%d, average time of each instance:%s" %
272 |                         (num, float(time.time()-time2)/float(num))
273 |                     )
274 |                 num += 1
275 | 
276 |         # get the lable file name and its line cnt number
277 |         allLabelFileNameLineCntList = get_machineID(results, config.labelpath)
278 | 
279 |         print("No of OutlierScores for all dataPoint:(%s):" % len(log_prob_per_element_list))
280 |         if config.savetestDS:
281 |             save_file(
282 |                 results.system_path("result_summary"), "OutlierScores_metric.txt",
283 |                 cat_List(allLabelFileNameLineCntList, log_prob_per_element_univariate_TS_list)
284 |             )
285 |         save_file(
286 |             results.system_path("result_summary"), "OutlierScores.txt",
287 |             cat_List(allLabelFileNameLineCntList, log_prob_per_element_list)
288 |         )
289 | 
290 |         if config.evaluation:
291 |             # Prepraration for the hitory two-metric results
292 |             twoMetricScore = read_file(results.system_path("train_summary"), "OutlierScores_metric.txt")
293 |             ave_twoMetricScore = np.mean(np.array(twoMetricScore), axis=0).tolist()
294 |             save_file(results.system_path("result_summary"), "PRF.txt",
295 |                 ["Average score of each univariate time series", "\n"], ",")
296 |             save_file(results.system_path("result_summary"), "PRF.txt",
297 |                 ave_twoMetricScore+["\n"], ",", "a")
298 |             save_file(results.system_path("result_summary"), "PRF.txt",
299 |                 ["Threshold", "F", "Precision", "Recall", "TP", "FP", "FN", "\n"], ",", "a")
300 | 
301 |             # get the sorted item each metric by change score
302 |             twoMetricScoreList = cal_scoreChanges(log_prob_per_element_list,
303 |                 ave_twoMetricScore, log_prob_per_element_univariate_TS_list)
304 |             MetricResult = session.run(SORT_UNIVARIATE_TS,
305 |                 feed_dict={SORT_UNIVARIATE_TS_INPUT: twoMetricScoreList})
306 |             save_file(results.system_path("result_summary"), "MetricResult.txt",
307 |                 cat_List(allLabelFileNameLineCntList, MetricResult))
308 | 
309 |             # POT evalution
310 |             POT_TH = pot_eval(
311 |                 read_file(results.system_path("train_summary"), "OutlierScores.txt", "float"), config.q, config.level
312 |             )
313 |             resultArray, outlierLabelfileNameLineCntList = cal_binaryResult(
314 |                 log_prob_per_element_list, POT_TH, time_indexs2, config.saveMetricInfo, allLabelFileNameLineCntList
315 |             )
316 |             evaluate(results, config.labelpath, resultArray, time_indexs2, POT_TH)
317 | 
318 |     # print the final metrics and close the results object
319 |     print_with_title('Results', results.format_metrics(), before='\n')
320 |     results.close()
321 | 
322 |     interpretation_hit_ratio(
323 |         truth_filepath=config.interpret_filepath,
324 |         prediction_filepath=os.path.join(config.result_dir, dirName, "result_summary", "MetricResult.txt")
325 |     )
326 | 
327 | 
328 | if __name__ == '__main__':
329 |     main()
330 | 


--------------------------------------------------------------------------------
/code/evaluate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from util import read_file, save_file
 3 | import numpy as np
 4 | import os
 5 | 
 6 | 
 7 | def evaluate(MLResult, labelpath, resultArray, timeIndex, threshold):
 8 |     """
 9 |     evalute the results
10 |     return F score of prediction and truth.
11 |     """
12 |     groundTruthArray = []
13 |     TPArray = []
14 |     num = 0
15 |     for fileName in read_file(MLResult.system_path("mid_summary"), "TestFileNameList.txt"):
16 |         with open(labelpath + fileName,"r") as f:
17 |             fline = f.readlines()
18 |             for line in fline:
19 |                 count = line.strip("\n")
20 |                 if int(count) == 1 and num in timeIndex:
21 |                     groundTruthArray.append(num)
22 |                 num += 1
23 | 
24 |     TP = 0
25 |     for i in resultArray:
26 |         if i in groundTruthArray:
27 |             TP += 1
28 |             TPArray.append(i)
29 | 
30 |     FP = len(resultArray) - TP
31 |     FN = len(groundTruthArray) - TP
32 |     Precision = TP / (float(TP + FP)) if TP + FP != 0 else 1
33 |     Recall = TP/(float(TP + FN)) if TP+FN != 0 else 1
34 |     F = 0 if Recall + Precision == 0 else (2 * Recall * Precision)/(Recall + Precision)
35 |     save_file(
36 |         MLResult.system_path("result_summary"), "PRF.txt",
37 |         [threshold, F, Precision, Recall, TP, FP, FN, "\n"], ",", "a"
38 |     )
39 |     return F
40 | 
41 | 
42 | def interpretation_hit_ratio(truth_filepath, prediction_filepath):
43 |     """
44 |     compute top 100%/120% interpretation hit ratio given truth lists of univariate time series
45 |     that contribute to outlier judgement and predicted lists of univariate time series.
46 |     return top 100%/120% interpretation hit ratio
47 |     """
48 |     with open(truth_filepath, 'r') as f:
49 |         gt = f.readlines()
50 | 
51 |     with open(prediction_filepath, 'r') as f:
52 |         result = f.readlines()
53 | 
54 |     gtDict = {}
55 |     for i in gt:
56 |         iList = i.strip("\n").strip("\r").split(",")
57 |         gtDict[iList[0]] = iList[1:]
58 | 
59 |     resultDict = {}
60 |     for i in result:
61 |         iList = i.strip("\n").strip("\r").replace(".txt", "").split(",")
62 |         resultDict[iList[0]] = iList[1:]
63 | 
64 |     for rate in [1.0, 1.2]:
65 |         accurate_list = []
66 |         for k in gtDict.keys():
67 |             t1 = resultDict[k]
68 |             t2 = gtDict[k]
69 |             t3 = list(set(t2).intersection(t1[0:int(len(t2) * rate)]))
70 |             accurate_list.append(float(len(t3)) / float(len(t2)))
71 |         print("top {}% interpretation hit ratio: ".format(rate * 100), sum(accurate_list) / len(accurate_list))
72 | 
73 | 


--------------------------------------------------------------------------------
/code/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import tfsnippet as spt
  3 | import functools
  4 | import tensorflow as tf
  5 | from tensorflow.contrib.framework import arg_scope, add_arg_scope
  6 | 
  7 | from config import ExpConfig
  8 | config = ExpConfig()
  9 | 
 10 | 
 11 | @spt.global_reuse
 12 | @add_arg_scope
 13 | def q_net(x, observed=None, n_z=None, is_training=False, is_initializing=False):
 14 |     """
 15 |     Inference net
 16 |     param x: input X, multivariate time series data.
 17 |     return q net structure.
 18 |     """
 19 |     net = spt.BayesianNet(observed=observed)
 20 | 
 21 |     normalizer_fn = None if not config.act_norm else functools.partial(
 22 |         spt.layers.act_norm,
 23 |         axis=-1 if config.channels_last else -3,
 24 |         initializing=is_initializing,
 25 |         value_ndims=3,
 26 |     )
 27 |     print("="*10+"qnet"+"="*10)
 28 | 
 29 |     # compute the hidden features
 30 |     with arg_scope([spt.layers.resnet_conv2d_block],
 31 |                    kernel_size=config.kernel_size2,
 32 |                    shortcut_kernel_size=config.shortcut_kernel_size,
 33 |                    activation_fn=tf.nn.elu,
 34 |                    normalizer_fn=normalizer_fn,
 35 |                    kernel_regularizer=spt.layers.l2_regularizer(config.l2_reg),
 36 |                    channels_last=config.channels_last):
 37 |         print("qx:%s"%x.get_shape())
 38 |         h_x = tf.reshape(
 39 |             tf.to_float(x),
 40 |             [-1, config.timeLength, config.metricNumber, 1]
 41 |             if config.channels_last 
 42 |             else [-1, 1, config.timeLength, config.metricNumber]
 43 |             )
 44 |         print("q1:%s"%h_x.get_shape())
 45 |         h_x = spt.layers.resnet_conv2d_block(
 46 |             h_x, 1, kernel_size=(config.kernel_size1, 1), strides=(config.strides1, 1)
 47 |         )
 48 |         print("q2:%s"%h_x.get_shape())
 49 |         h_x = spt.layers.resnet_conv2d_block(
 50 |             h_x, 1, kernel_size=(config.kernel_size1, 1), strides=(config.strides1, 1)
 51 |         )
 52 |         print("q3:%s"%h_x.get_shape())
 53 |         h_x = spt.layers.resnet_conv2d_block(
 54 |             h_x, 1, kernel_size=(config.kernel_size2, 1), strides=(config.strides2, 1)
 55 |         )
 56 |         print("q4:%s"%h_x.get_shape())
 57 |         h_x = spt.layers.resnet_conv2d_block(
 58 |             h_x, 1, kernel_size=(config.kernel_size2, 1), strides=(config.strides2, 1)
 59 |         )
 60 |         print("q5:%s"%h_x.get_shape())
 61 | 
 62 |     h_x = spt.ops.reshape_tail(h_x, ndims=3, shape=[-1])
 63 |     print("q6:%s" % h_x.get_shape())
 64 | 
 65 |     # sample y ~ q(y|x)
 66 |     c_logits = spt.layers.dense(h_x, config.n_c, name='c_logits')
 67 |     c = net.add('c', spt.Categorical(c_logits))
 68 |     c_one_hot = tf.one_hot(c, config.n_c, dtype=tf.float32)
 69 |     print("qc:%s, %s, %s" % (c_logits.shape, c.shape, c_one_hot.shape))
 70 |     h_z = h_x
 71 | 
 72 |     # sample z ~ q(z|x)
 73 |     z_mean = spt.layers.dense(h_z, config.z_dim, name='z_mean')
 74 |     z_logstd = spt.layers.dense(h_z, config.z_dim, name='z_logstd', activation_fn=tf.nn.elu) + config.std_epsilon
 75 |     z = net.add('z', spt.Normal(mean=z_mean, logstd=z_logstd), n_samples=n_z, group_ndims=1)
 76 |     print("q7:%s, %s, %s" % (z_mean.get_shape(), z_logstd.get_shape(), z.get_shape()))
 77 | 
 78 |     return net
 79 | 
 80 | 
 81 | @spt.global_reuse
 82 | @add_arg_scope
 83 | def p_net(observed=None, n_z=None, is_training=False, is_initializing=False):
 84 |     """
 85 |     Generative net
 86 |     return p net structure.
 87 |     """
 88 |     net = spt.BayesianNet(observed=observed)
 89 | 
 90 |     normalizer_fn = None if not config.act_norm else functools.partial(
 91 |         spt.layers.act_norm,
 92 |         axis=-1 if config.channels_last else -3,
 93 |         initializing=is_initializing,
 94 |         value_ndims=3,
 95 |     )
 96 | 
 97 |     def make_component(i):
 98 |         normal = spt.Normal(
 99 |             mean=tf.get_variable('mean_{}'.format(i), shape=[1, config.z_dim],
100 |                                  dtype=tf.float32, trainable=True),
101 |             logstd=tf.maximum(
102 |                 tf.get_variable('logstd_{}'.format(i), shape=[1, config.z_dim],
103 |                                 dtype=tf.float32, trainable=True),
104 |                 -1.
105 |             )
106 |         )
107 |         return normal.expand_value_ndims(1)
108 | 
109 |     components = [make_component(i) for i in range(config.n_c)]
110 |     mixture = spt.Mixture(
111 |         categorical=spt.Categorical(logits=tf.zeros([1, config.n_c])),
112 |         components=components,
113 |         is_reparameterized=True
114 |     )
115 |     z = net.add('z', mixture, n_samples=n_z)
116 | 
117 |     print("="*10+"pnet"+"="*10)
118 |     # compute the hidden features
119 |     with arg_scope([spt.layers.resnet_deconv2d_block],
120 |                    kernel_size=config.kernel_size2,
121 |                    shortcut_kernel_size=config.shortcut_kernel_size,
122 |                    activation_fn=tf.nn.elu,
123 |                    normalizer_fn=normalizer_fn,
124 |                    kernel_regularizer=spt.layers.l2_regularizer(config.l2_reg),
125 |                    channels_last=config.channels_last):
126 |         print("px:%s"%z.get_shape())
127 |         h_z = spt.layers.dense(
128 |             z, int(config.timeLength / (config.strides1 ** 2) / (config.strides2 ** 2) * int(config.metricNumber))
129 |         )
130 |         h_z = spt.ops.reshape_tail(
131 |             h_z,
132 |             ndims=1,
133 |             shape=(
134 |                 int(config.timeLength / (config.strides1 ** 2) / (config.strides2 ** 2)),
135 |                 int(config.metricNumber), 1
136 |             )
137 |             if config.channels_last else (
138 |                 1, int(config.timeLength / (config.strides1 ** 2) / (config.strides2 ** 2)),
139 |                 int(config.metricNumber)
140 |             )
141 |         )
142 |         print("p1:%s"%h_z.get_shape())
143 |         h_z = spt.layers.resnet_deconv2d_block(
144 |             h_z, 1, kernel_size=(config.kernel_size2, 1), strides=(config.strides2, 1)
145 |         )
146 |         print("p2:%s"%h_z.get_shape())
147 |         h_z = spt.layers.resnet_deconv2d_block(
148 |             h_z, 1, kernel_size=(config.kernel_size2, 1), strides=(config.strides2, 1)
149 |         )
150 |         print("p3:%s"%h_z.get_shape())
151 |         h_z = spt.layers.resnet_deconv2d_block(
152 |             h_z, 1, kernel_size=(config.kernel_size1, 1), strides=(config.strides1, 1)
153 |         )
154 |         print("p4:%s"%h_z.get_shape())
155 |         h_z = spt.layers.resnet_deconv2d_block(
156 |             h_z, 1, kernel_size=(config.kernel_size1, 1), strides=(config.strides1, 1)
157 |         )
158 |         print("p5:%s"%h_z.get_shape())
159 | 
160 |     # sample x ~ p(x|z)
161 |     x_mean = spt.layers.conv2d(
162 |         h_z, 1, (1, 1), padding='same', name='x_mean',
163 |         channels_last=config.channels_last
164 |     )
165 |     x_logstd = spt.layers.conv2d(
166 |         h_z, 1, (1, 1), padding='same', name='x_logstd',
167 |         channels_last=config.channels_last, activation_fn=tf.nn.elu, 
168 |     ) + config.std_epsilon
169 |     x = net.add('x', spt.Normal(mean=x_mean, logstd=x_logstd), n_samples=n_z, group_ndims=3)
170 |     print("p6:%s, %s, %s" % (x_mean.get_shape(), x_logstd.get_shape(), x.get_shape()))
171 | 
172 |     return net
173 | 


--------------------------------------------------------------------------------
/code/pot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from math import log
  3 | import numpy as np
  4 | from scipy.optimize import minimize
  5 | 
  6 | 
  7 | def _rootsFinder(fun, jac, bounds, npoints, method):
  8 |     """
  9 |     Find possible roots of a scalar function    
 10 |     method : str
 11 |         'regular' : regular sample of the search interval,
 12 |         'random' : uniform (distribution) sample of the search interval
 13 |     
 14 |     Return possible roots of the function
 15 |     """
 16 |     if method == 'regular':
 17 |         step = (bounds[1] - bounds[0]) / (npoints + 1)
 18 |         X0 = np.arange(bounds[0] + step, bounds[1], step)
 19 |     elif method == 'random':
 20 |         X0 = np.random.uniform(bounds[0], bounds[1], npoints)
 21 | 
 22 |     def objFun(X, f, jac):
 23 |         g = 0
 24 |         j = np.zeros(X.shape)
 25 |         i = 0
 26 |         for x in X:
 27 |             fx = f(x)
 28 |             g = g + fx ** 2
 29 |             j[i] = 2 * fx * jac(x)
 30 |             i = i + 1
 31 |         return g, j
 32 | 
 33 |     opt = minimize(lambda X: objFun(X, fun, jac), X0,
 34 |                    method='L-BFGS-B',
 35 |                    jac=True, bounds=[bounds] * len(X0))
 36 | 
 37 |     X = opt.x
 38 |     np.round(X, decimals=5)
 39 |     return np.unique(X)
 40 | 
 41 | 
 42 | def _log_likelihood(Y, gamma, sigma):
 43 |     """
 44 |     Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
 45 |     Returns log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
 46 |     """
 47 |     n = Y.size
 48 |     if gamma != 0:
 49 |         tau = gamma / sigma
 50 |         L = -n * log(sigma) - (1 + (1 / gamma)) * (np.log(1 + tau * Y)).sum()
 51 |     else:
 52 |         L = n * (1 + log(Y.mean()))
 53 |     return L
 54 | 
 55 | 
 56 | class POT:
 57 |     """
 58 |     This class allows to run POT algorithm on univariate dataset (upper-bound)
 59 |     """
 60 | 
 61 |     def __init__(self, q=1e-4):
 62 |         self.proba = q
 63 |         self.extreme_quantile = None
 64 |         self.init_data = None
 65 |         self.init_threshold = None
 66 |         self.peaks = None
 67 |         self.n = 0
 68 |         self.Nt = 0
 69 | 
 70 |     def initialize(self, init_data, level=0.02, min_extrema=False):
 71 |         self.init_data = np.array(init_data)
 72 |         n_init = self.init_data.size
 73 | 
 74 |         S = np.sort(self.init_data)  # we sort X to get the empirical quantile
 75 |         self.init_threshold = S[int(level * n_init)]  # t is fixed for the whole algorithm
 76 | 
 77 |         # initial peaks
 78 |         self.peaks = -1*self.init_data[self.init_data < self.init_threshold] + self.init_threshold
 79 |         self.Nt = self.peaks.size
 80 |         self.n = n_init
 81 |         g, s, l = self._grimshaw()
 82 |         self.extreme_quantile = self._quantile(g, s)
 83 |         return self.extreme_quantile
 84 | 
 85 |     def _grimshaw(self, epsilon=1e-8, n_points=10):
 86 |         """
 87 |         Compute the GPD parameters estimation with the Grimshaw's trick
 88 |         """
 89 |         def u(s):
 90 |             return 1 + np.log(s).mean()
 91 | 
 92 |         def v(s):
 93 |             return np.mean(1 / s)
 94 | 
 95 |         def w(Y, t):
 96 |             s = 1 + t * Y
 97 |             us = u(s)
 98 |             vs = v(s)
 99 |             return us * vs - 1
100 | 
101 |         def jac_w(Y, t):
102 |             s = 1 + t * Y
103 |             us = u(s)
104 |             vs = v(s)
105 |             jac_us = (1 / t) * (1 - vs)
106 |             jac_vs = (1 / t) * (-vs + np.mean(1 / s ** 2))
107 |             return us * jac_vs + vs * jac_us
108 | 
109 |         Ym = self.peaks.min()
110 |         YM = self.peaks.max()
111 |         Ymean = self.peaks.mean()
112 | 
113 |         a = -1 / YM
114 |         if abs(a) < 2 * epsilon:
115 |             epsilon = abs(a) / n_points
116 | 
117 |         a = a + epsilon
118 |         b = 2 * (Ymean - Ym) / (Ymean * Ym)
119 |         c = 2 * (Ymean - Ym) / (Ym ** 2)
120 | 
121 |         # We look for possible roots
122 |         left_zeros = _rootsFinder(lambda t: w(self.peaks, t),
123 |                                        lambda t: jac_w(self.peaks, t),
124 |                                        (a + epsilon, -epsilon),
125 |                                        n_points, 'regular')
126 | 
127 |         right_zeros = _rootsFinder(lambda t: w(self.peaks, t),
128 |                                         lambda t: jac_w(self.peaks, t),
129 |                                         (b, c),
130 |                                         n_points, 'regular')
131 | 
132 |         # all the possible roots
133 |         zeros = np.concatenate((left_zeros, right_zeros))
134 | 
135 |         # 0 is always a solution so we initialize with it
136 |         gamma_best = 0
137 |         sigma_best = Ymean
138 |         ll_best = _log_likelihood(self.peaks, gamma_best, sigma_best)
139 | 
140 |         # we look for better candidates
141 |         for z in zeros:
142 |             gamma = u(1 + z * self.peaks) - 1
143 |             sigma = gamma / z
144 |             ll = _log_likelihood(self.peaks, gamma, sigma)
145 |             if ll > ll_best:
146 |                 gamma_best = gamma
147 |                 sigma_best = sigma
148 |                 ll_best = ll
149 | 
150 |         return gamma_best, sigma_best, ll_best
151 | 
152 |     def _quantile(self, gamma, sigma):
153 |         """
154 |         Compute the quantile at level 1-q
155 |         Returns quantile at level 1-q for the GPD(γ,σ,μ=0)
156 |         """
157 |         r = self.n * self.proba / self.Nt
158 |         if gamma != 0:
159 |             return self.init_threshold - (sigma / gamma) * (pow(r, -gamma) - 1)
160 |         else:
161 |             return self.init_threshold + sigma * (r)
162 | 
163 | 


--------------------------------------------------------------------------------
/code/util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import time
  4 | import pandas as pd
  5 | import csv
  6 | import numpy as np
  7 | from functools import partial
  8 | np.seterr(divide='ignore', invalid='ignore')
  9 | import multiprocessing as mul
 10 | 
 11 | delEXTREVALUE = True
 12 | 
 13 | 
 14 | def read_file(pathName, fileName, Type="string", name=False):
 15 |     """
 16 |     read the content from txt file to a matrix
 17 |     return the matrix of the file
 18 |     """
 19 |     matrix = []
 20 |     with open(os.path.join(pathName, fileName), "r") as f:
 21 |         lines = f.readlines()
 22 |         for line in lines:
 23 |             if "," in line:
 24 |                 vector = line.strip("\r").strip("\n").split(',')
 25 |                 matrix.append([float(v) for v in vector[1:]] if name else [float(v) for v in vector])
 26 |             else:
 27 |                 if Type == "string":
 28 |                     matrix.append(line.strip("\r").strip("\n"))
 29 |                 if Type == "float":
 30 |                     matrix.append(float(line.strip("\r").strip("\n")))
 31 |     return matrix
 32 | 
 33 | 
 34 | def save_file(pathName, fileName, resultList, cat="\n", writeType = "w"):
 35 |     """
 36 |     save the 'resultList' in a 'fileName' File
 37 |     """
 38 |     with open(os.path.join(pathName, fileName), writeType) as f:
 39 |         if len(np.array(resultList).shape) == 1:
 40 |             f.write(cat.join(str(x) for x in resultList) + '\n')
 41 |         if len(np.array(resultList).shape) == 2:
 42 |             w = csv.writer(f, delimiter=',')
 43 |             w.writerows(resultList)
 44 | 
 45 | 
 46 | def data_norm_all(path, dirPath, timeLength, metricNumber, norm):
 47 |     """
 48 |     use the entire metric for normalize
 49 |     return all normalized matrix data and filepath.
 50 |     """
 51 |     df = pd.read_csv(os.path.join(dirPath, path)).astype(float)
 52 |     matrix = np.array(df.values.tolist())
 53 |     matrix = np.around(matrix, decimals=2)
 54 |     if norm:
 55 |         if delEXTREVALUE:
 56 |             Y = np.sort(matrix, axis=0)
 57 |             a, _ = Y.shape
 58 |             Z = Y[int(0.01*a):int(0.99*a), :]
 59 |             m_mean = np.mean(Z, axis=0, keepdims=True).astype(float)
 60 |             m_std = np.std(Z, axis=0, keepdims=True).astype(float)
 61 |         else:
 62 |             m_mean = np.mean(matrix, axis=0).astype(float)
 63 |             m_std = np.std(matrix, axis=0).astype(float)
 64 | 
 65 |         norm_matrix = (matrix - m_mean) / m_std
 66 |         norm_matrix = np.where(np.isnan(norm_matrix), 0, norm_matrix)
 67 |         norm_matrix = np.around(norm_matrix, decimals=2)
 68 |         norm_matrix = norm_matrix.reshape(-1, timeLength*metricNumber)
 69 |         return norm_matrix.tolist(), path, norm_matrix.shape[0]
 70 |     else:
 71 |         matrix = np.around(matrix, decimals=2)
 72 |         matrix = np.array(matrix).reshape(-1, timeLength*metricNumber)
 73 |         return matrix.tolist(), path, matrix.shape[0]
 74 | 
 75 | 
 76 | def get_data_eachday(path, dirPath, timeLength, metricNumber, norm):
 77 |     """
 78 |     use the each day metric for normalize
 79 |     return normalized matrix data for each day and filepath.
 80 |     """
 81 |     df = pd.read_csv(os.path.join(dirPath, path)).astype(float)
 82 |     matrix = np.array(df.values.tolist())
 83 |     matrix = np.around(matrix, decimals=2)
 84 | 
 85 |     if norm:
 86 |         matrix = matrix.reshape(-1, timeLength, metricNumber)
 87 |         if delEXTREVALUE:
 88 |             Y = np.sort(matrix, axis=1)
 89 |             a,b,c = Y.shape
 90 |             Z = Y[:, int(0.01*b):int(0.99*b), :]
 91 |             m_mean = np.mean(Z, axis=1, keepdims=True).astype(float)
 92 |             m_std = np.std(Z, axis=1, keepdims=True).astype(float)
 93 |         else:
 94 |             m_mean = np.mean(matrix, axis=1, keepdims=True).astype(float)
 95 |             m_std = np.std(matrix, axis=1, keepdims=True).astype(float)
 96 | 
 97 |         norm_matrix = (matrix - m_mean) / m_std
 98 |         norm_matrix = np.where(np.isnan(norm_matrix), 0, norm_matrix)
 99 |         norm_matrix = np.around(norm_matrix, decimals=2)
100 |         norm_matrix = norm_matrix.reshape(-1, timeLength*metricNumber)
101 |         return norm_matrix.tolist(), path, norm_matrix.shape[0]
102 |     else:
103 |         matrix = np.around(matrix, decimals=2)
104 |         matrix = np.array(matrix).reshape(-1, timeLength*metricNumber)
105 |         return matrix.tolist(), path, matrix.shape[0]
106 | 
107 | 
108 | def load_matrix_allData(dataReadformat, dirPath1, dirPath2,
109 |     timeLength, metricNumber, fileInfo1, fileInfo2, MLResult, norm):
110 |     """
111 |     read and normalize the data by Parallel using pool
112 |     return the two matrix data and corresponding time index.
113 |     """
114 |     st = time.time()
115 | 
116 |     matrix1, matrix2 = [], []
117 |     fileDirList1, fileDirList2 = [], []
118 | 
119 |     WORKERS = mul.cpu_count()
120 |     pool = mul.Pool(processes=WORKERS, maxtasksperchild=WORKERS)
121 | 
122 |     paras1 = [path for path in os.listdir(dirPath1) if ".txt" in path]
123 |     paras2 = [path for path in os.listdir(dirPath2) if ".txt" in path]
124 | 
125 |     if dataReadformat == "all":
126 |         get_data_partial1 = partial(
127 |             data_norm_all, dirPath=dirPath1, timeLength=timeLength, metricNumber=metricNumber, norm=norm
128 |         )
129 |         get_data_partial2 = partial(
130 |             data_norm_all, dirPath=dirPath2, timeLength=timeLength, metricNumber=metricNumber, norm=norm
131 |         )
132 |         result1 = pool.map_async(get_data_partial1, paras1)
133 |         result2 = pool.map_async(get_data_partial2, paras2)
134 |     else:
135 |         get_data_partial1 = partial(
136 |             get_data_eachday, dirPath=dirPath1, timeLength=timeLength, metricNumber=metricNumber, norm=norm
137 |         )
138 |         get_data_partial2 = partial(
139 |             get_data_eachday, dirPath=dirPath2, timeLength=timeLength, metricNumber=metricNumber, norm=norm
140 |         )
141 |         result1 = pool.map_async(get_data_partial1, paras1)
142 |         result2 = pool.map_async(get_data_partial2, paras2)
143 | 
144 |     pool.close()
145 |     pool.join()
146 | 
147 |     for i in result1.get():
148 |         matrix1 += i[0]
149 |         for j in range(1, i[2]+1):
150 |             fileDirList1.append(i[1]+'+'+str(j))
151 |     for i in result2.get():
152 |         matrix2 += i[0]
153 |         for j in range(1, i[2] + 1):
154 |             fileDirList2.append(i[1]+'+'+str(j))
155 |     time_indexs1 = [i for i in range(0, len(matrix1))]
156 |     time_indexs2 = [i for i in range(0, len(matrix2))]
157 |     save_file(MLResult.system_path("mid_summary"), fileInfo1, fileDirList1)
158 |     save_file(MLResult.system_path("mid_summary"), fileInfo2, fileDirList2)
159 |     print("-----Get data>>>>>:Time:%s" % (time.time()-st))
160 | 
161 |     return np.array(matrix1), np.array(time_indexs1), np.array(matrix2), np.array(time_indexs2)
162 | 
163 | 
164 | def cat_List(a, b):
165 |     """
166 |     cat the a: fileNameLineCnt list and b: resultList
167 |     return the combined list.
168 |     """
169 |     c = []
170 |     i = 0
171 |     while i < len(a):
172 |         if isinstance(b[i], list):
173 |             c.append([a[i]] + b[i])
174 |         elif isinstance(b[i], np.ndarray):
175 |             c.append([a[i]] + b[i].tolist())
176 |         else:
177 |             c.append([a[i]] + [b[i]])
178 |         i += 1
179 |     return c
180 | 
181 | 
182 | def get_machineID(MLResult, labelpath=None):
183 |     """
184 |     get the list: fileName + lineNum
185 |     return the result list.
186 |     """
187 |     if labelpath is None:
188 |         return read_file(MLResult.system_path("mid_summary"), "TestFileNameList.txt")
189 |     else:
190 |         labelFileNameLineCntList = []
191 |         for fileName in read_file(MLResult.system_path("mid_summary"), "TestFileNameList.txt"):
192 |             with open(labelpath + fileName, "r") as f:
193 |                 fline = f.readlines()
194 |                 lineCnt = 1
195 |                 while lineCnt <= len(fline):
196 |                     labelFileNameLineCntList.append(fileName+"+"+str(lineCnt))
197 |                     lineCnt += 1
198 |         return labelFileNameLineCntList
199 | 
200 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib == 3.0.2
 2 | numpy == 1.15.4
 3 | pandas == 0.23.4
 4 | scipy == 1.2.0
 5 | scikit_learn == 0.20.2
 6 | tensorflow == 1.14.0
 7 | tensorflow_estimator == 1.14.0
 8 | git+https://github.com/thu-ml/zhusuan.git
 9 | git+https://github.com/haowen-xu/tfsnippet.git@v0.2.0-alpha4
10 | imageio == 2.4.1
11 | fs == 2.3.0
12 | click == 7.0


--------------------------------------------------------------------------------