├── data_factory
    └── data_loader.py
├── img
    ├── DCdetector.jpg
    ├── art-compare.png
    ├── result_1.png
    ├── result_2.png
    ├── result_3.png
    ├── result_4.png
    ├── result_count.jpg
    └── workflow.png
├── main.py
├── metrics
    ├── AUC.py
    ├── Matthews_correlation_coefficient.py
    ├── affiliation
    │   ├── _affiliation_zone.py
    │   ├── _integral_interval.py
    │   ├── _single_ground_truth_event.py
    │   ├── generics.py
    │   └── metrics.py
    ├── combine_all_scores.py
    ├── customizable_f1_score.py
    ├── evaluate_utils.py
    ├── evaluator.py
    ├── f1_score_f1_pa.py
    ├── f1_series.py
    ├── fc_score.py
    ├── metrics.py
    ├── precision_at_k.py
    └── vus
    │   ├── analysis
    │       ├── robustness_eval.py
    │       └── score_computation.py
    │   ├── metrics.py
    │   ├── models
    │       ├── distance.py
    │       └── feature.py
    │   └── utils
    │       ├── metrics.py
    │       └── slidingWindows.py
├── model
    ├── DCdetector.py
    ├── RevIN.py
    ├── attn.py
    └── embed.py
├── readme.md
├── requirements.txt
├── result_count.jpg
├── scripts
    ├── Ablation_Multiscale.sh
    ├── Ablation_Window_Size.sh
    ├── Ablation_attention_head.sh
    ├── Ablation_encoder_layer.sh
    ├── MSL.sh
    ├── NIPS_TS_Swan.sh
    ├── NIPS_TS_Water.sh
    ├── PSM.sh
    ├── SMAP.sh
    ├── SMD.sh
    ├── SWAT.sh
    ├── UCR.sh
    └── UCR_AUG.sh
├── solver.py
└── utils
    ├── logger.py
    └── utils.py


/data_factory/data_loader.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | import random
  4 | from torch.utils.data import Dataset
  5 | from torch.utils.data import DataLoader
  6 | from PIL import Image
  7 | import numpy as np
  8 | import collections
  9 | import numbers
 10 | import math
 11 | import pandas as pd
 12 | from sklearn.preprocessing import StandardScaler
 13 | import pickle
 14 | 
 15 | 
 16 | class PSMSegLoader(object):
 17 |     def __init__(self, data_path, win_size, step, mode="train"):
 18 |         self.mode = mode
 19 |         self.step = step
 20 |         self.win_size = win_size
 21 |         self.scaler = StandardScaler()
 22 |         data = pd.read_csv(data_path + '/train.csv')
 23 |         data = data.values[:, 1:]
 24 |         data = np.nan_to_num(data)
 25 |         self.scaler.fit(data)
 26 |         data = self.scaler.transform(data)
 27 |         test_data = pd.read_csv(data_path + '/test.csv')
 28 |         test_data = test_data.values[:, 1:]
 29 |         test_data = np.nan_to_num(test_data)
 30 |         self.test = self.scaler.transform(test_data)
 31 |         self.train = data
 32 |         self.val = self.test
 33 |         self.test_labels = pd.read_csv(data_path + '/test_label.csv').values[:, 1:]
 34 | 
 35 |     def __len__(self):
 36 |         """
 37 |         Number of images in the object dataset.
 38 |         """
 39 |         if self.mode == "train":
 40 |             return (self.train.shape[0] - self.win_size) // self.step + 1
 41 |         elif (self.mode == 'val'):
 42 |             return (self.val.shape[0] - self.win_size) // self.step + 1
 43 |         elif (self.mode == 'test'):
 44 |             return (self.test.shape[0] - self.win_size) // self.step + 1
 45 |         else:
 46 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
 47 | 
 48 |     def __getitem__(self, index):
 49 |         index = index * self.step
 50 |         if self.mode == "train":
 51 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
 52 |         elif (self.mode == 'val'):
 53 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
 54 |         elif (self.mode == 'test'):
 55 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
 56 |                 self.test_labels[index:index + self.win_size])
 57 |         else:
 58 |             return np.float32(self.test[
 59 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
 60 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size])
 61 | 
 62 | 
 63 | class MSLSegLoader(object):
 64 |     def __init__(self, data_path, win_size, step, mode="train"):
 65 |         self.mode = mode
 66 |         self.step = step
 67 |         self.win_size = win_size
 68 |         self.scaler = StandardScaler()
 69 |         data = np.load(data_path + "/MSL_train.npy")
 70 |         self.scaler.fit(data)
 71 |         data = self.scaler.transform(data)
 72 |         test_data = np.load(data_path + "/MSL_test.npy")
 73 |         self.test = self.scaler.transform(test_data)
 74 |         self.train = data
 75 |         self.val = self.test
 76 |         self.test_labels = np.load(data_path + "/MSL_test_label.npy")
 77 | 
 78 |     def __len__(self):
 79 |         if self.mode == "train":
 80 |             return (self.train.shape[0] - self.win_size) // self.step + 1
 81 |         elif (self.mode == 'val'):
 82 |             return (self.val.shape[0] - self.win_size) // self.step + 1
 83 |         elif (self.mode == 'test'):
 84 |             return (self.test.shape[0] - self.win_size) // self.step + 1
 85 |         else:
 86 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
 87 | 
 88 |     def __getitem__(self, index):
 89 |         index = index * self.step
 90 |         if self.mode == "train":
 91 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
 92 |         elif (self.mode == 'val'):
 93 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
 94 |         elif (self.mode == 'test'):
 95 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
 96 |                 self.test_labels[index:index + self.win_size])
 97 |         else:
 98 |             return np.float32(self.test[
 99 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
100 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size])
101 | 
102 | 
103 | class SMAPSegLoader(object):
104 |     def __init__(self, data_path, win_size, step, mode="train"):
105 |         self.mode = mode
106 |         self.step = step
107 |         self.win_size = win_size
108 |         self.scaler = StandardScaler()
109 |         data = np.load(data_path + "/SMAP_train.npy")
110 |         self.scaler.fit(data)
111 |         data = self.scaler.transform(data)
112 |         test_data = np.load(data_path + "/SMAP_test.npy")
113 |         self.test = self.scaler.transform(test_data)
114 |         self.train = data
115 |         self.val = self.test
116 |         self.test_labels = np.load(data_path + "/SMAP_test_label.npy")
117 | 
118 |     def __len__(self):
119 |         if self.mode == "train":
120 |             return (self.train.shape[0] - self.win_size) // self.step + 1
121 |         elif (self.mode == 'val'):
122 |             return (self.val.shape[0] - self.win_size) // self.step + 1
123 |         elif (self.mode == 'test'):
124 |             return (self.test.shape[0] - self.win_size) // self.step + 1
125 |         else:
126 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
127 | 
128 |     def __getitem__(self, index):
129 |         index = index * self.step
130 |         if self.mode == "train": #train and val did not use label
131 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
132 |         elif (self.mode == 'val'):
133 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
134 |         elif (self.mode == 'test'):
135 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
136 |                 self.test_labels[index:index + self.win_size])
137 |         else:
138 |             return np.float32(self.test[
139 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
140 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size])
141 | 
142 | 
143 | class SMDSegLoader(object):
144 |     def __init__(self, data_path, win_size, step, mode="train"):
145 |         self.mode = mode
146 |         self.step = step
147 |         self.win_size = win_size
148 |         self.scaler = StandardScaler()
149 |         data = np.load(data_path + "/SMD_train.npy")[:,:]
150 |         self.scaler.fit(data)
151 |         data = self.scaler.transform(data)
152 |         test_data = np.load(data_path + "/SMD_test.npy")[:,:]
153 |         self.test = self.scaler.transform(test_data)
154 |         self.train = data
155 |         data_len = len(self.train)
156 |         self.val = self.train[(int)(data_len * 0.8):]
157 |         self.test_labels = np.load(data_path + "/SMD_test_label.npy")[:]
158 | 
159 |     def __len__(self):
160 |         if self.mode == "train":
161 |             return (self.train.shape[0] - self.win_size) // self.step + 1
162 |         elif (self.mode == 'val'):
163 |             return (self.val.shape[0] - self.win_size) // self.step + 1
164 |         elif (self.mode == 'test'):
165 |             return (self.test.shape[0] - self.win_size) // self.step + 1
166 |         else:
167 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
168 | 
169 |     def __getitem__(self, index):
170 |         index = index * self.step
171 |         if self.mode == "train":
172 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
173 |         elif (self.mode == 'val'):
174 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
175 |         elif (self.mode == 'test'):
176 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
177 |                 self.test_labels[index:index + self.win_size])
178 |         else:
179 |             return np.float32(self.test[
180 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
181 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size])
182 | 
183 |         
184 |         
185 | class UCRSegLoader(object):
186 |     def __init__(self, index, data_path, win_size, step, mode="train"):
187 |         self.mode = mode
188 |         self.step = step
189 |         self.index = index
190 |         self.win_size = win_size
191 |         self.scaler = StandardScaler()
192 |         data = np.load(data_path + "/UCR_"+str(index)+"_train.npy")
193 |         self.scaler.fit(data)
194 |         data = self.scaler.transform(data)
195 |         test_data = np.load(data_path + "/UCR_"+str(index)+"_test.npy")
196 |         self.test = self.scaler.transform(test_data)
197 | 
198 |         self.train = data
199 |         self.val = self.test
200 |         self.test_labels = np.load(data_path + "/UCR_"+str(index)+"_test_label.npy")
201 |         if self.mode == "val":
202 |             print("train:", self.train.shape)
203 |             print("test:", self.test.shape)
204 | 
205 |     def __len__(self):
206 |         if self.mode == "train":
207 |             return (self.train.shape[0] - self.win_size) // self.step + 1
208 |         elif (self.mode == 'val'):
209 |             return (self.val.shape[0] - self.win_size) // self.step + 1
210 |         elif (self.mode == 'test'):
211 |             return (self.test.shape[0] - self.win_size) // self.step + 1
212 |         else:
213 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
214 | 
215 |     def __getitem__(self, index):
216 |         index = index * self.step
217 |         if self.mode == "train":
218 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
219 |         elif (self.mode == 'val'):
220 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
221 |         elif (self.mode == 'test'):
222 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
223 |                 self.test_labels[index:index + self.win_size])
224 |         else:
225 |             return np.float32(self.test[
226 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
227 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size])    
228 |         
229 | 
230 | class UCRAUGSegLoader(object):
231 |     def __init__(self, index, data_path, win_size, step, mode="train"):
232 |         self.mode = mode
233 |         self.step = step
234 |         self.index = index
235 |         self.win_size = win_size
236 |         self.scaler = StandardScaler()
237 |         data = np.load(data_path + "/UCR_AUG_"+str(index)+"_train.npy")
238 |         self.scaler.fit(data)
239 |         data = self.scaler.transform(data)
240 |         test_data = np.load(data_path + "/UCR_AUG_"+str(index)+"_test.npy")
241 |         self.test = self.scaler.transform(test_data)
242 | 
243 |         self.train = data
244 |         self.val = self.test
245 |         self.test_labels = np.load(data_path + "/UCR_AUG_"+str(index)+"_test_label.npy")
246 |         if self.mode == "val":
247 |             print("train:", self.train.shape)
248 |             print("test:", self.test.shape)
249 | 
250 |     def __len__(self):
251 |         if self.mode == "train":
252 |             return (self.train.shape[0] - self.win_size) // self.step + 1
253 |         elif (self.mode == 'val'):
254 |             return (self.val.shape[0] - self.win_size) // self.step + 1
255 |         elif (self.mode == 'test'):
256 |             return (self.test.shape[0] - self.win_size) // self.step + 1
257 |         else:
258 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
259 | 
260 |     def __getitem__(self, index):
261 |         index = index * self.step
262 |         if self.mode == "train":
263 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
264 |         elif (self.mode == 'val'):
265 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
266 |         elif (self.mode == 'test'):
267 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
268 |                 self.test_labels[index:index + self.win_size])
269 |         else:
270 |             return np.float32(self.test[
271 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
272 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 
273 |         
274 | 
275 | class NIPS_TS_WaterSegLoader(object):
276 |     def __init__(self, data_path, win_size, step, mode="train"):
277 |         self.mode = mode
278 |         self.step = step
279 |         self.win_size = win_size
280 |         self.scaler = StandardScaler()
281 |         data = np.load(data_path + "/NIPS_TS_Water_train.npy")
282 |         self.scaler.fit(data)
283 |         data = self.scaler.transform(data)
284 |         test_data = np.load(data_path + "/NIPS_TS_Water_test.npy")
285 |         self.test = self.scaler.transform(test_data)
286 | 
287 |         self.train = data
288 |         self.val = self.test
289 |         self.test_labels = np.load(data_path + "/NIPS_TS_Water_test_label.npy")
290 |         print("test:", self.test.shape)
291 |         print("train:", self.train.shape)
292 | 
293 |     def __len__(self):
294 | 
295 |         if self.mode == "train":
296 |             return (self.train.shape[0] - self.win_size) // self.step + 1
297 |         elif (self.mode == 'val'):
298 |             return (self.val.shape[0] - self.win_size) // self.step + 1
299 |         elif (self.mode == 'test'):
300 |             return (self.test.shape[0] - self.win_size) // self.step + 1
301 |         else:
302 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
303 | 
304 |     def __getitem__(self, index):
305 |         index = index * self.step
306 |         if self.mode == "train":
307 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
308 |         elif (self.mode == 'val'):
309 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
310 |         elif (self.mode == 'test'):
311 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
312 |                 self.test_labels[index:index + self.win_size])
313 |         else:
314 |             return np.float32(self.test[
315 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
316 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size])        
317 |         
318 |         
319 |         
320 | class NIPS_TS_SwanSegLoader(object):
321 |     def __init__(self, data_path, win_size, step, mode="train"):
322 |         self.mode = mode
323 |         self.step = step
324 |         self.win_size = win_size
325 |         self.scaler = StandardScaler()
326 |         data = np.load(data_path + "/NIPS_TS_Swan_train.npy")
327 |         self.scaler.fit(data)
328 |         data = self.scaler.transform(data)
329 |         test_data = np.load(data_path + "/NIPS_TS_Swan_test.npy")
330 |         self.test = self.scaler.transform(test_data)
331 | 
332 |         self.train = data
333 |         self.val = self.test
334 |         self.test_labels = np.load(data_path + "/NIPS_TS_Swan_test_label.npy")
335 |         print("test:", self.test.shape)
336 |         print("train:", self.train.shape)
337 | 
338 |     def __len__(self):
339 |         if self.mode == "train":
340 |             return (self.train.shape[0] - self.win_size) // self.step + 1
341 |         elif (self.mode == 'val'):
342 |             return (self.val.shape[0] - self.win_size) // self.step + 1
343 |         elif (self.mode == 'test'):
344 |             return (self.test.shape[0] - self.win_size) // self.step + 1
345 |         else:
346 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
347 | 
348 |     def __getitem__(self, index):
349 |         index = index * self.step
350 |         if self.mode == "train":
351 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
352 |         elif (self.mode == 'val'):
353 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
354 |         elif (self.mode == 'test'):
355 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
356 |                 self.test_labels[index:index + self.win_size])
357 |         else:
358 |             return np.float32(self.test[
359 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
360 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 
361 |         
362 | 
363 | class NIPS_TS_CCardSegLoader(object):
364 |     def __init__(self, data_path, win_size, step, mode="train"):
365 |         self.mode = mode
366 |         self.step = step
367 |         self.win_size = win_size
368 |         self.scaler = StandardScaler()
369 |         data = np.load(data_path + "/NIPS_TS_CCard_train.npy")
370 |         self.scaler.fit(data)
371 |         data = self.scaler.transform(data)
372 |         test_data = np.load(data_path + "/NIPS_TS_CCard_test.npy")
373 |         self.test = self.scaler.transform(test_data)
374 | 
375 |         self.train = data
376 |         self.val = self.test
377 |         self.test_labels = np.load(data_path + "/NIPS_TS_CCard_test_label.npy")
378 | 
379 |     def __len__(self):
380 | 
381 |         if self.mode == "train":
382 |             return (self.train.shape[0] - self.win_size) // self.step + 1
383 |         elif (self.mode == 'val'):
384 |             return (self.val.shape[0] - self.win_size) // self.step + 1
385 |         elif (self.mode == 'test'):
386 |             return (self.test.shape[0] - self.win_size) // self.step + 1
387 |         else:
388 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
389 | 
390 |     def __getitem__(self, index):
391 |         index = index * self.step
392 |         if self.mode == "train":
393 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
394 |         elif (self.mode == 'val'):
395 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
396 |         elif (self.mode == 'test'):
397 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
398 |                 self.test_labels[index:index + self.win_size])
399 |         else:
400 |             return np.float32(self.test[
401 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
402 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 
403 |         
404 | 
405 |         
406 |         
407 | class SMD_OriSegLoader(object):
408 |     def __init__(self, index, data_path, win_size, step, mode="train"):
409 |         self.mode = mode
410 |         self.step = step
411 |         self.index = index
412 |         self.win_size = win_size
413 |         self.scaler = StandardScaler()
414 |         data = np.load(data_path + "/SMD_Ori_"+str(index)+"_train.npy")
415 |         self.scaler.fit(data)
416 |         data = self.scaler.transform(data)
417 |         test_data = np.load(data_path + "/SMD_Ori_"+str(index)+"_test.npy")
418 |         self.test = self.scaler.transform(test_data)
419 | 
420 |         self.train = data
421 |         self.val = self.test
422 |         self.test_labels = np.load(data_path + "/SMD_Ori_"+str(index)+"_test_label.npy")
423 |         if self.mode == "val":
424 |             print("train:", self.train.shape)
425 |             print("test:", self.test.shape)
426 | 
427 |     def __len__(self):
428 |         if self.mode == "train":
429 |             return (self.train.shape[0] - self.win_size) // self.step + 1
430 |         elif (self.mode == 'val'):
431 |             return (self.val.shape[0] - self.win_size) // self.step + 1
432 |         elif (self.mode == 'test'):
433 |             return (self.test.shape[0] - self.win_size) // self.step + 1
434 |         else:
435 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
436 | 
437 |     def __getitem__(self, index):
438 |         index = index * self.step
439 |         if self.mode == "train":
440 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
441 |         elif (self.mode == 'val'):
442 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
443 |         elif (self.mode == 'test'):
444 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
445 |                 self.test_labels[index:index + self.win_size])
446 |         else:
447 |             return np.float32(self.test[
448 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
449 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size])         
450 | 
451 | class SWATSegLoader(Dataset):
452 |     def __init__(self, root_path, win_size, step=1, flag="train"):
453 |         self.flag = flag
454 |         self.step = step
455 |         self.win_size = win_size
456 |         self.scaler = StandardScaler()
457 | 
458 |         train_data = pd.read_csv(os.path.join(root_path, 'swat_train2.csv'))
459 |         test_data = pd.read_csv(os.path.join(root_path, 'swat2.csv'))
460 |         labels = test_data.values[:, -1:]
461 |         train_data = train_data.values[:, :-1]
462 |         test_data = test_data.values[:, :-1]
463 | 
464 |         self.scaler.fit(train_data)
465 |         train_data = self.scaler.transform(train_data)
466 |         test_data = self.scaler.transform(test_data)
467 |         self.train = train_data
468 |         self.test = test_data
469 |         data_len = len(self.train)
470 |         self.val = self.train[(int)(data_len * 0.8):]
471 |         self.test_labels = labels
472 |         print("test:", self.test.shape)
473 |         print("train:", self.train.shape)
474 | 
475 |     def __len__(self):
476 |         """
477 |         Number of images in the object dataset.
478 |         """
479 |         if self.flag == "train":
480 |             return (self.train.shape[0] - self.win_size) // self.step + 1
481 |         elif (self.flag == 'val'):
482 |             return (self.val.shape[0] - self.win_size) // self.step + 1
483 |         elif (self.flag == 'test'):
484 |             return (self.test.shape[0] - self.win_size) // self.step + 1
485 |         else:
486 |             return (self.test.shape[0] - self.win_size) // self.win_size + 1
487 | 
488 |     def __getitem__(self, index):
489 |         index = index * self.step
490 |         if self.flag == "train":
491 |             return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
492 |         elif (self.flag == 'val'):
493 |             return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size])
494 |         elif (self.flag == 'test'):
495 |             return np.float32(self.test[index:index + self.win_size]), np.float32(
496 |                 self.test_labels[index:index + self.win_size])
497 |         else:
498 |             return np.float32(self.test[
499 |                               index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32(
500 |                 self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size])
501 | 
502 |         
503 | def get_loader_segment(index, data_path, batch_size, win_size=100, step=100, mode='train', dataset='KDD'):
504 |     if (dataset == 'SMD'):
505 |         dataset = SMDSegLoader(data_path, win_size, 1, mode)
506 |     elif (dataset == 'MSL'):
507 |         dataset = MSLSegLoader(data_path, win_size, 1, mode)
508 |     elif (dataset == 'SMAP'):
509 |         dataset = SMAPSegLoader(data_path, win_size, 1, mode)
510 |     elif (dataset == 'PSM'):
511 |         dataset = PSMSegLoader(data_path, win_size, 1, mode)
512 |     elif (dataset =='SWAT'):
513 |         dataset = SWATSegLoader(data_path,win_size,1,mode)
514 |     elif (dataset == 'UCR'):
515 |         dataset = UCRSegLoader(index, data_path, win_size, 1, mode)
516 |     elif (dataset == 'UCR_AUG'):
517 |         dataset = UCRAUGSegLoader(index, data_path, win_size, 1, mode)
518 |     elif (dataset == 'NIPS_TS_Water'):
519 |         dataset = NIPS_TS_WaterSegLoader(data_path, win_size, 1, mode)
520 |     elif (dataset == 'NIPS_TS_Swan'):
521 |         dataset = NIPS_TS_SwanSegLoader(data_path, win_size, 1, mode)
522 |     elif (dataset == 'NIPS_TS_CCard'):
523 |         dataset = NIPS_TS_CCardSegLoader(data_path, win_size, 1, mode)
524 |     elif (dataset == 'SMD_Ori'):
525 |         dataset = SMD_OriSegLoader(index, data_path, win_size, 1, mode)
526 |     
527 |     shuffle = False
528 |     if mode == 'train':
529 |         shuffle = True
530 | 
531 |     data_loader = DataLoader(dataset=dataset,
532 |                              batch_size=batch_size,
533 |                              shuffle=shuffle,
534 |                              num_workers=8,
535 |                              drop_last=True)
536 |     return data_loader
537 | 


--------------------------------------------------------------------------------
/img/DCdetector.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/DCdetector.jpg


--------------------------------------------------------------------------------
/img/art-compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/art-compare.png


--------------------------------------------------------------------------------
/img/result_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_1.png


--------------------------------------------------------------------------------
/img/result_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_2.png


--------------------------------------------------------------------------------
/img/result_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_3.png


--------------------------------------------------------------------------------
/img/result_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_4.png


--------------------------------------------------------------------------------
/img/result_count.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_count.jpg


--------------------------------------------------------------------------------
/img/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/workflow.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import numpy as np
  4 | from torch.backends import cudnn
  5 | from utils.utils import *
  6 | from solver import Solver
  7 | import time
  8 | import warnings
  9 | warnings.filterwarnings('ignore')
 10 | 
 11 | import sys
 12 | 
 13 | class Logger(object):
 14 |     def __init__(self, filename='default.log', add_flag=True, stream=sys.stdout):
 15 |         self.terminal = stream
 16 |         self.filename = filename
 17 |         self.add_flag = add_flag
 18 | 
 19 |     def write(self, message):
 20 |         if self.add_flag:
 21 |             with open(self.filename, 'a+') as log:
 22 |                 self.terminal.write(message)
 23 |                 log.write(message)
 24 |         else:
 25 |             with open(self.filename, 'w') as log:
 26 |                 self.terminal.write(message)
 27 |                 log.write(message)
 28 | 
 29 |     def flush(self):
 30 |         pass
 31 | 
 32 | 
 33 | def str2bool(v):
 34 |     return v.lower() in ('true')
 35 | 
 36 | 
 37 | def find_nearest(array, value):
 38 |     array = np.asarray(array)
 39 |     idx = (np.abs(array - value)).argmin()
 40 |     return int(array[idx-1])
 41 | 
 42 | 
 43 | def main(config):
 44 |     cudnn.benchmark = True
 45 |     if (not os.path.exists(config.model_save_path)):
 46 |         mkdir(config.model_save_path)
 47 |     solver = Solver(vars(config))
 48 | 
 49 |     if config.mode == 'train':
 50 |         solver.train()
 51 |     elif config.mode == 'test':
 52 |         solver.test()
 53 | 
 54 |     return solver
 55 | 
 56 | if __name__ == '__main__':
 57 |     parser = argparse.ArgumentParser()
 58 | 
 59 |     # Alternative
 60 |     parser.add_argument('--win_size', type=int, default=100)
 61 |     parser.add_argument('--patch_size', type=list, default=[5])
 62 |     parser.add_argument('--lr', type=float, default=1e-4)
 63 |     parser.add_argument('--loss_fuc', type=str, default='MSE')
 64 |     parser.add_argument('--n_heads', type=int, default=1)
 65 |     parser.add_argument('--e_layers', type=int, default=3)
 66 |     parser.add_argument('--d_model', type=int, default=256)
 67 |     parser.add_argument('--rec_timeseries', action='store_true', default=True)
 68 |     
 69 |     
 70 |     parser.add_argument('--use_gpu', type=bool, default=True, help='use gpu')
 71 |     parser.add_argument('--gpu', type=int, default=0, help='gpu')
 72 |     parser.add_argument('--use_multi_gpu', action='store_true', help='use multiple gpus', default=True)
 73 |     parser.add_argument('--devices', type=str, default='0,1,2,3',help='device ids of multile gpus')
 74 | 
 75 |     # Default
 76 |     parser.add_argument('--index', type=int, default=137)
 77 |     parser.add_argument('--num_epochs', type=int, default=10)
 78 |     parser.add_argument('--batch_size', type=int, default=128)
 79 |     parser.add_argument('--input_c', type=int, default=9)
 80 |     parser.add_argument('--output_c', type=int, default=9)
 81 |     parser.add_argument('--k', type=int, default=3)
 82 |     parser.add_argument('--dataset', type=str, default='credit')
 83 |     parser.add_argument('--mode', type=str, default='train', choices=['train', 'test'])
 84 |     parser.add_argument('--data_path', type=str, default='./dataset/creditcard_ts.csv')
 85 |     parser.add_argument('--model_save_path', type=str, default='checkpoints')
 86 | 
 87 |     parser.add_argument('--anormly_ratio', type=float, default=4.00)
 88 | 
 89 |     config = parser.parse_args()
 90 |     args = vars(config)
 91 |     config.patch_size = [int(patch_index) for patch_index in config.patch_size]
 92 |     
 93 |     
 94 |     if config.dataset == 'UCR':
 95 |         batch_size_buffer = [2,4,8,16,32,64,128,256]
 96 |         data_len = np.load('dataset/'+config.data_path + "/UCR_"+str(config.index)+"_train.npy").shape[0] 
 97 |         config.batch_size = find_nearest(batch_size_buffer, data_len / config.win_size)
 98 |     elif config.dataset == 'UCR_AUG':
 99 |         batch_size_buffer = [2,4,8,16,32,64,128,256]
100 |         data_len = np.load('dataset/'+config.data_path + "/UCR_AUG_"+str(config.index)+"_train.npy").shape[0] 
101 |         config.batch_size = find_nearest(batch_size_buffer, data_len / config.win_size)
102 |     elif config.dataset == 'SMD_Ori':
103 |         batch_size_buffer = [2,4,8,16,32,64,128,256,512]
104 |         data_len = np.load('dataset/'+config.data_path + "/SMD_Ori_"+str(config.index)+"_train.npy").shape[0] 
105 |         config.batch_size = find_nearest(batch_size_buffer, data_len / config.win_size)
106 |         
107 |     
108 |     config.use_gpu = True if torch.cuda.is_available() and config.use_gpu else False
109 |     if config.use_gpu and config.use_multi_gpu:
110 |         config.devices = config.devices.replace(' ','')
111 |         device_ids = config.devices.split(',')
112 |         config.device_ids = [int(id_) for id_ in device_ids]
113 |         config.gpu = config.device_ids[0]
114 |     
115 |     
116 |     sys.stdout = Logger("result/"+ config.data_path +".log", sys.stdout)
117 |     if config.mode == 'train':
118 |         print("\n\n")
119 |         print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
120 |         print('================ Hyperparameters ===============')
121 |         for k, v in sorted(args.items()):
122 |             print('%s: %s' % (str(k), str(v)))
123 |         print('====================  Train  ===================')
124 |         
125 |     main(config)
126 | 
127 |     
128 | 


--------------------------------------------------------------------------------
/metrics/AUC.py:
--------------------------------------------------------------------------------
  1 | # used by paper: TSB-UAD as the main evaluator
  2 | # github: https://github.com/johnpaparrizos/TSB-UAD/blob/main/TSB_AD/utils/metrics.py
  3 | import numpy as np
  4 | from sklearn import metrics
  5 | from metrics.evaluate_utils import find_length,range_convers_new
  6 | 
  7 | 
  8 | def extend_postive_range(x, window=16):
  9 |     label = x.copy().astype(float)
 10 | #     print(label)
 11 |     L = range_convers_new(label)  # index of non-zero segments
 12 | #     print(L)
 13 |     length = len(label)
 14 |     for k in range(len(L)):
 15 |         s = L[k][0]
 16 |         e = L[k][1]
 17 |         # x1 is the extended list like [1,2,3] which are non-zero(from the end-e)
 18 |         x1 = np.arange(e, min(e + window // 2, length))
 19 |         label[x1] += np.sqrt(1 - (x1 - e) / (window))
 20 |         # before the start-s
 21 |         x2 = np.arange(max(s - window // 2, 0), s)
 22 |         label[x2] += np.sqrt(1 - (s - x2) / (window))
 23 | 
 24 |     label = np.minimum(np.ones(length), label)
 25 |     return label
 26 | 
 27 | 
 28 | def extend_postive_range_individual(x, percentage=0.2):
 29 |     label = x.copy().astype(float)
 30 |     L = range_convers_new(label)  # index of non-zero segments
 31 |     length = len(label)
 32 |     for k in range(len(L)):
 33 |         s = L[k][0]
 34 |         e = L[k][1]
 35 | 
 36 |         l0 = int((e - s + 1) * percentage)
 37 | 
 38 |         x1 = np.arange(e, min(e + l0, length))
 39 |         label[x1] += np.sqrt(1 - (x1 - e) / (2 * l0))
 40 | 
 41 |         x2 = np.arange(max(s - l0, 0), s)
 42 |         label[x2] += np.sqrt(1 - (s - x2) / (2 * l0))
 43 | 
 44 |     label = np.minimum(np.ones(length), label)
 45 |     return label
 46 | 
 47 | 
 48 | def TPR_FPR_RangeAUC(labels, pred, P, L):
 49 |     product = labels * pred
 50 | 
 51 |     TP = np.sum(product)
 52 | 
 53 |     # recall = min(TP/P,1)
 54 |     P_new = (P + np.sum(labels)) / 2  # so TPR is neither large nor small
 55 |     # P_new = np.sum(labels)
 56 |     recall = min(TP / P_new, 1)
 57 |     # recall = TP/np.sum(labels)
 58 |     # print('recall '+str(recall))
 59 | 
 60 |     existence = 0
 61 |     for seg in L:
 62 |         if np.sum(product[seg[0]:(seg[1] + 1)]) > 0:
 63 |             existence += 1
 64 | 
 65 |     existence_ratio = existence / len(L)
 66 |     # print(existence_ratio)
 67 | 
 68 |     # TPR_RangeAUC = np.sqrt(recall*existence_ratio)
 69 |     # print(existence_ratio)
 70 |     TPR_RangeAUC = recall * existence_ratio
 71 | 
 72 |     FP = np.sum(pred) - TP
 73 |     # TN = np.sum((1-pred) * (1-labels))
 74 | 
 75 |     # FPR_RangeAUC = FP/(FP+TN)
 76 |     N_new = len(labels) - P_new
 77 |     FPR_RangeAUC = FP / N_new
 78 | 
 79 |     Precision_RangeAUC = TP / np.sum(pred)
 80 | 
 81 |     return TPR_RangeAUC, FPR_RangeAUC, Precision_RangeAUC
 82 | 
 83 | 
 84 | def Range_AUC(score_t_test, y_test,  window=5, percentage=0, plot_ROC=False, AUC_type='window'):
 85 |     # AUC_type='window'/'percentage'
 86 |     score = score_t_test
 87 |     labels = y_test
 88 |     score_sorted = -np.sort(-score)
 89 | 
 90 |     P = np.sum(labels)
 91 |     # print(np.sum(labels))
 92 |     if AUC_type == 'window':
 93 |         labels = extend_postive_range(labels, window=window)
 94 |     else:
 95 |         labels = extend_postive_range_individual(labels, percentage=percentage)
 96 | 
 97 |     # print(np.sum(labels))
 98 |     L = range_convers_new(labels)
 99 |     TPR_list = [0]
100 |     FPR_list = [0]
101 |     Precision_list = [1]
102 | 
103 |     for i in np.linspace(0, len(score) - 1, 250).astype(int):
104 |         threshold = score_sorted[i]
105 |         # print('thre='+str(threshold))
106 |         pred = score >= threshold
107 |         TPR, FPR, Precision = TPR_FPR_RangeAUC(labels, pred, P, L)
108 | 
109 |         TPR_list.append(TPR)
110 |         FPR_list.append(FPR)
111 |         Precision_list.append(Precision)
112 | 
113 |     TPR_list.append(1)
114 |     FPR_list.append(1)  # otherwise, range-AUC will stop earlier than (1,1)
115 | 
116 |     tpr = np.array(TPR_list)
117 |     fpr = np.array(FPR_list)
118 |     prec = np.array(Precision_list)
119 | 
120 |     width = fpr[1:] - fpr[:-1]
121 |     height = (tpr[1:] + tpr[:-1]) / 2
122 |     AUC_range = np.sum(width * height)
123 | 
124 |     width_PR = tpr[1:-1] - tpr[:-2]
125 |     height_PR = (prec[1:] + prec[:-1]) / 2
126 |     AP_range = np.sum(width_PR * height_PR)
127 | 
128 |     if plot_ROC:
129 |         return AUC_range, AP_range, fpr, tpr, prec
130 | 
131 |     return AUC_range
132 | 
133 | 
134 | def point_wise_AUC(score_t_test, y_test,  plot_ROC=False):
135 |     # area under curve
136 |     label = y_test
137 |     score = score_t_test
138 |     auc = metrics.roc_auc_score(label, score)
139 |     # plor ROC curve
140 |     if plot_ROC:
141 |         fpr, tpr, thresholds = metrics.roc_curve(label, score)
142 |         # display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc)
143 |         # display.plot()
144 |         return auc, fpr, tpr
145 |     else:
146 |         return auc
147 | 
148 | 
149 | def main():
150 |     y_test = np.zeros(100)
151 |     y_test[10:20] = 1
152 |     y_test[50:60] = 1
153 |     pred_labels = np.zeros(100)
154 |     pred_labels[15:17] = 0.5
155 |     pred_labels[55:62] = 0.7
156 |     # pred_labels[51:55] = 1
157 |     # true_events = get_events(y_test)
158 |     point_auc = point_wise_AUC(pred_labels, y_test)
159 |     range_auc = Range_AUC(pred_labels, y_test)
160 |     print("point_auc: {}, range_auc: {}".format(point_auc, range_auc))
161 | 
162 | 
163 | if __name__ == "__main__":
164 |     main()


--------------------------------------------------------------------------------
/metrics/Matthews_correlation_coefficient.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import confusion_matrix
 2 | import numpy as np
 3 | 
 4 | 
 5 | def MCC(y_test, pred_labels):
 6 |     tn, fp, fn, tp = confusion_matrix(y_test, pred_labels).ravel()
 7 |     MCC_score = (tp*tn-fp*fn)/(((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))**0.5)
 8 | 
 9 |     return MCC_score
10 | 
11 | 
12 | def main():
13 |     y_test = np.zeros(100)
14 |     y_test[10:20] = 1
15 |     y_test[50:60] = 1
16 |     pred_labels = np.zeros(100)
17 |     pred_labels[15:17] = 1
18 |     pred_labels[55:62] = 1
19 |     # pred_labels[51:55] = 1
20 |     # true_events = get_events(y_test)
21 |     confusion_matric = MCC(y_test, pred_labels)
22 | #     print(confusion_matric)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/metrics/affiliation/_affiliation_zone.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | from metrics.affiliation._integral_interval import interval_intersection
 4 | 
 5 | def t_start(j, Js = [(1,2),(3,4),(5,6)], Trange = (1,10)):
 6 |     """
 7 |     Helper for `E_gt_func`
 8 |     
 9 |     :param j: index from 0 to len(Js) (included) on which to get the start
10 |     :param Js: ground truth events, as a list of couples
11 |     :param Trange: range of the series where Js is included
12 |     :return: generalized start such that the middle of t_start and t_stop 
13 |     always gives the affiliation zone
14 |     """
15 |     b = max(Trange)
16 |     n = len(Js)
17 |     if j == n:
18 |         return(2*b - t_stop(n-1, Js, Trange))
19 |     else:
20 |         return(Js[j][0])
21 | 
22 | def t_stop(j, Js = [(1,2),(3,4),(5,6)], Trange = (1,10)):
23 |     """
24 |     Helper for `E_gt_func`
25 |     
26 |     :param j: index from 0 to len(Js) (included) on which to get the stop
27 |     :param Js: ground truth events, as a list of couples
28 |     :param Trange: range of the series where Js is included
29 |     :return: generalized stop such that the middle of t_start and t_stop 
30 |     always gives the affiliation zone
31 |     """
32 |     if j == -1:
33 |         a = min(Trange)
34 |         return(2*a - t_start(0, Js, Trange))
35 |     else:
36 |         return(Js[j][1])
37 | 
38 | def E_gt_func(j, Js, Trange):
39 |     """
40 |     Get the affiliation zone of element j of the ground truth
41 |     
42 |     :param j: index from 0 to len(Js) (excluded) on which to get the zone
43 |     :param Js: ground truth events, as a list of couples
44 |     :param Trange: range of the series where Js is included, can 
45 |     be (-math.inf, math.inf) for distance measures
46 |     :return: affiliation zone of element j of the ground truth represented
47 |     as a couple
48 |     """
49 |     range_left = (t_stop(j-1, Js, Trange) + t_start(j, Js, Trange))/2
50 |     range_right = (t_stop(j, Js, Trange) + t_start(j+1, Js, Trange))/2
51 |     return((range_left, range_right))
52 | 
53 | def get_all_E_gt_func(Js, Trange):
54 |     """
55 |     Get the affiliation partition from the ground truth point of view
56 |     
57 |     :param Js: ground truth events, as a list of couples
58 |     :param Trange: range of the series where Js is included, can 
59 |     be (-math.inf, math.inf) for distance measures
60 |     :return: affiliation partition of the events
61 |     """
62 |     # E_gt is the limit of affiliation/attraction for each ground truth event
63 |     E_gt = [E_gt_func(j, Js, Trange) for j in range(len(Js))]
64 |     return(E_gt)
65 | 
66 | def affiliation_partition(Is = [(1,1.5),(2,5),(5,6),(8,9)], E_gt = [(1,2.5),(2.5,4.5),(4.5,10)]):
67 |     """
68 |     Cut the events into the affiliation zones
69 |     The presentation given here is from the ground truth point of view,
70 |     but it is also used in the reversed direction in the main function.
71 |     
72 |     :param Is: events as a list of couples
73 |     :param E_gt: range of the affiliation zones
74 |     :return: a list of list of intervals (each interval represented by either 
75 |     a couple or None for empty interval). The outer list is indexed by each
76 |     affiliation zone of `E_gt`. The inner list is indexed by the events of `Is`.
77 |     """
78 |     out = [None] * len(E_gt)
79 |     for j in range(len(E_gt)):
80 |         E_gt_j = E_gt[j]
81 |         discarded_idx_before = [I[1] < E_gt_j[0] for I in Is]  # end point of predicted I is before the begin of E
82 |         discarded_idx_after = [I[0] > E_gt_j[1] for I in Is] # start of predicted I is after the end of E
83 |         kept_index = [not(a or b) for a, b in zip(discarded_idx_before, discarded_idx_after)]
84 |         Is_j = [x for x, y in zip(Is, kept_index)]
85 |         out[j] = [interval_intersection(I, E_gt[j]) for I in Is_j]
86 |     return(out)
87 | 


--------------------------------------------------------------------------------
/metrics/affiliation/_integral_interval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | import math
  4 | from metrics.affiliation.generics import _sum_wo_nan
  5 | """
  6 | In order to shorten the length of the variables,
  7 | the general convention in this file is to let:
  8 |     - I for a predicted event (start, stop),
  9 |     - Is for a list of predicted events,
 10 |     - J for a ground truth event,
 11 |     - Js for a list of ground truth events.
 12 | """
 13 | 
 14 | def interval_length(J = (1,2)):
 15 |     """
 16 |     Length of an interval
 17 |     
 18 |     :param J: couple representating the start and stop of an interval, or None
 19 |     :return: length of the interval, and 0 for a None interval
 20 |     """
 21 |     if J is None:
 22 |         return(0)
 23 |     return(J[1] - J[0])
 24 | 
 25 | def sum_interval_lengths(Is = [(1,2),(3,4),(5,6)]):
 26 |     """
 27 |     Sum of length of the intervals
 28 |     
 29 |     :param Is: list of intervals represented by starts and stops
 30 |     :return: sum of the interval length
 31 |     """
 32 |     return(sum([interval_length(I) for I in Is]))
 33 | 
 34 | def interval_intersection(I = (1, 3), J = (2, 4)): 
 35 |     """
 36 |     Intersection between two intervals I and J
 37 |     I and J should be either empty or represent a positive interval (no point)
 38 |     
 39 |     :param I: an interval represented by start and stop
 40 |     :param J: a second interval of the same form
 41 |     :return: an interval representing the start and stop of the intersection (or None if empty)
 42 |     """
 43 |     if I is None:
 44 |         return(None)
 45 |     if J is None:
 46 |         return(None)
 47 |         
 48 |     I_inter_J = (max(I[0], J[0]), min(I[1], J[1]))
 49 |     if I_inter_J[0] >= I_inter_J[1]:
 50 |         return(None)
 51 |     else:
 52 |         return(I_inter_J)
 53 | 
 54 | def interval_subset(I = (1, 3), J = (0, 6)):
 55 |     """
 56 |     Checks whether I is a subset of J
 57 |     
 58 |     :param I: an non empty interval represented by start and stop
 59 |     :param J: a second non empty interval of the same form
 60 |     :return: True if I is a subset of J
 61 |     """
 62 |     if (I[0] >= J[0]) and (I[1] <= J[1]):
 63 |         return True
 64 |     else:
 65 |         return False
 66 | 
 67 | def cut_into_three_func(I, J):
 68 |     """
 69 |     Cut an interval I into a partition of 3 subsets:
 70 |         the elements before J,
 71 |         the elements belonging to J,
 72 |         and the elements after J
 73 |     
 74 |     :param I: an interval represented by start and stop, or None for an empty one
 75 |     :param J: a non empty interval
 76 |     :return: a triplet of three intervals, each represented by either (start, stop) or None
 77 |     """
 78 |     if I is None:
 79 |         return((None, None, None))
 80 |     
 81 |     I_inter_J = interval_intersection(I, J)
 82 |     if I == I_inter_J:
 83 |         I_before = None
 84 |         I_after = None
 85 |     elif I[1] <= J[0]:
 86 |         I_before = I
 87 |         I_after = None
 88 |     elif I[0] >= J[1]:
 89 |         I_before = None
 90 |         I_after = I
 91 |     elif (I[0] <= J[0]) and (I[1] >= J[1]):
 92 |         I_before = (I[0], I_inter_J[0])
 93 |         I_after = (I_inter_J[1], I[1])
 94 |     elif I[0] <= J[0]:
 95 |         I_before = (I[0], I_inter_J[0])
 96 |         I_after = None
 97 |     elif I[1] >= J[1]:
 98 |         I_before = None
 99 |         I_after = (I_inter_J[1], I[1])
100 |     else:
101 |         raise ValueError('unexpected unconsidered case')
102 |     return(I_before, I_inter_J, I_after)
103 |   
104 | def get_pivot_j(I, J):
105 |     """
106 |     Get the single point of J that is the closest to I, called 'pivot' here,
107 |     with the requirement that I should be outside J
108 |     
109 |     :param I: a non empty interval (start, stop)
110 |     :param J: another non empty interval, with empty intersection with I
111 |     :return: the element j of J that is the closest to I
112 |     """
113 |     if interval_intersection(I, J) is not None:
114 |         raise ValueError('I and J should have a void intersection')
115 | 
116 |     j_pivot = None # j_pivot is a border of J
117 |     if max(I) <= min(J):
118 |         j_pivot = min(J)
119 |     elif min(I) >= max(J):
120 |         j_pivot = max(J)
121 |     else:
122 |         raise ValueError('I should be outside J')
123 |     return(j_pivot)
124 | 
125 | def integral_mini_interval(I, J):
126 |     """
127 |     In the specific case where interval I is located outside J,
128 |     integral of distance from x to J over the interval x \in I.
129 |     This is the *integral* i.e. the sum.
130 |     It's not the mean (not divided by the length of I yet)
131 |     
132 |     :param I: a interval (start, stop), or None
133 |     :param J: a non empty interval, with empty intersection with I
134 |     :return: the integral of distances d(x, J) over x \in I
135 |     """
136 |     if I is None:
137 |         return(0)
138 | 
139 |     j_pivot = get_pivot_j(I, J)
140 |     a = min(I)
141 |     b = max(I)
142 |     return((b-a)*abs((j_pivot - (a+b)/2)))
143 | 
144 | def integral_interval_distance(I, J):
145 |     """
146 |     For any non empty intervals I, J, compute the
147 |     integral of distance from x to J over the interval x \in I.
148 |     This is the *integral* i.e. the sum. 
149 |     It's not the mean (not divided by the length of I yet)
150 |     The interval I can intersect J or not
151 |     
152 |     :param I: a interval (start, stop), or None
153 |     :param J: a non empty interval
154 |     :return: the integral of distances d(x, J) over x \in I
155 |     """
156 |     # I and J are single intervals (not generic sets)
157 |     # I is a predicted interval in the range of affiliation of J
158 |     
159 |     def f(I_cut):
160 |         return(integral_mini_interval(I_cut, J))
161 |     # If I_middle is fully included into J, it is
162 |     # the distance to J is always 0
163 |     def f0(I_middle):
164 |         return(0)
165 | 
166 |     cut_into_three = cut_into_three_func(I, J)
167 |     # Distance for now, not the mean:
168 |     # Distance left: Between cut_into_three[0] and the point min(J)
169 |     d_left = f(cut_into_three[0])
170 |     # Distance middle: Between cut_into_three[1] = I inter J, and J
171 |     d_middle = f0(cut_into_three[1])
172 |     # Distance right: Between cut_into_three[2] and the point max(J)
173 |     d_right = f(cut_into_three[2])
174 |     # It's an integral so summable
175 |     return(d_left + d_middle + d_right)
176 | 
177 | def integral_mini_interval_P_CDFmethod__min_piece(I, J, E):
178 |     """
179 |     Helper of `integral_mini_interval_Pprecision_CDFmethod`
180 |     In the specific case where interval I is located outside J,
181 |     compute the integral $\int_{d_min}^{d_max} \min(m, x) dx$, with:
182 |     - m the smallest distance from J to E,
183 |     - d_min the smallest distance d(x, J) from x \in I to J
184 |     - d_max the largest distance d(x, J) from x \in I to J
185 |     
186 |     :param I: a single predicted interval, a non empty interval (start, stop)
187 |     :param J: ground truth interval, a non empty interval, with empty intersection with I
188 |     :param E: the affiliation/influence zone for J, represented as a couple (start, stop)
189 |     :return: the integral $\int_{d_min}^{d_max} \min(m, x) dx$
190 |     """
191 |     if interval_intersection(I, J) is not None:
192 |         raise ValueError('I and J should have a void intersection')
193 |     if not interval_subset(J, E):
194 |         raise ValueError('J should be included in E')
195 |     if not interval_subset(I, E):
196 |         raise ValueError('I should be included in E')
197 | 
198 |     e_min = min(E)
199 |     j_min = min(J)
200 |     j_max = max(J)
201 |     e_max = max(E)
202 |     i_min = min(I)
203 |     i_max = max(I)
204 |   
205 |     d_min = max(i_min - j_max, j_min - i_max)
206 |     d_max = max(i_max - j_max, j_min - i_min)
207 |     m = min(j_min - e_min, e_max - j_max)
208 |     A = min(d_max, m)**2 - min(d_min, m)**2
209 |     B = max(d_max, m) - max(d_min, m)
210 |     C = (1/2)*A + m*B
211 |     return(C)
212 | 
213 | def integral_mini_interval_Pprecision_CDFmethod(I, J, E):
214 |     """
215 |     Integral of the probability of distances over the interval I.
216 |     In the specific case where interval I is located outside J,
217 |     compute the integral $\int_{x \in I} Fbar(dist(x,J)) dx$.
218 |     This is the *integral* i.e. the sum (not the mean)
219 |     
220 |     :param I: a single predicted interval, a non empty interval (start, stop)
221 |     :param J: ground truth interval, a non empty interval, with empty intersection with I
222 |     :param E: the affiliation/influence zone for J, represented as a couple (start, stop)
223 |     :return: the integral $\int_{x \in I} Fbar(dist(x,J)) dx$
224 |     """
225 |     integral_min_piece = integral_mini_interval_P_CDFmethod__min_piece(I, J, E)
226 |   
227 |     e_min = min(E)
228 |     j_min = min(J)
229 |     j_max = max(J)
230 |     e_max = max(E)
231 |     i_min = min(I)
232 |     i_max = max(I)
233 |     d_min = max(i_min - j_max, j_min - i_max)
234 |     d_max = max(i_max - j_max, j_min - i_min)
235 |     integral_linear_piece = (1/2)*(d_max**2 - d_min**2)
236 |     integral_remaining_piece = (j_max - j_min)*(i_max - i_min)
237 |     
238 |     DeltaI = i_max - i_min
239 |     DeltaE = e_max - e_min
240 |     
241 |     output = DeltaI - (1/DeltaE)*(integral_min_piece + integral_linear_piece + integral_remaining_piece)
242 |     return(output)
243 | 
244 | def integral_interval_probaCDF_precision(I, J, E):
245 |     """
246 |     Integral of the probability of distances over the interval I.
247 |     Compute the integral $\int_{x \in I} Fbar(dist(x,J)) dx$.
248 |     This is the *integral* i.e. the sum (not the mean)
249 |     
250 |     :param I: a single (non empty) predicted interval in the zone of affiliation of J
251 |     :param J: ground truth interval
252 |     :param E: affiliation/influence zone for J
253 |     :return: the integral $\int_{x \in I} Fbar(dist(x,J)) dx$
254 |     """
255 |     # I and J are single intervals (not generic sets)
256 |     def f(I_cut):
257 |         if I_cut is None:
258 |             return(0)
259 |         else:
260 |             return(integral_mini_interval_Pprecision_CDFmethod(I_cut, J, E))
261 |             
262 |     # If I_middle is fully included into J, it is
263 |     # integral of 1 on the interval I_middle, so it's |I_middle|
264 |     def f0(I_middle):
265 |         if I_middle is None:
266 |             return(0)
267 |         else:
268 |             return(max(I_middle) - min(I_middle))
269 |     
270 |     cut_into_three = cut_into_three_func(I, J)
271 |     # Distance for now, not the mean:
272 |     # Distance left: Between cut_into_three[0] and the point min(J)
273 |     d_left = f(cut_into_three[0])
274 |     # Distance middle: Between cut_into_three[1] = I inter J, and J
275 |     d_middle = f0(cut_into_three[1])
276 |     # Distance right: Between cut_into_three[2] and the point max(J)
277 |     d_right = f(cut_into_three[2])
278 |     # It's an integral so summable
279 |     return(d_left + d_middle + d_right)
280 | 
281 | def cut_J_based_on_mean_func(J, e_mean):
282 |     """
283 |     Helper function for the recall.
284 |     Partition J into two intervals: before and after e_mean
285 |     (e_mean represents the center element of E the zone of affiliation)
286 |     
287 |     :param J: ground truth interval
288 |     :param e_mean: a float number (center value of E)
289 |     :return: a couple partitionning J into (J_before, J_after)
290 |     """
291 |     if J is None:
292 |         J_before = None
293 |         J_after = None
294 |     elif e_mean >= max(J):
295 |         J_before = J
296 |         J_after = None
297 |     elif e_mean <= min(J):
298 |         J_before = None
299 |         J_after = J
300 |     else: # e_mean is across J
301 |         J_before = (min(J), e_mean)
302 |         J_after = (e_mean, max(J))
303 |         
304 |     return((J_before, J_after))
305 | 
306 | def integral_mini_interval_Precall_CDFmethod(I, J, E):
307 |     """
308 |     Integral of the probability of distances over the interval J.
309 |     In the specific case where interval J is located outside I,
310 |     compute the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$.
311 |     This is the *integral* i.e. the sum (not the mean)
312 |     
313 |     :param I: a single (non empty) predicted interval
314 |     :param J: ground truth (non empty) interval, with empty intersection with I
315 |     :param E: the affiliation/influence zone for J, represented as a couple (start, stop)
316 |     :return: the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$
317 |     """
318 |     # The interval J should be located outside I 
319 |     # (so it's either the left piece or the right piece w.r.t I)
320 |     i_pivot = get_pivot_j(J, I)
321 |     e_min = min(E)
322 |     e_max = max(E)
323 |     e_mean = (e_min + e_max) / 2
324 |     
325 |     # If i_pivot is outside E (it's possible), then
326 |     # the distance is worst that any random element within E,
327 |     # so we set the recall to 0
328 |     if i_pivot <= min(E):
329 |         return(0)
330 |     elif i_pivot >= max(E):
331 |         return(0)
332 |     # Otherwise, we have at least i_pivot in E and so d < M so min(d,M)=d
333 |     
334 |     cut_J_based_on_e_mean = cut_J_based_on_mean_func(J, e_mean)
335 |     J_before = cut_J_based_on_e_mean[0]
336 |     J_after = cut_J_based_on_e_mean[1]
337 |   
338 |     iemin_mean = (e_min + i_pivot)/2
339 |     cut_Jbefore_based_on_iemin_mean = cut_J_based_on_mean_func(J_before, iemin_mean)
340 |     J_before_closeE = cut_Jbefore_based_on_iemin_mean[0] # before e_mean and closer to e_min than i_pivot ~ J_before_before
341 |     J_before_closeI = cut_Jbefore_based_on_iemin_mean[1] # before e_mean and closer to i_pivot than e_min ~ J_before_after
342 |     
343 |     iemax_mean = (e_max + i_pivot)/2
344 |     cut_Jafter_based_on_iemax_mean = cut_J_based_on_mean_func(J_after, iemax_mean)
345 |     J_after_closeI = cut_Jafter_based_on_iemax_mean[0] # after e_mean and closer to i_pivot than e_max ~ J_after_before
346 |     J_after_closeE = cut_Jafter_based_on_iemax_mean[1] # after e_mean and closer to e_max than i_pivot ~ J_after_after
347 |     
348 |     if J_before_closeE is not None:
349 |         j_before_before_min = min(J_before_closeE) # == min(J)
350 |         j_before_before_max = max(J_before_closeE)
351 |     else:
352 |         j_before_before_min = math.nan
353 |         j_before_before_max = math.nan
354 |   
355 |     if J_before_closeI is not None:
356 |         j_before_after_min = min(J_before_closeI) # == j_before_before_max if existing
357 |         j_before_after_max = max(J_before_closeI) # == max(J_before)
358 |     else:
359 |         j_before_after_min = math.nan
360 |         j_before_after_max = math.nan
361 |    
362 |     if J_after_closeI is not None:
363 |         j_after_before_min = min(J_after_closeI) # == min(J_after)
364 |         j_after_before_max = max(J_after_closeI) 
365 |     else:
366 |         j_after_before_min = math.nan
367 |         j_after_before_max = math.nan
368 |     
369 |     if J_after_closeE is not None:
370 |         j_after_after_min = min(J_after_closeE) # == j_after_before_max if existing
371 |         j_after_after_max = max(J_after_closeE) # == max(J)
372 |     else:
373 |         j_after_after_min = math.nan
374 |         j_after_after_max = math.nan
375 |   
376 |     # <-- J_before_closeE --> <-- J_before_closeI --> <-- J_after_closeI --> <-- J_after_closeE -->
377 |     # j_bb_min       j_bb_max j_ba_min       j_ba_max j_ab_min      j_ab_max j_aa_min      j_aa_max
378 |     # (with `b` for before and `a` for after in the previous variable names)
379 |     
380 |     #                                          vs e_mean  m = min(t-e_min, e_max-t)  d=|i_pivot-t|   min(d,m)                            \int min(d,m)dt   \int d dt        \int_(min(d,m)+d)dt                                    \int_{t \in J}(min(d,m)+d)dt
381 |     # Case J_before_closeE & i_pivot after J   before     t-e_min                    i_pivot-t       min(i_pivot-t,t-e_min) = t-e_min    t^2/2-e_min*t     i_pivot*t-t^2/2  t^2/2-e_min*t+i_pivot*t-t^2/2 = (i_pivot-e_min)*t      (i_pivot-e_min)*tB - (i_pivot-e_min)*tA = (i_pivot-e_min)*(tB-tA)
382 |     # Case J_before_closeI & i_pivot after J   before     t-e_min                    i_pivot-t       min(i_pivot-t,t-e_min) = i_pivot-t  i_pivot*t-t^2/2   i_pivot*t-t^2/2  i_pivot*t-t^2/2+i_pivot*t-t^2/2 = 2*i_pivot*t-t^2      2*i_pivot*tB-tB^2 - 2*i_pivot*tA + tA^2 = 2*i_pivot*(tB-tA) - (tB^2 - tA^2)
383 |     # Case J_after_closeI & i_pivot after J    after      e_max-t                    i_pivot-t       min(i_pivot-t,e_max-t) = i_pivot-t  i_pivot*t-t^2/2   i_pivot*t-t^2/2  i_pivot*t-t^2/2+i_pivot*t-t^2/2 = 2*i_pivot*t-t^2      2*i_pivot*tB-tB^2 - 2*i_pivot*tA + tA^2 = 2*i_pivot*(tB-tA) - (tB^2 - tA^2)
384 |     # Case J_after_closeE & i_pivot after J    after      e_max-t                    i_pivot-t       min(i_pivot-t,e_max-t) = e_max-t    e_max*t-t^2/2     i_pivot*t-t^2/2  e_max*t-t^2/2+i_pivot*t-t^2/2 = (e_max+i_pivot)*t-t^2  (e_max+i_pivot)*tB-tB^2 - (e_max+i_pivot)*tA + tA^2 = (e_max+i_pivot)*(tB-tA) - (tB^2 - tA^2)
385 |     #
386 |     # Case J_before_closeE & i_pivot before J  before     t-e_min                    t-i_pivot       min(t-i_pivot,t-e_min) = t-e_min    t^2/2-e_min*t     t^2/2-i_pivot*t  t^2/2-e_min*t+t^2/2-i_pivot*t = t^2-(e_min+i_pivot)*t  tB^2-(e_min+i_pivot)*tB - tA^2 + (e_min+i_pivot)*tA = (tB^2 - tA^2) - (e_min+i_pivot)*(tB-tA)
387 |     # Case J_before_closeI & i_pivot before J  before     t-e_min                    t-i_pivot       min(t-i_pivot,t-e_min) = t-i_pivot  t^2/2-i_pivot*t   t^2/2-i_pivot*t  t^2/2-i_pivot*t+t^2/2-i_pivot*t = t^2-2*i_pivot*t      tB^2-2*i_pivot*tB - tA^2 + 2*i_pivot*tA = (tB^2 - tA^2) - 2*i_pivot*(tB-tA)
388 |     # Case J_after_closeI & i_pivot before J   after      e_max-t                    t-i_pivot       min(t-i_pivot,e_max-t) = t-i_pivot  t^2/2-i_pivot*t   t^2/2-i_pivot*t  t^2/2-i_pivot*t+t^2/2-i_pivot*t = t^2-2*i_pivot*t      tB^2-2*i_pivot*tB - tA^2 + 2*i_pivot*tA = (tB^2 - tA^2) - 2*i_pivot*(tB-tA)
389 |     # Case J_after_closeE & i_pivot before J   after      e_max-t                    t-i_pivot       min(t-i_pivot,e_max-t) = e_max-t    e_max*t-t^2/2     t^2/2-i_pivot*t  e_max*t-t^2/2+t^2/2-i_pivot*t = (e_max-i_pivot)*t      (e_max-i_pivot)*tB - (e_max-i_pivot)*tA = (e_max-i_pivot)*(tB-tA)
390 |     
391 |     if i_pivot >= max(J):
392 |         part1_before_closeE = (i_pivot-e_min)*(j_before_before_max - j_before_before_min) # (i_pivot-e_min)*(tB-tA) # j_before_before_max - j_before_before_min
393 |         part2_before_closeI = 2*i_pivot*(j_before_after_max-j_before_after_min) - (j_before_after_max**2 - j_before_after_min**2) # 2*i_pivot*(tB-tA) - (tB^2 - tA^2) # j_before_after_max - j_before_after_min
394 |         part3_after_closeI = 2*i_pivot*(j_after_before_max-j_after_before_min) - (j_after_before_max**2 - j_after_before_min**2) # 2*i_pivot*(tB-tA) - (tB^2 - tA^2) # j_after_before_max - j_after_before_min  
395 |         part4_after_closeE = (e_max+i_pivot)*(j_after_after_max-j_after_after_min) - (j_after_after_max**2 - j_after_after_min**2) # (e_max+i_pivot)*(tB-tA) - (tB^2 - tA^2) # j_after_after_max - j_after_after_min
396 |         out_parts = [part1_before_closeE, part2_before_closeI, part3_after_closeI, part4_after_closeE]
397 |     elif i_pivot <= min(J):
398 |         part1_before_closeE = (j_before_before_max**2 - j_before_before_min**2) - (e_min+i_pivot)*(j_before_before_max-j_before_before_min) # (tB^2 - tA^2) - (e_min+i_pivot)*(tB-tA) # j_before_before_max - j_before_before_min
399 |         part2_before_closeI = (j_before_after_max**2 - j_before_after_min**2) - 2*i_pivot*(j_before_after_max-j_before_after_min) # (tB^2 - tA^2) - 2*i_pivot*(tB-tA) # j_before_after_max - j_before_after_min
400 |         part3_after_closeI = (j_after_before_max**2 - j_after_before_min**2) - 2*i_pivot*(j_after_before_max - j_after_before_min) # (tB^2 - tA^2) - 2*i_pivot*(tB-tA) # j_after_before_max - j_after_before_min
401 |         part4_after_closeE = (e_max-i_pivot)*(j_after_after_max - j_after_after_min) # (e_max-i_pivot)*(tB-tA) # j_after_after_max - j_after_after_min
402 |         out_parts = [part1_before_closeE, part2_before_closeI, part3_after_closeI, part4_after_closeE]
403 |     else:
404 |         raise ValueError('The i_pivot should be outside J')
405 |     
406 |     out_integral_min_dm_plus_d = _sum_wo_nan(out_parts) # integral on all J, i.e. sum of the disjoint parts
407 | 
408 |     # We have for each point t of J:
409 |     # \bar{F}_{t, recall}(d) = 1 - (1/|E|) * (min(d,m) + d)
410 |     # Since t is a single-point here, and we are in the case where i_pivot is inside E.
411 |     # The integral is then given by:
412 |     # C = \int_{t \in J} \bar{F}_{t, recall}(D(t)) dt
413 |     #   = \int_{t \in J} 1 - (1/|E|) * (min(d,m) + d) dt
414 |     #   = |J| - (1/|E|) * [\int_{t \in J} (min(d,m) + d) dt]
415 |     #   = |J| - (1/|E|) * out_integral_min_dm_plus_d    
416 |     DeltaJ = max(J) - min(J)
417 |     DeltaE = max(E) - min(E)
418 |     C = DeltaJ - (1/DeltaE) * out_integral_min_dm_plus_d
419 |     
420 |     return(C)
421 | 
422 | def integral_interval_probaCDF_recall(I, J, E):
423 |     """
424 |     Integral of the probability of distances over the interval J.
425 |     Compute the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$.
426 |     This is the *integral* i.e. the sum (not the mean)
427 | 
428 |     :param I: a single (non empty) predicted interval
429 |     :param J: ground truth (non empty) interval
430 |     :param E: the affiliation/influence zone for J
431 |     :return: the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$
432 |     """
433 |     # I and J are single intervals (not generic sets)
434 |     # E is the outside affiliation interval of J (even for recall!)
435 |     # (in particular J \subset E)
436 |     #
437 |     # J is the portion of the ground truth affiliated to I
438 |     # I is a predicted interval (can be outside E possibly since it's recall)
439 |     def f(J_cut):
440 |         if J_cut is None:
441 |             return(0)
442 |         else:
443 |             return integral_mini_interval_Precall_CDFmethod(I, J_cut, E)
444 | 
445 |     # If J_middle is fully included into I, it is
446 |     # integral of 1 on the interval J_middle, so it's |J_middle|
447 |     def f0(J_middle):
448 |         if J_middle is None:
449 |             return(0)
450 |         else:
451 |             return(max(J_middle) - min(J_middle))
452 |     
453 |     cut_into_three = cut_into_three_func(J, I) # it's J that we cut into 3, depending on the position w.r.t I
454 |     # since we integrate over J this time.
455 |     #
456 |     # Distance for now, not the mean:
457 |     # Distance left: Between cut_into_three[0] and the point min(I)
458 |     d_left = f(cut_into_three[0])
459 |     # Distance middle: Between cut_into_three[1] = J inter I, and I
460 |     d_middle = f0(cut_into_three[1])
461 |     # Distance right: Between cut_into_three[2] and the point max(I)
462 |     d_right = f(cut_into_three[2])
463 |     # It's an integral so summable
464 |     return(d_left + d_middle + d_right)
465 | 


--------------------------------------------------------------------------------
/metrics/affiliation/_single_ground_truth_event.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import math
 4 | from metrics.affiliation._affiliation_zone import (
 5 |         get_all_E_gt_func, 
 6 |         affiliation_partition)
 7 | from metrics.affiliation._integral_interval import (
 8 |         integral_interval_distance,
 9 |         integral_interval_probaCDF_precision, 
10 |         integral_interval_probaCDF_recall, 
11 |         interval_length,
12 |         sum_interval_lengths)
13 | 
14 | def affiliation_precision_distance(Is = [(1,2),(3,4),(5,6)], J = (2,5.5)):
15 |     """
16 |     Compute the individual average distance from Is to a single ground truth J
17 |     
18 |     :param Is: list of predicted events within the affiliation zone of J
19 |     :param J: couple representating the start and stop of a ground truth interval
20 |     :return: individual average precision directed distance number
21 |     """
22 |     if all([I is None for I in Is]): # no prediction in the current area
23 |         return(math.nan) # undefined
24 |     return(sum([integral_interval_distance(I, J) for I in Is]) / sum_interval_lengths(Is))
25 | 
26 | def affiliation_precision_proba(Is = [(1,2),(3,4),(5,6)], J = (2,5.5), E = (0,8)):
27 |     """
28 |     Compute the individual precision probability from Is to a single ground truth J
29 |     
30 |     :param Is: list of predicted events within the affiliation zone of J
31 |     :param J: couple representating the start and stop of a ground truth interval
32 |     :param E: couple representing the start and stop of the zone of affiliation of J
33 |     :return: individual precision probability in [0, 1], or math.nan if undefined
34 |     """
35 |     if all([I is None for I in Is]): # no prediction in the current area
36 |         return(math.nan) # undefined
37 |     return(sum([integral_interval_probaCDF_precision(I, J, E) for I in Is]) / sum_interval_lengths(Is))
38 | 
39 | def affiliation_recall_distance(Is = [(1,2),(3,4),(5,6)], J = (2,5.5)):
40 |     """
41 |     Compute the individual average distance from a single J to the predictions Is
42 |     
43 |     :param Is: list of predicted events within the affiliation zone of J
44 |     :param J: couple representating the start and stop of a ground truth interval
45 |     :return: individual average recall directed distance number
46 |     """
47 |     Is = [I for I in Is if I is not None] # filter possible None in Is
48 |     if len(Is) == 0: # there is no prediction in the current area
49 |         return(math.inf)
50 |     E_gt_recall = get_all_E_gt_func(Is, (-math.inf, math.inf))  # here from the point of view of the predictions
51 |     Js = affiliation_partition([J], E_gt_recall) # partition of J depending of proximity with Is
52 |     return(sum([integral_interval_distance(J[0], I) for I, J in zip(Is, Js)]) / interval_length(J))
53 | 
54 | def affiliation_recall_proba(Is = [(1,2),(3,4),(5,6)], J = (2,5.5), E = (0,8)):
55 |     """
56 |     Compute the individual recall probability from a single ground truth J to Is
57 |     
58 |     :param Is: list of predicted events within the affiliation zone of J
59 |     :param J: couple representating the start and stop of a ground truth interval
60 |     :param E: couple representing the start and stop of the zone of affiliation of J
61 |     :return: individual recall probability in [0, 1]
62 |     """
63 |     Is = [I for I in Is if I is not None] # filter possible None in Is
64 |     if len(Is) == 0: # there is no prediction in the current area
65 |         return(0)
66 |     E_gt_recall = get_all_E_gt_func(Is, E) # here from the point of view of the predictions
67 |     Js = affiliation_partition([J], E_gt_recall) # partition of J depending of proximity with Is
68 |     return(sum([integral_interval_probaCDF_recall(I, J[0], E) for I, J in zip(Is, Js)]) / interval_length(J))
69 | 


--------------------------------------------------------------------------------
/metrics/affiliation/generics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | from itertools import groupby
  4 | from operator import itemgetter
  5 | import math
  6 | import gzip
  7 | import glob
  8 | import os
  9 | 
 10 | def convert_vector_to_events(vector = [0, 1, 1, 0, 0, 1, 0]):
 11 |     """
 12 |     Convert a binary vector (indicating 1 for the anomalous instances)
 13 |     to a list of events. The events are considered as durations,
 14 |     i.e. setting 1 at index i corresponds to an anomalous interval [i, i+1).
 15 |     
 16 |     :param vector: a list of elements belonging to {0, 1}
 17 |     :return: a list of couples, each couple representing the start and stop of
 18 |     each event
 19 |     """
 20 |     positive_indexes = [idx for idx, val in enumerate(vector) if val > 0]
 21 |     events = []
 22 |     for k, g in groupby(enumerate(positive_indexes), lambda ix : ix[0] - ix[1]):
 23 |         cur_cut = list(map(itemgetter(1), g))
 24 |         events.append((cur_cut[0], cur_cut[-1]))
 25 |     
 26 |     # Consistent conversion in case of range anomalies (for indexes):
 27 |     # A positive index i is considered as the interval [i, i+1),
 28 |     # so the last index should be moved by 1
 29 |     events = [(x, y+1) for (x,y) in events]
 30 |         
 31 |     return(events)
 32 | 
 33 | def infer_Trange(events_pred, events_gt):
 34 |     """
 35 |     Given the list of events events_pred and events_gt, get the
 36 |     smallest possible Trange corresponding to the start and stop indexes 
 37 |     of the whole series.
 38 |     Trange will not influence the measure of distances, but will impact the
 39 |     measures of probabilities.
 40 |     
 41 |     :param events_pred: a list of couples corresponding to predicted events
 42 |     :param events_gt: a list of couples corresponding to ground truth events
 43 |     :return: a couple corresponding to the smallest range containing the events
 44 |     """
 45 |     if len(events_gt) == 0:
 46 |         raise ValueError('The gt events should contain at least one event')
 47 |     if len(events_pred) == 0:
 48 |         # empty prediction, base Trange only on events_gt (which is non empty)
 49 |         return(infer_Trange(events_gt, events_gt))
 50 |         
 51 |     min_pred = min([x[0] for x in events_pred])
 52 |     min_gt = min([x[0] for x in events_gt])
 53 |     max_pred = max([x[1] for x in events_pred])
 54 |     max_gt = max([x[1] for x in events_gt])
 55 |     Trange = (min(min_pred, min_gt), max(max_pred, max_gt))
 56 |     return(Trange)
 57 | 
 58 | def has_point_anomalies(events):
 59 |     """
 60 |     Checking whether events contain point anomalies, i.e.
 61 |     events starting and stopping at the same time.
 62 |     
 63 |     :param events: a list of couples corresponding to predicted events
 64 |     :return: True is the events have any point anomalies, False otherwise
 65 |     """
 66 |     if len(events) == 0:
 67 |         return(False)
 68 |     return(min([x[1] - x[0] for x in events]) == 0)
 69 | 
 70 | def _sum_wo_nan(vec):
 71 |     """
 72 |     Sum of elements, ignoring math.isnan ones
 73 |     
 74 |     :param vec: vector of floating numbers
 75 |     :return: sum of the elements, ignoring math.isnan ones
 76 |     """
 77 |     vec_wo_nan = [e for e in vec if not math.isnan(e)]
 78 |     return(sum(vec_wo_nan))
 79 |     
 80 | def _len_wo_nan(vec):
 81 |     """
 82 |     Count of elements, ignoring math.isnan ones
 83 |     
 84 |     :param vec: vector of floating numbers
 85 |     :return: count of the elements, ignoring math.isnan ones
 86 |     """
 87 |     vec_wo_nan = [e for e in vec if not math.isnan(e)]
 88 |     return(len(vec_wo_nan))
 89 | 
 90 | def read_gz_data(filename = 'data/machinetemp_groundtruth.gz'):
 91 |     """
 92 |     Load a file compressed with gz, such that each line of the
 93 |     file is either 0 (representing a normal instance) or 1 (representing)
 94 |     an anomalous instance.
 95 |     :param filename: file path to the gz compressed file
 96 |     :return: list of integers with either 0 or 1
 97 |     """
 98 |     with gzip.open(filename, 'rb') as f:
 99 |         content = f.read().splitlines()
100 |     content = [int(x) for x in content]
101 |     return(content)
102 | 
103 | def read_all_as_events():
104 |     """
105 |     Load the files contained in the folder `data/` and convert
106 |     to events. The length of the series is kept.
107 |     The convention for the file name is: `dataset_algorithm.gz`
108 |     :return: two dictionaries:
109 |         - the first containing the list of events for each dataset and algorithm,
110 |         - the second containing the range of the series for each dataset
111 |     """
112 |     filepaths = glob.glob('data/*.gz')
113 |     datasets = dict()
114 |     Tranges = dict()
115 |     for filepath in filepaths:
116 |         vector = read_gz_data(filepath)
117 |         events = convert_vector_to_events(vector)
118 |         # ad hoc cut for those files
119 |         cut_filepath = (os.path.split(filepath)[1]).split('_')
120 |         data_name = cut_filepath[0]
121 |         algo_name = (cut_filepath[1]).split('.')[0]
122 |         if not data_name in datasets:
123 |             datasets[data_name] = dict()
124 |             Tranges[data_name] = (0, len(vector))
125 |         datasets[data_name][algo_name] = events
126 |     return(datasets, Tranges)
127 | 
128 | def f1_func(p, r):
129 |     """
130 |     Compute the f1 function
131 |     :param p: precision numeric value
132 |     :param r: recall numeric value
133 |     :return: f1 numeric value
134 |     """
135 |     return(2*p*r/(p+r))
136 | 


--------------------------------------------------------------------------------
/metrics/affiliation/metrics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | from metrics.affiliation.generics import (
  4 |         infer_Trange,
  5 |         has_point_anomalies, 
  6 |         _len_wo_nan, 
  7 |         _sum_wo_nan,
  8 |         read_all_as_events)
  9 | from metrics.affiliation._affiliation_zone import (
 10 |         get_all_E_gt_func, 
 11 |         affiliation_partition)
 12 | from metrics.affiliation._single_ground_truth_event import (
 13 |         affiliation_precision_distance,
 14 |         affiliation_recall_distance,
 15 |         affiliation_precision_proba,
 16 |         affiliation_recall_proba)
 17 | 
 18 | def test_events(events):
 19 |     """
 20 |     Verify the validity of the input events
 21 |     :param events: list of events, each represented by a couple (start, stop)
 22 |     :return: None. Raise an error for incorrect formed or non ordered events
 23 |     """
 24 |     if type(events) is not list:
 25 |         raise TypeError('Input `events` should be a list of couples')
 26 |     if not all([type(x) is tuple for x in events]):
 27 |         raise TypeError('Input `events` should be a list of tuples')
 28 |     if not all([len(x) == 2 for x in events]):
 29 |         raise ValueError('Input `events` should be a list of couples (start, stop)')
 30 |     if not all([x[0] <= x[1] for x in events]):
 31 |         raise ValueError('Input `events` should be a list of couples (start, stop) with start <= stop')
 32 |     if not all([events[i][1] < events[i+1][0] for i in range(len(events) - 1)]):
 33 |         raise ValueError('Couples of input `events` should be disjoint and ordered')
 34 | 
 35 | def pr_from_events(events_pred, events_gt, Trange):
 36 |     """
 37 |     Compute the affiliation metrics including the precision/recall in [0,1],
 38 |     along with the individual precision/recall distances and probabilities
 39 |     
 40 |     :param events_pred: list of predicted events, each represented by a couple
 41 |     indicating the start and the stop of the event
 42 |     :param events_gt: list of ground truth events, each represented by a couple
 43 |     indicating the start and the stop of the event
 44 |     :param Trange: range of the series where events_pred and events_gt are included,
 45 |     represented as a couple (start, stop)
 46 |     :return: dictionary with precision, recall, and the individual metrics
 47 |     """
 48 |     # testing the inputs
 49 |     test_events(events_pred)
 50 |     test_events(events_gt)
 51 |     
 52 |     # other tests
 53 |     minimal_Trange = infer_Trange(events_pred, events_gt)
 54 |     if not Trange[0] <= minimal_Trange[0]:
 55 |         raise ValueError('`Trange` should include all the events')
 56 |     if not minimal_Trange[1] <= Trange[1]:
 57 |         raise ValueError('`Trange` should include all the events')
 58 |     
 59 |     if len(events_gt) == 0:
 60 |         raise ValueError('Input `events_gt` should have at least one event')
 61 | 
 62 |     if has_point_anomalies(events_pred) or has_point_anomalies(events_gt):
 63 |         raise ValueError('Cannot manage point anomalies currently')
 64 | 
 65 |     if Trange is None:
 66 |         # Set as default, but Trange should be indicated if probabilities are used
 67 |         raise ValueError('Trange should be indicated (or inferred with the `infer_Trange` function')
 68 | 
 69 |     E_gt = get_all_E_gt_func(events_gt, Trange)
 70 |     aff_partition = affiliation_partition(events_pred, E_gt)
 71 | 
 72 |     # Computing precision distance
 73 |     d_precision = [affiliation_precision_distance(Is, J) for Is, J in zip(aff_partition, events_gt)]
 74 |     
 75 |     # Computing recall distance
 76 |     d_recall = [affiliation_recall_distance(Is, J) for Is, J in zip(aff_partition, events_gt)]
 77 | 
 78 |     # Computing precision
 79 |     p_precision = [affiliation_precision_proba(Is, J, E) for Is, J, E in zip(aff_partition, events_gt, E_gt)]
 80 | 
 81 |     # Computing recall
 82 |     p_recall = [affiliation_recall_proba(Is, J, E) for Is, J, E in zip(aff_partition, events_gt, E_gt)]
 83 | 
 84 |     if _len_wo_nan(p_precision) > 0:
 85 |         p_precision_average = _sum_wo_nan(p_precision) / _len_wo_nan(p_precision)
 86 |     else:
 87 |         p_precision_average = p_precision[0] # math.nan
 88 |     p_recall_average = sum(p_recall) / len(p_recall)
 89 | 
 90 |     dict_out = dict({'precision': p_precision_average,
 91 |                      'recall': p_recall_average,
 92 |                      'individual_precision_probabilities': p_precision,
 93 |                      'individual_recall_probabilities': p_recall,
 94 |                      'individual_precision_distances': d_precision,
 95 |                      'individual_recall_distances': d_recall})
 96 |     return(dict_out)
 97 | 
 98 | def produce_all_results():
 99 |     """
100 |     Produce the affiliation precision/recall for all files
101 |     contained in the `data` repository
102 |     :return: a dictionary indexed by data names, each containing a dictionary
103 |     indexed by algorithm names, each containing the results of the affiliation
104 |     metrics (precision, recall, individual probabilities and distances)
105 |     """
106 |     datasets, Tranges = read_all_as_events() # read all the events in folder `data`
107 |     results = dict()
108 |     for data_name in datasets.keys():
109 |         results_data = dict()
110 |         for algo_name in datasets[data_name].keys():
111 |             if algo_name != 'groundtruth':
112 |                 results_data[algo_name] = pr_from_events(datasets[data_name][algo_name],
113 |                                                          datasets[data_name]['groundtruth'],
114 |                                                          Tranges[data_name])
115 |         results[data_name] = results_data
116 |     return(results)
117 | 


--------------------------------------------------------------------------------
/metrics/combine_all_scores.py:
--------------------------------------------------------------------------------
 1 | from f1_score_f1_pa import *
 2 | from fc_score import *
 3 | from precision_at_k import *
 4 | from customizable_f1_score import *
 5 | from AUC import *
 6 | from Matthews_correlation_coefficient import *
 7 | from affiliation.generics import convert_vector_to_events
 8 | from affiliation.metrics import pr_from_events
 9 | from vus.models.feature import Window
10 | from vus.metrics import get_range_vus_roc
11 | 
12 | 
13 | 
14 | def combine_all_evaluation_scores(y_test, pred_labels, anomaly_scores):
15 |     events_pred = convert_vector_to_events(y_test) # [(4, 5), (8, 9)]
16 |     events_gt = convert_vector_to_events(pred_labels)     # [(3, 4), (7, 10)]
17 |     Trange = (0, len(y_test))
18 |     affiliation = pr_from_events(events_pred, events_gt, Trange)
19 |     true_events = get_events(y_test)
20 |     _, _, _, f1_score_ori, f05_score_ori = get_accuracy_precision_recall_fscore(y_test, pred_labels)
21 |     f1_score_pa = get_point_adjust_scores(y_test, pred_labels, true_events)[5]
22 |     pa_accuracy, pa_precision, pa_recall, pa_f_score = get_adjust_F1PA(y_test, pred_labels)
23 |     range_f_score = customizable_f1_score(y_test, pred_labels)
24 |     _, _, f1_score_c = get_composite_fscore_raw(y_test, pred_labels,  true_events, return_prec_rec=True)
25 |     precision_k = precision_at_k(y_test, anomaly_scores, pred_labels)
26 |     point_auc = point_wise_AUC(pred_labels, y_test)
27 |     range_auc = Range_AUC(pred_labels, y_test)
28 |     MCC_score = MCC(y_test, pred_labels)
29 |     results = get_range_vus_roc(y_test, pred_labels, 100) # slidingWindow = 100 default
30 | 
31 |     
32 |     score_list = {"f1_score_ori": f1_score_ori, 
33 |                   "f05_score_ori" : f05_score_ori, 
34 |                   "f1_score_pa": f1_score_pa,
35 |                   "pa_accuracy":pa_accuracy, 
36 |                   "pa_precision":pa_precision, 
37 |                   "pa_recall":pa_recall, 
38 |                   "pa_f_score":pa_f_score,
39 |                   "range_f_score": range_f_score,
40 |                   "f1_score_c": f1_score_c, 
41 |                   "precision_k": precision_k,
42 |                   "point_auc": point_auc,
43 |                   "range_auc": range_auc, 
44 |                   "MCC_score":MCC_score, 
45 |                   "Affiliation precision": affiliation['precision'], 
46 |                   "Affiliation recall": affiliation['recall'],
47 |                   "R_AUC_ROC": results["R_AUC_ROC"], 
48 |                   "R_AUC_PR": results["R_AUC_PR"],
49 |                   "VUS_ROC": results["VUS_ROC"], 
50 |                   "VUS_PR": results["VUS_PR"]}
51 |     
52 |     return score_list
53 | 
54 | 
55 | def main():
56 |     y_test = np.zeros(100)
57 |     y_test[10:20] = 1
58 |     y_test[50:60] = 1
59 |     pred_labels = np.zeros(100)
60 |     pred_labels[15:17] = 1
61 |     pred_labels[55:62] = 1
62 |     anomaly_scores = np.zeros(100)
63 |     anomaly_scores[15:17] = 0.7
64 |     anomaly_scores[55:62] = 0.6
65 |     pred_labels[51:55] = 1
66 |     true_events = get_events(y_test)
67 |     scores = combine_all_evaluation_scores(y_test, pred_labels, anomaly_scores)
68 |     # scores = test(y_test, pred_labels)
69 |     for key,value in scores.items():
70 |         print(key,' : ',value)
71 | 
72 |     
73 | if __name__ == "__main__":
74 |     main()


--------------------------------------------------------------------------------
/metrics/customizable_f1_score.py:
--------------------------------------------------------------------------------
  1 | # used by paper: Exathlon: A Benchmark for Explainable Anomaly Detection over Time Series_VLDB 2021
  2 | # github: https://github.com/exathlonbenchmark/exathlon
  3 | import numpy as np
  4 | from metrics.evaluate_utils import range_convers_new
  5 | 
  6 | # the existence reward on the bias
  7 | def b(bias, i, length):
  8 |     if bias == 'flat':
  9 |         return 1
 10 |     elif bias == 'front-end bias':
 11 |         return length - i + 1
 12 |     elif bias == 'back-end bias':
 13 |         return i
 14 |     else:
 15 |         if i <= length / 2:
 16 |             return i
 17 |         else:
 18 |             return length - i + 1
 19 | 
 20 | 
 21 | def w(AnomalyRange, p):
 22 |     MyValue = 0
 23 |     MaxValue = 0
 24 |     start = AnomalyRange[0]
 25 |     AnomalyLength = AnomalyRange[1] - AnomalyRange[0] + 1
 26 |     # flat/'front-end bias'/'back-end bias'
 27 |     bias = 'flat'
 28 |     for i in range(start, start + AnomalyLength):
 29 |         bi = b(bias, i, AnomalyLength)
 30 |         MaxValue += bi
 31 |         if i in p:
 32 |             MyValue += bi
 33 |     return MyValue / MaxValue
 34 | 
 35 | 
 36 | def Cardinality_factor(Anomolyrange, Prange):
 37 |     score = 0
 38 |     start = Anomolyrange[0]
 39 |     end = Anomolyrange[1]
 40 |     for i in Prange:
 41 |         if start <= i[0] <= end:
 42 |             score += 1
 43 |         elif i[0] <= start <= i[1]:
 44 |             score += 1
 45 |         elif i[0] <= end <= i[1]:
 46 |             score += 1
 47 |         elif start >= i[0] and end <= i[1]:
 48 |             score += 1
 49 |     if score == 0:
 50 |         return 0
 51 |     else:
 52 |         return 1 / score
 53 | 
 54 | 
 55 | def existence_reward(labels, preds):
 56 |     '''
 57 |     labels: list of ordered pair
 58 |     preds predicted data
 59 |     '''
 60 | 
 61 |     score = 0
 62 |     for i in labels:
 63 |         if np.sum(np.multiply(preds <= i[1], preds >= i[0])) > 0:
 64 |             score += 1
 65 |     return score
 66 | 
 67 | 
 68 | def range_recall_new(labels, preds, alpha):
 69 |     p = np.where(preds == 1)[0]  # positions of predicted label==1
 70 |     range_pred = range_convers_new(preds)
 71 |     range_label = range_convers_new(labels)
 72 | 
 73 |     Nr = len(range_label)  # total # of real anomaly segments
 74 | 
 75 |     ExistenceReward = existence_reward(range_label, p)
 76 | 
 77 |     OverlapReward = 0
 78 |     for i in range_label:
 79 |         OverlapReward += w(i, p) * Cardinality_factor(i, range_pred)
 80 | 
 81 |     score = alpha * ExistenceReward + (1 - alpha) * OverlapReward
 82 |     if Nr != 0:
 83 |         return score / Nr, ExistenceReward / Nr, OverlapReward / Nr
 84 |     else:
 85 |         return 0, 0, 0
 86 | 
 87 | 
 88 | def customizable_f1_score(y_test, pred_labels,  alpha=0.2):
 89 |     label = y_test
 90 |     preds = pred_labels
 91 |     Rrecall, ExistenceReward, OverlapReward = range_recall_new(label, preds, alpha)
 92 |     Rprecision = range_recall_new(preds, label, 0)[0]
 93 | 
 94 |     if Rprecision + Rrecall == 0:
 95 |         Rf = 0
 96 |     else:
 97 |         Rf = 2 * Rrecall * Rprecision / (Rprecision + Rrecall)
 98 |     return Rf
 99 | 
100 | 
101 | def main():
102 |     y_test = np.zeros(100)
103 |     y_test[10:20] = 1
104 |     y_test[50:60] = 1
105 |     pred_labels = np.zeros(100)
106 |     pred_labels[15:19] = 1
107 |     pred_labels[55:62] = 1
108 |     # pred_labels[51:55] = 1
109 |     # true_events = get_events(y_test)
110 |     Rf = customizable_f1_score(y_test, pred_labels)
111 |     print("Rf: {}".format(Rf))
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     main()


--------------------------------------------------------------------------------
/metrics/evaluate_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from statsmodels.tsa.stattools import acf
 3 | from scipy.signal import argrelextrema
 4 | 
 5 | 
 6 | def get_composite_fscore_from_scores(score_t_test, thres, true_events, prec_t, return_prec_rec=False):
 7 |     pred_labels = score_t_test > thres
 8 |     tp = np.sum([pred_labels[start:end + 1].any() for start, end in true_events.values()])
 9 |     fn = len(true_events) - tp
10 |     rec_e = tp / (tp + fn)
11 |     fscore_c = 2 * rec_e * prec_t / (rec_e + prec_t)
12 |     if prec_t == 0 and rec_e == 0:
13 |         fscore_c = 0
14 |     if return_prec_rec:
15 |         return prec_t, rec_e, fscore_c
16 |     return fscore_c
17 | 
18 | 
19 | class NptConfig:
20 |     def __init__(self, config_dict):
21 |         for k, v in config_dict.items():
22 |             setattr(self, k, v)
23 | 
24 | def find_length(data):
25 |     if len(data.shape) > 1:
26 |         return 0
27 |     data = data[:min(20000, len(data))]
28 | 
29 |     base = 3
30 |     auto_corr = acf(data, nlags=400, fft=True)[base:]
31 | 
32 |     local_max = argrelextrema(auto_corr, np.greater)[0]
33 |     try:
34 |         max_local_max = np.argmax([auto_corr[lcm] for lcm in local_max])
35 |         if local_max[max_local_max] < 3 or local_max[max_local_max] > 300:
36 |             return 125
37 |         return local_max[max_local_max] + base
38 |     except:
39 |         return 125
40 | 
41 | 
42 | def range_convers_new(label):
43 |     '''
44 |     input: arrays of binary values
45 |     output: list of ordered pair [[a0,b0], [a1,b1]... ] of the inputs
46 |     '''
47 |     L = []
48 |     i = 0
49 |     j = 0
50 |     while j < len(label):
51 |         while label[i] == 0:
52 |             i += 1
53 |             if i >= len(label):
54 |                 break
55 |         j = i + 1
56 |         if j >= len(label):
57 |             if j == len(label):
58 |                 L.append((i, j - 1))
59 |             break
60 |         while label[j] != 0:
61 |             j += 1
62 |             if j >= len(label):
63 |                 L.append((i, j - 1))
64 |                 break
65 |         if j >= len(label):
66 |             break
67 |         L.append((i, j - 1))
68 |         i = j
69 |     return L


--------------------------------------------------------------------------------
/metrics/f1_score_f1_pa.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, roc_auc_score, precision_score, recall_score, \
 3 |     accuracy_score, fbeta_score, average_precision_score
 4 | 
 5 | 
 6 | # function: calculate the point-adjust f-scores(whether top k)
 7 | def get_point_adjust_scores(y_test, pred_labels, true_events, thereshold_k=0, whether_top_k=False):
 8 |     tp = 0
 9 |     fn = 0
10 |     for true_event in true_events.keys():
11 |         true_start, true_end = true_events[true_event]
12 |         if whether_top_k is False:
13 |             if pred_labels[true_start:true_end].sum() > 0:
14 |                 tp += (true_end - true_start)
15 |             else:
16 |                 fn += (true_end - true_start)
17 |         else:
18 |             if pred_labels[true_start:true_end].sum() > thereshold_k:
19 |                 tp += (true_end - true_start)
20 |             else:
21 |                 fn += (true_end - true_start)
22 |     fp = np.sum(pred_labels) - np.sum(pred_labels * y_test)
23 | 
24 |     prec, rec, fscore = get_prec_rec_fscore(tp, fp, fn)
25 |     return fp, fn, tp, prec, rec, fscore
26 | 
27 | def get_adjust_F1PA(pred, gt):
28 |     anomaly_state = False
29 |     for i in range(len(gt)):
30 |         if gt[i] == 1 and pred[i] == 1 and not anomaly_state:
31 |             anomaly_state = True
32 |             for j in range(i, 0, -1):
33 |                 if gt[j] == 0:
34 |                     break
35 |                 else:
36 |                     if pred[j] == 0:
37 |                         pred[j] = 1
38 |             for j in range(i, len(gt)):
39 |                 if gt[j] == 0:
40 |                     break
41 |                 else:
42 |                     if pred[j] == 0:
43 |                         pred[j] = 1
44 |         elif gt[i] == 0:
45 |             anomaly_state = False
46 |         if anomaly_state:
47 |             pred[i] = 1
48 |             
49 |     from sklearn.metrics import precision_recall_fscore_support
50 |     from sklearn.metrics import accuracy_score
51 | 
52 |     accuracy = accuracy_score(gt, pred)
53 |     precision, recall, f_score, support = precision_recall_fscore_support(gt, pred,
54 |                                                                           average='binary')
55 |     return accuracy, precision, recall, f_score
56 | 
57 | 
58 | # calculate the point-adjusted f-score
59 | def get_prec_rec_fscore(tp, fp, fn):
60 |     if tp == 0:
61 |         precision = 0
62 |         recall = 0
63 |     else:
64 |         precision = tp / (tp + fp)
65 |         recall = tp / (tp + fn)
66 |     fscore = get_f_score(precision, recall)
67 |     return precision, recall, fscore
68 | 
69 | 
70 | def get_f_score(prec, rec):
71 |     if prec == 0 and rec == 0:
72 |         f_score = 0
73 |     else:
74 |         f_score = 2 * (prec * rec) / (prec + rec)
75 |     return f_score
76 | 
77 | 
78 | # function: calculate the normal edition f-scores
79 | def get_accuracy_precision_recall_fscore(y_true: list, y_pred: list):
80 |     accuracy = accuracy_score(y_true, y_pred)
81 |     # warn_for=() avoids log warnings for any result being zero
82 |     # precision, recall, f_score, _ = prf(y_true, y_pred, average='binary', warn_for=())
83 |     precision = precision_score(y_true, y_pred)
84 |     recall = recall_score(y_true, y_pred)
85 |     f_score = (2 * precision * recall) / (precision + recall)
86 |     if precision == 0 and recall == 0:
87 |         f05_score = 0
88 |     else:
89 |         f05_score = fbeta_score(y_true, y_pred, average='binary', beta=0.5)
90 |     return accuracy, precision, recall, f_score, f05_score
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/metrics/f1_series.py:
--------------------------------------------------------------------------------
  1 | from fc_score import *
  2 | from f1_score_f1_pa import *
  3 | from evaluate_utils import *
  4 | 
  5 | default_thres_config = {"top_k_time": {},
  6 |                         "best_f1_test": {"exact_pt_adj": True},
  7 |                         "thresholded_score": {},
  8 |                         "tail_prob": {"tail_prob": 2},
  9 |                         "tail_prob_1": {"tail_prob": 1},
 10 |                         "tail_prob_2": {"tail_prob": 2},
 11 |                         "tail_prob_3": {"tail_prob": 3},
 12 |                         "tail_prob_4": {"tail_prob": 4},
 13 |                         "tail_prob_5": {"tail_prob": 5},
 14 |                         "dyn_gauss": {"long_window": 10000, "short_window": 1, "kernel_sigma": 10},
 15 |                         "nasa_npt": {"batch_size": 70, "window_size": 30, "telem_only": True,
 16 |                                      "smoothing_perc": 0.005, "l_s": 250, "error_buffer": 5, "p": 0.05}}
 17 | 
 18 | 
 19 | def threshold_and_predict(score_t_test, y_test, true_events, logger, test_anom_frac, thres_method="top_k_time",
 20 |                           point_adjust=False, score_t_train=None, thres_config_dict=dict(), return_auc=False,
 21 |                           composite_best_f1=False):
 22 |     if thres_method in thres_config_dict.keys():
 23 |         config = thres_config_dict[thres_method]
 24 |     else:
 25 |         config = default_thres_config[thres_method]
 26 |     # test_anom_frac = (np.sum(y_test)) / len(y_test)
 27 |     auroc = None
 28 |     avg_prec = None
 29 |     if thres_method == "thresholded_score":
 30 |         opt_thres = 0.5
 31 |         if set(score_t_test) - {0, 1}:
 32 |             logger.error("Score_t_test isn't binary. Predicting all as non-anomalous")
 33 |             pred_labels = np.zeros(len(score_t_test))
 34 |         else:
 35 |             pred_labels = score_t_test
 36 | 
 37 |     elif thres_method == "best_f1_test" and point_adjust:
 38 |         prec, rec, thresholds = precision_recall_curve(y_test, score_t_test, pos_label=1)
 39 |         if not config["exact_pt_adj"]:
 40 |             fscore_best_time = [get_f_score(precision, recall) for precision, recall in zip(prec, rec)]
 41 |             opt_num = np.squeeze(np.argmax(fscore_best_time))
 42 |             opt_thres = thresholds[opt_num]
 43 |             thresholds = np.random.choice(thresholds, size=5000) + [opt_thres]
 44 |         fscores = []
 45 |         for thres in thresholds:
 46 |             _, _, _, _, _, fscore = get_point_adjust_scores(y_test, score_t_test > thres, true_events)
 47 |             fscores.append(fscore)
 48 |         opt_thres = thresholds[np.argmax(fscores)]
 49 |         pred_labels = score_t_test > opt_thres
 50 | 
 51 |     elif thres_method == "best_f1_test" and composite_best_f1:
 52 |         prec, rec, thresholds = precision_recall_curve(y_test, score_t_test, pos_label=1)
 53 |         precs_t = prec
 54 |         fscores_c = [get_composite_fscore_from_scores(score_t_test, thres, true_events, prec_t) for thres, prec_t in
 55 |                      zip(thresholds, precs_t)]
 56 |         try:
 57 |             opt_thres = thresholds[np.nanargmax(fscores_c)]
 58 |         except:
 59 |             opt_thres = 0.0
 60 |         pred_labels = score_t_test > opt_thres
 61 | 
 62 |     elif thres_method == "top_k_time":
 63 |         opt_thres = np.nanpercentile(score_t_test, 100 * (1 - test_anom_frac), interpolation='higher')
 64 |         pred_labels = np.where(score_t_test > opt_thres, 1, 0)
 65 | 
 66 |     elif thres_method == "best_f1_test":
 67 |         prec, rec, thres = precision_recall_curve(y_test, score_t_test, pos_label=1)
 68 |         fscore = [get_f_score(precision, recall) for precision, recall in zip(prec, rec)]
 69 |         opt_num = np.squeeze(np.argmax(fscore))
 70 |         opt_thres = thres[opt_num]
 71 |         pred_labels = np.where(score_t_test > opt_thres, 1, 0)
 72 | 
 73 |     elif "tail_prob" in thres_method:
 74 |         tail_neg_log_prob = config["tail_prob"]
 75 |         opt_thres = tail_neg_log_prob
 76 |         pred_labels = np.where(score_t_test > opt_thres, 1, 0)
 77 | 
 78 |     elif thres_method == "nasa_npt":
 79 |         opt_thres = 0.5
 80 |         pred_labels = get_npt_labels(score_t_test, y_test, config)
 81 |     else:
 82 |         logger.error("Thresholding method {} not in [top_k_time, best_f1_test, tail_prob]".format(thres_method))
 83 |         return None, None
 84 |     if return_auc:
 85 |         avg_prec = average_precision_score(y_test, score_t_test)
 86 |         auroc = roc_auc_score(y_test, score_t_test)
 87 |         return opt_thres, pred_labels, avg_prec, auroc
 88 |     return opt_thres, pred_labels
 89 | 
 90 | 
 91 | # most-top funcion
 92 | def evaluate_predicted_labels(pred_labels, y_test, true_events, logger, eval_method="time-wise", breaks=[],
 93 |                               point_adjust=False):
 94 |     """
 95 |     Computes evaluation metrics for the binary classifications given the true and predicted labels
 96 |     :param point_adjust: used to judge whether is pa
 97 |     :param pred_labels: array of predicted labels
 98 |     :param y_test: array of true labels
 99 |     :param eval_method: string that indicates whether we evaluate the classification time point-wise or event-wise
100 |     :param breaks: array of discontinuities in the time series, relevant only if you look at event-wise
101 |     :param return_raw: Boolean that indicates whether we want to return tp, fp and fn or prec, recall and f1
102 |     :return: tuple of evaluation metrics
103 |     """
104 | 
105 |     if eval_method == "time-wise":
106 |         # point-adjust fscore
107 |         if point_adjust:
108 |             fp, fn, tp, prec, rec, fscore = get_point_adjust_scores(y_test, pred_labels, true_events)
109 |         # normal fscore
110 |         else:
111 |             _, prec, rec, fscore, _ = get_accuracy_precision_recall_fscore(y_test, pred_labels)
112 |             tp = np.sum(pred_labels * y_test)
113 |             fp = np.sum(pred_labels) - tp
114 |             fn = np.sum(y_test) - tp
115 |     # event-wise
116 |     else:
117 |         logger.error("Evaluation method {} not in [time-wise, event-wise]".format(eval_method))
118 |         return 0, 0, 0
119 | 
120 |     return tp, fp, fn, prec, rec, fscore
121 | 


--------------------------------------------------------------------------------
/metrics/fc_score.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import precision_score
 3 | 
 4 | 
 5 | def get_events(y_test, outlier=1, normal=0):
 6 |     events = dict()
 7 |     label_prev = normal
 8 |     event = 0  # corresponds to no event
 9 |     event_start = 0
10 |     for tim, label in enumerate(y_test):
11 |         if label == outlier:
12 |             if label_prev == normal:
13 |                 event += 1
14 |                 event_start = tim
15 |         else:
16 |             if label_prev == outlier:
17 |                 event_end = tim - 1
18 |                 events[event] = (event_start, event_end)
19 |         label_prev = label
20 | 
21 |     if label_prev == outlier:
22 |         event_end = tim - 1
23 |         events[event] = (event_start, event_end)
24 |     return events
25 | 
26 | 
27 | def get_composite_fscore_raw(y_test, pred_labels,  true_events, return_prec_rec=False):
28 |     tp = np.sum([pred_labels[start:end + 1].any() for start, end in true_events.values()])
29 |     fn = len(true_events) - tp
30 |     rec_e = tp / (tp + fn)
31 |     prec_t = precision_score(y_test, pred_labels)
32 |     fscore_c = 2 * rec_e * prec_t / (rec_e + prec_t)
33 |     if prec_t == 0 and rec_e == 0:
34 |         fscore_c = 0
35 |     if return_prec_rec:
36 |         return prec_t, rec_e, fscore_c
37 |     return fscore_c
38 | 
39 | 
40 | def main():
41 |     y_test = np.zeros(100)
42 |     y_test[10:20] = 1
43 |     y_test[50:60] = 1
44 |     pred_labels = np.zeros(100)
45 |     pred_labels[15:17] = 1
46 |     pred_labels[55:62] = 1
47 |     # pred_labels[51:55] = 1
48 |     # true_events = get_events(y_test)
49 |     prec_t, rec_e, fscore_c = get_composite_fscore_raw(pred_labels, y_test, return_prec_rec=True)
50 | #     print("Prec_t: {}, rec_e: {}, fscore_c: {}".format(prec_t, rec_e, fscore_c))
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/metrics/metrics.py:
--------------------------------------------------------------------------------
 1 | from metrics.f1_score_f1_pa import *
 2 | from metrics.fc_score import *
 3 | from metrics.precision_at_k import *
 4 | from metrics.customizable_f1_score import *
 5 | from metrics.AUC import *
 6 | from metrics.Matthews_correlation_coefficient import *
 7 | from metrics.affiliation.generics import convert_vector_to_events
 8 | from metrics.affiliation.metrics import pr_from_events
 9 | from metrics.vus.models.feature import Window
10 | from metrics.vus.metrics import get_range_vus_roc
11 | import numpy as np
12 | 
13 | def combine_all_evaluation_scores(y_test, pred_labels, anomaly_scores):
14 |     events_pred = convert_vector_to_events(y_test) 
15 |     events_gt = convert_vector_to_events(pred_labels)
16 |     Trange = (0, len(y_test))
17 |     affiliation = pr_from_events(events_pred, events_gt, Trange)
18 |     true_events = get_events(y_test)
19 |     pa_accuracy, pa_precision, pa_recall, pa_f_score = get_adjust_F1PA(y_test, pred_labels)
20 |     MCC_score = MCC(y_test, pred_labels)
21 |     vus_results = get_range_vus_roc(y_test, pred_labels, 100) # default slidingWindow = 100
22 |     
23 |     score_list_simple = {
24 |                   "pa_accuracy":pa_accuracy, 
25 |                   "pa_precision":pa_precision, 
26 |                   "pa_recall":pa_recall, 
27 |                   "pa_f_score":pa_f_score,
28 |                   "MCC_score":MCC_score, 
29 |                   "Affiliation precision": affiliation['precision'], 
30 |                   "Affiliation recall": affiliation['recall'],
31 |                   "R_AUC_ROC": vus_results["R_AUC_ROC"], 
32 |                   "R_AUC_PR": vus_results["R_AUC_PR"],
33 |                   "VUS_ROC": vus_results["VUS_ROC"],
34 |                   "VUS_PR": vus_results["VUS_PR"]
35 |                   }
36 |     
37 |     # return score_list, score_list_simple
38 |     return score_list_simple
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     y_test = np.load("data/events_pred_MSL.npy")+0
43 |     pred_labels = np.load("data/events_gt_MSL.npy")+0
44 |     anomaly_scores = np.load("data/events_scores_MSL.npy")
45 |     print(len(y_test), max(anomaly_scores), min(anomaly_scores))
46 |     score_list_simple = combine_all_evaluation_scores(y_test, pred_labels, anomaly_scores)
47 | 
48 |     for key, value in score_list_simple.items():
49 |         print('{0:21} :{1:10f}'.format(key, value))


--------------------------------------------------------------------------------
/metrics/precision_at_k.py:
--------------------------------------------------------------------------------
 1 | # k is defined as the number of anomalies
 2 | # only calculate the range top k not the whole set
 3 | import numpy as np
 4 | 
 5 | 
 6 | def precision_at_k(y_test, score_t_test, pred_labels):
 7 |     # top-k
 8 |     k = int(np.sum(y_test))
 9 |     threshold = np.percentile(score_t_test, 100 * (1 - k / len(y_test)))
10 | 
11 |     # precision_at_k = metrics.top_k_accuracy_score(label, score, k)
12 |     p_at_k = np.where(pred_labels > threshold)[0]
13 |     TP_at_k = sum(y_test[p_at_k])
14 |     precision_at_k = TP_at_k / k
15 |     return precision_at_k
16 | 


--------------------------------------------------------------------------------
/metrics/vus/analysis/robustness_eval.py:
--------------------------------------------------------------------------------
  1 | from random import shuffle
  2 | import numpy as np
  3 | import math
  4 | import matplotlib.pyplot as plt
  5 | from matplotlib import cm
  6 | import pandas as pd
  7 | from tqdm import tqdm as tqdm
  8 | import time
  9 | from sklearn.preprocessing import MinMaxScaler
 10 | import random
 11 | 
 12 | 
 13 | import os
 14 | import sys
 15 | module_path = os.path.abspath(os.path.join('../..'))
 16 | if module_path not in sys.path:
 17 |     sys.path.append(module_path)
 18 | 
 19 | from metrics.vus.utils.slidingWindows import find_length
 20 | from metrics.vus.utils.metrics import metricor
 21 | 
 22 | from metrics.vus.models.distance import Fourier
 23 | from metrics.vus.models.feature import Window
 24 | 
 25 | 
 26 | def generate_new_label(label,lag):
 27 |     if lag < 0:
 28 |         return np.array(list(label[-lag:]) + [0]*(-lag))
 29 |     elif lag > 0:
 30 |         return np.array([0]*lag + list(label[:-lag]))
 31 |     elif lag == 0:
 32 |         return label
 33 | 
 34 | def compute_anomaly_acc_lag(methods_scores,label,slidingWindow,methods_keys):
 35 |     
 36 |     lag_range = list(range(-slidingWindow//4,slidingWindow//4,5))
 37 |     methods_acc = {}
 38 |     for i,methods_score in enumerate(tqdm(methods_keys)):
 39 |         dict_acc = {
 40 |             'R_AUC_ROC':      [],
 41 |             'AUC_ROC':        [],
 42 |             'R_AUC_PR':       [],
 43 |             'AUC_PR':         [],
 44 |             'VUS_ROC':        [],
 45 |             'VUS_PR':         [],
 46 |             'Precision':      [],
 47 |             'Recall':         [],
 48 |             'F':              [],
 49 |             'ExistenceReward':[],
 50 |             'OverlapReward':  [],
 51 |             'Precision@k':    [],
 52 |             'Rprecision':     [],
 53 |             'Rrecall':        [],
 54 |             'RF':             []}
 55 |         
 56 |         for lag in tqdm(lag_range):
 57 |             new_label = generate_new_label(label,lag)
 58 |             
 59 |             grader = metricor()  
 60 | 
 61 |             R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=new_label, score=methods_scores[methods_score], window=slidingWindow, plot_ROC=True) 
 62 |             L, fpr, tpr= grader.metric_new(new_label, methods_scores[methods_score], plot_ROC=True)
 63 |             precision, recall, AP = grader.metric_PR(new_label, methods_scores[methods_score])  
 64 |             Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d = generate_curve(new_label,methods_scores[methods_score],2*slidingWindow)
 65 |             L1 = [ elem for elem in L]
 66 | 
 67 |             dict_acc['R_AUC_ROC']      +=[R_AUC]
 68 |             dict_acc['AUC_ROC']        +=[L1[0]]
 69 |             dict_acc['R_AUC_PR']       +=[R_AP]
 70 |             dict_acc['AUC_PR']         +=[AP]
 71 |             dict_acc['VUS_ROC']        +=[avg_auc_3d]
 72 |             dict_acc['VUS_PR']         +=[avg_ap_3d]
 73 |             dict_acc['Precision']      +=[L1[1]]
 74 |             dict_acc['Recall']         +=[L1[2]]
 75 |             dict_acc['F']              +=[L1[3]]
 76 |             dict_acc['ExistenceReward']+=[L1[5]]
 77 |             dict_acc['OverlapReward']  +=[L1[6]]
 78 |             dict_acc['Precision@k']    +=[L1[9]]
 79 |             dict_acc['Rprecision']     +=[L1[7]]
 80 |             dict_acc['Rrecall']        +=[L1[4]]
 81 |             dict_acc['RF']             +=[L1[8]]
 82 | 
 83 |         methods_acc[methods_score] = dict_acc
 84 |     return methods_acc
 85 | 
 86 | 
 87 | def compute_anomaly_acc_percentage(methods_scores,label,slidingWindow,methods_keys,pos_first_anom):
 88 |     
 89 |     
 90 |     list_pos = []
 91 |     step_a = max(0,(len(label) - pos_first_anom-200))//20
 92 |     step_b = max(0,pos_first_anom-200)//20
 93 |     pos_a = min(len(label),pos_first_anom + 200)
 94 |     pos_b = max(0,pos_first_anom - 200)
 95 |     list_pos.append((pos_b,pos_a))
 96 |     for pos_iter in range(20):
 97 |         pos_a = min(len(label),pos_a + step_a)
 98 |         pos_b = max(0,pos_b - step_b)
 99 |         list_pos.append((pos_b,pos_a))
100 |     methods_acc = {}
101 |     print(list_pos)
102 |     for i,methods_score in enumerate(tqdm(methods_keys)):
103 |         dict_acc = {
104 |             'R_AUC_ROC':      [],
105 |             'AUC_ROC':        [],
106 |             'R_AUC_PR':       [],
107 |             'AUC_PR':         [],
108 |             'VUS_ROC':        [],
109 |             'VUS_PR':         [],
110 |             'Precision':      [],
111 |             'Recall':         [],
112 |             'F':              [],
113 |             'ExistenceReward':[],
114 |             'OverlapReward':  [],
115 |             'Precision@k':    [],
116 |             'Rprecision':     [],
117 |             'Rrecall':        [],
118 |             'RF':             []}
119 |         
120 |         for end_pos in tqdm(list_pos):
121 |             new_label = label[end_pos[0]:end_pos[1]]
122 |             new_score = np.array(methods_scores[methods_score])[end_pos[0]:end_pos[1]]
123 |             grader = metricor()  
124 | 
125 |             R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=new_label, score=new_score, window=slidingWindow, plot_ROC=True) 
126 |             L, fpr, tpr= grader.metric_new(new_label, new_score, plot_ROC=True)
127 |             precision, recall, AP = grader.metric_PR(new_label, new_score)  
128 |             Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d = generate_curve(new_label,new_score,2*slidingWindow)
129 |             L1 = [ elem for elem in L]
130 | 
131 |             dict_acc['R_AUC_ROC']      +=[R_AUC]
132 |             dict_acc['AUC_ROC']        +=[L1[0]]
133 |             dict_acc['R_AUC_PR']       +=[R_AP]
134 |             dict_acc['AUC_PR']         +=[AP]
135 |             dict_acc['VUS_ROC']        +=[avg_auc_3d]
136 |             dict_acc['VUS_PR']         +=[avg_ap_3d]
137 |             dict_acc['Precision']      +=[L1[1]]
138 |             dict_acc['Recall']         +=[L1[2]]
139 |             dict_acc['F']              +=[L1[3]]
140 |             dict_acc['ExistenceReward']+=[L1[5]]
141 |             dict_acc['OverlapReward']  +=[L1[6]]
142 |             dict_acc['Precision@k']    +=[L1[9]]
143 |             dict_acc['Rprecision']     +=[L1[7]]
144 |             dict_acc['Rrecall']        +=[L1[4]]
145 |             dict_acc['RF']             +=[L1[8]]
146 | 
147 |         methods_acc[methods_score] = dict_acc
148 |     return methods_acc
149 | 
150 | def compute_anomaly_acc_noise(methods_scores,label,slidingWindow,methods_keys):
151 |     
152 |     lag_range = list(range(-slidingWindow//2,slidingWindow//2,10))
153 |     methods_acc = {}
154 |     for i,methods_score in enumerate(tqdm(methods_keys)):
155 |         dict_acc = {
156 |             'R_AUC_ROC':      [],
157 |             'AUC_ROC':        [],
158 |             'R_AUC_PR':       [],
159 |             'AUC_PR':         [],
160 |             'VUS_ROC':        [],
161 |             'VUS_PR':         [],
162 |             'Precision':      [],
163 |             'Recall':         [],
164 |             'F':              [],
165 |             'ExistenceReward':[],
166 |             'OverlapReward':  [],
167 |             'Precision@k':    [],
168 |             'Rprecision':     [],
169 |             'Rrecall':        [],
170 |             'RF':             []}
171 |         
172 |         for lag in tqdm(lag_range):
173 |             new_label = label
174 |             
175 |             grader = metricor()  
176 | 
177 |             noise = np.random.normal(-0.1,0.1,len(methods_scores[methods_score]))
178 |             
179 |             new_score = np.array(methods_scores[methods_score]) + noise
180 |             new_score = (new_score - min(new_score))/(max(new_score) - min(new_score))
181 |             
182 |             R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=new_label, score=new_score, window=slidingWindow, plot_ROC=True) 
183 |             L, fpr, tpr= grader.metric_new(new_label, new_score, plot_ROC=True)
184 |             precision, recall, AP = grader.metric_PR(new_label, new_score)  
185 |             Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d = generate_curve(new_label,new_score,2*slidingWindow)
186 |             L1 = [ elem for elem in L]
187 | 
188 |             dict_acc['R_AUC_ROC']      +=[R_AUC]
189 |             dict_acc['AUC_ROC']        +=[L1[0]]
190 |             dict_acc['R_AUC_PR']       +=[R_AP]
191 |             dict_acc['AUC_PR']         +=[AP]
192 |             dict_acc['VUS_ROC']        +=[avg_auc_3d]
193 |             dict_acc['VUS_PR']         +=[avg_ap_3d]
194 |             dict_acc['Precision']      +=[L1[1]]
195 |             dict_acc['Recall']         +=[L1[2]]
196 |             dict_acc['F']              +=[L1[3]]
197 |             dict_acc['ExistenceReward']+=[L1[5]]
198 |             dict_acc['OverlapReward']  +=[L1[6]]
199 |             dict_acc['Precision@k']    +=[L1[9]]
200 |             dict_acc['Rprecision']     +=[L1[7]]
201 |             dict_acc['Rrecall']        +=[L1[4]]
202 |             dict_acc['RF']             +=[L1[8]]
203 | 
204 |         methods_acc[methods_score] = dict_acc
205 |     return methods_acc
206 | 
207 | 
208 | def compute_anomaly_acc_pairwise(methods_scores,label,slidingWindow,method1,method2):
209 |     
210 |     lag_range = list(range(-slidingWindow//4,slidingWindow//4,5))
211 |     methods_acc = {}
212 |     method_key = [method1]
213 |     if method2 is not None:
214 |         method_key = [method1,method2]
215 |     for i,methods_score in enumerate(tqdm(method_key)):
216 |         dict_acc = {
217 |             'R_AUC_ROC':      [],
218 |             'AUC_ROC':        [],
219 |             'R_AUC_PR':       [],
220 |             'AUC_PR':         [],
221 |             'VUS_ROC':        [],
222 |             'VUS_PR':         [],
223 |             'Precision':      [],
224 |             'Recall':         [],
225 |             'F':              [],
226 |             'ExistenceReward':[],
227 |             'OverlapReward':  [],
228 |             'Precision@k':    [],
229 |             'Rprecision':     [],
230 |             'Rrecall':        [],
231 |             'RF':             []}
232 |         
233 |         for lag in tqdm(range(60)):
234 |             new_lag = random.randint(-slidingWindow//4,slidingWindow//4)
235 |             new_label = generate_new_label(label,new_lag)
236 |             
237 |             noise = np.random.normal(-0.1,0.1,len(methods_scores[methods_score]))
238 |             new_score = np.array(methods_scores[methods_score]) + noise
239 |             new_score = (new_score - min(new_score))/(max(new_score) - min(new_score))
240 |             
241 |             grader = metricor()  
242 | 
243 |             R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=new_label, score=new_score, window=slidingWindow, plot_ROC=True) 
244 |             L, fpr, tpr= grader.metric_new(new_label, new_score, plot_ROC=True)
245 |             precision, recall, AP = grader.metric_PR(new_label, new_score)  
246 |             #range_anomaly = grader.range_convers_new(new_label)
247 |             Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d = generate_curve(new_label,new_score,2*slidingWindow)
248 |             L1 = [ elem for elem in L]
249 | 
250 |             dict_acc['R_AUC_ROC']      +=[R_AUC]
251 |             dict_acc['AUC_ROC']        +=[L1[0]]
252 |             dict_acc['R_AUC_PR']       +=[R_AP]
253 |             dict_acc['AUC_PR']         +=[AP]
254 |             dict_acc['VUS_ROC']        +=[avg_auc_3d]
255 |             dict_acc['VUS_PR']         +=[avg_ap_3d]
256 |             dict_acc['Precision']      +=[L1[1]]
257 |             dict_acc['Recall']         +=[L1[2]]
258 |             dict_acc['F']              +=[L1[3]]
259 |             dict_acc['ExistenceReward']+=[L1[5]]
260 |             dict_acc['OverlapReward']  +=[L1[6]]
261 |             dict_acc['Precision@k']    +=[L1[9]]
262 |             dict_acc['Rprecision']     +=[L1[7]]
263 |             dict_acc['Rrecall']        +=[L1[4]]
264 |             dict_acc['RF']             +=[L1[8]]
265 | 
266 |         methods_acc[methods_score] = dict_acc
267 |     return methods_acc
268 | 
269 | 
270 | def normalize_dict_exp(methods_acc_lag,methods_keys):
271 |     key_metrics = [
272 |         'VUS_ROC',
273 |         'VUS_PR',
274 |         'R_AUC_ROC',
275 |         'R_AUC_PR',
276 |         'AUC_ROC',
277 |         'AUC_PR',
278 |         'Rprecision',
279 |         'Rrecall',
280 |         'RF',
281 |         'Precision',
282 |         'Recall',
283 |         'F',
284 |         'Precision@k'
285 |     ][::-1]
286 |     
287 |     norm_methods_acc_lag = {}
288 |     for key in methods_keys:
289 |         norm_methods_acc_lag[key] = {}
290 |         for key_metric in key_metrics:
291 |             ts = methods_acc_lag[key][key_metric]
292 |             new_ts = list(np.array(ts) -  np.mean(ts))
293 |             norm_methods_acc_lag[key][key_metric] = new_ts
294 |     return norm_methods_acc_lag
295 |         
296 | def group_dict(methods_acc_lag,methods_keys):
297 |     key_metrics = [
298 |         'VUS_ROC',
299 |         'VUS_PR',
300 |         'R_AUC_ROC',
301 |         'R_AUC_PR',
302 |         'AUC_ROC',
303 |         'AUC_PR',
304 |         'Rprecision',
305 |         'Rrecall',
306 |         'RF',
307 |         'Precision',
308 |         'Recall',
309 |         'F',
310 |         'Precision@k'
311 |     ][::-1]
312 |     
313 |     norm_methods_acc_lag = {key:[] for key in key_metrics}
314 |     for key in methods_keys:
315 |         for key_metric in key_metrics:
316 |             ts = list(methods_acc_lag[key][key_metric])
317 |             new_ts = list(np.array(ts) -  np.mean(ts))
318 |             norm_methods_acc_lag[key_metric] += new_ts
319 |     return norm_methods_acc_lag
320 | 
321 | 
322 | def generate_curve(label,score,slidingWindow):
323 |     tpr_3d, fpr_3d, prec_3d, window_3d, avg_auc_3d, avg_ap_3d = metricor().RangeAUC_volume(labels_original=label, score=score, windowSize=1*slidingWindow)
324 | 
325 |     X = np.array(tpr_3d).reshape(1,-1).ravel()
326 |     X_ap = np.array(tpr_3d)[:,:-1].reshape(1,-1).ravel()
327 |     Y = np.array(fpr_3d).reshape(1,-1).ravel()
328 |     W = np.array(prec_3d).reshape(1,-1).ravel()
329 |     Z = np.repeat(window_3d, len(tpr_3d[0]))
330 |     Z_ap = np.repeat(window_3d, len(tpr_3d[0])-1)
331 |     
332 |     return Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d
333 | 
334 | def box_plot(data, edge_color, fill_color):
335 |     bp = ax.boxplot(data, patch_artist=True)
336 |     
337 |     for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']:
338 |         plt.setp(bp[element], color=edge_color)
339 | 
340 |     for patch in bp['boxes']:
341 |         patch.set(facecolor=fill_color)       
342 |         
343 |     return bp
344 | 


--------------------------------------------------------------------------------
/metrics/vus/analysis/score_computation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import math
  4 | import matplotlib.pyplot as plt
  5 | from matplotlib import cm
  6 | import pandas as pd
  7 | from tqdm import tqdm as tqdm
  8 | import time
  9 | from sklearn.preprocessing import MinMaxScaler
 10 | import random
 11 | 
 12 | 
 13 | import os
 14 | import sys
 15 | module_path = os.path.abspath(os.path.join('../..'))
 16 | if module_path not in sys.path:
 17 |     sys.path.append(module_path)
 18 | 
 19 | from metrics.vus.utils.slidingWindows import find_length
 20 | from metrics.vus.utils.metrics import metricor
 21 | 
 22 | from metrics.vus.models.distance import Fourier
 23 | from metrics.vus.models.feature import Window
 24 | from metrics.vus.models.cnn import cnn
 25 | from metrics.vus.models.AE_mlp2 import AE_MLP2
 26 | from metrics.vus.models.lstm import lstm
 27 | from metrics.vus.models.ocsvm import OCSVM
 28 | from metrics.vus.models.poly import POLY
 29 | from metrics.vus.models.pca import PCA
 30 | from metrics.vus.models.norma import NORMA
 31 | from metrics.vus.models.matrix_profile import MatrixProfile
 32 | from metrics.vus.models.lof import LOF
 33 | from metrics.vus.models.iforest import IForest
 34 | 
 35 | def find_section_length(label,length):
 36 |     best_i = None
 37 |     best_sum = None
 38 |     current_subseq = False
 39 |     for i in range(len(label)):
 40 |         changed = False
 41 |         if label[i] == 1:
 42 |             if current_subseq == False:
 43 |                 current_subseq = True
 44 |                 if best_i is None:
 45 |                     changed = True
 46 |                     best_i = i
 47 |                     best_sum = np.sum(label[max(0,i-200):min(len(label),i+9800)])
 48 |                 else:
 49 |                     if np.sum(label[max(0,i-200):min(len(label),i+9800)]) < best_sum:
 50 |                         changed = True
 51 |                         best_i = i
 52 |                         best_sum = np.sum(label[max(0,i-200):min(len(label),i+9800)])
 53 |                     else:
 54 |                         changed = False
 55 |                 if changed:
 56 |                     diff = i+9800 - len(label)
 57 | 
 58 |                     pos1 = max(0,i-200 - max(0,diff))
 59 |                     pos2 = min(i+9800,len(label))
 60 |         else:
 61 |             current_subseq = False
 62 |     if best_i is not None:
 63 |         return best_i-pos1,(pos1,pos2)
 64 |     else:
 65 |         return None,None
 66 | 
 67 | def generate_data(filepath,init_pos,max_length):
 68 |     
 69 |     df = pd.read_csv(filepath, header=None).to_numpy()
 70 |     name = filepath.split('/')[-1]
 71 |     #max_length = 30000
 72 |     data = df[init_pos:init_pos+max_length,0].astype(float)
 73 |     label = df[init_pos:init_pos+max_length,1]
 74 |     
 75 |     pos_first_anom,pos = find_section_length(label,max_length)
 76 |     
 77 |     data = df[pos[0]:pos[1],0].astype(float)
 78 |     label = df[pos[0]:pos[1],1]
 79 |     
 80 |     slidingWindow = find_length(data)
 81 |     #slidingWindow = 70
 82 |     X_data = Window(window = slidingWindow).convert(data).to_numpy()
 83 | 
 84 |     data_train = data[:int(0.1*len(data))]
 85 |     data_test = data
 86 | 
 87 |     X_train = Window(window = slidingWindow).convert(data_train).to_numpy()
 88 |     X_test = Window(window = slidingWindow).convert(data_test).to_numpy()
 89 |     
 90 |     return pos_first_anom,slidingWindow,data,X_data,data_train,data_test,X_train,X_test,label
 91 | 
 92 | def compute_score(methods,slidingWindow,data,X_data,data_train,data_test,X_train,X_test):
 93 |     
 94 |     methods_scores = {}
 95 |     for method in methods:
 96 |         start_time = time.time()
 97 |         if method == 'IForest':
 98 |             clf = IForest(n_jobs=1)
 99 |             x = X_data
100 |             clf.fit(x)
101 |             score = clf.decision_scores_
102 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
103 |             score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
104 | 
105 |         elif method == 'LOF':
106 |             clf = LOF(n_neighbors=20, n_jobs=1)
107 |             x = X_data
108 |             clf.fit(x)
109 |             score = clf.decision_scores_
110 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
111 |             score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
112 | 
113 |         elif method == 'MatrixProfile':
114 |             clf = MatrixProfile(window = slidingWindow)
115 |             x = data
116 |             clf.fit(x)
117 |             score = clf.decision_scores_
118 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
119 |             score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
120 | 
121 |         elif method == 'NormA':
122 |             clf = NORMA(pattern_length = slidingWindow, nm_size=3*slidingWindow)
123 |             x = data
124 |             clf.fit(x)
125 |             score = clf.decision_scores_
126 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
127 |             score = np.array([score[0]]*((slidingWindow-1)//2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
128 | 
129 |         elif method == 'PCA':
130 |             clf = PCA()
131 |             x = X_data
132 |             clf.fit(x)
133 |             score = clf.decision_scores_
134 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
135 |             score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
136 | 
137 |         elif method == 'POLY':
138 |             clf = POLY(power=3, window = slidingWindow)
139 |             x = data
140 |             clf.fit(x)
141 |             measure = Fourier()
142 |             measure.detector = clf
143 |             measure.set_param()
144 |             clf.decision_function(measure=measure)
145 |             score = clf.decision_scores_
146 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
147 | 
148 |         elif method == 'OCSVM':
149 |             X_train_ = MinMaxScaler(feature_range=(0,1)).fit_transform(X_train.T).T
150 |             X_test_ = MinMaxScaler(feature_range=(0,1)).fit_transform(X_test.T).T
151 |             clf = OCSVM(nu=0.05)
152 |             clf.fit(X_train_, X_test_)
153 |             score = clf.decision_scores_
154 |             score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
155 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
156 | 
157 |         elif method == 'LSTM':
158 |             clf = lstm(slidingwindow = slidingWindow, predict_time_steps=1, epochs = 50, patience = 5, verbose=0)
159 |             clf.fit(data_train, data_test)
160 |             measure = Fourier()
161 |             measure.detector = clf
162 |             measure.set_param()
163 |             clf.decision_function(measure=measure)
164 |             score = clf.decision_scores_
165 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
166 | 
167 |         elif method == 'AE':
168 |             clf = AE_MLP2(slidingWindow = slidingWindow, epochs=100, verbose=0)
169 |             clf.fit(data_train, data_test)
170 |             score = clf.decision_scores_
171 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
172 | 
173 |         elif method == 'CNN':
174 |             clf = cnn(slidingwindow = slidingWindow, predict_time_steps=1, epochs = 100, patience = 5, verbose=0)
175 |             clf.fit(data_train, data_test)
176 |             measure = Fourier()
177 |             measure.detector = clf
178 |             measure.set_param()
179 |             clf.decision_function(measure=measure)
180 |             score = clf.decision_scores_
181 |             score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
182 | 
183 |         #end_time = time.time()
184 |         #time_exec = end_time - start_time
185 |         #print(method,"\t time: {}".format(time_exec))
186 |         methods_scores[method] = score
187 |         
188 |     return methods_scores
189 | 
190 | 
191 | 
192 | 
193 | 


--------------------------------------------------------------------------------
/metrics/vus/metrics.py:
--------------------------------------------------------------------------------
 1 | from .utils.metrics import metricor
 2 | from .analysis.robustness_eval import generate_curve
 3 | 
 4 | 
 5 | def get_range_vus_roc(score, labels, slidingWindow):
 6 |     grader = metricor()
 7 |     R_AUC_ROC, R_AUC_PR, _, _, _ = grader.RangeAUC(labels=labels, score=score, window=slidingWindow, plot_ROC=True)
 8 |     _, _, _, _, _, _,VUS_ROC, VUS_PR = generate_curve(labels, score, 2*slidingWindow)
 9 |     metrics = {'R_AUC_ROC': R_AUC_ROC, 'R_AUC_PR': R_AUC_PR, 'VUS_ROC': VUS_ROC, 'VUS_PR': VUS_PR}
10 | 
11 |     return metrics
12 | 


--------------------------------------------------------------------------------
/metrics/vus/models/feature.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Classes of feature mapping for model type B
  3 | """
  4 | 
  5 | import numpy as np
  6 | # import matplotlib.pyplot as plt
  7 | # import random
  8 | # from arch import arch_model
  9 | import pandas as pd
 10 | import math
 11 | # import pmdarima as pm
 12 | # from pmdarima import model_selection
 13 | # import os
 14 | # import dis
 15 | # import statistics
 16 | # from sklearn import metrics
 17 | # import sklearn
 18 | from tsfresh import extract_features
 19 | 
 20 | from statsmodels.tsa.seasonal import seasonal_decompose
 21 | 
 22 | # import itertools
 23 | # import functools
 24 | import warnings
 25 | from builtins import range
 26 | # from collections import defaultdict
 27 | 
 28 | 
 29 | from numpy.linalg import LinAlgError
 30 | # from scipy.signal import cwt, find_peaks_cwt, ricker, welch
 31 | # from scipy.stats import linregress
 32 | # from statsmodels.tools.sm_exceptions import MissingDataError
 33 | 
 34 | with warnings.catch_warnings():
 35 |     # Ignore warnings of the patsy package
 36 |     warnings.simplefilter("ignore", DeprecationWarning)
 37 | 
 38 |     from statsmodels.tsa.ar_model import AR
 39 | # from statsmodels.tsa.stattools import acf, adfuller, pacf
 40 | 
 41 | from hurst import compute_Hc
 42 | 
 43 | class Window:
 44 |     """ The  class for rolling window feature mapping.
 45 |     The mapping converts the original timeseries X into a matrix. 
 46 |     The matrix consists of rows of sliding windows of original X. 
 47 |     """
 48 | 
 49 |     def __init__(self,  window = 100):
 50 |         self.window = window
 51 |         self.detector = None
 52 |     def convert(self, X):
 53 |         n = self.window
 54 |         X = pd.Series(X)
 55 |         L = []
 56 |         if n == 0:
 57 |             df = X
 58 |         else:
 59 |             for i in range(n):
 60 |                 L.append(X.shift(i))
 61 |             df = pd.concat(L, axis = 1)
 62 |             df = df.iloc[n-1:]
 63 |         return df
 64 | 
 65 | class tf_Stat:
 66 |     '''statisitc feature extraction using the tf_feature package. 
 67 |     It calculates 763 features in total so it might be over complicated for some models. 
 68 |     Recommend to use for methods like Isolation Forest which randomly picks a feature
 69 |     and then perform the classification. To use for other distance-based model like KNN,
 70 |     LOF, CBLOF, etc, first train to pass a function that give weights to individual features so that
 71 |     inconsequential features won't cloud the important ones (mean, variance, kurtosis, etc).
 72 | 
 73 |     '''
 74 |     def __init__(self,  window = 100, step = 25):
 75 |         self.window = window
 76 |         self.step = step
 77 |         self.detector = None
 78 |     def convert(self, X):
 79 |         window = self.window
 80 |         step = self.step
 81 |         pos = math.ceil(window/2)
 82 |         #step <= window
 83 | 
 84 |         length = X.shape[0]
 85 | 
 86 |         Xd = pd.DataFrame(X)
 87 |         Xd.columns = pd.Index(['x'], dtype='object')
 88 |         Xd['id'] = 1
 89 |         Xd['time'] = Xd.index
 90 |         
 91 |         test = np.array(extract_features(Xd.iloc[0+pos-math.ceil(window/2):0+pos + math.floor(window/2)], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0))
 92 |         M = np.zeros((length - window, test.shape[1]+1 ))
 93 | 
 94 |         
 95 |         i = 0
 96 |         while i + window <= M.shape[0]:
 97 |             M[i:i+step, 0]= X[pos + i: pos + i + step]
 98 |             vector = np.array(extract_features(Xd.iloc[i+pos-math.ceil(window/2):i+pos + math.floor(window/2)], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0))
 99 | 
100 |             M[i:i+step, 1:] = vector
101 |             i+= step
102 |         num = M.shape[0]
103 |         if i <  num:
104 |             M[i: num, 0]= X[pos + i: pos + num]
105 |             M[i: num, 1:] = np.array(extract_features(Xd.iloc[i+pos-math.ceil(window/2):], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0))
106 |         return M
107 | 
108 | class Stat:
109 |     '''statisitc feature extraction. 
110 |     Features include [mean, variance, skewness, kurtosis, autocorrelation, maximum, 
111 |     minimum, entropy, seasonality, hurst component, AR coef]
112 | 
113 |     '''
114 |     def __init__(self,  window = 100, data_step = 10, param = [{"coeff": 0, "k": 5}], lag = 1, freq = 720):
115 |         self.window = window
116 |         self.data_step = data_step
117 |         self.detector = None
118 |         self.param = param
119 |         self.lag = lag 
120 |         self.freq =freq
121 |         if data_step > int(window/2):
122 |             raise ValueError('value step shoudm\'t be greater than half of the window')
123 |         
124 |         
125 |     def convert(self, X):
126 |         freq = self.freq
127 |         n = self.window
128 |         data_step = self.data_step
129 |         X = pd.Series(X)
130 |         L = []
131 |         if n == 0:
132 |             df = X
133 |             raise ValueError('window lenght is set to zero')
134 |         else:
135 |             for i in range(n):
136 |                 L.append(X.shift(i))
137 |             df = pd.concat(L, axis = 1)
138 |             df = df.iloc[n:]
139 |             df2 = pd.concat(L[:data_step], axis = 1)
140 | 
141 |         
142 |         
143 |         df = df.reset_index()
144 |         #value 
145 |         x0 = df2[math.ceil(n/2) : - math.floor(n/2)].reset_index()
146 |         #mean 
147 |         x1 = (df.mean(axis=1))
148 |         #variance 
149 |         x2 = df.var(axis=1)
150 |         #AR-coef
151 |         self.ar_function = lambda x: self.ar_coefficient(x)
152 |         x3 = df.apply(self.ar_function, axis =1, result_type='expand'  )
153 |         #autocorrelation
154 |         self.auto_function = lambda x: self.autocorrelation(x)
155 |         x4 = df.apply(self.auto_function, axis =1, result_type='expand'  )
156 |         #kurtosis
157 |         x5 = (df.kurtosis(axis=1))
158 |         #skewness
159 |         x6 = (df.skew(axis=1))
160 |         #maximum
161 |         x7 = (df.max(axis=1))
162 |         #minimum
163 |         x8 = (df.min(axis=1))
164 |         #entropy
165 |         self.entropy_function = lambda x: self.sample_entropy(x)
166 |         x9 = df.apply(self.entropy_function, axis =1, result_type='expand')
167 |         
168 |         #seasonality
169 |         result = seasonal_decompose(X, model='additive', freq = freq, extrapolate_trend='freq')
170 |         #seasonal
171 |         x10 = pd.Series(np.array(result.seasonal[math.ceil(n/2) : - math.floor(n/2)]))
172 |         #trend 
173 |         x11 = pd.Series(np.array(result.trend[math.ceil(n/2) : - math.floor(n/2)]))
174 |         #resid 
175 |         x12 = pd.Series(np.array(result.resid[math.ceil(n/2) : - math.floor(n/2)]))
176 |         
177 |         #Hurst component
178 |         self.hurst_function = lambda x: self.hurst_f(x)
179 |         x13 = df.apply(self.hurst_function, axis =1, result_type='expand')
180 |         
181 |         L = [x0, x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12, x13]
182 |         M = pd.concat(L, axis = 1)
183 |         M = M.drop(columns=['index'])
184 | 
185 |         return M
186 |     def ar_coefficient(self, x):
187 |         """
188 |         This feature calculator fits the unconditional maximum likelihood
189 |         of an autoregressive AR(k) process.
190 |         The k parameter is the maximum lag of the process
191 | 
192 |         .. math::
193 | 
194 |             X_{t}=\\varphi_0 +\\sum _{{i=1}}^{k}\\varphi_{i}X_{{t-i}}+\\varepsilon_{t}
195 | 
196 |         For the configurations from param which should contain the maxlag "k" and such an AR process is calculated. Then
197 |         the coefficients :math:`\\varphi_{i}` whose index :math:`i` contained from "coeff" are returned.
198 | 
199 |         :param x: the time series to calculate the feature of
200 |         :type x: numpy.ndarray
201 |         :param param: contains dictionaries {"coeff": x, "k": y} with x,y int
202 |         :type param: list
203 |         :return x: the different feature values
204 |         :return type: pandas.Series
205 |         """
206 |         calculated_ar_params = {}
207 |         param = self.param
208 |         x_as_list = list(x)
209 | 
210 |         res = {}
211 | 
212 |         for parameter_combination in param:
213 |             k = parameter_combination["k"]
214 |             p = parameter_combination["coeff"]
215 | 
216 |             column_name = "coeff_{}__k_{}".format(p, k)
217 | 
218 |             if k not in calculated_ar_params:
219 |                 try:
220 |                     calculated_AR = AR(x_as_list)
221 |                     calculated_ar_params[k] = calculated_AR.fit(maxlag=k, solver="mle").params
222 |                 except (LinAlgError, ValueError):
223 |                     calculated_ar_params[k] = [np.NaN] * k
224 | 
225 |             mod = calculated_ar_params[k]
226 | 
227 |             if p <= k:
228 |                 try:
229 |                     res[column_name] = mod[p]
230 |                 except IndexError:
231 |                     res[column_name] = 0
232 |             else:
233 |                 res[column_name] = np.NaN
234 | 
235 |         L = [(key, value) for key, value in res.items()]
236 |         L0 = []
237 |         for item in L:
238 |             L0.append(item[1])
239 |         return L0
240 | 
241 |     def autocorrelation(self, x):
242 |         """
243 |         Calculates the autocorrelation of the specified lag, according to the formula [1]
244 | 
245 |         .. math::
246 | 
247 |             \\frac{1}{(n-l)\\sigma^{2}} \\sum_{t=1}^{n-l}(X_{t}-\\mu )(X_{t+l}-\\mu)
248 | 
249 |         where :math:`n` is the length of the time series :math:`X_i`, :math:`\\sigma^2` its variance and :math:`\\mu` its
250 |         mean. `l` denotes the lag.
251 | 
252 |         .. rubric:: References
253 | 
254 |         [1] https://en.wikipedia.org/wiki/Autocorrelation#Estimation
255 | 
256 |         :param x: the time series to calculate the feature of
257 |         :type x: numpy.ndarray
258 |         :param lag: the lag
259 |         :type lag: int
260 |         :return: the value of this feature
261 |         :return type: float
262 |         """
263 |         lag = self.lag
264 |         # This is important: If a series is passed, the product below is calculated
265 |         # based on the index, which corresponds to squaring the series.
266 |         if isinstance(x, pd.Series):
267 |             x = x.values
268 |         if len(x) < lag:
269 |             return np.nan
270 |         # Slice the relevant subseries based on the lag
271 |         y1 = x[:(len(x) - lag)]
272 |         y2 = x[lag:]
273 |         # Subtract the mean of the whole series x
274 |         x_mean = np.mean(x)
275 |         # The result is sometimes referred to as "covariation"
276 |         sum_product = np.sum((y1 - x_mean) * (y2 - x_mean))
277 |         # Return the normalized unbiased covariance
278 |         v = np.var(x)
279 |         if np.isclose(v, 0):
280 |             return np.NaN
281 |         else:
282 |             return sum_product / ((len(x) - lag) * v)
283 |     def _into_subchunks(self, x, subchunk_length, every_n=1):
284 |         """
285 |         Split the time series x into subwindows of length "subchunk_length", starting every "every_n".
286 | 
287 |         For example, the input data if [0, 1, 2, 3, 4, 5, 6] will be turned into a matrix
288 | 
289 |             0  2  4
290 |             1  3  5
291 |             2  4  6
292 | 
293 |         with the settings subchunk_length = 3 and every_n = 2
294 |         """
295 |         len_x = len(x)
296 | 
297 |         assert subchunk_length > 1
298 |         assert every_n > 0
299 | 
300 |         # how often can we shift a window of size subchunk_length over the input?
301 |         num_shifts = (len_x - subchunk_length) // every_n + 1
302 |         shift_starts = every_n * np.arange(num_shifts)
303 |         indices = np.arange(subchunk_length)
304 | 
305 |         indexer = np.expand_dims(indices, axis=0) + np.expand_dims(shift_starts, axis=1)
306 |         return np.asarray(x)[indexer]
307 |     def sample_entropy(self, x):
308 |         """
309 |         Calculate and return sample entropy of x.
310 | 
311 |         .. rubric:: References
312 | 
313 |         |  [1] http://en.wikipedia.org/wiki/Sample_Entropy
314 |         |  [2] https://www.ncbi.nlm.nih.gov/pubmed/10843903?dopt=Abstract
315 | 
316 |         :param x: the time series to calculate the feature of
317 |         :type x: numpy.ndarray
318 | 
319 |         :return: the value of this feature
320 |         :return type: float
321 |         """
322 |         x = np.array(x)
323 | 
324 |         # if one of the values is NaN, we can not compute anything meaningful
325 |         if np.isnan(x).any():
326 |             return np.nan
327 | 
328 |         m = 2  # common value for m, according to wikipedia...
329 |         tolerance = 0.2 * np.std(x)  # 0.2 is a common value for r, according to wikipedia...
330 | 
331 |         # Split time series and save all templates of length m
332 |         # Basically we turn [1, 2, 3, 4] into [1, 2], [2, 3], [3, 4]
333 |         xm = self._into_subchunks(x, m)
334 | 
335 |         # Now calculate the maximum distance between each of those pairs
336 |         #   np.abs(xmi - xm).max(axis=1)
337 |         # and check how many are below the tolerance.
338 |         # For speed reasons, we are not doing this in a nested for loop,
339 |         # but with numpy magic.
340 |         # Example:
341 |         # if x = [1, 2, 3]
342 |         # then xm = [[1, 2], [2, 3]]
343 |         # so we will substract xm from [1, 2] => [[0, 0], [-1, -1]]
344 |         # and from [2, 3] => [[1, 1], [0, 0]]
345 |         # taking the abs and max gives us:
346 |         # [0, 1] and [1, 0]
347 |         # as the diagonal elements are always 0, we substract 1.
348 |         B = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm])
349 | 
350 |         # Similar for computing A
351 |         xmp1 = self._into_subchunks(x, m + 1)
352 | 
353 |         A = np.sum([np.sum(np.abs(xmi - xmp1).max(axis=1) <= tolerance) - 1 for xmi in xmp1])
354 | 
355 |         # Return SampEn
356 |         return -np.log(A / B)
357 |     def hurst_f(self, x):
358 |         H,c, M = compute_Hc(x)
359 |         return [H, c]


--------------------------------------------------------------------------------
/metrics/vus/utils/metrics.py:
--------------------------------------------------------------------------------
  1 | from sklearn import metrics
  2 | import numpy as np
  3 | import math
  4 | # import matplotlib.pyplot as plt
  5 | 
  6 | class metricor:
  7 |     def __init__(self, a = 1, probability = True, bias = 'flat', ):
  8 |         self.a = a
  9 |         self.probability = probability
 10 |         self.bias = bias 
 11 |     
 12 |     def detect_model(self, model, label, contamination = 0.1, window = 100, is_A = False, is_threshold = True):
 13 |         if is_threshold:
 14 |             score = self.scale_threshold(model.decision_scores_, model._mu, model._sigma)
 15 |         else:
 16 |             score = self.scale_contamination(model.decision_scores_, contamination = contamination)
 17 |         if is_A is False:
 18 |             scoreX = np.zeros(len(score)+window)
 19 |             scoreX[math.ceil(window/2): len(score)+window - math.floor(window/2)] = score 
 20 |         else:
 21 |             scoreX = score
 22 |             
 23 |         self.score_=scoreX
 24 |         L = self.metric(label, scoreX)
 25 |         return L
 26 | 
 27 |         
 28 |     def labels_conv(self, preds):
 29 |         '''return indices of predicted anomaly
 30 |         '''
 31 | 
 32 |         # p = np.zeros(len(preds))
 33 |         index = np.where(preds >= 0.5)
 34 |         return index[0]
 35 |     
 36 |     def labels_conv_binary(self, preds):
 37 |         '''return predicted label
 38 |         '''
 39 |         p = np.zeros(len(preds))
 40 |         index = np.where(preds >= 0.5)
 41 |         p[index[0]] = 1
 42 |         return p 
 43 | 
 44 | 
 45 |     def w(self, AnomalyRange, p):
 46 |         MyValue = 0
 47 |         MaxValue = 0
 48 |         start = AnomalyRange[0]
 49 |         AnomalyLength = AnomalyRange[1] - AnomalyRange[0] + 1
 50 |         for i in range(start, start +AnomalyLength):
 51 |             bi = self.b(i, AnomalyLength)
 52 |             MaxValue +=  bi
 53 |             if i in p:
 54 |                 MyValue += bi
 55 |         return MyValue/MaxValue
 56 | 
 57 |     def Cardinality_factor(self, Anomolyrange, Prange):
 58 |         score = 0 
 59 |         start = Anomolyrange[0]
 60 |         end = Anomolyrange[1]
 61 |         for i in Prange:
 62 |             if i[0] >= start and i[0] <= end:
 63 |                 score +=1 
 64 |             elif start >= i[0] and start <= i[1]:
 65 |                 score += 1
 66 |             elif end >= i[0] and end <= i[1]:
 67 |                 score += 1
 68 |             elif start >= i[0] and end <= i[1]:
 69 |                 score += 1
 70 |         if score == 0:
 71 |             return 0
 72 |         else:
 73 |             return 1/score
 74 |         
 75 |     def b(self, i, length):
 76 |         bias = self.bias 
 77 |         if bias == 'flat':
 78 |             return 1
 79 |         elif bias == 'front-end bias':
 80 |             return length - i + 1
 81 |         elif bias == 'back-end bias':
 82 |             return i
 83 |         else:
 84 |             if i <= length/2:
 85 |                 return i
 86 |             else:
 87 |                 return length - i + 1
 88 | 
 89 | 
 90 |     def scale_threshold(self, score, score_mu, score_sigma):
 91 |         return (score >= (score_mu + 3*score_sigma)).astype(int)
 92 |     
 93 |     
 94 |     def metric_new(self, label, score, plot_ROC=False, alpha=0.2,coeff=3):
 95 |         '''input:
 96 |                Real labels and anomaly score in prediction
 97 |             
 98 |            output:
 99 |                AUC, 
100 |                Precision, 
101 |                Recall, 
102 |                F-score, 
103 |                Range-precision, 
104 |                Range-recall, 
105 |                Range-Fscore, 
106 |                Precison@k, 
107 |              
108 |             k is chosen to be # of outliers in real labels
109 |         '''
110 |         if np.sum(label) == 0:
111 |             print('All labels are 0. Label must have groud truth value for calculating AUC score.')
112 |             return None
113 |         
114 |         if np.isnan(score).any() or score is None:
115 |             print('Score must not be none.')
116 |             return None
117 |         
118 |         #area under curve
119 |         auc = metrics.roc_auc_score(label, score)
120 |         # plor ROC curve
121 |         if plot_ROC:
122 |             fpr, tpr, thresholds  = metrics.roc_curve(label, score)
123 |             # display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc)
124 |             # display.plot()            
125 |             
126 |         #precision, recall, F
127 |         
128 |         preds = score > (np.mean(score)+coeff*np.std(score))
129 |         if np.sum(preds) == 0:
130 |             preds = score > (np.mean(score)+2*np.std(score))
131 |             if np.sum(preds) == 0:
132 |                 preds = score > (np.mean(score)+1*np.std(score))
133 |         Precision, Recall, F, Support = metrics.precision_recall_fscore_support(label, preds, zero_division=0)
134 |         precision = Precision[1]
135 |         recall = Recall[1]
136 |         f = F[1]
137 | 
138 |         #range anomaly 
139 |         Rrecall, ExistenceReward, OverlapReward = self.range_recall_new(label, preds, alpha)
140 |         Rprecision = self.range_recall_new(preds, label, 0)[0]
141 |         
142 |         if Rprecision + Rrecall==0:
143 |             Rf=0
144 |         else:
145 |             Rf = 2 * Rrecall * Rprecision / (Rprecision + Rrecall)
146 |         
147 |         # top-k
148 |         k = int(np.sum(label))
149 |         threshold = np.percentile(score, 100 * (1-k/len(label)))
150 |         
151 |         # precision_at_k = metrics.top_k_accuracy_score(label, score, k)
152 |         p_at_k = np.where(preds > threshold)[0]
153 |         TP_at_k = sum(label[p_at_k])
154 |         precision_at_k = TP_at_k/k
155 |         
156 |         L = [auc, precision, recall, f, Rrecall, ExistenceReward, OverlapReward, Rprecision, Rf, precision_at_k]
157 |         if plot_ROC:
158 |             return L, fpr, tpr
159 |         return L
160 | 
161 |     def metric_PR(self, label, score):
162 |         precision, recall, thresholds = metrics.precision_recall_curve(label, score)
163 |         # plt.figure()
164 |         # disp = metrics.PrecisionRecallDisplay(precision=precision, recall=recall)
165 |         # disp.plot()
166 |         AP = metrics.auc(recall, precision)
167 |         #AP = metrics.average_precision_score(label, score)
168 |         return precision, recall, AP
169 |         
170 |     def range_recall_new(self, labels, preds, alpha):   
171 | 
172 | 
173 |         p = np.where(preds == 1)[0]    # positions of predicted label==1
174 |         range_pred = self.range_convers_new(preds)  
175 |         range_label = self.range_convers_new(labels)
176 |         
177 |         Nr = len(range_label)    # total # of real anomaly segments
178 | 
179 |         ExistenceReward = self.existence_reward(range_label, p)
180 | 
181 | 
182 |         OverlapReward = 0
183 |         for i in range_label:
184 |             OverlapReward += self.w(i, p) * self.Cardinality_factor(i, range_pred)
185 | 
186 | 
187 |         score = alpha * ExistenceReward + (1-alpha) * OverlapReward
188 |         if Nr != 0:
189 |             return score/Nr, ExistenceReward/Nr, OverlapReward/Nr
190 |         else:
191 |             return 0,0,0
192 | 
193 |     def range_convers_new(self, label):
194 |         '''
195 |         input: arrays of binary values 
196 |         output: list of ordered pair [[a0,b0], [a1,b1]... ] of the inputs
197 |         '''
198 |         L = []
199 |         i = 0
200 |         j = 0 
201 |         while j < len(label):
202 |             # print(i)
203 |             while label[i] == 0:
204 |                 i+=1
205 |                 if i >= len(label):
206 |                     break
207 |             j = i+1
208 |             # print('j'+str(j))
209 |             if j >= len(label):
210 |                 if j==len(label):
211 |                     L.append((i,j-1))
212 |     
213 |                 break
214 |             while label[j] != 0:
215 |                 j+=1
216 |                 if j >= len(label):
217 |                     L.append((i,j-1))
218 |                     break
219 |             if j >= len(label):
220 |                 break
221 |             L.append((i, j-1))
222 |             i = j
223 |         return L
224 |         
225 |     def existence_reward(self, labels, preds):
226 |         '''
227 |         labels: list of ordered pair 
228 |         preds predicted data
229 |         '''
230 | 
231 |         score = 0
232 |         for i in labels:
233 |             if np.sum(np.multiply(preds <= i[1], preds >= i[0])) > 0:
234 |                 score += 1
235 |         return score
236 |     
237 |     def num_nonzero_segments(self, x):
238 |         count=0
239 |         if x[0]>0:
240 |             count+=1
241 |         for i in range(1, len(x)):
242 |             if x[i]>0 and x[i-1]==0:
243 |                 count+=1
244 |         return count
245 |     
246 |     def extend_postive_range(self, x, window=5):
247 |         label = x.copy().astype(float)
248 |         L = self.range_convers_new(label)   # index of non-zero segments
249 |         length = len(label)
250 |         for k in range(len(L)):
251 |             s = L[k][0] 
252 |             e = L[k][1] 
253 |             
254 |             
255 |             x1 = np.arange(e,min(e+window//2,length))
256 |             label[x1] += np.sqrt(1 - (x1-e)/(window))
257 |             
258 |             x2 = np.arange(max(s-window//2,0),s)
259 |             label[x2] += np.sqrt(1 - (s-x2)/(window))
260 |             
261 |         label = np.minimum(np.ones(length), label)
262 |         return label
263 |     
264 |     def extend_postive_range_individual(self, x, percentage=0.2):
265 |         label = x.copy().astype(float)
266 |         L = self.range_convers_new(label)   # index of non-zero segments
267 |         length = len(label)
268 |         for k in range(len(L)):
269 |             s = L[k][0] 
270 |             e = L[k][1] 
271 |             
272 |             l0 = int((e-s+1)*percentage)
273 |             
274 |             x1 = np.arange(e,min(e+l0,length))
275 |             label[x1] += np.sqrt(1 - (x1-e)/(2*l0))
276 |             
277 |             x2 = np.arange(max(s-l0,0),s)
278 |             label[x2] += np.sqrt(1 - (s-x2)/(2*l0))
279 |             
280 |         label = np.minimum(np.ones(length), label)
281 |         return label
282 |     
283 |     def TPR_FPR_RangeAUC(self, labels, pred, P, L):
284 |         product = labels * pred
285 |         
286 |         TP = np.sum(product)
287 |         
288 |         # recall = min(TP/P,1)
289 |         P_new = (P+np.sum(labels))/2      # so TPR is neither large nor small
290 |         # P_new = np.sum(labels)
291 |         recall = min(TP/P_new,1)
292 |         # recall = TP/np.sum(labels)
293 |         # print('recall '+str(recall))
294 |         
295 |         
296 |         existence = 0
297 |         for seg in L:
298 |             if np.sum(product[seg[0]:(seg[1]+1)])>0:
299 |                 existence += 1
300 |                 
301 |         existence_ratio = existence/len(L)
302 |         # print(existence_ratio)
303 |         
304 |         # TPR_RangeAUC = np.sqrt(recall*existence_ratio)
305 |         # print(existence_ratio)
306 |         TPR_RangeAUC = recall*existence_ratio
307 |         
308 |         FP = np.sum(pred) - TP
309 |         # TN = np.sum((1-pred) * (1-labels))
310 |         
311 |         # FPR_RangeAUC = FP/(FP+TN)
312 |         N_new = len(labels) - P_new
313 |         FPR_RangeAUC = FP/N_new
314 |         
315 |         Precision_RangeAUC = TP/np.sum(pred)
316 |         
317 |         return TPR_RangeAUC, FPR_RangeAUC, Precision_RangeAUC
318 |     
319 |     def RangeAUC(self, labels, score, window=0, percentage=0, plot_ROC=False, AUC_type='window'):
320 |         # AUC_type='window'/'percentage'
321 |         score_sorted = -np.sort(-score)
322 |         
323 |         P = np.sum(labels)
324 |         # print(np.sum(labels))
325 |         if AUC_type=='window':
326 |             labels = self.extend_postive_range(labels, window=window)
327 |         else:   
328 |             labels = self.extend_postive_range_individual(labels, percentage=percentage)
329 |         
330 |         # print(np.sum(labels))
331 |         L = self.range_convers_new(labels)
332 |         TPR_list = [0]
333 |         FPR_list = [0]
334 |         Precision_list = [1]
335 |         
336 |         for i in np.linspace(0, len(score)-1, 250).astype(int):
337 |             threshold = score_sorted[i]
338 |             # print('thre='+str(threshold))
339 |             pred = score>= threshold
340 |             TPR, FPR, Precision = self.TPR_FPR_RangeAUC(labels, pred, P,L)
341 |             
342 |             TPR_list.append(TPR)
343 |             FPR_list.append(FPR)
344 |             Precision_list.append(Precision)
345 |             
346 |         TPR_list.append(1)
347 |         FPR_list.append(1)   # otherwise, range-AUC will stop earlier than (1,1)
348 |         
349 |         tpr = np.array(TPR_list)
350 |         fpr = np.array(FPR_list)
351 |         prec = np.array(Precision_list)
352 |         
353 |         width = fpr[1:] - fpr[:-1]
354 |         height = (tpr[1:] + tpr[:-1])/2
355 |         AUC_range = np.sum(width*height)
356 |         
357 |         width_PR = tpr[1:-1] - tpr[:-2]
358 |         height_PR = (prec[1:] + prec[:-1])/2
359 |         AP_range = np.sum(width_PR*height_PR)
360 |         
361 |         if plot_ROC:
362 |             return AUC_range, AP_range, fpr, tpr, prec
363 |         
364 |         return AUC_range
365 |         
366 | 
367 |     # TPR_FPR_window
368 |     def RangeAUC_volume(self, labels_original, score, windowSize):
369 |         score_sorted = -np.sort(-score)
370 |         
371 |         tpr_3d=[]
372 |         fpr_3d=[]
373 |         prec_3d=[]
374 |         
375 |         auc_3d=[]
376 |         ap_3d=[]
377 |         
378 |         window_3d = np.arange(0, windowSize+1, 1)
379 |         P = np.sum(labels_original)
380 |        
381 |         for window in window_3d:
382 |             labels = self.extend_postive_range(labels_original, window)
383 |             
384 |             # print(np.sum(labels))
385 |             L = self.range_convers_new(labels)
386 |             TPR_list = [0]
387 |             FPR_list = [0]
388 |             Precision_list = [1]
389 |             
390 |             for i in np.linspace(0, len(score)-1, 250).astype(int):
391 |                 threshold = score_sorted[i]
392 |                 # print('thre='+str(threshold))
393 |                 pred = score>= threshold
394 |                 TPR, FPR, Precision = self.TPR_FPR_RangeAUC(labels, pred, P,L)
395 |                 
396 |                 TPR_list.append(TPR)
397 |                 FPR_list.append(FPR)
398 |                 Precision_list.append(Precision)
399 |                 
400 |             TPR_list.append(1)
401 |             FPR_list.append(1)   # otherwise, range-AUC will stop earlier than (1,1)
402 |             
403 |             
404 |             tpr = np.array(TPR_list)
405 |             fpr = np.array(FPR_list)
406 |             prec = np.array(Precision_list)
407 |             
408 |             tpr_3d.append(tpr)
409 |             fpr_3d.append(fpr)
410 |             prec_3d.append(prec)
411 |             
412 |             width = fpr[1:] - fpr[:-1]
413 |             height = (tpr[1:] + tpr[:-1])/2
414 |             AUC_range = np.sum(width*height)
415 |             auc_3d.append(AUC_range)
416 |             
417 |             width_PR = tpr[1:-1] - tpr[:-2]
418 |             height_PR = (prec[1:] + prec[:-1])/2
419 |             AP_range = np.sum(width_PR*height_PR)
420 |             ap_3d.append(AP_range)
421 | 
422 |         
423 |         return tpr_3d, fpr_3d, prec_3d, window_3d, sum(auc_3d)/len(window_3d), sum(ap_3d)/len(window_3d)
424 | 
425 | 
426 | 
427 | 
428 | def generate_curve(label,score,slidingWindow):
429 |     tpr_3d, fpr_3d, prec_3d, window_3d, avg_auc_3d, avg_ap_3d = metricor().RangeAUC_volume(labels_original=label, score=score, windowSize=1*slidingWindow)
430 | 
431 |     X = np.array(tpr_3d).reshape(1,-1).ravel()
432 |     X_ap = np.array(tpr_3d)[:,:-1].reshape(1,-1).ravel()
433 |     Y = np.array(fpr_3d).reshape(1,-1).ravel()
434 |     W = np.array(prec_3d).reshape(1,-1).ravel()
435 |     Z = np.repeat(window_3d, len(tpr_3d[0]))
436 |     Z_ap = np.repeat(window_3d, len(tpr_3d[0])-1)
437 |     
438 |     return Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d
439 |         
440 |         


--------------------------------------------------------------------------------
/metrics/vus/utils/slidingWindows.py:
--------------------------------------------------------------------------------
 1 | from statsmodels.tsa.stattools import acf
 2 | from scipy.signal import argrelextrema
 3 | import numpy as np
 4 | 
 5 | import matplotlib.patches as mpatches 
 6 | import matplotlib.pyplot as plt
 7 | # determine sliding window (period) based on ACF
 8 | def find_length(data):
 9 |     if len(data.shape)>1:
10 |         return 0
11 |     data = data[:min(20000, len(data))]
12 |     
13 |     base = 3
14 |     auto_corr = acf(data, nlags=400, fft=True)[base:]
15 |     
16 |     
17 |     local_max = argrelextrema(auto_corr, np.greater)[0]
18 |     try:
19 |         max_local_max = np.argmax([auto_corr[lcm] for lcm in local_max])
20 |         if local_max[max_local_max]<3 or local_max[max_local_max]>300:
21 |             return 125
22 |         return local_max[max_local_max]+base
23 |     except:
24 |         return 125


--------------------------------------------------------------------------------
/model/DCdetector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from einops import rearrange
 5 | from .attn import DAC_structure, AttentionLayer
 6 | from .embed import DataEmbedding, TokenEmbedding
 7 | from .RevIN import RevIN
 8 | from tkinter import _flatten
 9 | 
10 | 
11 | class Encoder(nn.Module):
12 |     def __init__(self, attn_layers, norm_layer=None):
13 |         super(Encoder, self).__init__()
14 |         self.attn_layers = nn.ModuleList(attn_layers)
15 |         self.norm = norm_layer
16 | 
17 |     def forward(self, x_patch_size, x_patch_num, x_ori, patch_index, attn_mask=None):
18 |         series_list = []
19 |         prior_list = []
20 |         for attn_layer in self.attn_layers:
21 |             series, prior = attn_layer(x_patch_size, x_patch_num, x_ori, patch_index, attn_mask=attn_mask)
22 |             series_list.append(series)
23 |             prior_list.append(prior)
24 |         return series_list, prior_list
25 | 
26 | 
27 | 
28 | class DCdetector(nn.Module):
29 |     def __init__(self, win_size, enc_in, c_out, n_heads=1, d_model=256, e_layers=3, patch_size=[3,5,7], channel=55, d_ff=512, dropout=0.0, activation='gelu', output_attention=True):
30 |         super(DCdetector, self).__init__()
31 |         self.output_attention = output_attention
32 |         self.patch_size = patch_size
33 |         self.channel = channel
34 |         self.win_size = win_size
35 |         
36 |         # Patching List  
37 |         self.embedding_patch_size = nn.ModuleList()
38 |         self.embedding_patch_num = nn.ModuleList()
39 |         for i, patchsize in enumerate(self.patch_size):
40 |             self.embedding_patch_size.append(DataEmbedding(patchsize, d_model, dropout))
41 |             self.embedding_patch_num.append(DataEmbedding(self.win_size//patchsize, d_model, dropout))
42 | 
43 |         self.embedding_window_size = DataEmbedding(enc_in, d_model, dropout)
44 |          
45 |         # Dual Attention Encoder
46 |         self.encoder = Encoder(
47 |             [
48 |                 AttentionLayer(
49 |                     DAC_structure(win_size, patch_size, channel, False, attention_dropout=dropout, output_attention=output_attention),
50 |                     d_model, patch_size, channel, n_heads, win_size)for l in range(e_layers)
51 |             ],
52 |             norm_layer=torch.nn.LayerNorm(d_model)
53 |         )
54 | 
55 |         self.projection = nn.Linear(d_model, c_out, bias=True)
56 | 
57 | 
58 |     def forward(self, x):
59 |         B, L, M = x.shape #Batch win_size channel
60 |         series_patch_mean = []
61 |         prior_patch_mean = []
62 |         revin_layer = RevIN(num_features=M)
63 | 
64 |         # Instance Normalization Operation
65 |         x = revin_layer(x, 'norm')
66 |         x_ori = self.embedding_window_size(x)
67 |         
68 |         # Mutil-scale Patching Operation 
69 |         for patch_index, patchsize in enumerate(self.patch_size):
70 |             x_patch_size, x_patch_num = x, x
71 |             x_patch_size = rearrange(x_patch_size, 'b l m -> b m l') #Batch channel win_size
72 |             x_patch_num = rearrange(x_patch_num, 'b l m -> b m l') #Batch channel win_size
73 |             
74 |             x_patch_size = rearrange(x_patch_size, 'b m (n p) -> (b m) n p', p = patchsize) 
75 |             x_patch_size = self.embedding_patch_size[patch_index](x_patch_size)
76 |             x_patch_num = rearrange(x_patch_num, 'b m (p n) -> (b m) p n', p = patchsize) 
77 |             x_patch_num = self.embedding_patch_num[patch_index](x_patch_num)
78 |             
79 |             series, prior = self.encoder(x_patch_size, x_patch_num, x_ori, patch_index)
80 |             series_patch_mean.append(series), prior_patch_mean.append(prior)
81 | 
82 |         series_patch_mean = list(_flatten(series_patch_mean))
83 |         prior_patch_mean = list(_flatten(prior_patch_mean))
84 |             
85 |         if self.output_attention:
86 |             return series_patch_mean, prior_patch_mean
87 |         else:
88 |             return None
89 |         
90 | 
91 | 


--------------------------------------------------------------------------------
/model/RevIN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class RevIN(nn.Module):
 5 |     def __init__(self, num_features: int, eps=1e-5, affine=True):
 6 |         """
 7 |         :param num_features: the number of features or channels
 8 |         :param eps: a value added for numerical stability
 9 |         :param affine: if True, RevIN has learnable affine parameters
10 |         """
11 |         super(RevIN, self).__init__()
12 |         self.num_features = num_features
13 |         self.eps = eps
14 |         self.affine = affine
15 |         if self.affine:
16 |             self._init_params()
17 | 
18 |     def forward(self, x, mode:str):
19 |         if mode == 'norm':
20 |             self._get_statistics(x)
21 |             x = self._normalize(x)
22 |         elif mode == 'denorm':
23 |             x = self._denormalize(x)
24 |         else: raise NotImplementedError
25 |         return x
26 | 
27 |     def _init_params(self):
28 |         # initialize RevIN params: (C,)
29 |         self.affine_weight = torch.ones(self.num_features)
30 |         self.affine_bias = torch.zeros(self.num_features)
31 |         self.affine_weight=self.affine_weight.to(device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'))
32 |         self.affine_bias=self.affine_bias.to(device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'))
33 |         
34 | 
35 |     def _get_statistics(self, x):
36 |         dim2reduce = tuple(range(1, x.ndim-1))
37 |         self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach()
38 |         self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach()
39 |             
40 | 
41 |     def _normalize(self, x):
42 |         x = x - self.mean
43 |         x = x / self.stdev
44 |         if self.affine:
45 |             x = x * self.affine_weight
46 |             x = x + self.affine_bias
47 |         return x
48 | 
49 |     def _denormalize(self, x):
50 |         if self.affine:
51 |             x = x - self.affine_bias
52 |             x = x / (self.affine_weight + self.eps*self.eps)
53 |         x = x * self.stdev
54 |         x = x + self.mean
55 |         return x
56 | 


--------------------------------------------------------------------------------
/model/attn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | import math
 6 | from math import sqrt
 7 | import os
 8 | from einops import rearrange, reduce, repeat
 9 | 
10 | 
11 | class DAC_structure(nn.Module):
12 |     def __init__(self, win_size, patch_size, channel, mask_flag=True, scale=None, attention_dropout=0.05, output_attention=False):
13 |         super(DAC_structure, self).__init__()
14 |         self.scale = scale
15 |         self.mask_flag = mask_flag
16 |         self.output_attention = output_attention
17 |         self.dropout = nn.Dropout(attention_dropout)
18 |         self.window_size = win_size
19 |         self.patch_size = patch_size
20 |         self.channel = channel
21 | 
22 |     def forward(self, queries_patch_size, queries_patch_num, keys_patch_size, keys_patch_num, values, patch_index, attn_mask):
23 |                                                  
24 |         # Patch-wise Representation
25 |         B, L, H, E = queries_patch_size.shape #batch_size*channel, patch_num, n_head, d_model/n_head
26 |         scale_patch_size = self.scale or 1. / sqrt(E)
27 |         scores_patch_size = torch.einsum("blhe,bshe->bhls", queries_patch_size, keys_patch_size) #batch*ch, nheads, p_num, p_num   
28 |         attn_patch_size = scale_patch_size * scores_patch_size
29 |         series_patch_size = self.dropout(torch.softmax(attn_patch_size, dim=-1)) # B*D_model H N N
30 |         
31 |         # In-patch Representation
32 |         B, L, H, E = queries_patch_num.shape #batch_size*channel, patch_size, n_head, d_model/n_head
33 |         scale_patch_num = self.scale or 1. / sqrt(E)
34 |         scores_patch_num = torch.einsum("blhe,bshe->bhls", queries_patch_num, keys_patch_num) #batch*ch, nheads, p_size, p_size 
35 |         attn_patch_num = scale_patch_num * scores_patch_num
36 |         series_patch_num = self.dropout(torch.softmax(attn_patch_num, dim=-1)) # B*D_model H S S 
37 | 
38 |         # Upsampling
39 |         series_patch_size = repeat(series_patch_size, 'b l m n -> b l (m repeat_m) (n repeat_n)', repeat_m=self.patch_size[patch_index], repeat_n=self.patch_size[patch_index])    
40 |         series_patch_num = series_patch_num.repeat(1,1,self.window_size//self.patch_size[patch_index],self.window_size//self.patch_size[patch_index]) 
41 |         series_patch_size = reduce(series_patch_size, '(b reduce_b) l m n-> b l m n', 'mean', reduce_b=self.channel)
42 |         series_patch_num = reduce(series_patch_num, '(b reduce_b) l m n-> b l m n', 'mean', reduce_b=self.channel)
43 | 
44 | 
45 |         if self.output_attention:
46 |             return series_patch_size, series_patch_num
47 |         else:
48 |             return (None)
49 |                                                   
50 | 
51 | 
52 | class AttentionLayer(nn.Module):
53 |     def __init__(self, attention, d_model, patch_size, channel, n_heads, win_size, d_keys=None, d_values=None):
54 |         super(AttentionLayer, self).__init__()
55 | 
56 |         d_keys = d_keys or (d_model // n_heads)
57 |         d_values = d_values or (d_model // n_heads)
58 |         self.norm = nn.LayerNorm(d_model)
59 |         self.inner_attention = attention
60 |         self.patch_size = patch_size
61 |         self.channel = channel
62 |         self.window_size = win_size
63 |         self.n_heads = n_heads 
64 |         
65 |         self.patch_query_projection = nn.Linear(d_model, d_keys * n_heads)
66 |         self.patch_key_projection = nn.Linear(d_model, d_keys * n_heads)
67 |         self.out_projection = nn.Linear(d_values * n_heads, d_model)      
68 |         self.value_projection = nn.Linear(d_model, d_values * n_heads)
69 | 
70 |     def forward(self, x_patch_size, x_patch_num, x_ori, patch_index, attn_mask):
71 |         
72 |         # patch_size
73 |         B, L, M = x_patch_size.shape
74 |         H = self.n_heads
75 |         queries_patch_size, keys_patch_size = x_patch_size, x_patch_size
76 |         queries_patch_size = self.patch_query_projection(queries_patch_size).view(B, L, H, -1) 
77 |         keys_patch_size = self.patch_key_projection(keys_patch_size).view(B, L, H, -1) 
78 | 
79 |         # patch_num
80 |         B, L, M = x_patch_num.shape
81 |         queries_patch_num, keys_patch_num = x_patch_num, x_patch_num
82 |         queries_patch_num = self.patch_query_projection(queries_patch_num).view(B, L, H, -1) 
83 |         keys_patch_num = self.patch_key_projection(keys_patch_num).view(B, L, H, -1)
84 |         
85 |         # x_ori
86 |         B, L, _ = x_ori.shape
87 |         values = self.value_projection(x_ori).view(B, L, H, -1)
88 |         
89 |         series, prior = self.inner_attention(
90 |             queries_patch_size, queries_patch_num,
91 |             keys_patch_size, keys_patch_num,
92 |             values, patch_index,
93 |             attn_mask
94 |         )
95 |         
96 |         return series, prior
97 | 


--------------------------------------------------------------------------------
/model/embed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.nn.utils import weight_norm
 5 | import math
 6 | 
 7 | 
 8 | class PositionalEmbedding(nn.Module):
 9 |     def __init__(self, d_model, max_len=5000):
10 |         super(PositionalEmbedding, self).__init__()
11 |         # Compute the positional encodings once in log space.
12 |         pe = torch.zeros(max_len, d_model).float()
13 |         pe.require_grad = False
14 | 
15 |         position = torch.arange(0, max_len).float().unsqueeze(1)
16 |         div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
17 | 
18 |         pe[:, 0::2] = torch.sin(position * div_term)
19 |         pe[:, 1::2] = torch.cos(position * div_term)
20 | 
21 |         pe = pe.unsqueeze(0)
22 |         self.register_buffer('pe', pe)
23 | 
24 |     def forward(self, x):
25 |         return self.pe[:, :x.size(1)]
26 | 
27 | 
28 | class TokenEmbedding(nn.Module):
29 |     def __init__(self, c_in, d_model):
30 |         super(TokenEmbedding, self).__init__()
31 |         padding = 1 if torch.__version__ >= '1.5.0' else 2
32 |         self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
33 |                                    kernel_size=3, padding=padding, padding_mode='circular', bias=False)
34 |         for m in self.modules():
35 |             if isinstance(m, nn.Conv1d):
36 |                 nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
37 | 
38 |     def forward(self, x):
39 |         x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
40 |         return x
41 | 
42 | 
43 | class DataEmbedding(nn.Module):
44 |     def __init__(self, c_in, d_model, dropout=0.05):
45 |         super(DataEmbedding, self).__init__()
46 | 
47 |         self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
48 |         self.position_embedding = PositionalEmbedding(d_model=d_model)
49 | 
50 |         self.dropout = nn.Dropout(p=dropout)
51 | 
52 |     def forward(self, x):
53 |         x = self.value_embedding(x) + self.position_embedding(x)
54 |         return self.dropout(x)
55 |     
56 | 
57 |     
58 | 
59 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # DCdetector (KDD 2023 research track paper)
  2 | 
  3 | **DCdetector: Dual Attention Contrastive Representation Learning for Time Series Anomaly Detection**
  4 | [[Paper]](https://arxiv.org/abs/2306.10347)
  5 | 
  6 | 
  7 | 
  8 | The most fundamental challenge for time series anomaly detection is to learn a representation map that enables effective discrimination of anomalies. Reconstruction-based methods still dominate, but the representation learning with anomalies might hurt the performance with its large abnormal loss. In this paper, we propose DCdetector, a multi-scale dual attention contrastive representation learning model.
  9 | 
 10 | - **Overall**: DCdetector utilizes a novel dual attention asymmetric design to create the permutated environment and pure contrastive loss to guide the learning process, thus learning a permutation invariant representation with superior discrimination abilities.
 11 | 
 12 | - **Architecture**: A contrastive learning-based dual-branch attention structure is designed to learn a permutation invariant representation that enlarges the representation differences between normal points and anomalies.
 13 | 
 14 | - **Architecture**: Two additional structures are designed for time series. Channel independence patching is proposed to enhance local semantic information in time series. Multi-scale is proposed in the attention module to reduce information loss during patching.
 15 | 
 16 | - **Optimization**: An effective and robust loss function is designed based on the similarity of two branches. Note that the model is trained purely contrastively without reconstruction loss, which reduces distractions from anomalies.
 17 | 
 18 | - **Performance & Justification**: DCdetector achieves performance comparable or superior to SOTA methods on eight time series anomaly detection benchmark datasets. We also provide justification discussion to explain how our model avoids collapse without negative samples.
 19 | 
 20 | |![Figure1](img/art-compare.png)|
 21 | |:--:| 
 22 | | *Figure 1. Architecture comparison of three approaches.* |
 23 | 
 24 | |![Figure2](img/workflow.png)|
 25 | |:--:| 
 26 | | *Figure 2. The workflow of the DCdetector framework.* |
 27 | 
 28 | 
 29 | ## Main Result
 30 | We compare our model with 26 baselines based on various evaluation criteria. Extensive experiments show that DCdetector achieves the best or comparable performance on eight benchmark datasets compared to various state-of-the-art algorithms.
 31 | 
 32 | |![Figure1](img/DCdetector.jpg)|
 33 | |:--:| 
 34 | | *Table 1. Overall results on real-world multivariate datasets.* |
 35 | 
 36 | |![image](img/result_2.png) | ![image](img/result_count.jpg)
 37 | |:--:|:--:|
 38 | | *Table 2. Overall results on NIPS-TS datasets.* | *Table 3. Overall results on univariate dataset.* |
 39 | 
 40 | |![Figure4](img/result_4.png)|
 41 | |:--:| 
 42 | | *Table 4. Multi-metrics results on NIPS-TS datasets.* |
 43 | 
 44 | 
 45 | ## Code Description
 46 | There are ten files/folders in the source.
 47 | 
 48 | - data_factory: The preprocessing folder/file. All datasets preprocessing codes are here.
 49 | - dataset: The dataset folder, and you can download all datasets [here](https://drive.google.com/drive/folders/1RaIJQ8esoWuhyphhmMaH-VCDh-WIluRR?usp=sharing).
 50 | - main.py: The main python file. You can adjustment all parameters in there.
 51 | - metrics: There is the evaluation metrics code folder, which includes VUC, affiliation precision/recall pair, and other common metrics. The details can be corresponding to paper’s Section 4.2.
 52 | - model: DCdetector model folder. The details can be corresponding to paper’s Section 3.
 53 | - result: In our code demo, we can automatically save the results and train processing log in this folder.
 54 | - scripts: All datasets and ablation experiments scripts. You can reproduce the experiment results as get start shown.
 55 | - solver.py: Another python file. The training, validation, and testing processing are all in there. 
 56 | - utils: Other functions for data processing and model building.
 57 | - img: Images needed in readme.md.
 58 | - requirements.txt: Python packages needed to run this repo.
 59 | 
 60 | 
 61 | ## Get Start
 62 | 1. Install Python 3.6, PyTorch >= 1.4.0.
 63 | 2. Download data. You can obtain all benchmarks from [Google Cloud](https://drive.google.com/drive/folders/1RaIJQ8esoWuhyphhmMaH-VCDh-WIluRR?usp=sharing). All the datasets are well pre-processed.
 64 | 3. Train and evaluate. We provide the experiment scripts of all benchmarks under the folder ```./scripts```. You can reproduce the experiment results as follows:
 65 | 
 66 | ```bash
 67 | bash ./scripts/SMD.sh
 68 | bash ./scripts/MSL.sh
 69 | bash ./scripts/SMAP.sh
 70 | bash ./scripts/PSM.sh
 71 | bash ./scripts/SWAT.sh
 72 | bash ./scripts/NIPS_TS_Swan.sh
 73 | bash ./scripts/NIPS_TS_Water.sh
 74 | bash ./scripts/UCR.sh
 75 | ```
 76 | 
 77 | Also, some scripts of ablation experiments.
 78 | 
 79 | ```bash
 80 | bash ./scripts/Ablation_attention_head.sh
 81 | bash ./scripts/Ablation_encoder_layer.sh
 82 | bash ./scripts/Ablation_Multiscale.sh
 83 | bash ./scripts/Ablation_Window_Size.sh
 84 | ```
 85 | 
 86 | ## Citation
 87 | If you find this repo useful, please cite our paper.
 88 | 
 89 | ```
 90 | @inproceedings{yang2023dcdetector,
 91 |   title={DCdetector: Dual Attention Contrastive Representation Learning for Time Series Anomaly Detection},
 92 |   author={Yiyuan Yang and Chaoli Zhang and Tian Zhou and Qingsong Wen and Liang Sun},
 93 |   booktitle={Proc. 29th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD 2023)},
 94 |   location = {Long Beach, CA},
 95 |   pages={3033–3045},
 96 |   year={2023}
 97 | }
 98 | ```
 99 | 
100 | ## Contact
101 | If you have any question or want to use the code, please contact yiyuan.yang@cs.ox.ac.uk, chaoli.zcl@alibaba-inc.com,tian.zt@alibaba-inc.com,qingsong.wen@alibaba-inc.com.
102 | 
103 | ## Acknowledgement
104 | We appreciate the following github repos a lot for their valuable code:
105 | 
106 | https://github.com/thuml/Anomaly-Transformer
107 | 
108 | https://github.com/ahstat/affiliation-metrics-py
109 | 
110 | https://github.com/TheDatumOrg/VUS
111 | 
112 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | arch==6.1.0
 2 | einops==0.6.1
 3 | matplotlib==3.7.0
 4 | numpy==1.23.5
 5 | pandas==1.5.3
 6 | Pillow==9.4.0
 7 | scikit_learn==1.2.2
 8 | scipy==1.8.1
 9 | statsmodels==0.14.0
10 | torch==1.13.0
11 | tqdm==4.65.0
12 | tsfresh==0.20.1
13 | 


--------------------------------------------------------------------------------
/result_count.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/result_count.jpg


--------------------------------------------------------------------------------
/scripts/Ablation_Multiscale.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0
 2 | 
 3 | #MSL 
 4 | for j in {1,3,5,13,15,35,135};
 5 | do
 6 |     python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 128  --mode train --dataset MSL  --data_path MSL  --input_c 55 --output_c 55 --win_size 60 --patch_size $j
 7 |     python main.py --anormly_ratio 1 --num_epochs 10  --batch_size 128  --mode test  --dataset MSL  --data_path MSL  --input_c 55 --output_c 55 --win_size 60 --patch_size $j 
 8 | done
 9 | 
10 | 
11 | #PSM
12 | for j in {1,3,5,13,15,35,135};
13 | do
14 |     python main.py --anormly_ratio 1 --num_epochs 5   --batch_size 128  --mode train --dataset PSM  --data_path PSM  --input_c 25 --output_c 25 --win_size 60 --patch_size $j
15 |     python main.py --anormly_ratio 1 --num_epochs 10  --batch_size 128  --mode test  --dataset PSM  --data_path PSM  --input_c 25 --output_c 25 --win_size 60 --patch_size $j 
16 | done
17 | 
18 | 
19 | # SMAP
20 | for j in {1,3,5,13,15,35,135};
21 | do
22 |     python main.py --anormly_ratio 0.85 --num_epochs 3   --batch_size 128  --mode train --dataset SMAP  --data_path SMAP  --input_c 25 --output_c 25 --win_size 60 --patch_size $j
23 |     python main.py --anormly_ratio 0.85 --num_epochs 10  --batch_size 128  --mode test  --dataset SMAP  --data_path SMAP  --input_c 25 --output_c 25 --win_size 60 --patch_size $j 
24 | done
25 | 


--------------------------------------------------------------------------------
/scripts/Ablation_Window_Size.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0
 2 | 
 3 | # #MSL 
 4 | for i in {30,45,60,75,90,105,120,135,150,175,195,210};
 5 | do
 6 |     python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 128  --mode train --dataset MSL  --data_path MSL  --input_c 55 --output_c 55 --win_size $i --patch_size 35
 7 |     python main.py --anormly_ratio 1 --num_epochs 10  --batch_size 128  --mode test  --dataset MSL  --data_path MSL  --input_c 55 --output_c 55 --win_size $i --patch_size 35
 8 | done
 9 | 
10 | 
11 | #SMAP
12 | for i in {30,45,60,75,90,105,120,135,150,175,195,210};
13 | do
14 |     python main.py --anormly_ratio 0.85 --num_epochs 3   --batch_size 128  --mode train --dataset SMAP  --data_path SMAP  --input_c 25 --output_c 25 --win_size $i --patch_size 35
15 |     python main.py --anormly_ratio 0.85 --num_epochs 10  --batch_size 128  --mode test  --dataset SMAP  --data_path SMAP  --input_c 25 --output_c 25 --win_size $i --patch_size 35
16 | done
17 | 
18 | 
19 | #PSM
20 | for i in {30,45,60,75,90,105,120,135,150,175,195,210};
21 | do
22 |     python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 128  --mode train --dataset PSM  --data_path PSM  --input_c 25 --output_c 25 --win_size $i --patch_size 35
23 |     python main.py --anormly_ratio 1 --num_epochs 10  --batch_size 128  --mode test  --dataset PSM  --data_path PSM  --input_c 25 --output_c 25 --win_size $i --patch_size 35 
24 | done
25 | 
26 | 


--------------------------------------------------------------------------------
/scripts/Ablation_attention_head.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0
 2 |     
 3 | #MSL 
 4 | for i in {1,2,4,8};
 5 | do
 6 |     python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 128  --mode train --dataset MSL  --data_path MSL  --input_c 55 --output_c 55 --win_size 60 --patch_size 35  --n_heads $i
 7 |     python main.py --anormly_ratio 1 --num_epochs 10  --batch_size 128  --mode test  --dataset MSL  --data_path MSL  --input_c 55 --output_c 55 --win_size 60 --patch_size 35  --n_heads $i
 8 | done
 9 | 
10 | 
11 | #PSM
12 | for i in {1,2,4,8};
13 | do
14 |     python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 128  --mode train --dataset PSM  --data_path PSM  --input_c 25 --output_c 25 --win_size 60 --patch_size 35  --n_heads $i
15 |     python main.py --anormly_ratio 1 --num_epochs 10  --batch_size 128  --mode test  --dataset PSM  --data_path PSM  --input_c 25 --output_c 25 --win_size 60 --patch_size 35   --n_heads $i
16 | done
17 | 
18 | 
19 | #SMAP
20 | for i in {1,2,4,8};
21 | do
22 |     python main.py --anormly_ratio 0.85 --num_epochs 3   --batch_size 128  --mode train --dataset SMAP  --data_path SMAP  --input_c 25 --output_c 25 --win_size 60 --patch_size 35  --n_heads $i
23 |     python main.py --anormly_ratio 0.85 --num_epochs 10  --batch_size 128  --mode test  --dataset SMAP  --data_path SMAP  --input_c 25 --output_c 25 --win_size 60 --patch_size 35  --n_heads $i
24 | done
25 | 


--------------------------------------------------------------------------------
/scripts/Ablation_encoder_layer.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0
 2 |     
 3 | #MSL 
 4 | for i in {1,2,3,4,5};
 5 | do
 6 |     python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 128  --mode train --dataset MSL  --data_path MSL  --input_c 55 --output_c 55 --win_size 60 --patch_size 35  --e_layer $i
 7 |     python main.py --anormly_ratio 1 --num_epochs 10  --batch_size 128  --mode test  --dataset MSL  --data_path MSL  --input_c 55 --output_c 55 --win_size 60 --patch_size 35  --e_layer $i
 8 | done
 9 | 
10 | #SMAP
11 | for i in {1,2,3,4,5};
12 | do
13 | python main.py --anormly_ratio 0.85 --num_epochs 3   --batch_size 128  --mode train --dataset SMAP  --data_path SMAP  --input_c 25 --output_c 25 --win_size 60 --patch_size 35  --e_layer $i
14 | python main.py --anormly_ratio 0.85 --num_epochs 10  --batch_size 128  --mode test  --dataset SMAP  --data_path SMAP  --input_c 25 --output_c 25 --win_size 60 --patch_size 35  --e_layer $i
15 | done
16 | 
17 | #PSM
18 | for i in {1,2,3,4,5};
19 | do
20 | python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 128  --mode train --dataset PSM  --data_path PSM  --input_c 25 --output_c 25 --win_size 60 --patch_size 35  --e_layer $i
21 | python main.py --anormly_ratio 1 --num_epochs 10  --batch_size 128  --mode test  --dataset PSM  --data_path PSM  --input_c 25 --output_c 25 --win_size 60 --patch_size 35   --e_layer $i
22 | done
23 | 


--------------------------------------------------------------------------------
/scripts/MSL.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0
2 | 
3 | python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 64  --mode train --dataset MSL  --data_path MSL  --input_c 55 --output_c 55  --win_size 90  --patch_size 35
4 | python main.py --anormly_ratio 1  --num_epochs 10     --batch_size 64    --mode test    --dataset MSL   --data_path MSL --input_c 55    --output_c 55   --win_size 90  --patch_size 35


--------------------------------------------------------------------------------
/scripts/NIPS_TS_Swan.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=1
2 | 
3 | python main.py --anormly_ratio 0.9 --num_epochs 3   --batch_size 128  --mode train --dataset NIPS_TS_Swan  --data_path NIPS_TS_Swan  --input_c 38 --output_c 38  --loss_fuc MSE    --win_size 36  --patch_size 13
4 | python main.py --anormly_ratio 0.9  --num_epochs 10     --batch_size 128   --mode test    --dataset NIPS_TS_Swan   --data_path NIPS_TS_Swan --input_c 38    --output_c 38    --loss_fuc MSE       --win_size 36   --patch_size 13
5 | 


--------------------------------------------------------------------------------
/scripts/NIPS_TS_Water.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0
2 | 
3 | python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 256  --mode train --dataset NIPS_TS_Water  --data_path NIPS_TS_Water  --input_c 9 --output_c 9  --loss_fuc MSE   --patch_size 135  --win_size 90
4 | python main.py --anormly_ratio 1  --num_epochs 10     --batch_size 256   --mode test    --dataset NIPS_TS_Water   --data_path NIPS_TS_Water --input_c 9    --output_c 9    --loss_fuc MSE   --patch_size 135   --win_size 90
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/scripts/PSM.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0
2 | 
3 | python main.py --anormly_ratio 1 --num_epochs 3    --batch_size 256  --mode train --dataset PSM  --data_path PSM --input_c 25    --output_c 25  --loss_fuc MSE  --win_size 60  --patch_size 135
4 | python main.py --anormly_ratio 1  --num_epochs 10       --batch_size 256     --mode test    --dataset PSM   --data_path PSM  --input_c 25    --output_c 25  --loss_fuc MSE  --win_size 60  --patch_size 135


--------------------------------------------------------------------------------
/scripts/SMAP.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0
2 | 
3 | python main.py --anormly_ratio 0.85 --num_epochs 3   --batch_size 256  --mode train --dataset SMAP  --data_path SMAP --input_c 25    --output_c 25  --loss_fuc MSE --patch_size 357 --win_size 105
4 | python main.py --anormly_ratio 0.85  --num_epochs 10   --batch_size 256     --mode test    --dataset SMAP   --data_path SMAP  --input_c 25    --output_c 25   --loss_fuc MSE --patch_size 357 --win_size 105
5 | 


--------------------------------------------------------------------------------
/scripts/SMD.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=3
2 | 
3 | python main.py --anormly_ratio 0.6 --num_epochs 2   --batch_size 256  --mode train --dataset SMD  --data_path SMD   --input_c 38   --output_c 38  --loss_fuc MSE  --win_size 105  --patch_size 57
4 | python main.py --anormly_ratio 0.6 --num_epochs 10   --batch_size 256  --mode test    --dataset SMD   --data_path SMD     --input_c 38      --output_c 38   --loss_fuc MSE   --win_size 105  --patch_size 57


--------------------------------------------------------------------------------
/scripts/SWAT.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0
2 | 
3 | python main.py --anormly_ratio 1 --num_epochs 3   --batch_size 128  --mode train --dataset SWAT  --data_path SWAT  --input_c 51    --output_c 51  --loss_fuc MSE --patch_size 357 --win_size 105
4 | python main.py --anormly_ratio 1  --num_epochs 10   --batch_size 128     --mode test    --dataset SWAT   --data_path SWAT  --input_c 51    --output_c 51   --loss_fuc MSE --patch_size 357 --win_size 105
5 | 


--------------------------------------------------------------------------------
/scripts/UCR.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=1
 2 | 
 3 | for i in {1..250};
 4 | do
 5 | 
 6 | python main.py --anormly_ratio 0.5 --num_epochs 3   --batch_size 128  --mode train --dataset UCR  --data_path UCR   --input_c 1 --output 1 --index $i --win_size 105 --patch_size 357
 7 | python main.py --anormly_ratio 0.5 --num_epochs 10   --batch_size 128    --mode test    --dataset UCR   --data_path UCR     --input_c 1   --output 1  --index $i --win_size 105 --patch_size 357
 8 | 
 9 | done  
10 | 
11 | 


--------------------------------------------------------------------------------
/scripts/UCR_AUG.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0
2 | 
3 | for i in {1..247};
4 | do
5 | 
6 | python main.py --anormly_ratio 0.5 --num_epochs 3   --batch_size 128  --mode train --dataset UCR_AUG  --data_path UCR_AUG   --input_c 1 --output 1 --index $i --win_size 60  --patch_size 35
7 | python main.py --anormly_ratio 0.5 --num_epochs 10   --batch_size 128    --mode test    --dataset UCR_AUG   --data_path UCR_AUG     --input_c 1   --output 1  --index $i  --win_size 60 --patch_size 35
8 | 
9 | done  


--------------------------------------------------------------------------------
/solver.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import os
  6 | import time
  7 | from utils.utils import *
  8 | from model.DCdetector import DCdetector
  9 | from data_factory.data_loader import get_loader_segment
 10 | from einops import rearrange
 11 | from metrics.metrics import *
 12 | import warnings
 13 | warnings.filterwarnings('ignore')
 14 | 
 15 | def my_kl_loss(p, q):
 16 |     res = p * (torch.log(p + 0.0001) - torch.log(q + 0.0001))
 17 |     return torch.mean(torch.sum(res, dim=-1), dim=1)
 18 | 
 19 | def adjust_learning_rate(optimizer, epoch, lr_):
 20 |     lr_adjust = {epoch: lr_ * (0.5 ** ((epoch - 1) // 1))}
 21 |     if epoch in lr_adjust.keys():
 22 |         lr = lr_adjust[epoch]
 23 |         for param_group in optimizer.param_groups:
 24 |             param_group['lr'] = lr
 25 | 
 26 | class EarlyStopping:
 27 |     def __init__(self, patience=7, verbose=False, dataset_name='', delta=0):
 28 |         self.patience = patience
 29 |         self.verbose = verbose
 30 |         self.counter = 0
 31 |         self.best_score = None
 32 |         self.best_score2 = None
 33 |         self.early_stop = False
 34 |         self.val_loss_min = np.Inf
 35 |         self.val_loss2_min = np.Inf
 36 |         self.delta = delta
 37 |         self.dataset = dataset_name
 38 | 
 39 |     def __call__(self, val_loss, val_loss2, model, path):
 40 |         score = -val_loss
 41 |         score2 = -val_loss2
 42 |         if self.best_score is None:
 43 |             self.best_score = score
 44 |             self.best_score2 = score2
 45 |             self.save_checkpoint(val_loss, val_loss2, model, path)
 46 |         elif score < self.best_score + self.delta or score2 < self.best_score2 + self.delta:
 47 |             self.counter += 1
 48 |             if self.counter >= self.patience:
 49 |                 self.early_stop = True
 50 |         else:
 51 |             self.best_score = score
 52 |             self.best_score2 = score2
 53 |             self.save_checkpoint(val_loss, val_loss2, model, path)
 54 |             self.counter = 0
 55 | 
 56 |     def save_checkpoint(self, val_loss, val_loss2, model, path):
 57 |         torch.save(model.state_dict(), os.path.join(path, str(self.dataset) + '_checkpoint.pth'))
 58 |         self.val_loss_min = val_loss
 59 |         self.val_loss2_min = val_loss2
 60 | 
 61 |         
 62 | class Solver(object):
 63 |     DEFAULTS = {}
 64 | 
 65 |     def __init__(self, config):
 66 | 
 67 |         self.__dict__.update(Solver.DEFAULTS, **config)
 68 | 
 69 |         self.train_loader = get_loader_segment(self.index, 'dataset/'+self.data_path, batch_size=self.batch_size, win_size=self.win_size, mode='train', dataset=self.dataset, )
 70 |         self.vali_loader = get_loader_segment(self.index, 'dataset/'+self.data_path, batch_size=self.batch_size, win_size=self.win_size, mode='val', dataset=self.dataset)
 71 |         self.test_loader = get_loader_segment(self.index, 'dataset/'+self.data_path, batch_size=self.batch_size, win_size=self.win_size, mode='test', dataset=self.dataset)
 72 |         self.thre_loader = get_loader_segment(self.index, 'dataset/'+self.data_path, batch_size=self.batch_size, win_size=self.win_size, mode='thre', dataset=self.dataset)
 73 | 
 74 |         self.build_model()
 75 |         
 76 |         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 77 |         
 78 |         if self.loss_fuc == 'MAE':
 79 |             self.criterion = nn.L1Loss()
 80 |         elif self.loss_fuc == 'MSE':
 81 |             self.criterion = nn.MSELoss()
 82 |         
 83 | 
 84 |     def build_model(self):
 85 |         self.model = DCdetector(win_size=self.win_size, enc_in=self.input_c, c_out=self.output_c, n_heads=self.n_heads, d_model=self.d_model, e_layers=self.e_layers, patch_size=self.patch_size, channel=self.input_c)
 86 |         
 87 |         if torch.cuda.is_available():
 88 |             self.model.cuda()
 89 |             
 90 |         self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
 91 |         
 92 |         
 93 |     def vali(self, vali_loader):
 94 |         self.model.eval()
 95 |         loss_1 = []
 96 |         loss_2 = []
 97 |         for i, (input_data, _) in enumerate(vali_loader):
 98 |             input = input_data.float().to(self.device)
 99 |             series, prior = self.model(input)
100 |             series_loss = 0.0
101 |             prior_loss = 0.0
102 |             for u in range(len(prior)):
103 |                 series_loss += (torch.mean(my_kl_loss(series[u], (
104 |                         prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
105 |                                                                                                self.win_size)).detach())) + torch.mean(
106 |                     my_kl_loss(
107 |                         (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
108 |                                                                                                 self.win_size)).detach(),
109 |                         series[u])))
110 |                 prior_loss += (torch.mean(
111 |                     my_kl_loss((prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
112 |                                                                                                        self.win_size)),
113 |                                series[u].detach())) + torch.mean(
114 |                     my_kl_loss(series[u].detach(),
115 |                                (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
116 |                                                                                                        self.win_size)))))
117 |                 
118 |             series_loss = series_loss / len(prior)
119 |             prior_loss = prior_loss / len(prior)
120 | 
121 |             loss_1.append((prior_loss - series_loss).item())
122 | 
123 |         return np.average(loss_1), np.average(loss_2)
124 | 
125 | 
126 |     def train(self):
127 | 
128 |         time_now = time.time()
129 |         path = self.model_save_path
130 |         if not os.path.exists(path):
131 |             os.makedirs(path)
132 |         early_stopping = EarlyStopping(patience=5, verbose=True, dataset_name=self.data_path)
133 |         train_steps = len(self.train_loader)
134 | 
135 |         for epoch in range(self.num_epochs):
136 |             iter_count = 0
137 | 
138 |             epoch_time = time.time()
139 |             self.model.train()
140 |             for i, (input_data, labels) in enumerate(self.train_loader):
141 | 
142 |                 self.optimizer.zero_grad()
143 |                 iter_count += 1
144 |                 input = input_data.float().to(self.device)
145 |                 series, prior = self.model(input)
146 |                 
147 |                 series_loss = 0.0
148 |                 prior_loss = 0.0
149 | 
150 |                 for u in range(len(prior)):
151 |                     series_loss += (torch.mean(my_kl_loss(series[u], (
152 |                             prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
153 |                                                                                                    self.win_size)).detach())) + torch.mean(
154 |                         my_kl_loss((prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
155 |                                                                                                            self.win_size)).detach(),
156 |                                    series[u])))
157 |                     prior_loss += (torch.mean(my_kl_loss(
158 |                         (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
159 |                                                                                                 self.win_size)),
160 |                         series[u].detach())) + torch.mean(
161 |                         my_kl_loss(series[u].detach(), (
162 |                                 prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
163 |                                                                                                        self.win_size)))))
164 | 
165 |                 series_loss = series_loss / len(prior)
166 |                 prior_loss = prior_loss / len(prior)
167 | 
168 |                 loss = prior_loss - series_loss 
169 | 
170 |                 if (i + 1) % 100 == 0:
171 |                     speed = (time.time() - time_now) / iter_count
172 |                     left_time = speed * ((self.num_epochs - epoch) * train_steps - i)
173 |                     print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time))
174 |                     iter_count = 0
175 |                     time_now = time.time()
176 |  
177 |                 loss.backward()
178 |                 self.optimizer.step()
179 | 
180 |             vali_loss1, vali_loss2 = self.vali(self.test_loader)
181 | 
182 |             print(
183 |                 "Epoch: {0}, Cost time: {1:.3f}s ".format(
184 |                     epoch + 1, time.time() - epoch_time))
185 |             early_stopping(vali_loss1, vali_loss2, self.model, path)
186 |             if early_stopping.early_stop:
187 |                 break
188 |             adjust_learning_rate(self.optimizer, epoch + 1, self.lr)
189 | 
190 |             
191 |     def test(self):
192 |         self.model.load_state_dict(
193 |             torch.load(
194 |                 os.path.join(str(self.model_save_path), str(self.data_path) + '_checkpoint.pth')))
195 |         self.model.eval()
196 |         temperature = 50
197 | 
198 |         # (1) stastic on the train set
199 |         attens_energy = []
200 |         for i, (input_data, labels) in enumerate(self.train_loader):
201 |             input = input_data.float().to(self.device)
202 |             series, prior = self.model(input)
203 |             series_loss = 0.0
204 |             prior_loss = 0.0
205 |             for u in range(len(prior)):
206 |                 if u == 0:
207 |                     series_loss = my_kl_loss(series[u], (
208 |                             prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
209 |                                                                                                    self.win_size)).detach()) * temperature
210 |                     prior_loss = my_kl_loss(
211 |                         (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
212 |                                                                                                 self.win_size)),
213 |                         series[u].detach()) * temperature
214 |                 else:
215 |                     series_loss += my_kl_loss(series[u], (
216 |                             prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
217 |                                                                                                    self.win_size)).detach()) * temperature
218 |                     prior_loss += my_kl_loss(
219 |                         (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
220 |                                                                                                 self.win_size)),
221 |                         series[u].detach()) * temperature
222 | 
223 |             metric = torch.softmax((-series_loss - prior_loss), dim=-1)
224 |             cri = metric.detach().cpu().numpy()
225 |             attens_energy.append(cri)
226 | 
227 |         attens_energy = np.concatenate(attens_energy, axis=0).reshape(-1)
228 |         train_energy = np.array(attens_energy)
229 | 
230 |         # (2) find the threshold
231 |         attens_energy = []
232 |         for i, (input_data, labels) in enumerate(self.thre_loader):
233 |             input = input_data.float().to(self.device)
234 |             series, prior = self.model(input)
235 |             series_loss = 0.0
236 |             prior_loss = 0.0
237 |             for u in range(len(prior)):
238 |                 if u == 0:
239 |                     series_loss = my_kl_loss(series[u], (
240 |                             prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
241 |                                                                                                    self.win_size)).detach()) * temperature
242 |                     prior_loss = my_kl_loss(
243 |                         (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
244 |                                                                                                 self.win_size)),
245 |                         series[u].detach()) * temperature
246 |                 else:
247 |                     series_loss += my_kl_loss(series[u], (
248 |                             prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
249 |                                                                                                    self.win_size)).detach()) * temperature
250 |                     prior_loss += my_kl_loss(
251 |                         (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
252 |                                                                                                 self.win_size)),
253 |                         series[u].detach()) * temperature
254 | 
255 |             metric = torch.softmax((-series_loss - prior_loss), dim=-1)
256 |             cri = metric.detach().cpu().numpy()
257 |             attens_energy.append(cri)
258 | 
259 |         attens_energy = np.concatenate(attens_energy, axis=0).reshape(-1)
260 |         test_energy = np.array(attens_energy)
261 |         combined_energy = np.concatenate([train_energy, test_energy], axis=0)
262 |         thresh = np.percentile(combined_energy, 100 - self.anormly_ratio)
263 |         print("Threshold :", thresh)
264 | 
265 |         # (3) evaluation on the test set
266 |         test_labels = []
267 |         attens_energy = []
268 |         for i, (input_data, labels) in enumerate(self.thre_loader):
269 |             input = input_data.float().to(self.device)
270 |             series, prior = self.model(input)
271 |             series_loss = 0.0
272 |             prior_loss = 0.0
273 |             for u in range(len(prior)):
274 |                 if u == 0:
275 |                     series_loss = my_kl_loss(series[u], (
276 |                             prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
277 |                                                                                                    self.win_size)).detach()) * temperature
278 |                     prior_loss = my_kl_loss(
279 |                         (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
280 |                                                                                                 self.win_size)),
281 |                         series[u].detach()) * temperature
282 |                 else:
283 |                     series_loss += my_kl_loss(series[u], (
284 |                             prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
285 |                                                                                                    self.win_size)).detach()) * temperature
286 |                     prior_loss += my_kl_loss(
287 |                         (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1,
288 |                                                                                                 self.win_size)),
289 |                         series[u].detach()) * temperature
290 |             metric = torch.softmax((-series_loss - prior_loss), dim=-1)
291 |             cri = metric.detach().cpu().numpy()
292 |             attens_energy.append(cri)
293 |             test_labels.append(labels)
294 |             
295 |         attens_energy = np.concatenate(attens_energy, axis=0).reshape(-1)
296 |         test_labels = np.concatenate(test_labels, axis=0).reshape(-1)
297 |         test_energy = np.array(attens_energy)
298 |         test_labels = np.array(test_labels)
299 | 
300 |         pred = (test_energy > thresh).astype(int)
301 |         gt = test_labels.astype(int)
302 |         
303 |         matrix = [self.index]
304 |         scores_simple = combine_all_evaluation_scores(pred, gt, test_energy)
305 |         for key, value in scores_simple.items():
306 |             matrix.append(value)
307 |             print('{0:21} : {1:0.4f}'.format(key, value))
308 | 
309 |         anomaly_state = False
310 |         for i in range(len(gt)):
311 |             if gt[i] == 1 and pred[i] == 1 and not anomaly_state:
312 |                 anomaly_state = True
313 |                 for j in range(i, 0, -1):
314 |                     if gt[j] == 0:
315 |                         break
316 |                     else:
317 |                         if pred[j] == 0:
318 |                             pred[j] = 1
319 |                 for j in range(i, len(gt)):
320 |                     if gt[j] == 0:
321 |                         break
322 |                     else:
323 |                         if pred[j] == 0:
324 |                             pred[j] = 1
325 |             elif gt[i] == 0:
326 |                 anomaly_state = False
327 |             if anomaly_state:
328 |                 pred[i] = 1
329 | 
330 |         pred = np.array(pred)
331 |         gt = np.array(gt)
332 | 
333 |         from sklearn.metrics import precision_recall_fscore_support
334 |         from sklearn.metrics import accuracy_score
335 | 
336 |         accuracy = accuracy_score(gt, pred)
337 |         precision, recall, f_score, support = precision_recall_fscore_support(gt, pred, average='binary')
338 |         print("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format(accuracy, precision, recall, f_score))
339 |         
340 |         if self.data_path == 'UCR' or 'UCR_AUG':
341 |             import csv
342 |             with open('result/'+self.data_path+'.csv', 'a+') as f:
343 |                 writer = csv.writer(f)
344 |                 writer.writerow(matrix)
345 | 
346 |         return accuracy, precision, recall, f_score
347 | 


--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | import scipy.misc
 5 | 
 6 | try:
 7 |     from StringIO import StringIO  # Python 2.7
 8 | except ImportError:
 9 |     from io import BytesIO  # Python 3.5+
10 | 
11 | 
12 | class Logger(object):
13 | 
14 |     def __init__(self, log_dir):
15 |         """Create a summary writer logging to log_dir."""
16 |         self.writer = tf.summary.FileWriter(log_dir)
17 | 
18 |     def scalar_summary(self, tag, value, step):
19 |         """Log a scalar variable."""
20 |         summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
21 |         self.writer.add_summary(summary, step)
22 | 
23 |     def image_summary(self, tag, images, step):
24 |         """Log a list of images."""
25 | 
26 |         img_summaries = []
27 |         for i, img in enumerate(images):
28 |             # Write the image to a string
29 |             try:
30 |                 s = StringIO()
31 |             except:
32 |                 s = BytesIO()
33 |             scipy.misc.toimage(img).save(s, format="png")
34 | 
35 |             # Create an Image object
36 |             img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(),
37 |                                        height=img.shape[0],
38 |                                        width=img.shape[1])
39 |             # Create a Summary value
40 |             img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum))
41 | 
42 |         # Create and write Summary
43 |         summary = tf.Summary(value=img_summaries)
44 |         self.writer.add_summary(summary, step)
45 | 
46 |     def histo_summary(self, tag, values, step, bins=1000):
47 |         """Log a histogram of the tensor of values."""
48 | 
49 |         # Create a histogram using numpy
50 |         counts, bin_edges = np.histogram(values, bins=bins)
51 | 
52 |         # Fill the fields of the histogram proto
53 |         hist = tf.HistogramProto()
54 |         hist.min = float(np.min(values))
55 |         hist.max = float(np.max(values))
56 |         hist.num = int(np.prod(values.shape))
57 |         hist.sum = float(np.sum(values))
58 |         hist.sum_squares = float(np.sum(values ** 2))
59 | 
60 |         # Drop the start of the first bin
61 |         bin_edges = bin_edges[1:]
62 | 
63 |         # Add bin edges and counts
64 |         for edge in bin_edges:
65 |             hist.bucket_limit.append(edge)
66 |         for c in counts:
67 |             hist.bucket.append(c)
68 | 
69 |         # Create and write Summary
70 |         summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
71 |         self.writer.add_summary(summary, step)
72 |         self.writer.flush()
73 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.autograd import Variable
 6 | import numpy as np
 7 | 
 8 | 
 9 | def to_var(x, volatile=False):
10 |     if torch.cuda.is_available():
11 |         x = x.cuda()
12 |     return Variable(x, volatile=volatile)
13 | 
14 | 
15 | def mkdir(directory):
16 |     if not os.path.exists(directory):
17 |         os.makedirs(directory)
18 | 


--------------------------------------------------------------------------------