├── data_factory └── data_loader.py ├── img ├── DCdetector.jpg ├── art-compare.png ├── result_1.png ├── result_2.png ├── result_3.png ├── result_4.png ├── result_count.jpg └── workflow.png ├── main.py ├── metrics ├── AUC.py ├── Matthews_correlation_coefficient.py ├── affiliation │ ├── _affiliation_zone.py │ ├── _integral_interval.py │ ├── _single_ground_truth_event.py │ ├── generics.py │ └── metrics.py ├── combine_all_scores.py ├── customizable_f1_score.py ├── evaluate_utils.py ├── evaluator.py ├── f1_score_f1_pa.py ├── f1_series.py ├── fc_score.py ├── metrics.py ├── precision_at_k.py └── vus │ ├── analysis │ ├── robustness_eval.py │ └── score_computation.py │ ├── metrics.py │ ├── models │ ├── distance.py │ └── feature.py │ └── utils │ ├── metrics.py │ └── slidingWindows.py ├── model ├── DCdetector.py ├── RevIN.py ├── attn.py └── embed.py ├── readme.md ├── requirements.txt ├── result_count.jpg ├── scripts ├── Ablation_Multiscale.sh ├── Ablation_Window_Size.sh ├── Ablation_attention_head.sh ├── Ablation_encoder_layer.sh ├── MSL.sh ├── NIPS_TS_Swan.sh ├── NIPS_TS_Water.sh ├── PSM.sh ├── SMAP.sh ├── SMD.sh ├── SWAT.sh ├── UCR.sh └── UCR_AUG.sh ├── solver.py └── utils ├── logger.py └── utils.py /data_factory/data_loader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import random 4 | from torch.utils.data import Dataset 5 | from torch.utils.data import DataLoader 6 | from PIL import Image 7 | import numpy as np 8 | import collections 9 | import numbers 10 | import math 11 | import pandas as pd 12 | from sklearn.preprocessing import StandardScaler 13 | import pickle 14 | 15 | 16 | class PSMSegLoader(object): 17 | def __init__(self, data_path, win_size, step, mode="train"): 18 | self.mode = mode 19 | self.step = step 20 | self.win_size = win_size 21 | self.scaler = StandardScaler() 22 | data = pd.read_csv(data_path + '/train.csv') 23 | data = data.values[:, 1:] 24 | data = np.nan_to_num(data) 25 | self.scaler.fit(data) 26 | data = self.scaler.transform(data) 27 | test_data = pd.read_csv(data_path + '/test.csv') 28 | test_data = test_data.values[:, 1:] 29 | test_data = np.nan_to_num(test_data) 30 | self.test = self.scaler.transform(test_data) 31 | self.train = data 32 | self.val = self.test 33 | self.test_labels = pd.read_csv(data_path + '/test_label.csv').values[:, 1:] 34 | 35 | def __len__(self): 36 | """ 37 | Number of images in the object dataset. 38 | """ 39 | if self.mode == "train": 40 | return (self.train.shape[0] - self.win_size) // self.step + 1 41 | elif (self.mode == 'val'): 42 | return (self.val.shape[0] - self.win_size) // self.step + 1 43 | elif (self.mode == 'test'): 44 | return (self.test.shape[0] - self.win_size) // self.step + 1 45 | else: 46 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 47 | 48 | def __getitem__(self, index): 49 | index = index * self.step 50 | if self.mode == "train": 51 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 52 | elif (self.mode == 'val'): 53 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 54 | elif (self.mode == 'test'): 55 | return np.float32(self.test[index:index + self.win_size]), np.float32( 56 | self.test_labels[index:index + self.win_size]) 57 | else: 58 | return np.float32(self.test[ 59 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 60 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 61 | 62 | 63 | class MSLSegLoader(object): 64 | def __init__(self, data_path, win_size, step, mode="train"): 65 | self.mode = mode 66 | self.step = step 67 | self.win_size = win_size 68 | self.scaler = StandardScaler() 69 | data = np.load(data_path + "/MSL_train.npy") 70 | self.scaler.fit(data) 71 | data = self.scaler.transform(data) 72 | test_data = np.load(data_path + "/MSL_test.npy") 73 | self.test = self.scaler.transform(test_data) 74 | self.train = data 75 | self.val = self.test 76 | self.test_labels = np.load(data_path + "/MSL_test_label.npy") 77 | 78 | def __len__(self): 79 | if self.mode == "train": 80 | return (self.train.shape[0] - self.win_size) // self.step + 1 81 | elif (self.mode == 'val'): 82 | return (self.val.shape[0] - self.win_size) // self.step + 1 83 | elif (self.mode == 'test'): 84 | return (self.test.shape[0] - self.win_size) // self.step + 1 85 | else: 86 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 87 | 88 | def __getitem__(self, index): 89 | index = index * self.step 90 | if self.mode == "train": 91 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 92 | elif (self.mode == 'val'): 93 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 94 | elif (self.mode == 'test'): 95 | return np.float32(self.test[index:index + self.win_size]), np.float32( 96 | self.test_labels[index:index + self.win_size]) 97 | else: 98 | return np.float32(self.test[ 99 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 100 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 101 | 102 | 103 | class SMAPSegLoader(object): 104 | def __init__(self, data_path, win_size, step, mode="train"): 105 | self.mode = mode 106 | self.step = step 107 | self.win_size = win_size 108 | self.scaler = StandardScaler() 109 | data = np.load(data_path + "/SMAP_train.npy") 110 | self.scaler.fit(data) 111 | data = self.scaler.transform(data) 112 | test_data = np.load(data_path + "/SMAP_test.npy") 113 | self.test = self.scaler.transform(test_data) 114 | self.train = data 115 | self.val = self.test 116 | self.test_labels = np.load(data_path + "/SMAP_test_label.npy") 117 | 118 | def __len__(self): 119 | if self.mode == "train": 120 | return (self.train.shape[0] - self.win_size) // self.step + 1 121 | elif (self.mode == 'val'): 122 | return (self.val.shape[0] - self.win_size) // self.step + 1 123 | elif (self.mode == 'test'): 124 | return (self.test.shape[0] - self.win_size) // self.step + 1 125 | else: 126 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 127 | 128 | def __getitem__(self, index): 129 | index = index * self.step 130 | if self.mode == "train": #train and val did not use label 131 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 132 | elif (self.mode == 'val'): 133 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 134 | elif (self.mode == 'test'): 135 | return np.float32(self.test[index:index + self.win_size]), np.float32( 136 | self.test_labels[index:index + self.win_size]) 137 | else: 138 | return np.float32(self.test[ 139 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 140 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 141 | 142 | 143 | class SMDSegLoader(object): 144 | def __init__(self, data_path, win_size, step, mode="train"): 145 | self.mode = mode 146 | self.step = step 147 | self.win_size = win_size 148 | self.scaler = StandardScaler() 149 | data = np.load(data_path + "/SMD_train.npy")[:,:] 150 | self.scaler.fit(data) 151 | data = self.scaler.transform(data) 152 | test_data = np.load(data_path + "/SMD_test.npy")[:,:] 153 | self.test = self.scaler.transform(test_data) 154 | self.train = data 155 | data_len = len(self.train) 156 | self.val = self.train[(int)(data_len * 0.8):] 157 | self.test_labels = np.load(data_path + "/SMD_test_label.npy")[:] 158 | 159 | def __len__(self): 160 | if self.mode == "train": 161 | return (self.train.shape[0] - self.win_size) // self.step + 1 162 | elif (self.mode == 'val'): 163 | return (self.val.shape[0] - self.win_size) // self.step + 1 164 | elif (self.mode == 'test'): 165 | return (self.test.shape[0] - self.win_size) // self.step + 1 166 | else: 167 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 168 | 169 | def __getitem__(self, index): 170 | index = index * self.step 171 | if self.mode == "train": 172 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 173 | elif (self.mode == 'val'): 174 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 175 | elif (self.mode == 'test'): 176 | return np.float32(self.test[index:index + self.win_size]), np.float32( 177 | self.test_labels[index:index + self.win_size]) 178 | else: 179 | return np.float32(self.test[ 180 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 181 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 182 | 183 | 184 | 185 | class UCRSegLoader(object): 186 | def __init__(self, index, data_path, win_size, step, mode="train"): 187 | self.mode = mode 188 | self.step = step 189 | self.index = index 190 | self.win_size = win_size 191 | self.scaler = StandardScaler() 192 | data = np.load(data_path + "/UCR_"+str(index)+"_train.npy") 193 | self.scaler.fit(data) 194 | data = self.scaler.transform(data) 195 | test_data = np.load(data_path + "/UCR_"+str(index)+"_test.npy") 196 | self.test = self.scaler.transform(test_data) 197 | 198 | self.train = data 199 | self.val = self.test 200 | self.test_labels = np.load(data_path + "/UCR_"+str(index)+"_test_label.npy") 201 | if self.mode == "val": 202 | print("train:", self.train.shape) 203 | print("test:", self.test.shape) 204 | 205 | def __len__(self): 206 | if self.mode == "train": 207 | return (self.train.shape[0] - self.win_size) // self.step + 1 208 | elif (self.mode == 'val'): 209 | return (self.val.shape[0] - self.win_size) // self.step + 1 210 | elif (self.mode == 'test'): 211 | return (self.test.shape[0] - self.win_size) // self.step + 1 212 | else: 213 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 214 | 215 | def __getitem__(self, index): 216 | index = index * self.step 217 | if self.mode == "train": 218 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 219 | elif (self.mode == 'val'): 220 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 221 | elif (self.mode == 'test'): 222 | return np.float32(self.test[index:index + self.win_size]), np.float32( 223 | self.test_labels[index:index + self.win_size]) 224 | else: 225 | return np.float32(self.test[ 226 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 227 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 228 | 229 | 230 | class UCRAUGSegLoader(object): 231 | def __init__(self, index, data_path, win_size, step, mode="train"): 232 | self.mode = mode 233 | self.step = step 234 | self.index = index 235 | self.win_size = win_size 236 | self.scaler = StandardScaler() 237 | data = np.load(data_path + "/UCR_AUG_"+str(index)+"_train.npy") 238 | self.scaler.fit(data) 239 | data = self.scaler.transform(data) 240 | test_data = np.load(data_path + "/UCR_AUG_"+str(index)+"_test.npy") 241 | self.test = self.scaler.transform(test_data) 242 | 243 | self.train = data 244 | self.val = self.test 245 | self.test_labels = np.load(data_path + "/UCR_AUG_"+str(index)+"_test_label.npy") 246 | if self.mode == "val": 247 | print("train:", self.train.shape) 248 | print("test:", self.test.shape) 249 | 250 | def __len__(self): 251 | if self.mode == "train": 252 | return (self.train.shape[0] - self.win_size) // self.step + 1 253 | elif (self.mode == 'val'): 254 | return (self.val.shape[0] - self.win_size) // self.step + 1 255 | elif (self.mode == 'test'): 256 | return (self.test.shape[0] - self.win_size) // self.step + 1 257 | else: 258 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 259 | 260 | def __getitem__(self, index): 261 | index = index * self.step 262 | if self.mode == "train": 263 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 264 | elif (self.mode == 'val'): 265 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 266 | elif (self.mode == 'test'): 267 | return np.float32(self.test[index:index + self.win_size]), np.float32( 268 | self.test_labels[index:index + self.win_size]) 269 | else: 270 | return np.float32(self.test[ 271 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 272 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 273 | 274 | 275 | class NIPS_TS_WaterSegLoader(object): 276 | def __init__(self, data_path, win_size, step, mode="train"): 277 | self.mode = mode 278 | self.step = step 279 | self.win_size = win_size 280 | self.scaler = StandardScaler() 281 | data = np.load(data_path + "/NIPS_TS_Water_train.npy") 282 | self.scaler.fit(data) 283 | data = self.scaler.transform(data) 284 | test_data = np.load(data_path + "/NIPS_TS_Water_test.npy") 285 | self.test = self.scaler.transform(test_data) 286 | 287 | self.train = data 288 | self.val = self.test 289 | self.test_labels = np.load(data_path + "/NIPS_TS_Water_test_label.npy") 290 | print("test:", self.test.shape) 291 | print("train:", self.train.shape) 292 | 293 | def __len__(self): 294 | 295 | if self.mode == "train": 296 | return (self.train.shape[0] - self.win_size) // self.step + 1 297 | elif (self.mode == 'val'): 298 | return (self.val.shape[0] - self.win_size) // self.step + 1 299 | elif (self.mode == 'test'): 300 | return (self.test.shape[0] - self.win_size) // self.step + 1 301 | else: 302 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 303 | 304 | def __getitem__(self, index): 305 | index = index * self.step 306 | if self.mode == "train": 307 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 308 | elif (self.mode == 'val'): 309 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 310 | elif (self.mode == 'test'): 311 | return np.float32(self.test[index:index + self.win_size]), np.float32( 312 | self.test_labels[index:index + self.win_size]) 313 | else: 314 | return np.float32(self.test[ 315 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 316 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 317 | 318 | 319 | 320 | class NIPS_TS_SwanSegLoader(object): 321 | def __init__(self, data_path, win_size, step, mode="train"): 322 | self.mode = mode 323 | self.step = step 324 | self.win_size = win_size 325 | self.scaler = StandardScaler() 326 | data = np.load(data_path + "/NIPS_TS_Swan_train.npy") 327 | self.scaler.fit(data) 328 | data = self.scaler.transform(data) 329 | test_data = np.load(data_path + "/NIPS_TS_Swan_test.npy") 330 | self.test = self.scaler.transform(test_data) 331 | 332 | self.train = data 333 | self.val = self.test 334 | self.test_labels = np.load(data_path + "/NIPS_TS_Swan_test_label.npy") 335 | print("test:", self.test.shape) 336 | print("train:", self.train.shape) 337 | 338 | def __len__(self): 339 | if self.mode == "train": 340 | return (self.train.shape[0] - self.win_size) // self.step + 1 341 | elif (self.mode == 'val'): 342 | return (self.val.shape[0] - self.win_size) // self.step + 1 343 | elif (self.mode == 'test'): 344 | return (self.test.shape[0] - self.win_size) // self.step + 1 345 | else: 346 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 347 | 348 | def __getitem__(self, index): 349 | index = index * self.step 350 | if self.mode == "train": 351 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 352 | elif (self.mode == 'val'): 353 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 354 | elif (self.mode == 'test'): 355 | return np.float32(self.test[index:index + self.win_size]), np.float32( 356 | self.test_labels[index:index + self.win_size]) 357 | else: 358 | return np.float32(self.test[ 359 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 360 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 361 | 362 | 363 | class NIPS_TS_CCardSegLoader(object): 364 | def __init__(self, data_path, win_size, step, mode="train"): 365 | self.mode = mode 366 | self.step = step 367 | self.win_size = win_size 368 | self.scaler = StandardScaler() 369 | data = np.load(data_path + "/NIPS_TS_CCard_train.npy") 370 | self.scaler.fit(data) 371 | data = self.scaler.transform(data) 372 | test_data = np.load(data_path + "/NIPS_TS_CCard_test.npy") 373 | self.test = self.scaler.transform(test_data) 374 | 375 | self.train = data 376 | self.val = self.test 377 | self.test_labels = np.load(data_path + "/NIPS_TS_CCard_test_label.npy") 378 | 379 | def __len__(self): 380 | 381 | if self.mode == "train": 382 | return (self.train.shape[0] - self.win_size) // self.step + 1 383 | elif (self.mode == 'val'): 384 | return (self.val.shape[0] - self.win_size) // self.step + 1 385 | elif (self.mode == 'test'): 386 | return (self.test.shape[0] - self.win_size) // self.step + 1 387 | else: 388 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 389 | 390 | def __getitem__(self, index): 391 | index = index * self.step 392 | if self.mode == "train": 393 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 394 | elif (self.mode == 'val'): 395 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 396 | elif (self.mode == 'test'): 397 | return np.float32(self.test[index:index + self.win_size]), np.float32( 398 | self.test_labels[index:index + self.win_size]) 399 | else: 400 | return np.float32(self.test[ 401 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 402 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 403 | 404 | 405 | 406 | 407 | class SMD_OriSegLoader(object): 408 | def __init__(self, index, data_path, win_size, step, mode="train"): 409 | self.mode = mode 410 | self.step = step 411 | self.index = index 412 | self.win_size = win_size 413 | self.scaler = StandardScaler() 414 | data = np.load(data_path + "/SMD_Ori_"+str(index)+"_train.npy") 415 | self.scaler.fit(data) 416 | data = self.scaler.transform(data) 417 | test_data = np.load(data_path + "/SMD_Ori_"+str(index)+"_test.npy") 418 | self.test = self.scaler.transform(test_data) 419 | 420 | self.train = data 421 | self.val = self.test 422 | self.test_labels = np.load(data_path + "/SMD_Ori_"+str(index)+"_test_label.npy") 423 | if self.mode == "val": 424 | print("train:", self.train.shape) 425 | print("test:", self.test.shape) 426 | 427 | def __len__(self): 428 | if self.mode == "train": 429 | return (self.train.shape[0] - self.win_size) // self.step + 1 430 | elif (self.mode == 'val'): 431 | return (self.val.shape[0] - self.win_size) // self.step + 1 432 | elif (self.mode == 'test'): 433 | return (self.test.shape[0] - self.win_size) // self.step + 1 434 | else: 435 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 436 | 437 | def __getitem__(self, index): 438 | index = index * self.step 439 | if self.mode == "train": 440 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 441 | elif (self.mode == 'val'): 442 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 443 | elif (self.mode == 'test'): 444 | return np.float32(self.test[index:index + self.win_size]), np.float32( 445 | self.test_labels[index:index + self.win_size]) 446 | else: 447 | return np.float32(self.test[ 448 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 449 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 450 | 451 | class SWATSegLoader(Dataset): 452 | def __init__(self, root_path, win_size, step=1, flag="train"): 453 | self.flag = flag 454 | self.step = step 455 | self.win_size = win_size 456 | self.scaler = StandardScaler() 457 | 458 | train_data = pd.read_csv(os.path.join(root_path, 'swat_train2.csv')) 459 | test_data = pd.read_csv(os.path.join(root_path, 'swat2.csv')) 460 | labels = test_data.values[:, -1:] 461 | train_data = train_data.values[:, :-1] 462 | test_data = test_data.values[:, :-1] 463 | 464 | self.scaler.fit(train_data) 465 | train_data = self.scaler.transform(train_data) 466 | test_data = self.scaler.transform(test_data) 467 | self.train = train_data 468 | self.test = test_data 469 | data_len = len(self.train) 470 | self.val = self.train[(int)(data_len * 0.8):] 471 | self.test_labels = labels 472 | print("test:", self.test.shape) 473 | print("train:", self.train.shape) 474 | 475 | def __len__(self): 476 | """ 477 | Number of images in the object dataset. 478 | """ 479 | if self.flag == "train": 480 | return (self.train.shape[0] - self.win_size) // self.step + 1 481 | elif (self.flag == 'val'): 482 | return (self.val.shape[0] - self.win_size) // self.step + 1 483 | elif (self.flag == 'test'): 484 | return (self.test.shape[0] - self.win_size) // self.step + 1 485 | else: 486 | return (self.test.shape[0] - self.win_size) // self.win_size + 1 487 | 488 | def __getitem__(self, index): 489 | index = index * self.step 490 | if self.flag == "train": 491 | return np.float32(self.train[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 492 | elif (self.flag == 'val'): 493 | return np.float32(self.val[index:index + self.win_size]), np.float32(self.test_labels[0:self.win_size]) 494 | elif (self.flag == 'test'): 495 | return np.float32(self.test[index:index + self.win_size]), np.float32( 496 | self.test_labels[index:index + self.win_size]) 497 | else: 498 | return np.float32(self.test[ 499 | index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]), np.float32( 500 | self.test_labels[index // self.step * self.win_size:index // self.step * self.win_size + self.win_size]) 501 | 502 | 503 | def get_loader_segment(index, data_path, batch_size, win_size=100, step=100, mode='train', dataset='KDD'): 504 | if (dataset == 'SMD'): 505 | dataset = SMDSegLoader(data_path, win_size, 1, mode) 506 | elif (dataset == 'MSL'): 507 | dataset = MSLSegLoader(data_path, win_size, 1, mode) 508 | elif (dataset == 'SMAP'): 509 | dataset = SMAPSegLoader(data_path, win_size, 1, mode) 510 | elif (dataset == 'PSM'): 511 | dataset = PSMSegLoader(data_path, win_size, 1, mode) 512 | elif (dataset =='SWAT'): 513 | dataset = SWATSegLoader(data_path,win_size,1,mode) 514 | elif (dataset == 'UCR'): 515 | dataset = UCRSegLoader(index, data_path, win_size, 1, mode) 516 | elif (dataset == 'UCR_AUG'): 517 | dataset = UCRAUGSegLoader(index, data_path, win_size, 1, mode) 518 | elif (dataset == 'NIPS_TS_Water'): 519 | dataset = NIPS_TS_WaterSegLoader(data_path, win_size, 1, mode) 520 | elif (dataset == 'NIPS_TS_Swan'): 521 | dataset = NIPS_TS_SwanSegLoader(data_path, win_size, 1, mode) 522 | elif (dataset == 'NIPS_TS_CCard'): 523 | dataset = NIPS_TS_CCardSegLoader(data_path, win_size, 1, mode) 524 | elif (dataset == 'SMD_Ori'): 525 | dataset = SMD_OriSegLoader(index, data_path, win_size, 1, mode) 526 | 527 | shuffle = False 528 | if mode == 'train': 529 | shuffle = True 530 | 531 | data_loader = DataLoader(dataset=dataset, 532 | batch_size=batch_size, 533 | shuffle=shuffle, 534 | num_workers=8, 535 | drop_last=True) 536 | return data_loader 537 | -------------------------------------------------------------------------------- /img/DCdetector.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/DCdetector.jpg -------------------------------------------------------------------------------- /img/art-compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/art-compare.png -------------------------------------------------------------------------------- /img/result_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_1.png -------------------------------------------------------------------------------- /img/result_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_2.png -------------------------------------------------------------------------------- /img/result_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_3.png -------------------------------------------------------------------------------- /img/result_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_4.png -------------------------------------------------------------------------------- /img/result_count.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/result_count.jpg -------------------------------------------------------------------------------- /img/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/img/workflow.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | from torch.backends import cudnn 5 | from utils.utils import * 6 | from solver import Solver 7 | import time 8 | import warnings 9 | warnings.filterwarnings('ignore') 10 | 11 | import sys 12 | 13 | class Logger(object): 14 | def __init__(self, filename='default.log', add_flag=True, stream=sys.stdout): 15 | self.terminal = stream 16 | self.filename = filename 17 | self.add_flag = add_flag 18 | 19 | def write(self, message): 20 | if self.add_flag: 21 | with open(self.filename, 'a+') as log: 22 | self.terminal.write(message) 23 | log.write(message) 24 | else: 25 | with open(self.filename, 'w') as log: 26 | self.terminal.write(message) 27 | log.write(message) 28 | 29 | def flush(self): 30 | pass 31 | 32 | 33 | def str2bool(v): 34 | return v.lower() in ('true') 35 | 36 | 37 | def find_nearest(array, value): 38 | array = np.asarray(array) 39 | idx = (np.abs(array - value)).argmin() 40 | return int(array[idx-1]) 41 | 42 | 43 | def main(config): 44 | cudnn.benchmark = True 45 | if (not os.path.exists(config.model_save_path)): 46 | mkdir(config.model_save_path) 47 | solver = Solver(vars(config)) 48 | 49 | if config.mode == 'train': 50 | solver.train() 51 | elif config.mode == 'test': 52 | solver.test() 53 | 54 | return solver 55 | 56 | if __name__ == '__main__': 57 | parser = argparse.ArgumentParser() 58 | 59 | # Alternative 60 | parser.add_argument('--win_size', type=int, default=100) 61 | parser.add_argument('--patch_size', type=list, default=[5]) 62 | parser.add_argument('--lr', type=float, default=1e-4) 63 | parser.add_argument('--loss_fuc', type=str, default='MSE') 64 | parser.add_argument('--n_heads', type=int, default=1) 65 | parser.add_argument('--e_layers', type=int, default=3) 66 | parser.add_argument('--d_model', type=int, default=256) 67 | parser.add_argument('--rec_timeseries', action='store_true', default=True) 68 | 69 | 70 | parser.add_argument('--use_gpu', type=bool, default=True, help='use gpu') 71 | parser.add_argument('--gpu', type=int, default=0, help='gpu') 72 | parser.add_argument('--use_multi_gpu', action='store_true', help='use multiple gpus', default=True) 73 | parser.add_argument('--devices', type=str, default='0,1,2,3',help='device ids of multile gpus') 74 | 75 | # Default 76 | parser.add_argument('--index', type=int, default=137) 77 | parser.add_argument('--num_epochs', type=int, default=10) 78 | parser.add_argument('--batch_size', type=int, default=128) 79 | parser.add_argument('--input_c', type=int, default=9) 80 | parser.add_argument('--output_c', type=int, default=9) 81 | parser.add_argument('--k', type=int, default=3) 82 | parser.add_argument('--dataset', type=str, default='credit') 83 | parser.add_argument('--mode', type=str, default='train', choices=['train', 'test']) 84 | parser.add_argument('--data_path', type=str, default='./dataset/creditcard_ts.csv') 85 | parser.add_argument('--model_save_path', type=str, default='checkpoints') 86 | 87 | parser.add_argument('--anormly_ratio', type=float, default=4.00) 88 | 89 | config = parser.parse_args() 90 | args = vars(config) 91 | config.patch_size = [int(patch_index) for patch_index in config.patch_size] 92 | 93 | 94 | if config.dataset == 'UCR': 95 | batch_size_buffer = [2,4,8,16,32,64,128,256] 96 | data_len = np.load('dataset/'+config.data_path + "/UCR_"+str(config.index)+"_train.npy").shape[0] 97 | config.batch_size = find_nearest(batch_size_buffer, data_len / config.win_size) 98 | elif config.dataset == 'UCR_AUG': 99 | batch_size_buffer = [2,4,8,16,32,64,128,256] 100 | data_len = np.load('dataset/'+config.data_path + "/UCR_AUG_"+str(config.index)+"_train.npy").shape[0] 101 | config.batch_size = find_nearest(batch_size_buffer, data_len / config.win_size) 102 | elif config.dataset == 'SMD_Ori': 103 | batch_size_buffer = [2,4,8,16,32,64,128,256,512] 104 | data_len = np.load('dataset/'+config.data_path + "/SMD_Ori_"+str(config.index)+"_train.npy").shape[0] 105 | config.batch_size = find_nearest(batch_size_buffer, data_len / config.win_size) 106 | 107 | 108 | config.use_gpu = True if torch.cuda.is_available() and config.use_gpu else False 109 | if config.use_gpu and config.use_multi_gpu: 110 | config.devices = config.devices.replace(' ','') 111 | device_ids = config.devices.split(',') 112 | config.device_ids = [int(id_) for id_ in device_ids] 113 | config.gpu = config.device_ids[0] 114 | 115 | 116 | sys.stdout = Logger("result/"+ config.data_path +".log", sys.stdout) 117 | if config.mode == 'train': 118 | print("\n\n") 119 | print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 120 | print('================ Hyperparameters ===============') 121 | for k, v in sorted(args.items()): 122 | print('%s: %s' % (str(k), str(v))) 123 | print('==================== Train ===================') 124 | 125 | main(config) 126 | 127 | 128 | -------------------------------------------------------------------------------- /metrics/AUC.py: -------------------------------------------------------------------------------- 1 | # used by paper: TSB-UAD as the main evaluator 2 | # github: https://github.com/johnpaparrizos/TSB-UAD/blob/main/TSB_AD/utils/metrics.py 3 | import numpy as np 4 | from sklearn import metrics 5 | from metrics.evaluate_utils import find_length,range_convers_new 6 | 7 | 8 | def extend_postive_range(x, window=16): 9 | label = x.copy().astype(float) 10 | # print(label) 11 | L = range_convers_new(label) # index of non-zero segments 12 | # print(L) 13 | length = len(label) 14 | for k in range(len(L)): 15 | s = L[k][0] 16 | e = L[k][1] 17 | # x1 is the extended list like [1,2,3] which are non-zero(from the end-e) 18 | x1 = np.arange(e, min(e + window // 2, length)) 19 | label[x1] += np.sqrt(1 - (x1 - e) / (window)) 20 | # before the start-s 21 | x2 = np.arange(max(s - window // 2, 0), s) 22 | label[x2] += np.sqrt(1 - (s - x2) / (window)) 23 | 24 | label = np.minimum(np.ones(length), label) 25 | return label 26 | 27 | 28 | def extend_postive_range_individual(x, percentage=0.2): 29 | label = x.copy().astype(float) 30 | L = range_convers_new(label) # index of non-zero segments 31 | length = len(label) 32 | for k in range(len(L)): 33 | s = L[k][0] 34 | e = L[k][1] 35 | 36 | l0 = int((e - s + 1) * percentage) 37 | 38 | x1 = np.arange(e, min(e + l0, length)) 39 | label[x1] += np.sqrt(1 - (x1 - e) / (2 * l0)) 40 | 41 | x2 = np.arange(max(s - l0, 0), s) 42 | label[x2] += np.sqrt(1 - (s - x2) / (2 * l0)) 43 | 44 | label = np.minimum(np.ones(length), label) 45 | return label 46 | 47 | 48 | def TPR_FPR_RangeAUC(labels, pred, P, L): 49 | product = labels * pred 50 | 51 | TP = np.sum(product) 52 | 53 | # recall = min(TP/P,1) 54 | P_new = (P + np.sum(labels)) / 2 # so TPR is neither large nor small 55 | # P_new = np.sum(labels) 56 | recall = min(TP / P_new, 1) 57 | # recall = TP/np.sum(labels) 58 | # print('recall '+str(recall)) 59 | 60 | existence = 0 61 | for seg in L: 62 | if np.sum(product[seg[0]:(seg[1] + 1)]) > 0: 63 | existence += 1 64 | 65 | existence_ratio = existence / len(L) 66 | # print(existence_ratio) 67 | 68 | # TPR_RangeAUC = np.sqrt(recall*existence_ratio) 69 | # print(existence_ratio) 70 | TPR_RangeAUC = recall * existence_ratio 71 | 72 | FP = np.sum(pred) - TP 73 | # TN = np.sum((1-pred) * (1-labels)) 74 | 75 | # FPR_RangeAUC = FP/(FP+TN) 76 | N_new = len(labels) - P_new 77 | FPR_RangeAUC = FP / N_new 78 | 79 | Precision_RangeAUC = TP / np.sum(pred) 80 | 81 | return TPR_RangeAUC, FPR_RangeAUC, Precision_RangeAUC 82 | 83 | 84 | def Range_AUC(score_t_test, y_test, window=5, percentage=0, plot_ROC=False, AUC_type='window'): 85 | # AUC_type='window'/'percentage' 86 | score = score_t_test 87 | labels = y_test 88 | score_sorted = -np.sort(-score) 89 | 90 | P = np.sum(labels) 91 | # print(np.sum(labels)) 92 | if AUC_type == 'window': 93 | labels = extend_postive_range(labels, window=window) 94 | else: 95 | labels = extend_postive_range_individual(labels, percentage=percentage) 96 | 97 | # print(np.sum(labels)) 98 | L = range_convers_new(labels) 99 | TPR_list = [0] 100 | FPR_list = [0] 101 | Precision_list = [1] 102 | 103 | for i in np.linspace(0, len(score) - 1, 250).astype(int): 104 | threshold = score_sorted[i] 105 | # print('thre='+str(threshold)) 106 | pred = score >= threshold 107 | TPR, FPR, Precision = TPR_FPR_RangeAUC(labels, pred, P, L) 108 | 109 | TPR_list.append(TPR) 110 | FPR_list.append(FPR) 111 | Precision_list.append(Precision) 112 | 113 | TPR_list.append(1) 114 | FPR_list.append(1) # otherwise, range-AUC will stop earlier than (1,1) 115 | 116 | tpr = np.array(TPR_list) 117 | fpr = np.array(FPR_list) 118 | prec = np.array(Precision_list) 119 | 120 | width = fpr[1:] - fpr[:-1] 121 | height = (tpr[1:] + tpr[:-1]) / 2 122 | AUC_range = np.sum(width * height) 123 | 124 | width_PR = tpr[1:-1] - tpr[:-2] 125 | height_PR = (prec[1:] + prec[:-1]) / 2 126 | AP_range = np.sum(width_PR * height_PR) 127 | 128 | if plot_ROC: 129 | return AUC_range, AP_range, fpr, tpr, prec 130 | 131 | return AUC_range 132 | 133 | 134 | def point_wise_AUC(score_t_test, y_test, plot_ROC=False): 135 | # area under curve 136 | label = y_test 137 | score = score_t_test 138 | auc = metrics.roc_auc_score(label, score) 139 | # plor ROC curve 140 | if plot_ROC: 141 | fpr, tpr, thresholds = metrics.roc_curve(label, score) 142 | # display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc) 143 | # display.plot() 144 | return auc, fpr, tpr 145 | else: 146 | return auc 147 | 148 | 149 | def main(): 150 | y_test = np.zeros(100) 151 | y_test[10:20] = 1 152 | y_test[50:60] = 1 153 | pred_labels = np.zeros(100) 154 | pred_labels[15:17] = 0.5 155 | pred_labels[55:62] = 0.7 156 | # pred_labels[51:55] = 1 157 | # true_events = get_events(y_test) 158 | point_auc = point_wise_AUC(pred_labels, y_test) 159 | range_auc = Range_AUC(pred_labels, y_test) 160 | print("point_auc: {}, range_auc: {}".format(point_auc, range_auc)) 161 | 162 | 163 | if __name__ == "__main__": 164 | main() -------------------------------------------------------------------------------- /metrics/Matthews_correlation_coefficient.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import confusion_matrix 2 | import numpy as np 3 | 4 | 5 | def MCC(y_test, pred_labels): 6 | tn, fp, fn, tp = confusion_matrix(y_test, pred_labels).ravel() 7 | MCC_score = (tp*tn-fp*fn)/(((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))**0.5) 8 | 9 | return MCC_score 10 | 11 | 12 | def main(): 13 | y_test = np.zeros(100) 14 | y_test[10:20] = 1 15 | y_test[50:60] = 1 16 | pred_labels = np.zeros(100) 17 | pred_labels[15:17] = 1 18 | pred_labels[55:62] = 1 19 | # pred_labels[51:55] = 1 20 | # true_events = get_events(y_test) 21 | confusion_matric = MCC(y_test, pred_labels) 22 | # print(confusion_matric) 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /metrics/affiliation/_affiliation_zone.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from metrics.affiliation._integral_interval import interval_intersection 4 | 5 | def t_start(j, Js = [(1,2),(3,4),(5,6)], Trange = (1,10)): 6 | """ 7 | Helper for `E_gt_func` 8 | 9 | :param j: index from 0 to len(Js) (included) on which to get the start 10 | :param Js: ground truth events, as a list of couples 11 | :param Trange: range of the series where Js is included 12 | :return: generalized start such that the middle of t_start and t_stop 13 | always gives the affiliation zone 14 | """ 15 | b = max(Trange) 16 | n = len(Js) 17 | if j == n: 18 | return(2*b - t_stop(n-1, Js, Trange)) 19 | else: 20 | return(Js[j][0]) 21 | 22 | def t_stop(j, Js = [(1,2),(3,4),(5,6)], Trange = (1,10)): 23 | """ 24 | Helper for `E_gt_func` 25 | 26 | :param j: index from 0 to len(Js) (included) on which to get the stop 27 | :param Js: ground truth events, as a list of couples 28 | :param Trange: range of the series where Js is included 29 | :return: generalized stop such that the middle of t_start and t_stop 30 | always gives the affiliation zone 31 | """ 32 | if j == -1: 33 | a = min(Trange) 34 | return(2*a - t_start(0, Js, Trange)) 35 | else: 36 | return(Js[j][1]) 37 | 38 | def E_gt_func(j, Js, Trange): 39 | """ 40 | Get the affiliation zone of element j of the ground truth 41 | 42 | :param j: index from 0 to len(Js) (excluded) on which to get the zone 43 | :param Js: ground truth events, as a list of couples 44 | :param Trange: range of the series where Js is included, can 45 | be (-math.inf, math.inf) for distance measures 46 | :return: affiliation zone of element j of the ground truth represented 47 | as a couple 48 | """ 49 | range_left = (t_stop(j-1, Js, Trange) + t_start(j, Js, Trange))/2 50 | range_right = (t_stop(j, Js, Trange) + t_start(j+1, Js, Trange))/2 51 | return((range_left, range_right)) 52 | 53 | def get_all_E_gt_func(Js, Trange): 54 | """ 55 | Get the affiliation partition from the ground truth point of view 56 | 57 | :param Js: ground truth events, as a list of couples 58 | :param Trange: range of the series where Js is included, can 59 | be (-math.inf, math.inf) for distance measures 60 | :return: affiliation partition of the events 61 | """ 62 | # E_gt is the limit of affiliation/attraction for each ground truth event 63 | E_gt = [E_gt_func(j, Js, Trange) for j in range(len(Js))] 64 | return(E_gt) 65 | 66 | def affiliation_partition(Is = [(1,1.5),(2,5),(5,6),(8,9)], E_gt = [(1,2.5),(2.5,4.5),(4.5,10)]): 67 | """ 68 | Cut the events into the affiliation zones 69 | The presentation given here is from the ground truth point of view, 70 | but it is also used in the reversed direction in the main function. 71 | 72 | :param Is: events as a list of couples 73 | :param E_gt: range of the affiliation zones 74 | :return: a list of list of intervals (each interval represented by either 75 | a couple or None for empty interval). The outer list is indexed by each 76 | affiliation zone of `E_gt`. The inner list is indexed by the events of `Is`. 77 | """ 78 | out = [None] * len(E_gt) 79 | for j in range(len(E_gt)): 80 | E_gt_j = E_gt[j] 81 | discarded_idx_before = [I[1] < E_gt_j[0] for I in Is] # end point of predicted I is before the begin of E 82 | discarded_idx_after = [I[0] > E_gt_j[1] for I in Is] # start of predicted I is after the end of E 83 | kept_index = [not(a or b) for a, b in zip(discarded_idx_before, discarded_idx_after)] 84 | Is_j = [x for x, y in zip(Is, kept_index)] 85 | out[j] = [interval_intersection(I, E_gt[j]) for I in Is_j] 86 | return(out) 87 | -------------------------------------------------------------------------------- /metrics/affiliation/_integral_interval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import math 4 | from metrics.affiliation.generics import _sum_wo_nan 5 | """ 6 | In order to shorten the length of the variables, 7 | the general convention in this file is to let: 8 | - I for a predicted event (start, stop), 9 | - Is for a list of predicted events, 10 | - J for a ground truth event, 11 | - Js for a list of ground truth events. 12 | """ 13 | 14 | def interval_length(J = (1,2)): 15 | """ 16 | Length of an interval 17 | 18 | :param J: couple representating the start and stop of an interval, or None 19 | :return: length of the interval, and 0 for a None interval 20 | """ 21 | if J is None: 22 | return(0) 23 | return(J[1] - J[0]) 24 | 25 | def sum_interval_lengths(Is = [(1,2),(3,4),(5,6)]): 26 | """ 27 | Sum of length of the intervals 28 | 29 | :param Is: list of intervals represented by starts and stops 30 | :return: sum of the interval length 31 | """ 32 | return(sum([interval_length(I) for I in Is])) 33 | 34 | def interval_intersection(I = (1, 3), J = (2, 4)): 35 | """ 36 | Intersection between two intervals I and J 37 | I and J should be either empty or represent a positive interval (no point) 38 | 39 | :param I: an interval represented by start and stop 40 | :param J: a second interval of the same form 41 | :return: an interval representing the start and stop of the intersection (or None if empty) 42 | """ 43 | if I is None: 44 | return(None) 45 | if J is None: 46 | return(None) 47 | 48 | I_inter_J = (max(I[0], J[0]), min(I[1], J[1])) 49 | if I_inter_J[0] >= I_inter_J[1]: 50 | return(None) 51 | else: 52 | return(I_inter_J) 53 | 54 | def interval_subset(I = (1, 3), J = (0, 6)): 55 | """ 56 | Checks whether I is a subset of J 57 | 58 | :param I: an non empty interval represented by start and stop 59 | :param J: a second non empty interval of the same form 60 | :return: True if I is a subset of J 61 | """ 62 | if (I[0] >= J[0]) and (I[1] <= J[1]): 63 | return True 64 | else: 65 | return False 66 | 67 | def cut_into_three_func(I, J): 68 | """ 69 | Cut an interval I into a partition of 3 subsets: 70 | the elements before J, 71 | the elements belonging to J, 72 | and the elements after J 73 | 74 | :param I: an interval represented by start and stop, or None for an empty one 75 | :param J: a non empty interval 76 | :return: a triplet of three intervals, each represented by either (start, stop) or None 77 | """ 78 | if I is None: 79 | return((None, None, None)) 80 | 81 | I_inter_J = interval_intersection(I, J) 82 | if I == I_inter_J: 83 | I_before = None 84 | I_after = None 85 | elif I[1] <= J[0]: 86 | I_before = I 87 | I_after = None 88 | elif I[0] >= J[1]: 89 | I_before = None 90 | I_after = I 91 | elif (I[0] <= J[0]) and (I[1] >= J[1]): 92 | I_before = (I[0], I_inter_J[0]) 93 | I_after = (I_inter_J[1], I[1]) 94 | elif I[0] <= J[0]: 95 | I_before = (I[0], I_inter_J[0]) 96 | I_after = None 97 | elif I[1] >= J[1]: 98 | I_before = None 99 | I_after = (I_inter_J[1], I[1]) 100 | else: 101 | raise ValueError('unexpected unconsidered case') 102 | return(I_before, I_inter_J, I_after) 103 | 104 | def get_pivot_j(I, J): 105 | """ 106 | Get the single point of J that is the closest to I, called 'pivot' here, 107 | with the requirement that I should be outside J 108 | 109 | :param I: a non empty interval (start, stop) 110 | :param J: another non empty interval, with empty intersection with I 111 | :return: the element j of J that is the closest to I 112 | """ 113 | if interval_intersection(I, J) is not None: 114 | raise ValueError('I and J should have a void intersection') 115 | 116 | j_pivot = None # j_pivot is a border of J 117 | if max(I) <= min(J): 118 | j_pivot = min(J) 119 | elif min(I) >= max(J): 120 | j_pivot = max(J) 121 | else: 122 | raise ValueError('I should be outside J') 123 | return(j_pivot) 124 | 125 | def integral_mini_interval(I, J): 126 | """ 127 | In the specific case where interval I is located outside J, 128 | integral of distance from x to J over the interval x \in I. 129 | This is the *integral* i.e. the sum. 130 | It's not the mean (not divided by the length of I yet) 131 | 132 | :param I: a interval (start, stop), or None 133 | :param J: a non empty interval, with empty intersection with I 134 | :return: the integral of distances d(x, J) over x \in I 135 | """ 136 | if I is None: 137 | return(0) 138 | 139 | j_pivot = get_pivot_j(I, J) 140 | a = min(I) 141 | b = max(I) 142 | return((b-a)*abs((j_pivot - (a+b)/2))) 143 | 144 | def integral_interval_distance(I, J): 145 | """ 146 | For any non empty intervals I, J, compute the 147 | integral of distance from x to J over the interval x \in I. 148 | This is the *integral* i.e. the sum. 149 | It's not the mean (not divided by the length of I yet) 150 | The interval I can intersect J or not 151 | 152 | :param I: a interval (start, stop), or None 153 | :param J: a non empty interval 154 | :return: the integral of distances d(x, J) over x \in I 155 | """ 156 | # I and J are single intervals (not generic sets) 157 | # I is a predicted interval in the range of affiliation of J 158 | 159 | def f(I_cut): 160 | return(integral_mini_interval(I_cut, J)) 161 | # If I_middle is fully included into J, it is 162 | # the distance to J is always 0 163 | def f0(I_middle): 164 | return(0) 165 | 166 | cut_into_three = cut_into_three_func(I, J) 167 | # Distance for now, not the mean: 168 | # Distance left: Between cut_into_three[0] and the point min(J) 169 | d_left = f(cut_into_three[0]) 170 | # Distance middle: Between cut_into_three[1] = I inter J, and J 171 | d_middle = f0(cut_into_three[1]) 172 | # Distance right: Between cut_into_three[2] and the point max(J) 173 | d_right = f(cut_into_three[2]) 174 | # It's an integral so summable 175 | return(d_left + d_middle + d_right) 176 | 177 | def integral_mini_interval_P_CDFmethod__min_piece(I, J, E): 178 | """ 179 | Helper of `integral_mini_interval_Pprecision_CDFmethod` 180 | In the specific case where interval I is located outside J, 181 | compute the integral $\int_{d_min}^{d_max} \min(m, x) dx$, with: 182 | - m the smallest distance from J to E, 183 | - d_min the smallest distance d(x, J) from x \in I to J 184 | - d_max the largest distance d(x, J) from x \in I to J 185 | 186 | :param I: a single predicted interval, a non empty interval (start, stop) 187 | :param J: ground truth interval, a non empty interval, with empty intersection with I 188 | :param E: the affiliation/influence zone for J, represented as a couple (start, stop) 189 | :return: the integral $\int_{d_min}^{d_max} \min(m, x) dx$ 190 | """ 191 | if interval_intersection(I, J) is not None: 192 | raise ValueError('I and J should have a void intersection') 193 | if not interval_subset(J, E): 194 | raise ValueError('J should be included in E') 195 | if not interval_subset(I, E): 196 | raise ValueError('I should be included in E') 197 | 198 | e_min = min(E) 199 | j_min = min(J) 200 | j_max = max(J) 201 | e_max = max(E) 202 | i_min = min(I) 203 | i_max = max(I) 204 | 205 | d_min = max(i_min - j_max, j_min - i_max) 206 | d_max = max(i_max - j_max, j_min - i_min) 207 | m = min(j_min - e_min, e_max - j_max) 208 | A = min(d_max, m)**2 - min(d_min, m)**2 209 | B = max(d_max, m) - max(d_min, m) 210 | C = (1/2)*A + m*B 211 | return(C) 212 | 213 | def integral_mini_interval_Pprecision_CDFmethod(I, J, E): 214 | """ 215 | Integral of the probability of distances over the interval I. 216 | In the specific case where interval I is located outside J, 217 | compute the integral $\int_{x \in I} Fbar(dist(x,J)) dx$. 218 | This is the *integral* i.e. the sum (not the mean) 219 | 220 | :param I: a single predicted interval, a non empty interval (start, stop) 221 | :param J: ground truth interval, a non empty interval, with empty intersection with I 222 | :param E: the affiliation/influence zone for J, represented as a couple (start, stop) 223 | :return: the integral $\int_{x \in I} Fbar(dist(x,J)) dx$ 224 | """ 225 | integral_min_piece = integral_mini_interval_P_CDFmethod__min_piece(I, J, E) 226 | 227 | e_min = min(E) 228 | j_min = min(J) 229 | j_max = max(J) 230 | e_max = max(E) 231 | i_min = min(I) 232 | i_max = max(I) 233 | d_min = max(i_min - j_max, j_min - i_max) 234 | d_max = max(i_max - j_max, j_min - i_min) 235 | integral_linear_piece = (1/2)*(d_max**2 - d_min**2) 236 | integral_remaining_piece = (j_max - j_min)*(i_max - i_min) 237 | 238 | DeltaI = i_max - i_min 239 | DeltaE = e_max - e_min 240 | 241 | output = DeltaI - (1/DeltaE)*(integral_min_piece + integral_linear_piece + integral_remaining_piece) 242 | return(output) 243 | 244 | def integral_interval_probaCDF_precision(I, J, E): 245 | """ 246 | Integral of the probability of distances over the interval I. 247 | Compute the integral $\int_{x \in I} Fbar(dist(x,J)) dx$. 248 | This is the *integral* i.e. the sum (not the mean) 249 | 250 | :param I: a single (non empty) predicted interval in the zone of affiliation of J 251 | :param J: ground truth interval 252 | :param E: affiliation/influence zone for J 253 | :return: the integral $\int_{x \in I} Fbar(dist(x,J)) dx$ 254 | """ 255 | # I and J are single intervals (not generic sets) 256 | def f(I_cut): 257 | if I_cut is None: 258 | return(0) 259 | else: 260 | return(integral_mini_interval_Pprecision_CDFmethod(I_cut, J, E)) 261 | 262 | # If I_middle is fully included into J, it is 263 | # integral of 1 on the interval I_middle, so it's |I_middle| 264 | def f0(I_middle): 265 | if I_middle is None: 266 | return(0) 267 | else: 268 | return(max(I_middle) - min(I_middle)) 269 | 270 | cut_into_three = cut_into_three_func(I, J) 271 | # Distance for now, not the mean: 272 | # Distance left: Between cut_into_three[0] and the point min(J) 273 | d_left = f(cut_into_three[0]) 274 | # Distance middle: Between cut_into_three[1] = I inter J, and J 275 | d_middle = f0(cut_into_three[1]) 276 | # Distance right: Between cut_into_three[2] and the point max(J) 277 | d_right = f(cut_into_three[2]) 278 | # It's an integral so summable 279 | return(d_left + d_middle + d_right) 280 | 281 | def cut_J_based_on_mean_func(J, e_mean): 282 | """ 283 | Helper function for the recall. 284 | Partition J into two intervals: before and after e_mean 285 | (e_mean represents the center element of E the zone of affiliation) 286 | 287 | :param J: ground truth interval 288 | :param e_mean: a float number (center value of E) 289 | :return: a couple partitionning J into (J_before, J_after) 290 | """ 291 | if J is None: 292 | J_before = None 293 | J_after = None 294 | elif e_mean >= max(J): 295 | J_before = J 296 | J_after = None 297 | elif e_mean <= min(J): 298 | J_before = None 299 | J_after = J 300 | else: # e_mean is across J 301 | J_before = (min(J), e_mean) 302 | J_after = (e_mean, max(J)) 303 | 304 | return((J_before, J_after)) 305 | 306 | def integral_mini_interval_Precall_CDFmethod(I, J, E): 307 | """ 308 | Integral of the probability of distances over the interval J. 309 | In the specific case where interval J is located outside I, 310 | compute the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$. 311 | This is the *integral* i.e. the sum (not the mean) 312 | 313 | :param I: a single (non empty) predicted interval 314 | :param J: ground truth (non empty) interval, with empty intersection with I 315 | :param E: the affiliation/influence zone for J, represented as a couple (start, stop) 316 | :return: the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$ 317 | """ 318 | # The interval J should be located outside I 319 | # (so it's either the left piece or the right piece w.r.t I) 320 | i_pivot = get_pivot_j(J, I) 321 | e_min = min(E) 322 | e_max = max(E) 323 | e_mean = (e_min + e_max) / 2 324 | 325 | # If i_pivot is outside E (it's possible), then 326 | # the distance is worst that any random element within E, 327 | # so we set the recall to 0 328 | if i_pivot <= min(E): 329 | return(0) 330 | elif i_pivot >= max(E): 331 | return(0) 332 | # Otherwise, we have at least i_pivot in E and so d < M so min(d,M)=d 333 | 334 | cut_J_based_on_e_mean = cut_J_based_on_mean_func(J, e_mean) 335 | J_before = cut_J_based_on_e_mean[0] 336 | J_after = cut_J_based_on_e_mean[1] 337 | 338 | iemin_mean = (e_min + i_pivot)/2 339 | cut_Jbefore_based_on_iemin_mean = cut_J_based_on_mean_func(J_before, iemin_mean) 340 | J_before_closeE = cut_Jbefore_based_on_iemin_mean[0] # before e_mean and closer to e_min than i_pivot ~ J_before_before 341 | J_before_closeI = cut_Jbefore_based_on_iemin_mean[1] # before e_mean and closer to i_pivot than e_min ~ J_before_after 342 | 343 | iemax_mean = (e_max + i_pivot)/2 344 | cut_Jafter_based_on_iemax_mean = cut_J_based_on_mean_func(J_after, iemax_mean) 345 | J_after_closeI = cut_Jafter_based_on_iemax_mean[0] # after e_mean and closer to i_pivot than e_max ~ J_after_before 346 | J_after_closeE = cut_Jafter_based_on_iemax_mean[1] # after e_mean and closer to e_max than i_pivot ~ J_after_after 347 | 348 | if J_before_closeE is not None: 349 | j_before_before_min = min(J_before_closeE) # == min(J) 350 | j_before_before_max = max(J_before_closeE) 351 | else: 352 | j_before_before_min = math.nan 353 | j_before_before_max = math.nan 354 | 355 | if J_before_closeI is not None: 356 | j_before_after_min = min(J_before_closeI) # == j_before_before_max if existing 357 | j_before_after_max = max(J_before_closeI) # == max(J_before) 358 | else: 359 | j_before_after_min = math.nan 360 | j_before_after_max = math.nan 361 | 362 | if J_after_closeI is not None: 363 | j_after_before_min = min(J_after_closeI) # == min(J_after) 364 | j_after_before_max = max(J_after_closeI) 365 | else: 366 | j_after_before_min = math.nan 367 | j_after_before_max = math.nan 368 | 369 | if J_after_closeE is not None: 370 | j_after_after_min = min(J_after_closeE) # == j_after_before_max if existing 371 | j_after_after_max = max(J_after_closeE) # == max(J) 372 | else: 373 | j_after_after_min = math.nan 374 | j_after_after_max = math.nan 375 | 376 | # <-- J_before_closeE --> <-- J_before_closeI --> <-- J_after_closeI --> <-- J_after_closeE --> 377 | # j_bb_min j_bb_max j_ba_min j_ba_max j_ab_min j_ab_max j_aa_min j_aa_max 378 | # (with `b` for before and `a` for after in the previous variable names) 379 | 380 | # vs e_mean m = min(t-e_min, e_max-t) d=|i_pivot-t| min(d,m) \int min(d,m)dt \int d dt \int_(min(d,m)+d)dt \int_{t \in J}(min(d,m)+d)dt 381 | # Case J_before_closeE & i_pivot after J before t-e_min i_pivot-t min(i_pivot-t,t-e_min) = t-e_min t^2/2-e_min*t i_pivot*t-t^2/2 t^2/2-e_min*t+i_pivot*t-t^2/2 = (i_pivot-e_min)*t (i_pivot-e_min)*tB - (i_pivot-e_min)*tA = (i_pivot-e_min)*(tB-tA) 382 | # Case J_before_closeI & i_pivot after J before t-e_min i_pivot-t min(i_pivot-t,t-e_min) = i_pivot-t i_pivot*t-t^2/2 i_pivot*t-t^2/2 i_pivot*t-t^2/2+i_pivot*t-t^2/2 = 2*i_pivot*t-t^2 2*i_pivot*tB-tB^2 - 2*i_pivot*tA + tA^2 = 2*i_pivot*(tB-tA) - (tB^2 - tA^2) 383 | # Case J_after_closeI & i_pivot after J after e_max-t i_pivot-t min(i_pivot-t,e_max-t) = i_pivot-t i_pivot*t-t^2/2 i_pivot*t-t^2/2 i_pivot*t-t^2/2+i_pivot*t-t^2/2 = 2*i_pivot*t-t^2 2*i_pivot*tB-tB^2 - 2*i_pivot*tA + tA^2 = 2*i_pivot*(tB-tA) - (tB^2 - tA^2) 384 | # Case J_after_closeE & i_pivot after J after e_max-t i_pivot-t min(i_pivot-t,e_max-t) = e_max-t e_max*t-t^2/2 i_pivot*t-t^2/2 e_max*t-t^2/2+i_pivot*t-t^2/2 = (e_max+i_pivot)*t-t^2 (e_max+i_pivot)*tB-tB^2 - (e_max+i_pivot)*tA + tA^2 = (e_max+i_pivot)*(tB-tA) - (tB^2 - tA^2) 385 | # 386 | # Case J_before_closeE & i_pivot before J before t-e_min t-i_pivot min(t-i_pivot,t-e_min) = t-e_min t^2/2-e_min*t t^2/2-i_pivot*t t^2/2-e_min*t+t^2/2-i_pivot*t = t^2-(e_min+i_pivot)*t tB^2-(e_min+i_pivot)*tB - tA^2 + (e_min+i_pivot)*tA = (tB^2 - tA^2) - (e_min+i_pivot)*(tB-tA) 387 | # Case J_before_closeI & i_pivot before J before t-e_min t-i_pivot min(t-i_pivot,t-e_min) = t-i_pivot t^2/2-i_pivot*t t^2/2-i_pivot*t t^2/2-i_pivot*t+t^2/2-i_pivot*t = t^2-2*i_pivot*t tB^2-2*i_pivot*tB - tA^2 + 2*i_pivot*tA = (tB^2 - tA^2) - 2*i_pivot*(tB-tA) 388 | # Case J_after_closeI & i_pivot before J after e_max-t t-i_pivot min(t-i_pivot,e_max-t) = t-i_pivot t^2/2-i_pivot*t t^2/2-i_pivot*t t^2/2-i_pivot*t+t^2/2-i_pivot*t = t^2-2*i_pivot*t tB^2-2*i_pivot*tB - tA^2 + 2*i_pivot*tA = (tB^2 - tA^2) - 2*i_pivot*(tB-tA) 389 | # Case J_after_closeE & i_pivot before J after e_max-t t-i_pivot min(t-i_pivot,e_max-t) = e_max-t e_max*t-t^2/2 t^2/2-i_pivot*t e_max*t-t^2/2+t^2/2-i_pivot*t = (e_max-i_pivot)*t (e_max-i_pivot)*tB - (e_max-i_pivot)*tA = (e_max-i_pivot)*(tB-tA) 390 | 391 | if i_pivot >= max(J): 392 | part1_before_closeE = (i_pivot-e_min)*(j_before_before_max - j_before_before_min) # (i_pivot-e_min)*(tB-tA) # j_before_before_max - j_before_before_min 393 | part2_before_closeI = 2*i_pivot*(j_before_after_max-j_before_after_min) - (j_before_after_max**2 - j_before_after_min**2) # 2*i_pivot*(tB-tA) - (tB^2 - tA^2) # j_before_after_max - j_before_after_min 394 | part3_after_closeI = 2*i_pivot*(j_after_before_max-j_after_before_min) - (j_after_before_max**2 - j_after_before_min**2) # 2*i_pivot*(tB-tA) - (tB^2 - tA^2) # j_after_before_max - j_after_before_min 395 | part4_after_closeE = (e_max+i_pivot)*(j_after_after_max-j_after_after_min) - (j_after_after_max**2 - j_after_after_min**2) # (e_max+i_pivot)*(tB-tA) - (tB^2 - tA^2) # j_after_after_max - j_after_after_min 396 | out_parts = [part1_before_closeE, part2_before_closeI, part3_after_closeI, part4_after_closeE] 397 | elif i_pivot <= min(J): 398 | part1_before_closeE = (j_before_before_max**2 - j_before_before_min**2) - (e_min+i_pivot)*(j_before_before_max-j_before_before_min) # (tB^2 - tA^2) - (e_min+i_pivot)*(tB-tA) # j_before_before_max - j_before_before_min 399 | part2_before_closeI = (j_before_after_max**2 - j_before_after_min**2) - 2*i_pivot*(j_before_after_max-j_before_after_min) # (tB^2 - tA^2) - 2*i_pivot*(tB-tA) # j_before_after_max - j_before_after_min 400 | part3_after_closeI = (j_after_before_max**2 - j_after_before_min**2) - 2*i_pivot*(j_after_before_max - j_after_before_min) # (tB^2 - tA^2) - 2*i_pivot*(tB-tA) # j_after_before_max - j_after_before_min 401 | part4_after_closeE = (e_max-i_pivot)*(j_after_after_max - j_after_after_min) # (e_max-i_pivot)*(tB-tA) # j_after_after_max - j_after_after_min 402 | out_parts = [part1_before_closeE, part2_before_closeI, part3_after_closeI, part4_after_closeE] 403 | else: 404 | raise ValueError('The i_pivot should be outside J') 405 | 406 | out_integral_min_dm_plus_d = _sum_wo_nan(out_parts) # integral on all J, i.e. sum of the disjoint parts 407 | 408 | # We have for each point t of J: 409 | # \bar{F}_{t, recall}(d) = 1 - (1/|E|) * (min(d,m) + d) 410 | # Since t is a single-point here, and we are in the case where i_pivot is inside E. 411 | # The integral is then given by: 412 | # C = \int_{t \in J} \bar{F}_{t, recall}(D(t)) dt 413 | # = \int_{t \in J} 1 - (1/|E|) * (min(d,m) + d) dt 414 | # = |J| - (1/|E|) * [\int_{t \in J} (min(d,m) + d) dt] 415 | # = |J| - (1/|E|) * out_integral_min_dm_plus_d 416 | DeltaJ = max(J) - min(J) 417 | DeltaE = max(E) - min(E) 418 | C = DeltaJ - (1/DeltaE) * out_integral_min_dm_plus_d 419 | 420 | return(C) 421 | 422 | def integral_interval_probaCDF_recall(I, J, E): 423 | """ 424 | Integral of the probability of distances over the interval J. 425 | Compute the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$. 426 | This is the *integral* i.e. the sum (not the mean) 427 | 428 | :param I: a single (non empty) predicted interval 429 | :param J: ground truth (non empty) interval 430 | :param E: the affiliation/influence zone for J 431 | :return: the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$ 432 | """ 433 | # I and J are single intervals (not generic sets) 434 | # E is the outside affiliation interval of J (even for recall!) 435 | # (in particular J \subset E) 436 | # 437 | # J is the portion of the ground truth affiliated to I 438 | # I is a predicted interval (can be outside E possibly since it's recall) 439 | def f(J_cut): 440 | if J_cut is None: 441 | return(0) 442 | else: 443 | return integral_mini_interval_Precall_CDFmethod(I, J_cut, E) 444 | 445 | # If J_middle is fully included into I, it is 446 | # integral of 1 on the interval J_middle, so it's |J_middle| 447 | def f0(J_middle): 448 | if J_middle is None: 449 | return(0) 450 | else: 451 | return(max(J_middle) - min(J_middle)) 452 | 453 | cut_into_three = cut_into_three_func(J, I) # it's J that we cut into 3, depending on the position w.r.t I 454 | # since we integrate over J this time. 455 | # 456 | # Distance for now, not the mean: 457 | # Distance left: Between cut_into_three[0] and the point min(I) 458 | d_left = f(cut_into_three[0]) 459 | # Distance middle: Between cut_into_three[1] = J inter I, and I 460 | d_middle = f0(cut_into_three[1]) 461 | # Distance right: Between cut_into_three[2] and the point max(I) 462 | d_right = f(cut_into_three[2]) 463 | # It's an integral so summable 464 | return(d_left + d_middle + d_right) 465 | -------------------------------------------------------------------------------- /metrics/affiliation/_single_ground_truth_event.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import math 4 | from metrics.affiliation._affiliation_zone import ( 5 | get_all_E_gt_func, 6 | affiliation_partition) 7 | from metrics.affiliation._integral_interval import ( 8 | integral_interval_distance, 9 | integral_interval_probaCDF_precision, 10 | integral_interval_probaCDF_recall, 11 | interval_length, 12 | sum_interval_lengths) 13 | 14 | def affiliation_precision_distance(Is = [(1,2),(3,4),(5,6)], J = (2,5.5)): 15 | """ 16 | Compute the individual average distance from Is to a single ground truth J 17 | 18 | :param Is: list of predicted events within the affiliation zone of J 19 | :param J: couple representating the start and stop of a ground truth interval 20 | :return: individual average precision directed distance number 21 | """ 22 | if all([I is None for I in Is]): # no prediction in the current area 23 | return(math.nan) # undefined 24 | return(sum([integral_interval_distance(I, J) for I in Is]) / sum_interval_lengths(Is)) 25 | 26 | def affiliation_precision_proba(Is = [(1,2),(3,4),(5,6)], J = (2,5.5), E = (0,8)): 27 | """ 28 | Compute the individual precision probability from Is to a single ground truth J 29 | 30 | :param Is: list of predicted events within the affiliation zone of J 31 | :param J: couple representating the start and stop of a ground truth interval 32 | :param E: couple representing the start and stop of the zone of affiliation of J 33 | :return: individual precision probability in [0, 1], or math.nan if undefined 34 | """ 35 | if all([I is None for I in Is]): # no prediction in the current area 36 | return(math.nan) # undefined 37 | return(sum([integral_interval_probaCDF_precision(I, J, E) for I in Is]) / sum_interval_lengths(Is)) 38 | 39 | def affiliation_recall_distance(Is = [(1,2),(3,4),(5,6)], J = (2,5.5)): 40 | """ 41 | Compute the individual average distance from a single J to the predictions Is 42 | 43 | :param Is: list of predicted events within the affiliation zone of J 44 | :param J: couple representating the start and stop of a ground truth interval 45 | :return: individual average recall directed distance number 46 | """ 47 | Is = [I for I in Is if I is not None] # filter possible None in Is 48 | if len(Is) == 0: # there is no prediction in the current area 49 | return(math.inf) 50 | E_gt_recall = get_all_E_gt_func(Is, (-math.inf, math.inf)) # here from the point of view of the predictions 51 | Js = affiliation_partition([J], E_gt_recall) # partition of J depending of proximity with Is 52 | return(sum([integral_interval_distance(J[0], I) for I, J in zip(Is, Js)]) / interval_length(J)) 53 | 54 | def affiliation_recall_proba(Is = [(1,2),(3,4),(5,6)], J = (2,5.5), E = (0,8)): 55 | """ 56 | Compute the individual recall probability from a single ground truth J to Is 57 | 58 | :param Is: list of predicted events within the affiliation zone of J 59 | :param J: couple representating the start and stop of a ground truth interval 60 | :param E: couple representing the start and stop of the zone of affiliation of J 61 | :return: individual recall probability in [0, 1] 62 | """ 63 | Is = [I for I in Is if I is not None] # filter possible None in Is 64 | if len(Is) == 0: # there is no prediction in the current area 65 | return(0) 66 | E_gt_recall = get_all_E_gt_func(Is, E) # here from the point of view of the predictions 67 | Js = affiliation_partition([J], E_gt_recall) # partition of J depending of proximity with Is 68 | return(sum([integral_interval_probaCDF_recall(I, J[0], E) for I, J in zip(Is, Js)]) / interval_length(J)) 69 | -------------------------------------------------------------------------------- /metrics/affiliation/generics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from itertools import groupby 4 | from operator import itemgetter 5 | import math 6 | import gzip 7 | import glob 8 | import os 9 | 10 | def convert_vector_to_events(vector = [0, 1, 1, 0, 0, 1, 0]): 11 | """ 12 | Convert a binary vector (indicating 1 for the anomalous instances) 13 | to a list of events. The events are considered as durations, 14 | i.e. setting 1 at index i corresponds to an anomalous interval [i, i+1). 15 | 16 | :param vector: a list of elements belonging to {0, 1} 17 | :return: a list of couples, each couple representing the start and stop of 18 | each event 19 | """ 20 | positive_indexes = [idx for idx, val in enumerate(vector) if val > 0] 21 | events = [] 22 | for k, g in groupby(enumerate(positive_indexes), lambda ix : ix[0] - ix[1]): 23 | cur_cut = list(map(itemgetter(1), g)) 24 | events.append((cur_cut[0], cur_cut[-1])) 25 | 26 | # Consistent conversion in case of range anomalies (for indexes): 27 | # A positive index i is considered as the interval [i, i+1), 28 | # so the last index should be moved by 1 29 | events = [(x, y+1) for (x,y) in events] 30 | 31 | return(events) 32 | 33 | def infer_Trange(events_pred, events_gt): 34 | """ 35 | Given the list of events events_pred and events_gt, get the 36 | smallest possible Trange corresponding to the start and stop indexes 37 | of the whole series. 38 | Trange will not influence the measure of distances, but will impact the 39 | measures of probabilities. 40 | 41 | :param events_pred: a list of couples corresponding to predicted events 42 | :param events_gt: a list of couples corresponding to ground truth events 43 | :return: a couple corresponding to the smallest range containing the events 44 | """ 45 | if len(events_gt) == 0: 46 | raise ValueError('The gt events should contain at least one event') 47 | if len(events_pred) == 0: 48 | # empty prediction, base Trange only on events_gt (which is non empty) 49 | return(infer_Trange(events_gt, events_gt)) 50 | 51 | min_pred = min([x[0] for x in events_pred]) 52 | min_gt = min([x[0] for x in events_gt]) 53 | max_pred = max([x[1] for x in events_pred]) 54 | max_gt = max([x[1] for x in events_gt]) 55 | Trange = (min(min_pred, min_gt), max(max_pred, max_gt)) 56 | return(Trange) 57 | 58 | def has_point_anomalies(events): 59 | """ 60 | Checking whether events contain point anomalies, i.e. 61 | events starting and stopping at the same time. 62 | 63 | :param events: a list of couples corresponding to predicted events 64 | :return: True is the events have any point anomalies, False otherwise 65 | """ 66 | if len(events) == 0: 67 | return(False) 68 | return(min([x[1] - x[0] for x in events]) == 0) 69 | 70 | def _sum_wo_nan(vec): 71 | """ 72 | Sum of elements, ignoring math.isnan ones 73 | 74 | :param vec: vector of floating numbers 75 | :return: sum of the elements, ignoring math.isnan ones 76 | """ 77 | vec_wo_nan = [e for e in vec if not math.isnan(e)] 78 | return(sum(vec_wo_nan)) 79 | 80 | def _len_wo_nan(vec): 81 | """ 82 | Count of elements, ignoring math.isnan ones 83 | 84 | :param vec: vector of floating numbers 85 | :return: count of the elements, ignoring math.isnan ones 86 | """ 87 | vec_wo_nan = [e for e in vec if not math.isnan(e)] 88 | return(len(vec_wo_nan)) 89 | 90 | def read_gz_data(filename = 'data/machinetemp_groundtruth.gz'): 91 | """ 92 | Load a file compressed with gz, such that each line of the 93 | file is either 0 (representing a normal instance) or 1 (representing) 94 | an anomalous instance. 95 | :param filename: file path to the gz compressed file 96 | :return: list of integers with either 0 or 1 97 | """ 98 | with gzip.open(filename, 'rb') as f: 99 | content = f.read().splitlines() 100 | content = [int(x) for x in content] 101 | return(content) 102 | 103 | def read_all_as_events(): 104 | """ 105 | Load the files contained in the folder `data/` and convert 106 | to events. The length of the series is kept. 107 | The convention for the file name is: `dataset_algorithm.gz` 108 | :return: two dictionaries: 109 | - the first containing the list of events for each dataset and algorithm, 110 | - the second containing the range of the series for each dataset 111 | """ 112 | filepaths = glob.glob('data/*.gz') 113 | datasets = dict() 114 | Tranges = dict() 115 | for filepath in filepaths: 116 | vector = read_gz_data(filepath) 117 | events = convert_vector_to_events(vector) 118 | # ad hoc cut for those files 119 | cut_filepath = (os.path.split(filepath)[1]).split('_') 120 | data_name = cut_filepath[0] 121 | algo_name = (cut_filepath[1]).split('.')[0] 122 | if not data_name in datasets: 123 | datasets[data_name] = dict() 124 | Tranges[data_name] = (0, len(vector)) 125 | datasets[data_name][algo_name] = events 126 | return(datasets, Tranges) 127 | 128 | def f1_func(p, r): 129 | """ 130 | Compute the f1 function 131 | :param p: precision numeric value 132 | :param r: recall numeric value 133 | :return: f1 numeric value 134 | """ 135 | return(2*p*r/(p+r)) 136 | -------------------------------------------------------------------------------- /metrics/affiliation/metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from metrics.affiliation.generics import ( 4 | infer_Trange, 5 | has_point_anomalies, 6 | _len_wo_nan, 7 | _sum_wo_nan, 8 | read_all_as_events) 9 | from metrics.affiliation._affiliation_zone import ( 10 | get_all_E_gt_func, 11 | affiliation_partition) 12 | from metrics.affiliation._single_ground_truth_event import ( 13 | affiliation_precision_distance, 14 | affiliation_recall_distance, 15 | affiliation_precision_proba, 16 | affiliation_recall_proba) 17 | 18 | def test_events(events): 19 | """ 20 | Verify the validity of the input events 21 | :param events: list of events, each represented by a couple (start, stop) 22 | :return: None. Raise an error for incorrect formed or non ordered events 23 | """ 24 | if type(events) is not list: 25 | raise TypeError('Input `events` should be a list of couples') 26 | if not all([type(x) is tuple for x in events]): 27 | raise TypeError('Input `events` should be a list of tuples') 28 | if not all([len(x) == 2 for x in events]): 29 | raise ValueError('Input `events` should be a list of couples (start, stop)') 30 | if not all([x[0] <= x[1] for x in events]): 31 | raise ValueError('Input `events` should be a list of couples (start, stop) with start <= stop') 32 | if not all([events[i][1] < events[i+1][0] for i in range(len(events) - 1)]): 33 | raise ValueError('Couples of input `events` should be disjoint and ordered') 34 | 35 | def pr_from_events(events_pred, events_gt, Trange): 36 | """ 37 | Compute the affiliation metrics including the precision/recall in [0,1], 38 | along with the individual precision/recall distances and probabilities 39 | 40 | :param events_pred: list of predicted events, each represented by a couple 41 | indicating the start and the stop of the event 42 | :param events_gt: list of ground truth events, each represented by a couple 43 | indicating the start and the stop of the event 44 | :param Trange: range of the series where events_pred and events_gt are included, 45 | represented as a couple (start, stop) 46 | :return: dictionary with precision, recall, and the individual metrics 47 | """ 48 | # testing the inputs 49 | test_events(events_pred) 50 | test_events(events_gt) 51 | 52 | # other tests 53 | minimal_Trange = infer_Trange(events_pred, events_gt) 54 | if not Trange[0] <= minimal_Trange[0]: 55 | raise ValueError('`Trange` should include all the events') 56 | if not minimal_Trange[1] <= Trange[1]: 57 | raise ValueError('`Trange` should include all the events') 58 | 59 | if len(events_gt) == 0: 60 | raise ValueError('Input `events_gt` should have at least one event') 61 | 62 | if has_point_anomalies(events_pred) or has_point_anomalies(events_gt): 63 | raise ValueError('Cannot manage point anomalies currently') 64 | 65 | if Trange is None: 66 | # Set as default, but Trange should be indicated if probabilities are used 67 | raise ValueError('Trange should be indicated (or inferred with the `infer_Trange` function') 68 | 69 | E_gt = get_all_E_gt_func(events_gt, Trange) 70 | aff_partition = affiliation_partition(events_pred, E_gt) 71 | 72 | # Computing precision distance 73 | d_precision = [affiliation_precision_distance(Is, J) for Is, J in zip(aff_partition, events_gt)] 74 | 75 | # Computing recall distance 76 | d_recall = [affiliation_recall_distance(Is, J) for Is, J in zip(aff_partition, events_gt)] 77 | 78 | # Computing precision 79 | p_precision = [affiliation_precision_proba(Is, J, E) for Is, J, E in zip(aff_partition, events_gt, E_gt)] 80 | 81 | # Computing recall 82 | p_recall = [affiliation_recall_proba(Is, J, E) for Is, J, E in zip(aff_partition, events_gt, E_gt)] 83 | 84 | if _len_wo_nan(p_precision) > 0: 85 | p_precision_average = _sum_wo_nan(p_precision) / _len_wo_nan(p_precision) 86 | else: 87 | p_precision_average = p_precision[0] # math.nan 88 | p_recall_average = sum(p_recall) / len(p_recall) 89 | 90 | dict_out = dict({'precision': p_precision_average, 91 | 'recall': p_recall_average, 92 | 'individual_precision_probabilities': p_precision, 93 | 'individual_recall_probabilities': p_recall, 94 | 'individual_precision_distances': d_precision, 95 | 'individual_recall_distances': d_recall}) 96 | return(dict_out) 97 | 98 | def produce_all_results(): 99 | """ 100 | Produce the affiliation precision/recall for all files 101 | contained in the `data` repository 102 | :return: a dictionary indexed by data names, each containing a dictionary 103 | indexed by algorithm names, each containing the results of the affiliation 104 | metrics (precision, recall, individual probabilities and distances) 105 | """ 106 | datasets, Tranges = read_all_as_events() # read all the events in folder `data` 107 | results = dict() 108 | for data_name in datasets.keys(): 109 | results_data = dict() 110 | for algo_name in datasets[data_name].keys(): 111 | if algo_name != 'groundtruth': 112 | results_data[algo_name] = pr_from_events(datasets[data_name][algo_name], 113 | datasets[data_name]['groundtruth'], 114 | Tranges[data_name]) 115 | results[data_name] = results_data 116 | return(results) 117 | -------------------------------------------------------------------------------- /metrics/combine_all_scores.py: -------------------------------------------------------------------------------- 1 | from f1_score_f1_pa import * 2 | from fc_score import * 3 | from precision_at_k import * 4 | from customizable_f1_score import * 5 | from AUC import * 6 | from Matthews_correlation_coefficient import * 7 | from affiliation.generics import convert_vector_to_events 8 | from affiliation.metrics import pr_from_events 9 | from vus.models.feature import Window 10 | from vus.metrics import get_range_vus_roc 11 | 12 | 13 | 14 | def combine_all_evaluation_scores(y_test, pred_labels, anomaly_scores): 15 | events_pred = convert_vector_to_events(y_test) # [(4, 5), (8, 9)] 16 | events_gt = convert_vector_to_events(pred_labels) # [(3, 4), (7, 10)] 17 | Trange = (0, len(y_test)) 18 | affiliation = pr_from_events(events_pred, events_gt, Trange) 19 | true_events = get_events(y_test) 20 | _, _, _, f1_score_ori, f05_score_ori = get_accuracy_precision_recall_fscore(y_test, pred_labels) 21 | f1_score_pa = get_point_adjust_scores(y_test, pred_labels, true_events)[5] 22 | pa_accuracy, pa_precision, pa_recall, pa_f_score = get_adjust_F1PA(y_test, pred_labels) 23 | range_f_score = customizable_f1_score(y_test, pred_labels) 24 | _, _, f1_score_c = get_composite_fscore_raw(y_test, pred_labels, true_events, return_prec_rec=True) 25 | precision_k = precision_at_k(y_test, anomaly_scores, pred_labels) 26 | point_auc = point_wise_AUC(pred_labels, y_test) 27 | range_auc = Range_AUC(pred_labels, y_test) 28 | MCC_score = MCC(y_test, pred_labels) 29 | results = get_range_vus_roc(y_test, pred_labels, 100) # slidingWindow = 100 default 30 | 31 | 32 | score_list = {"f1_score_ori": f1_score_ori, 33 | "f05_score_ori" : f05_score_ori, 34 | "f1_score_pa": f1_score_pa, 35 | "pa_accuracy":pa_accuracy, 36 | "pa_precision":pa_precision, 37 | "pa_recall":pa_recall, 38 | "pa_f_score":pa_f_score, 39 | "range_f_score": range_f_score, 40 | "f1_score_c": f1_score_c, 41 | "precision_k": precision_k, 42 | "point_auc": point_auc, 43 | "range_auc": range_auc, 44 | "MCC_score":MCC_score, 45 | "Affiliation precision": affiliation['precision'], 46 | "Affiliation recall": affiliation['recall'], 47 | "R_AUC_ROC": results["R_AUC_ROC"], 48 | "R_AUC_PR": results["R_AUC_PR"], 49 | "VUS_ROC": results["VUS_ROC"], 50 | "VUS_PR": results["VUS_PR"]} 51 | 52 | return score_list 53 | 54 | 55 | def main(): 56 | y_test = np.zeros(100) 57 | y_test[10:20] = 1 58 | y_test[50:60] = 1 59 | pred_labels = np.zeros(100) 60 | pred_labels[15:17] = 1 61 | pred_labels[55:62] = 1 62 | anomaly_scores = np.zeros(100) 63 | anomaly_scores[15:17] = 0.7 64 | anomaly_scores[55:62] = 0.6 65 | pred_labels[51:55] = 1 66 | true_events = get_events(y_test) 67 | scores = combine_all_evaluation_scores(y_test, pred_labels, anomaly_scores) 68 | # scores = test(y_test, pred_labels) 69 | for key,value in scores.items(): 70 | print(key,' : ',value) 71 | 72 | 73 | if __name__ == "__main__": 74 | main() -------------------------------------------------------------------------------- /metrics/customizable_f1_score.py: -------------------------------------------------------------------------------- 1 | # used by paper: Exathlon: A Benchmark for Explainable Anomaly Detection over Time Series_VLDB 2021 2 | # github: https://github.com/exathlonbenchmark/exathlon 3 | import numpy as np 4 | from metrics.evaluate_utils import range_convers_new 5 | 6 | # the existence reward on the bias 7 | def b(bias, i, length): 8 | if bias == 'flat': 9 | return 1 10 | elif bias == 'front-end bias': 11 | return length - i + 1 12 | elif bias == 'back-end bias': 13 | return i 14 | else: 15 | if i <= length / 2: 16 | return i 17 | else: 18 | return length - i + 1 19 | 20 | 21 | def w(AnomalyRange, p): 22 | MyValue = 0 23 | MaxValue = 0 24 | start = AnomalyRange[0] 25 | AnomalyLength = AnomalyRange[1] - AnomalyRange[0] + 1 26 | # flat/'front-end bias'/'back-end bias' 27 | bias = 'flat' 28 | for i in range(start, start + AnomalyLength): 29 | bi = b(bias, i, AnomalyLength) 30 | MaxValue += bi 31 | if i in p: 32 | MyValue += bi 33 | return MyValue / MaxValue 34 | 35 | 36 | def Cardinality_factor(Anomolyrange, Prange): 37 | score = 0 38 | start = Anomolyrange[0] 39 | end = Anomolyrange[1] 40 | for i in Prange: 41 | if start <= i[0] <= end: 42 | score += 1 43 | elif i[0] <= start <= i[1]: 44 | score += 1 45 | elif i[0] <= end <= i[1]: 46 | score += 1 47 | elif start >= i[0] and end <= i[1]: 48 | score += 1 49 | if score == 0: 50 | return 0 51 | else: 52 | return 1 / score 53 | 54 | 55 | def existence_reward(labels, preds): 56 | ''' 57 | labels: list of ordered pair 58 | preds predicted data 59 | ''' 60 | 61 | score = 0 62 | for i in labels: 63 | if np.sum(np.multiply(preds <= i[1], preds >= i[0])) > 0: 64 | score += 1 65 | return score 66 | 67 | 68 | def range_recall_new(labels, preds, alpha): 69 | p = np.where(preds == 1)[0] # positions of predicted label==1 70 | range_pred = range_convers_new(preds) 71 | range_label = range_convers_new(labels) 72 | 73 | Nr = len(range_label) # total # of real anomaly segments 74 | 75 | ExistenceReward = existence_reward(range_label, p) 76 | 77 | OverlapReward = 0 78 | for i in range_label: 79 | OverlapReward += w(i, p) * Cardinality_factor(i, range_pred) 80 | 81 | score = alpha * ExistenceReward + (1 - alpha) * OverlapReward 82 | if Nr != 0: 83 | return score / Nr, ExistenceReward / Nr, OverlapReward / Nr 84 | else: 85 | return 0, 0, 0 86 | 87 | 88 | def customizable_f1_score(y_test, pred_labels, alpha=0.2): 89 | label = y_test 90 | preds = pred_labels 91 | Rrecall, ExistenceReward, OverlapReward = range_recall_new(label, preds, alpha) 92 | Rprecision = range_recall_new(preds, label, 0)[0] 93 | 94 | if Rprecision + Rrecall == 0: 95 | Rf = 0 96 | else: 97 | Rf = 2 * Rrecall * Rprecision / (Rprecision + Rrecall) 98 | return Rf 99 | 100 | 101 | def main(): 102 | y_test = np.zeros(100) 103 | y_test[10:20] = 1 104 | y_test[50:60] = 1 105 | pred_labels = np.zeros(100) 106 | pred_labels[15:19] = 1 107 | pred_labels[55:62] = 1 108 | # pred_labels[51:55] = 1 109 | # true_events = get_events(y_test) 110 | Rf = customizable_f1_score(y_test, pred_labels) 111 | print("Rf: {}".format(Rf)) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() -------------------------------------------------------------------------------- /metrics/evaluate_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from statsmodels.tsa.stattools import acf 3 | from scipy.signal import argrelextrema 4 | 5 | 6 | def get_composite_fscore_from_scores(score_t_test, thres, true_events, prec_t, return_prec_rec=False): 7 | pred_labels = score_t_test > thres 8 | tp = np.sum([pred_labels[start:end + 1].any() for start, end in true_events.values()]) 9 | fn = len(true_events) - tp 10 | rec_e = tp / (tp + fn) 11 | fscore_c = 2 * rec_e * prec_t / (rec_e + prec_t) 12 | if prec_t == 0 and rec_e == 0: 13 | fscore_c = 0 14 | if return_prec_rec: 15 | return prec_t, rec_e, fscore_c 16 | return fscore_c 17 | 18 | 19 | class NptConfig: 20 | def __init__(self, config_dict): 21 | for k, v in config_dict.items(): 22 | setattr(self, k, v) 23 | 24 | def find_length(data): 25 | if len(data.shape) > 1: 26 | return 0 27 | data = data[:min(20000, len(data))] 28 | 29 | base = 3 30 | auto_corr = acf(data, nlags=400, fft=True)[base:] 31 | 32 | local_max = argrelextrema(auto_corr, np.greater)[0] 33 | try: 34 | max_local_max = np.argmax([auto_corr[lcm] for lcm in local_max]) 35 | if local_max[max_local_max] < 3 or local_max[max_local_max] > 300: 36 | return 125 37 | return local_max[max_local_max] + base 38 | except: 39 | return 125 40 | 41 | 42 | def range_convers_new(label): 43 | ''' 44 | input: arrays of binary values 45 | output: list of ordered pair [[a0,b0], [a1,b1]... ] of the inputs 46 | ''' 47 | L = [] 48 | i = 0 49 | j = 0 50 | while j < len(label): 51 | while label[i] == 0: 52 | i += 1 53 | if i >= len(label): 54 | break 55 | j = i + 1 56 | if j >= len(label): 57 | if j == len(label): 58 | L.append((i, j - 1)) 59 | break 60 | while label[j] != 0: 61 | j += 1 62 | if j >= len(label): 63 | L.append((i, j - 1)) 64 | break 65 | if j >= len(label): 66 | break 67 | L.append((i, j - 1)) 68 | i = j 69 | return L -------------------------------------------------------------------------------- /metrics/f1_score_f1_pa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, roc_auc_score, precision_score, recall_score, \ 3 | accuracy_score, fbeta_score, average_precision_score 4 | 5 | 6 | # function: calculate the point-adjust f-scores(whether top k) 7 | def get_point_adjust_scores(y_test, pred_labels, true_events, thereshold_k=0, whether_top_k=False): 8 | tp = 0 9 | fn = 0 10 | for true_event in true_events.keys(): 11 | true_start, true_end = true_events[true_event] 12 | if whether_top_k is False: 13 | if pred_labels[true_start:true_end].sum() > 0: 14 | tp += (true_end - true_start) 15 | else: 16 | fn += (true_end - true_start) 17 | else: 18 | if pred_labels[true_start:true_end].sum() > thereshold_k: 19 | tp += (true_end - true_start) 20 | else: 21 | fn += (true_end - true_start) 22 | fp = np.sum(pred_labels) - np.sum(pred_labels * y_test) 23 | 24 | prec, rec, fscore = get_prec_rec_fscore(tp, fp, fn) 25 | return fp, fn, tp, prec, rec, fscore 26 | 27 | def get_adjust_F1PA(pred, gt): 28 | anomaly_state = False 29 | for i in range(len(gt)): 30 | if gt[i] == 1 and pred[i] == 1 and not anomaly_state: 31 | anomaly_state = True 32 | for j in range(i, 0, -1): 33 | if gt[j] == 0: 34 | break 35 | else: 36 | if pred[j] == 0: 37 | pred[j] = 1 38 | for j in range(i, len(gt)): 39 | if gt[j] == 0: 40 | break 41 | else: 42 | if pred[j] == 0: 43 | pred[j] = 1 44 | elif gt[i] == 0: 45 | anomaly_state = False 46 | if anomaly_state: 47 | pred[i] = 1 48 | 49 | from sklearn.metrics import precision_recall_fscore_support 50 | from sklearn.metrics import accuracy_score 51 | 52 | accuracy = accuracy_score(gt, pred) 53 | precision, recall, f_score, support = precision_recall_fscore_support(gt, pred, 54 | average='binary') 55 | return accuracy, precision, recall, f_score 56 | 57 | 58 | # calculate the point-adjusted f-score 59 | def get_prec_rec_fscore(tp, fp, fn): 60 | if tp == 0: 61 | precision = 0 62 | recall = 0 63 | else: 64 | precision = tp / (tp + fp) 65 | recall = tp / (tp + fn) 66 | fscore = get_f_score(precision, recall) 67 | return precision, recall, fscore 68 | 69 | 70 | def get_f_score(prec, rec): 71 | if prec == 0 and rec == 0: 72 | f_score = 0 73 | else: 74 | f_score = 2 * (prec * rec) / (prec + rec) 75 | return f_score 76 | 77 | 78 | # function: calculate the normal edition f-scores 79 | def get_accuracy_precision_recall_fscore(y_true: list, y_pred: list): 80 | accuracy = accuracy_score(y_true, y_pred) 81 | # warn_for=() avoids log warnings for any result being zero 82 | # precision, recall, f_score, _ = prf(y_true, y_pred, average='binary', warn_for=()) 83 | precision = precision_score(y_true, y_pred) 84 | recall = recall_score(y_true, y_pred) 85 | f_score = (2 * precision * recall) / (precision + recall) 86 | if precision == 0 and recall == 0: 87 | f05_score = 0 88 | else: 89 | f05_score = fbeta_score(y_true, y_pred, average='binary', beta=0.5) 90 | return accuracy, precision, recall, f_score, f05_score 91 | 92 | 93 | -------------------------------------------------------------------------------- /metrics/f1_series.py: -------------------------------------------------------------------------------- 1 | from fc_score import * 2 | from f1_score_f1_pa import * 3 | from evaluate_utils import * 4 | 5 | default_thres_config = {"top_k_time": {}, 6 | "best_f1_test": {"exact_pt_adj": True}, 7 | "thresholded_score": {}, 8 | "tail_prob": {"tail_prob": 2}, 9 | "tail_prob_1": {"tail_prob": 1}, 10 | "tail_prob_2": {"tail_prob": 2}, 11 | "tail_prob_3": {"tail_prob": 3}, 12 | "tail_prob_4": {"tail_prob": 4}, 13 | "tail_prob_5": {"tail_prob": 5}, 14 | "dyn_gauss": {"long_window": 10000, "short_window": 1, "kernel_sigma": 10}, 15 | "nasa_npt": {"batch_size": 70, "window_size": 30, "telem_only": True, 16 | "smoothing_perc": 0.005, "l_s": 250, "error_buffer": 5, "p": 0.05}} 17 | 18 | 19 | def threshold_and_predict(score_t_test, y_test, true_events, logger, test_anom_frac, thres_method="top_k_time", 20 | point_adjust=False, score_t_train=None, thres_config_dict=dict(), return_auc=False, 21 | composite_best_f1=False): 22 | if thres_method in thres_config_dict.keys(): 23 | config = thres_config_dict[thres_method] 24 | else: 25 | config = default_thres_config[thres_method] 26 | # test_anom_frac = (np.sum(y_test)) / len(y_test) 27 | auroc = None 28 | avg_prec = None 29 | if thres_method == "thresholded_score": 30 | opt_thres = 0.5 31 | if set(score_t_test) - {0, 1}: 32 | logger.error("Score_t_test isn't binary. Predicting all as non-anomalous") 33 | pred_labels = np.zeros(len(score_t_test)) 34 | else: 35 | pred_labels = score_t_test 36 | 37 | elif thres_method == "best_f1_test" and point_adjust: 38 | prec, rec, thresholds = precision_recall_curve(y_test, score_t_test, pos_label=1) 39 | if not config["exact_pt_adj"]: 40 | fscore_best_time = [get_f_score(precision, recall) for precision, recall in zip(prec, rec)] 41 | opt_num = np.squeeze(np.argmax(fscore_best_time)) 42 | opt_thres = thresholds[opt_num] 43 | thresholds = np.random.choice(thresholds, size=5000) + [opt_thres] 44 | fscores = [] 45 | for thres in thresholds: 46 | _, _, _, _, _, fscore = get_point_adjust_scores(y_test, score_t_test > thres, true_events) 47 | fscores.append(fscore) 48 | opt_thres = thresholds[np.argmax(fscores)] 49 | pred_labels = score_t_test > opt_thres 50 | 51 | elif thres_method == "best_f1_test" and composite_best_f1: 52 | prec, rec, thresholds = precision_recall_curve(y_test, score_t_test, pos_label=1) 53 | precs_t = prec 54 | fscores_c = [get_composite_fscore_from_scores(score_t_test, thres, true_events, prec_t) for thres, prec_t in 55 | zip(thresholds, precs_t)] 56 | try: 57 | opt_thres = thresholds[np.nanargmax(fscores_c)] 58 | except: 59 | opt_thres = 0.0 60 | pred_labels = score_t_test > opt_thres 61 | 62 | elif thres_method == "top_k_time": 63 | opt_thres = np.nanpercentile(score_t_test, 100 * (1 - test_anom_frac), interpolation='higher') 64 | pred_labels = np.where(score_t_test > opt_thres, 1, 0) 65 | 66 | elif thres_method == "best_f1_test": 67 | prec, rec, thres = precision_recall_curve(y_test, score_t_test, pos_label=1) 68 | fscore = [get_f_score(precision, recall) for precision, recall in zip(prec, rec)] 69 | opt_num = np.squeeze(np.argmax(fscore)) 70 | opt_thres = thres[opt_num] 71 | pred_labels = np.where(score_t_test > opt_thres, 1, 0) 72 | 73 | elif "tail_prob" in thres_method: 74 | tail_neg_log_prob = config["tail_prob"] 75 | opt_thres = tail_neg_log_prob 76 | pred_labels = np.where(score_t_test > opt_thres, 1, 0) 77 | 78 | elif thres_method == "nasa_npt": 79 | opt_thres = 0.5 80 | pred_labels = get_npt_labels(score_t_test, y_test, config) 81 | else: 82 | logger.error("Thresholding method {} not in [top_k_time, best_f1_test, tail_prob]".format(thres_method)) 83 | return None, None 84 | if return_auc: 85 | avg_prec = average_precision_score(y_test, score_t_test) 86 | auroc = roc_auc_score(y_test, score_t_test) 87 | return opt_thres, pred_labels, avg_prec, auroc 88 | return opt_thres, pred_labels 89 | 90 | 91 | # most-top funcion 92 | def evaluate_predicted_labels(pred_labels, y_test, true_events, logger, eval_method="time-wise", breaks=[], 93 | point_adjust=False): 94 | """ 95 | Computes evaluation metrics for the binary classifications given the true and predicted labels 96 | :param point_adjust: used to judge whether is pa 97 | :param pred_labels: array of predicted labels 98 | :param y_test: array of true labels 99 | :param eval_method: string that indicates whether we evaluate the classification time point-wise or event-wise 100 | :param breaks: array of discontinuities in the time series, relevant only if you look at event-wise 101 | :param return_raw: Boolean that indicates whether we want to return tp, fp and fn or prec, recall and f1 102 | :return: tuple of evaluation metrics 103 | """ 104 | 105 | if eval_method == "time-wise": 106 | # point-adjust fscore 107 | if point_adjust: 108 | fp, fn, tp, prec, rec, fscore = get_point_adjust_scores(y_test, pred_labels, true_events) 109 | # normal fscore 110 | else: 111 | _, prec, rec, fscore, _ = get_accuracy_precision_recall_fscore(y_test, pred_labels) 112 | tp = np.sum(pred_labels * y_test) 113 | fp = np.sum(pred_labels) - tp 114 | fn = np.sum(y_test) - tp 115 | # event-wise 116 | else: 117 | logger.error("Evaluation method {} not in [time-wise, event-wise]".format(eval_method)) 118 | return 0, 0, 0 119 | 120 | return tp, fp, fn, prec, rec, fscore 121 | -------------------------------------------------------------------------------- /metrics/fc_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import precision_score 3 | 4 | 5 | def get_events(y_test, outlier=1, normal=0): 6 | events = dict() 7 | label_prev = normal 8 | event = 0 # corresponds to no event 9 | event_start = 0 10 | for tim, label in enumerate(y_test): 11 | if label == outlier: 12 | if label_prev == normal: 13 | event += 1 14 | event_start = tim 15 | else: 16 | if label_prev == outlier: 17 | event_end = tim - 1 18 | events[event] = (event_start, event_end) 19 | label_prev = label 20 | 21 | if label_prev == outlier: 22 | event_end = tim - 1 23 | events[event] = (event_start, event_end) 24 | return events 25 | 26 | 27 | def get_composite_fscore_raw(y_test, pred_labels, true_events, return_prec_rec=False): 28 | tp = np.sum([pred_labels[start:end + 1].any() for start, end in true_events.values()]) 29 | fn = len(true_events) - tp 30 | rec_e = tp / (tp + fn) 31 | prec_t = precision_score(y_test, pred_labels) 32 | fscore_c = 2 * rec_e * prec_t / (rec_e + prec_t) 33 | if prec_t == 0 and rec_e == 0: 34 | fscore_c = 0 35 | if return_prec_rec: 36 | return prec_t, rec_e, fscore_c 37 | return fscore_c 38 | 39 | 40 | def main(): 41 | y_test = np.zeros(100) 42 | y_test[10:20] = 1 43 | y_test[50:60] = 1 44 | pred_labels = np.zeros(100) 45 | pred_labels[15:17] = 1 46 | pred_labels[55:62] = 1 47 | # pred_labels[51:55] = 1 48 | # true_events = get_events(y_test) 49 | prec_t, rec_e, fscore_c = get_composite_fscore_raw(pred_labels, y_test, return_prec_rec=True) 50 | # print("Prec_t: {}, rec_e: {}, fscore_c: {}".format(prec_t, rec_e, fscore_c)) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /metrics/metrics.py: -------------------------------------------------------------------------------- 1 | from metrics.f1_score_f1_pa import * 2 | from metrics.fc_score import * 3 | from metrics.precision_at_k import * 4 | from metrics.customizable_f1_score import * 5 | from metrics.AUC import * 6 | from metrics.Matthews_correlation_coefficient import * 7 | from metrics.affiliation.generics import convert_vector_to_events 8 | from metrics.affiliation.metrics import pr_from_events 9 | from metrics.vus.models.feature import Window 10 | from metrics.vus.metrics import get_range_vus_roc 11 | import numpy as np 12 | 13 | def combine_all_evaluation_scores(y_test, pred_labels, anomaly_scores): 14 | events_pred = convert_vector_to_events(y_test) 15 | events_gt = convert_vector_to_events(pred_labels) 16 | Trange = (0, len(y_test)) 17 | affiliation = pr_from_events(events_pred, events_gt, Trange) 18 | true_events = get_events(y_test) 19 | pa_accuracy, pa_precision, pa_recall, pa_f_score = get_adjust_F1PA(y_test, pred_labels) 20 | MCC_score = MCC(y_test, pred_labels) 21 | vus_results = get_range_vus_roc(y_test, pred_labels, 100) # default slidingWindow = 100 22 | 23 | score_list_simple = { 24 | "pa_accuracy":pa_accuracy, 25 | "pa_precision":pa_precision, 26 | "pa_recall":pa_recall, 27 | "pa_f_score":pa_f_score, 28 | "MCC_score":MCC_score, 29 | "Affiliation precision": affiliation['precision'], 30 | "Affiliation recall": affiliation['recall'], 31 | "R_AUC_ROC": vus_results["R_AUC_ROC"], 32 | "R_AUC_PR": vus_results["R_AUC_PR"], 33 | "VUS_ROC": vus_results["VUS_ROC"], 34 | "VUS_PR": vus_results["VUS_PR"] 35 | } 36 | 37 | # return score_list, score_list_simple 38 | return score_list_simple 39 | 40 | 41 | if __name__ == '__main__': 42 | y_test = np.load("data/events_pred_MSL.npy")+0 43 | pred_labels = np.load("data/events_gt_MSL.npy")+0 44 | anomaly_scores = np.load("data/events_scores_MSL.npy") 45 | print(len(y_test), max(anomaly_scores), min(anomaly_scores)) 46 | score_list_simple = combine_all_evaluation_scores(y_test, pred_labels, anomaly_scores) 47 | 48 | for key, value in score_list_simple.items(): 49 | print('{0:21} :{1:10f}'.format(key, value)) -------------------------------------------------------------------------------- /metrics/precision_at_k.py: -------------------------------------------------------------------------------- 1 | # k is defined as the number of anomalies 2 | # only calculate the range top k not the whole set 3 | import numpy as np 4 | 5 | 6 | def precision_at_k(y_test, score_t_test, pred_labels): 7 | # top-k 8 | k = int(np.sum(y_test)) 9 | threshold = np.percentile(score_t_test, 100 * (1 - k / len(y_test))) 10 | 11 | # precision_at_k = metrics.top_k_accuracy_score(label, score, k) 12 | p_at_k = np.where(pred_labels > threshold)[0] 13 | TP_at_k = sum(y_test[p_at_k]) 14 | precision_at_k = TP_at_k / k 15 | return precision_at_k 16 | -------------------------------------------------------------------------------- /metrics/vus/analysis/robustness_eval.py: -------------------------------------------------------------------------------- 1 | from random import shuffle 2 | import numpy as np 3 | import math 4 | import matplotlib.pyplot as plt 5 | from matplotlib import cm 6 | import pandas as pd 7 | from tqdm import tqdm as tqdm 8 | import time 9 | from sklearn.preprocessing import MinMaxScaler 10 | import random 11 | 12 | 13 | import os 14 | import sys 15 | module_path = os.path.abspath(os.path.join('../..')) 16 | if module_path not in sys.path: 17 | sys.path.append(module_path) 18 | 19 | from metrics.vus.utils.slidingWindows import find_length 20 | from metrics.vus.utils.metrics import metricor 21 | 22 | from metrics.vus.models.distance import Fourier 23 | from metrics.vus.models.feature import Window 24 | 25 | 26 | def generate_new_label(label,lag): 27 | if lag < 0: 28 | return np.array(list(label[-lag:]) + [0]*(-lag)) 29 | elif lag > 0: 30 | return np.array([0]*lag + list(label[:-lag])) 31 | elif lag == 0: 32 | return label 33 | 34 | def compute_anomaly_acc_lag(methods_scores,label,slidingWindow,methods_keys): 35 | 36 | lag_range = list(range(-slidingWindow//4,slidingWindow//4,5)) 37 | methods_acc = {} 38 | for i,methods_score in enumerate(tqdm(methods_keys)): 39 | dict_acc = { 40 | 'R_AUC_ROC': [], 41 | 'AUC_ROC': [], 42 | 'R_AUC_PR': [], 43 | 'AUC_PR': [], 44 | 'VUS_ROC': [], 45 | 'VUS_PR': [], 46 | 'Precision': [], 47 | 'Recall': [], 48 | 'F': [], 49 | 'ExistenceReward':[], 50 | 'OverlapReward': [], 51 | 'Precision@k': [], 52 | 'Rprecision': [], 53 | 'Rrecall': [], 54 | 'RF': []} 55 | 56 | for lag in tqdm(lag_range): 57 | new_label = generate_new_label(label,lag) 58 | 59 | grader = metricor() 60 | 61 | R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=new_label, score=methods_scores[methods_score], window=slidingWindow, plot_ROC=True) 62 | L, fpr, tpr= grader.metric_new(new_label, methods_scores[methods_score], plot_ROC=True) 63 | precision, recall, AP = grader.metric_PR(new_label, methods_scores[methods_score]) 64 | Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d = generate_curve(new_label,methods_scores[methods_score],2*slidingWindow) 65 | L1 = [ elem for elem in L] 66 | 67 | dict_acc['R_AUC_ROC'] +=[R_AUC] 68 | dict_acc['AUC_ROC'] +=[L1[0]] 69 | dict_acc['R_AUC_PR'] +=[R_AP] 70 | dict_acc['AUC_PR'] +=[AP] 71 | dict_acc['VUS_ROC'] +=[avg_auc_3d] 72 | dict_acc['VUS_PR'] +=[avg_ap_3d] 73 | dict_acc['Precision'] +=[L1[1]] 74 | dict_acc['Recall'] +=[L1[2]] 75 | dict_acc['F'] +=[L1[3]] 76 | dict_acc['ExistenceReward']+=[L1[5]] 77 | dict_acc['OverlapReward'] +=[L1[6]] 78 | dict_acc['Precision@k'] +=[L1[9]] 79 | dict_acc['Rprecision'] +=[L1[7]] 80 | dict_acc['Rrecall'] +=[L1[4]] 81 | dict_acc['RF'] +=[L1[8]] 82 | 83 | methods_acc[methods_score] = dict_acc 84 | return methods_acc 85 | 86 | 87 | def compute_anomaly_acc_percentage(methods_scores,label,slidingWindow,methods_keys,pos_first_anom): 88 | 89 | 90 | list_pos = [] 91 | step_a = max(0,(len(label) - pos_first_anom-200))//20 92 | step_b = max(0,pos_first_anom-200)//20 93 | pos_a = min(len(label),pos_first_anom + 200) 94 | pos_b = max(0,pos_first_anom - 200) 95 | list_pos.append((pos_b,pos_a)) 96 | for pos_iter in range(20): 97 | pos_a = min(len(label),pos_a + step_a) 98 | pos_b = max(0,pos_b - step_b) 99 | list_pos.append((pos_b,pos_a)) 100 | methods_acc = {} 101 | print(list_pos) 102 | for i,methods_score in enumerate(tqdm(methods_keys)): 103 | dict_acc = { 104 | 'R_AUC_ROC': [], 105 | 'AUC_ROC': [], 106 | 'R_AUC_PR': [], 107 | 'AUC_PR': [], 108 | 'VUS_ROC': [], 109 | 'VUS_PR': [], 110 | 'Precision': [], 111 | 'Recall': [], 112 | 'F': [], 113 | 'ExistenceReward':[], 114 | 'OverlapReward': [], 115 | 'Precision@k': [], 116 | 'Rprecision': [], 117 | 'Rrecall': [], 118 | 'RF': []} 119 | 120 | for end_pos in tqdm(list_pos): 121 | new_label = label[end_pos[0]:end_pos[1]] 122 | new_score = np.array(methods_scores[methods_score])[end_pos[0]:end_pos[1]] 123 | grader = metricor() 124 | 125 | R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=new_label, score=new_score, window=slidingWindow, plot_ROC=True) 126 | L, fpr, tpr= grader.metric_new(new_label, new_score, plot_ROC=True) 127 | precision, recall, AP = grader.metric_PR(new_label, new_score) 128 | Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d = generate_curve(new_label,new_score,2*slidingWindow) 129 | L1 = [ elem for elem in L] 130 | 131 | dict_acc['R_AUC_ROC'] +=[R_AUC] 132 | dict_acc['AUC_ROC'] +=[L1[0]] 133 | dict_acc['R_AUC_PR'] +=[R_AP] 134 | dict_acc['AUC_PR'] +=[AP] 135 | dict_acc['VUS_ROC'] +=[avg_auc_3d] 136 | dict_acc['VUS_PR'] +=[avg_ap_3d] 137 | dict_acc['Precision'] +=[L1[1]] 138 | dict_acc['Recall'] +=[L1[2]] 139 | dict_acc['F'] +=[L1[3]] 140 | dict_acc['ExistenceReward']+=[L1[5]] 141 | dict_acc['OverlapReward'] +=[L1[6]] 142 | dict_acc['Precision@k'] +=[L1[9]] 143 | dict_acc['Rprecision'] +=[L1[7]] 144 | dict_acc['Rrecall'] +=[L1[4]] 145 | dict_acc['RF'] +=[L1[8]] 146 | 147 | methods_acc[methods_score] = dict_acc 148 | return methods_acc 149 | 150 | def compute_anomaly_acc_noise(methods_scores,label,slidingWindow,methods_keys): 151 | 152 | lag_range = list(range(-slidingWindow//2,slidingWindow//2,10)) 153 | methods_acc = {} 154 | for i,methods_score in enumerate(tqdm(methods_keys)): 155 | dict_acc = { 156 | 'R_AUC_ROC': [], 157 | 'AUC_ROC': [], 158 | 'R_AUC_PR': [], 159 | 'AUC_PR': [], 160 | 'VUS_ROC': [], 161 | 'VUS_PR': [], 162 | 'Precision': [], 163 | 'Recall': [], 164 | 'F': [], 165 | 'ExistenceReward':[], 166 | 'OverlapReward': [], 167 | 'Precision@k': [], 168 | 'Rprecision': [], 169 | 'Rrecall': [], 170 | 'RF': []} 171 | 172 | for lag in tqdm(lag_range): 173 | new_label = label 174 | 175 | grader = metricor() 176 | 177 | noise = np.random.normal(-0.1,0.1,len(methods_scores[methods_score])) 178 | 179 | new_score = np.array(methods_scores[methods_score]) + noise 180 | new_score = (new_score - min(new_score))/(max(new_score) - min(new_score)) 181 | 182 | R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=new_label, score=new_score, window=slidingWindow, plot_ROC=True) 183 | L, fpr, tpr= grader.metric_new(new_label, new_score, plot_ROC=True) 184 | precision, recall, AP = grader.metric_PR(new_label, new_score) 185 | Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d = generate_curve(new_label,new_score,2*slidingWindow) 186 | L1 = [ elem for elem in L] 187 | 188 | dict_acc['R_AUC_ROC'] +=[R_AUC] 189 | dict_acc['AUC_ROC'] +=[L1[0]] 190 | dict_acc['R_AUC_PR'] +=[R_AP] 191 | dict_acc['AUC_PR'] +=[AP] 192 | dict_acc['VUS_ROC'] +=[avg_auc_3d] 193 | dict_acc['VUS_PR'] +=[avg_ap_3d] 194 | dict_acc['Precision'] +=[L1[1]] 195 | dict_acc['Recall'] +=[L1[2]] 196 | dict_acc['F'] +=[L1[3]] 197 | dict_acc['ExistenceReward']+=[L1[5]] 198 | dict_acc['OverlapReward'] +=[L1[6]] 199 | dict_acc['Precision@k'] +=[L1[9]] 200 | dict_acc['Rprecision'] +=[L1[7]] 201 | dict_acc['Rrecall'] +=[L1[4]] 202 | dict_acc['RF'] +=[L1[8]] 203 | 204 | methods_acc[methods_score] = dict_acc 205 | return methods_acc 206 | 207 | 208 | def compute_anomaly_acc_pairwise(methods_scores,label,slidingWindow,method1,method2): 209 | 210 | lag_range = list(range(-slidingWindow//4,slidingWindow//4,5)) 211 | methods_acc = {} 212 | method_key = [method1] 213 | if method2 is not None: 214 | method_key = [method1,method2] 215 | for i,methods_score in enumerate(tqdm(method_key)): 216 | dict_acc = { 217 | 'R_AUC_ROC': [], 218 | 'AUC_ROC': [], 219 | 'R_AUC_PR': [], 220 | 'AUC_PR': [], 221 | 'VUS_ROC': [], 222 | 'VUS_PR': [], 223 | 'Precision': [], 224 | 'Recall': [], 225 | 'F': [], 226 | 'ExistenceReward':[], 227 | 'OverlapReward': [], 228 | 'Precision@k': [], 229 | 'Rprecision': [], 230 | 'Rrecall': [], 231 | 'RF': []} 232 | 233 | for lag in tqdm(range(60)): 234 | new_lag = random.randint(-slidingWindow//4,slidingWindow//4) 235 | new_label = generate_new_label(label,new_lag) 236 | 237 | noise = np.random.normal(-0.1,0.1,len(methods_scores[methods_score])) 238 | new_score = np.array(methods_scores[methods_score]) + noise 239 | new_score = (new_score - min(new_score))/(max(new_score) - min(new_score)) 240 | 241 | grader = metricor() 242 | 243 | R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=new_label, score=new_score, window=slidingWindow, plot_ROC=True) 244 | L, fpr, tpr= grader.metric_new(new_label, new_score, plot_ROC=True) 245 | precision, recall, AP = grader.metric_PR(new_label, new_score) 246 | #range_anomaly = grader.range_convers_new(new_label) 247 | Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d = generate_curve(new_label,new_score,2*slidingWindow) 248 | L1 = [ elem for elem in L] 249 | 250 | dict_acc['R_AUC_ROC'] +=[R_AUC] 251 | dict_acc['AUC_ROC'] +=[L1[0]] 252 | dict_acc['R_AUC_PR'] +=[R_AP] 253 | dict_acc['AUC_PR'] +=[AP] 254 | dict_acc['VUS_ROC'] +=[avg_auc_3d] 255 | dict_acc['VUS_PR'] +=[avg_ap_3d] 256 | dict_acc['Precision'] +=[L1[1]] 257 | dict_acc['Recall'] +=[L1[2]] 258 | dict_acc['F'] +=[L1[3]] 259 | dict_acc['ExistenceReward']+=[L1[5]] 260 | dict_acc['OverlapReward'] +=[L1[6]] 261 | dict_acc['Precision@k'] +=[L1[9]] 262 | dict_acc['Rprecision'] +=[L1[7]] 263 | dict_acc['Rrecall'] +=[L1[4]] 264 | dict_acc['RF'] +=[L1[8]] 265 | 266 | methods_acc[methods_score] = dict_acc 267 | return methods_acc 268 | 269 | 270 | def normalize_dict_exp(methods_acc_lag,methods_keys): 271 | key_metrics = [ 272 | 'VUS_ROC', 273 | 'VUS_PR', 274 | 'R_AUC_ROC', 275 | 'R_AUC_PR', 276 | 'AUC_ROC', 277 | 'AUC_PR', 278 | 'Rprecision', 279 | 'Rrecall', 280 | 'RF', 281 | 'Precision', 282 | 'Recall', 283 | 'F', 284 | 'Precision@k' 285 | ][::-1] 286 | 287 | norm_methods_acc_lag = {} 288 | for key in methods_keys: 289 | norm_methods_acc_lag[key] = {} 290 | for key_metric in key_metrics: 291 | ts = methods_acc_lag[key][key_metric] 292 | new_ts = list(np.array(ts) - np.mean(ts)) 293 | norm_methods_acc_lag[key][key_metric] = new_ts 294 | return norm_methods_acc_lag 295 | 296 | def group_dict(methods_acc_lag,methods_keys): 297 | key_metrics = [ 298 | 'VUS_ROC', 299 | 'VUS_PR', 300 | 'R_AUC_ROC', 301 | 'R_AUC_PR', 302 | 'AUC_ROC', 303 | 'AUC_PR', 304 | 'Rprecision', 305 | 'Rrecall', 306 | 'RF', 307 | 'Precision', 308 | 'Recall', 309 | 'F', 310 | 'Precision@k' 311 | ][::-1] 312 | 313 | norm_methods_acc_lag = {key:[] for key in key_metrics} 314 | for key in methods_keys: 315 | for key_metric in key_metrics: 316 | ts = list(methods_acc_lag[key][key_metric]) 317 | new_ts = list(np.array(ts) - np.mean(ts)) 318 | norm_methods_acc_lag[key_metric] += new_ts 319 | return norm_methods_acc_lag 320 | 321 | 322 | def generate_curve(label,score,slidingWindow): 323 | tpr_3d, fpr_3d, prec_3d, window_3d, avg_auc_3d, avg_ap_3d = metricor().RangeAUC_volume(labels_original=label, score=score, windowSize=1*slidingWindow) 324 | 325 | X = np.array(tpr_3d).reshape(1,-1).ravel() 326 | X_ap = np.array(tpr_3d)[:,:-1].reshape(1,-1).ravel() 327 | Y = np.array(fpr_3d).reshape(1,-1).ravel() 328 | W = np.array(prec_3d).reshape(1,-1).ravel() 329 | Z = np.repeat(window_3d, len(tpr_3d[0])) 330 | Z_ap = np.repeat(window_3d, len(tpr_3d[0])-1) 331 | 332 | return Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d 333 | 334 | def box_plot(data, edge_color, fill_color): 335 | bp = ax.boxplot(data, patch_artist=True) 336 | 337 | for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']: 338 | plt.setp(bp[element], color=edge_color) 339 | 340 | for patch in bp['boxes']: 341 | patch.set(facecolor=fill_color) 342 | 343 | return bp 344 | -------------------------------------------------------------------------------- /metrics/vus/analysis/score_computation.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import math 4 | import matplotlib.pyplot as plt 5 | from matplotlib import cm 6 | import pandas as pd 7 | from tqdm import tqdm as tqdm 8 | import time 9 | from sklearn.preprocessing import MinMaxScaler 10 | import random 11 | 12 | 13 | import os 14 | import sys 15 | module_path = os.path.abspath(os.path.join('../..')) 16 | if module_path not in sys.path: 17 | sys.path.append(module_path) 18 | 19 | from metrics.vus.utils.slidingWindows import find_length 20 | from metrics.vus.utils.metrics import metricor 21 | 22 | from metrics.vus.models.distance import Fourier 23 | from metrics.vus.models.feature import Window 24 | from metrics.vus.models.cnn import cnn 25 | from metrics.vus.models.AE_mlp2 import AE_MLP2 26 | from metrics.vus.models.lstm import lstm 27 | from metrics.vus.models.ocsvm import OCSVM 28 | from metrics.vus.models.poly import POLY 29 | from metrics.vus.models.pca import PCA 30 | from metrics.vus.models.norma import NORMA 31 | from metrics.vus.models.matrix_profile import MatrixProfile 32 | from metrics.vus.models.lof import LOF 33 | from metrics.vus.models.iforest import IForest 34 | 35 | def find_section_length(label,length): 36 | best_i = None 37 | best_sum = None 38 | current_subseq = False 39 | for i in range(len(label)): 40 | changed = False 41 | if label[i] == 1: 42 | if current_subseq == False: 43 | current_subseq = True 44 | if best_i is None: 45 | changed = True 46 | best_i = i 47 | best_sum = np.sum(label[max(0,i-200):min(len(label),i+9800)]) 48 | else: 49 | if np.sum(label[max(0,i-200):min(len(label),i+9800)]) < best_sum: 50 | changed = True 51 | best_i = i 52 | best_sum = np.sum(label[max(0,i-200):min(len(label),i+9800)]) 53 | else: 54 | changed = False 55 | if changed: 56 | diff = i+9800 - len(label) 57 | 58 | pos1 = max(0,i-200 - max(0,diff)) 59 | pos2 = min(i+9800,len(label)) 60 | else: 61 | current_subseq = False 62 | if best_i is not None: 63 | return best_i-pos1,(pos1,pos2) 64 | else: 65 | return None,None 66 | 67 | def generate_data(filepath,init_pos,max_length): 68 | 69 | df = pd.read_csv(filepath, header=None).to_numpy() 70 | name = filepath.split('/')[-1] 71 | #max_length = 30000 72 | data = df[init_pos:init_pos+max_length,0].astype(float) 73 | label = df[init_pos:init_pos+max_length,1] 74 | 75 | pos_first_anom,pos = find_section_length(label,max_length) 76 | 77 | data = df[pos[0]:pos[1],0].astype(float) 78 | label = df[pos[0]:pos[1],1] 79 | 80 | slidingWindow = find_length(data) 81 | #slidingWindow = 70 82 | X_data = Window(window = slidingWindow).convert(data).to_numpy() 83 | 84 | data_train = data[:int(0.1*len(data))] 85 | data_test = data 86 | 87 | X_train = Window(window = slidingWindow).convert(data_train).to_numpy() 88 | X_test = Window(window = slidingWindow).convert(data_test).to_numpy() 89 | 90 | return pos_first_anom,slidingWindow,data,X_data,data_train,data_test,X_train,X_test,label 91 | 92 | def compute_score(methods,slidingWindow,data,X_data,data_train,data_test,X_train,X_test): 93 | 94 | methods_scores = {} 95 | for method in methods: 96 | start_time = time.time() 97 | if method == 'IForest': 98 | clf = IForest(n_jobs=1) 99 | x = X_data 100 | clf.fit(x) 101 | score = clf.decision_scores_ 102 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 103 | score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2)) 104 | 105 | elif method == 'LOF': 106 | clf = LOF(n_neighbors=20, n_jobs=1) 107 | x = X_data 108 | clf.fit(x) 109 | score = clf.decision_scores_ 110 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 111 | score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2)) 112 | 113 | elif method == 'MatrixProfile': 114 | clf = MatrixProfile(window = slidingWindow) 115 | x = data 116 | clf.fit(x) 117 | score = clf.decision_scores_ 118 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 119 | score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2)) 120 | 121 | elif method == 'NormA': 122 | clf = NORMA(pattern_length = slidingWindow, nm_size=3*slidingWindow) 123 | x = data 124 | clf.fit(x) 125 | score = clf.decision_scores_ 126 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 127 | score = np.array([score[0]]*((slidingWindow-1)//2) + list(score) + [score[-1]]*((slidingWindow-1)//2)) 128 | 129 | elif method == 'PCA': 130 | clf = PCA() 131 | x = X_data 132 | clf.fit(x) 133 | score = clf.decision_scores_ 134 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 135 | score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2)) 136 | 137 | elif method == 'POLY': 138 | clf = POLY(power=3, window = slidingWindow) 139 | x = data 140 | clf.fit(x) 141 | measure = Fourier() 142 | measure.detector = clf 143 | measure.set_param() 144 | clf.decision_function(measure=measure) 145 | score = clf.decision_scores_ 146 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 147 | 148 | elif method == 'OCSVM': 149 | X_train_ = MinMaxScaler(feature_range=(0,1)).fit_transform(X_train.T).T 150 | X_test_ = MinMaxScaler(feature_range=(0,1)).fit_transform(X_test.T).T 151 | clf = OCSVM(nu=0.05) 152 | clf.fit(X_train_, X_test_) 153 | score = clf.decision_scores_ 154 | score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2)) 155 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 156 | 157 | elif method == 'LSTM': 158 | clf = lstm(slidingwindow = slidingWindow, predict_time_steps=1, epochs = 50, patience = 5, verbose=0) 159 | clf.fit(data_train, data_test) 160 | measure = Fourier() 161 | measure.detector = clf 162 | measure.set_param() 163 | clf.decision_function(measure=measure) 164 | score = clf.decision_scores_ 165 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 166 | 167 | elif method == 'AE': 168 | clf = AE_MLP2(slidingWindow = slidingWindow, epochs=100, verbose=0) 169 | clf.fit(data_train, data_test) 170 | score = clf.decision_scores_ 171 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 172 | 173 | elif method == 'CNN': 174 | clf = cnn(slidingwindow = slidingWindow, predict_time_steps=1, epochs = 100, patience = 5, verbose=0) 175 | clf.fit(data_train, data_test) 176 | measure = Fourier() 177 | measure.detector = clf 178 | measure.set_param() 179 | clf.decision_function(measure=measure) 180 | score = clf.decision_scores_ 181 | score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel() 182 | 183 | #end_time = time.time() 184 | #time_exec = end_time - start_time 185 | #print(method,"\t time: {}".format(time_exec)) 186 | methods_scores[method] = score 187 | 188 | return methods_scores 189 | 190 | 191 | 192 | 193 | -------------------------------------------------------------------------------- /metrics/vus/metrics.py: -------------------------------------------------------------------------------- 1 | from .utils.metrics import metricor 2 | from .analysis.robustness_eval import generate_curve 3 | 4 | 5 | def get_range_vus_roc(score, labels, slidingWindow): 6 | grader = metricor() 7 | R_AUC_ROC, R_AUC_PR, _, _, _ = grader.RangeAUC(labels=labels, score=score, window=slidingWindow, plot_ROC=True) 8 | _, _, _, _, _, _,VUS_ROC, VUS_PR = generate_curve(labels, score, 2*slidingWindow) 9 | metrics = {'R_AUC_ROC': R_AUC_ROC, 'R_AUC_PR': R_AUC_PR, 'VUS_ROC': VUS_ROC, 'VUS_PR': VUS_PR} 10 | 11 | return metrics 12 | -------------------------------------------------------------------------------- /metrics/vus/models/feature.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Classes of feature mapping for model type B 3 | """ 4 | 5 | import numpy as np 6 | # import matplotlib.pyplot as plt 7 | # import random 8 | # from arch import arch_model 9 | import pandas as pd 10 | import math 11 | # import pmdarima as pm 12 | # from pmdarima import model_selection 13 | # import os 14 | # import dis 15 | # import statistics 16 | # from sklearn import metrics 17 | # import sklearn 18 | from tsfresh import extract_features 19 | 20 | from statsmodels.tsa.seasonal import seasonal_decompose 21 | 22 | # import itertools 23 | # import functools 24 | import warnings 25 | from builtins import range 26 | # from collections import defaultdict 27 | 28 | 29 | from numpy.linalg import LinAlgError 30 | # from scipy.signal import cwt, find_peaks_cwt, ricker, welch 31 | # from scipy.stats import linregress 32 | # from statsmodels.tools.sm_exceptions import MissingDataError 33 | 34 | with warnings.catch_warnings(): 35 | # Ignore warnings of the patsy package 36 | warnings.simplefilter("ignore", DeprecationWarning) 37 | 38 | from statsmodels.tsa.ar_model import AR 39 | # from statsmodels.tsa.stattools import acf, adfuller, pacf 40 | 41 | from hurst import compute_Hc 42 | 43 | class Window: 44 | """ The class for rolling window feature mapping. 45 | The mapping converts the original timeseries X into a matrix. 46 | The matrix consists of rows of sliding windows of original X. 47 | """ 48 | 49 | def __init__(self, window = 100): 50 | self.window = window 51 | self.detector = None 52 | def convert(self, X): 53 | n = self.window 54 | X = pd.Series(X) 55 | L = [] 56 | if n == 0: 57 | df = X 58 | else: 59 | for i in range(n): 60 | L.append(X.shift(i)) 61 | df = pd.concat(L, axis = 1) 62 | df = df.iloc[n-1:] 63 | return df 64 | 65 | class tf_Stat: 66 | '''statisitc feature extraction using the tf_feature package. 67 | It calculates 763 features in total so it might be over complicated for some models. 68 | Recommend to use for methods like Isolation Forest which randomly picks a feature 69 | and then perform the classification. To use for other distance-based model like KNN, 70 | LOF, CBLOF, etc, first train to pass a function that give weights to individual features so that 71 | inconsequential features won't cloud the important ones (mean, variance, kurtosis, etc). 72 | 73 | ''' 74 | def __init__(self, window = 100, step = 25): 75 | self.window = window 76 | self.step = step 77 | self.detector = None 78 | def convert(self, X): 79 | window = self.window 80 | step = self.step 81 | pos = math.ceil(window/2) 82 | #step <= window 83 | 84 | length = X.shape[0] 85 | 86 | Xd = pd.DataFrame(X) 87 | Xd.columns = pd.Index(['x'], dtype='object') 88 | Xd['id'] = 1 89 | Xd['time'] = Xd.index 90 | 91 | test = np.array(extract_features(Xd.iloc[0+pos-math.ceil(window/2):0+pos + math.floor(window/2)], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0)) 92 | M = np.zeros((length - window, test.shape[1]+1 )) 93 | 94 | 95 | i = 0 96 | while i + window <= M.shape[0]: 97 | M[i:i+step, 0]= X[pos + i: pos + i + step] 98 | vector = np.array(extract_features(Xd.iloc[i+pos-math.ceil(window/2):i+pos + math.floor(window/2)], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0)) 99 | 100 | M[i:i+step, 1:] = vector 101 | i+= step 102 | num = M.shape[0] 103 | if i < num: 104 | M[i: num, 0]= X[pos + i: pos + num] 105 | M[i: num, 1:] = np.array(extract_features(Xd.iloc[i+pos-math.ceil(window/2):], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0)) 106 | return M 107 | 108 | class Stat: 109 | '''statisitc feature extraction. 110 | Features include [mean, variance, skewness, kurtosis, autocorrelation, maximum, 111 | minimum, entropy, seasonality, hurst component, AR coef] 112 | 113 | ''' 114 | def __init__(self, window = 100, data_step = 10, param = [{"coeff": 0, "k": 5}], lag = 1, freq = 720): 115 | self.window = window 116 | self.data_step = data_step 117 | self.detector = None 118 | self.param = param 119 | self.lag = lag 120 | self.freq =freq 121 | if data_step > int(window/2): 122 | raise ValueError('value step shoudm\'t be greater than half of the window') 123 | 124 | 125 | def convert(self, X): 126 | freq = self.freq 127 | n = self.window 128 | data_step = self.data_step 129 | X = pd.Series(X) 130 | L = [] 131 | if n == 0: 132 | df = X 133 | raise ValueError('window lenght is set to zero') 134 | else: 135 | for i in range(n): 136 | L.append(X.shift(i)) 137 | df = pd.concat(L, axis = 1) 138 | df = df.iloc[n:] 139 | df2 = pd.concat(L[:data_step], axis = 1) 140 | 141 | 142 | 143 | df = df.reset_index() 144 | #value 145 | x0 = df2[math.ceil(n/2) : - math.floor(n/2)].reset_index() 146 | #mean 147 | x1 = (df.mean(axis=1)) 148 | #variance 149 | x2 = df.var(axis=1) 150 | #AR-coef 151 | self.ar_function = lambda x: self.ar_coefficient(x) 152 | x3 = df.apply(self.ar_function, axis =1, result_type='expand' ) 153 | #autocorrelation 154 | self.auto_function = lambda x: self.autocorrelation(x) 155 | x4 = df.apply(self.auto_function, axis =1, result_type='expand' ) 156 | #kurtosis 157 | x5 = (df.kurtosis(axis=1)) 158 | #skewness 159 | x6 = (df.skew(axis=1)) 160 | #maximum 161 | x7 = (df.max(axis=1)) 162 | #minimum 163 | x8 = (df.min(axis=1)) 164 | #entropy 165 | self.entropy_function = lambda x: self.sample_entropy(x) 166 | x9 = df.apply(self.entropy_function, axis =1, result_type='expand') 167 | 168 | #seasonality 169 | result = seasonal_decompose(X, model='additive', freq = freq, extrapolate_trend='freq') 170 | #seasonal 171 | x10 = pd.Series(np.array(result.seasonal[math.ceil(n/2) : - math.floor(n/2)])) 172 | #trend 173 | x11 = pd.Series(np.array(result.trend[math.ceil(n/2) : - math.floor(n/2)])) 174 | #resid 175 | x12 = pd.Series(np.array(result.resid[math.ceil(n/2) : - math.floor(n/2)])) 176 | 177 | #Hurst component 178 | self.hurst_function = lambda x: self.hurst_f(x) 179 | x13 = df.apply(self.hurst_function, axis =1, result_type='expand') 180 | 181 | L = [x0, x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12, x13] 182 | M = pd.concat(L, axis = 1) 183 | M = M.drop(columns=['index']) 184 | 185 | return M 186 | def ar_coefficient(self, x): 187 | """ 188 | This feature calculator fits the unconditional maximum likelihood 189 | of an autoregressive AR(k) process. 190 | The k parameter is the maximum lag of the process 191 | 192 | .. math:: 193 | 194 | X_{t}=\\varphi_0 +\\sum _{{i=1}}^{k}\\varphi_{i}X_{{t-i}}+\\varepsilon_{t} 195 | 196 | For the configurations from param which should contain the maxlag "k" and such an AR process is calculated. Then 197 | the coefficients :math:`\\varphi_{i}` whose index :math:`i` contained from "coeff" are returned. 198 | 199 | :param x: the time series to calculate the feature of 200 | :type x: numpy.ndarray 201 | :param param: contains dictionaries {"coeff": x, "k": y} with x,y int 202 | :type param: list 203 | :return x: the different feature values 204 | :return type: pandas.Series 205 | """ 206 | calculated_ar_params = {} 207 | param = self.param 208 | x_as_list = list(x) 209 | 210 | res = {} 211 | 212 | for parameter_combination in param: 213 | k = parameter_combination["k"] 214 | p = parameter_combination["coeff"] 215 | 216 | column_name = "coeff_{}__k_{}".format(p, k) 217 | 218 | if k not in calculated_ar_params: 219 | try: 220 | calculated_AR = AR(x_as_list) 221 | calculated_ar_params[k] = calculated_AR.fit(maxlag=k, solver="mle").params 222 | except (LinAlgError, ValueError): 223 | calculated_ar_params[k] = [np.NaN] * k 224 | 225 | mod = calculated_ar_params[k] 226 | 227 | if p <= k: 228 | try: 229 | res[column_name] = mod[p] 230 | except IndexError: 231 | res[column_name] = 0 232 | else: 233 | res[column_name] = np.NaN 234 | 235 | L = [(key, value) for key, value in res.items()] 236 | L0 = [] 237 | for item in L: 238 | L0.append(item[1]) 239 | return L0 240 | 241 | def autocorrelation(self, x): 242 | """ 243 | Calculates the autocorrelation of the specified lag, according to the formula [1] 244 | 245 | .. math:: 246 | 247 | \\frac{1}{(n-l)\\sigma^{2}} \\sum_{t=1}^{n-l}(X_{t}-\\mu )(X_{t+l}-\\mu) 248 | 249 | where :math:`n` is the length of the time series :math:`X_i`, :math:`\\sigma^2` its variance and :math:`\\mu` its 250 | mean. `l` denotes the lag. 251 | 252 | .. rubric:: References 253 | 254 | [1] https://en.wikipedia.org/wiki/Autocorrelation#Estimation 255 | 256 | :param x: the time series to calculate the feature of 257 | :type x: numpy.ndarray 258 | :param lag: the lag 259 | :type lag: int 260 | :return: the value of this feature 261 | :return type: float 262 | """ 263 | lag = self.lag 264 | # This is important: If a series is passed, the product below is calculated 265 | # based on the index, which corresponds to squaring the series. 266 | if isinstance(x, pd.Series): 267 | x = x.values 268 | if len(x) < lag: 269 | return np.nan 270 | # Slice the relevant subseries based on the lag 271 | y1 = x[:(len(x) - lag)] 272 | y2 = x[lag:] 273 | # Subtract the mean of the whole series x 274 | x_mean = np.mean(x) 275 | # The result is sometimes referred to as "covariation" 276 | sum_product = np.sum((y1 - x_mean) * (y2 - x_mean)) 277 | # Return the normalized unbiased covariance 278 | v = np.var(x) 279 | if np.isclose(v, 0): 280 | return np.NaN 281 | else: 282 | return sum_product / ((len(x) - lag) * v) 283 | def _into_subchunks(self, x, subchunk_length, every_n=1): 284 | """ 285 | Split the time series x into subwindows of length "subchunk_length", starting every "every_n". 286 | 287 | For example, the input data if [0, 1, 2, 3, 4, 5, 6] will be turned into a matrix 288 | 289 | 0 2 4 290 | 1 3 5 291 | 2 4 6 292 | 293 | with the settings subchunk_length = 3 and every_n = 2 294 | """ 295 | len_x = len(x) 296 | 297 | assert subchunk_length > 1 298 | assert every_n > 0 299 | 300 | # how often can we shift a window of size subchunk_length over the input? 301 | num_shifts = (len_x - subchunk_length) // every_n + 1 302 | shift_starts = every_n * np.arange(num_shifts) 303 | indices = np.arange(subchunk_length) 304 | 305 | indexer = np.expand_dims(indices, axis=0) + np.expand_dims(shift_starts, axis=1) 306 | return np.asarray(x)[indexer] 307 | def sample_entropy(self, x): 308 | """ 309 | Calculate and return sample entropy of x. 310 | 311 | .. rubric:: References 312 | 313 | | [1] http://en.wikipedia.org/wiki/Sample_Entropy 314 | | [2] https://www.ncbi.nlm.nih.gov/pubmed/10843903?dopt=Abstract 315 | 316 | :param x: the time series to calculate the feature of 317 | :type x: numpy.ndarray 318 | 319 | :return: the value of this feature 320 | :return type: float 321 | """ 322 | x = np.array(x) 323 | 324 | # if one of the values is NaN, we can not compute anything meaningful 325 | if np.isnan(x).any(): 326 | return np.nan 327 | 328 | m = 2 # common value for m, according to wikipedia... 329 | tolerance = 0.2 * np.std(x) # 0.2 is a common value for r, according to wikipedia... 330 | 331 | # Split time series and save all templates of length m 332 | # Basically we turn [1, 2, 3, 4] into [1, 2], [2, 3], [3, 4] 333 | xm = self._into_subchunks(x, m) 334 | 335 | # Now calculate the maximum distance between each of those pairs 336 | # np.abs(xmi - xm).max(axis=1) 337 | # and check how many are below the tolerance. 338 | # For speed reasons, we are not doing this in a nested for loop, 339 | # but with numpy magic. 340 | # Example: 341 | # if x = [1, 2, 3] 342 | # then xm = [[1, 2], [2, 3]] 343 | # so we will substract xm from [1, 2] => [[0, 0], [-1, -1]] 344 | # and from [2, 3] => [[1, 1], [0, 0]] 345 | # taking the abs and max gives us: 346 | # [0, 1] and [1, 0] 347 | # as the diagonal elements are always 0, we substract 1. 348 | B = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm]) 349 | 350 | # Similar for computing A 351 | xmp1 = self._into_subchunks(x, m + 1) 352 | 353 | A = np.sum([np.sum(np.abs(xmi - xmp1).max(axis=1) <= tolerance) - 1 for xmi in xmp1]) 354 | 355 | # Return SampEn 356 | return -np.log(A / B) 357 | def hurst_f(self, x): 358 | H,c, M = compute_Hc(x) 359 | return [H, c] -------------------------------------------------------------------------------- /metrics/vus/utils/metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | import numpy as np 3 | import math 4 | # import matplotlib.pyplot as plt 5 | 6 | class metricor: 7 | def __init__(self, a = 1, probability = True, bias = 'flat', ): 8 | self.a = a 9 | self.probability = probability 10 | self.bias = bias 11 | 12 | def detect_model(self, model, label, contamination = 0.1, window = 100, is_A = False, is_threshold = True): 13 | if is_threshold: 14 | score = self.scale_threshold(model.decision_scores_, model._mu, model._sigma) 15 | else: 16 | score = self.scale_contamination(model.decision_scores_, contamination = contamination) 17 | if is_A is False: 18 | scoreX = np.zeros(len(score)+window) 19 | scoreX[math.ceil(window/2): len(score)+window - math.floor(window/2)] = score 20 | else: 21 | scoreX = score 22 | 23 | self.score_=scoreX 24 | L = self.metric(label, scoreX) 25 | return L 26 | 27 | 28 | def labels_conv(self, preds): 29 | '''return indices of predicted anomaly 30 | ''' 31 | 32 | # p = np.zeros(len(preds)) 33 | index = np.where(preds >= 0.5) 34 | return index[0] 35 | 36 | def labels_conv_binary(self, preds): 37 | '''return predicted label 38 | ''' 39 | p = np.zeros(len(preds)) 40 | index = np.where(preds >= 0.5) 41 | p[index[0]] = 1 42 | return p 43 | 44 | 45 | def w(self, AnomalyRange, p): 46 | MyValue = 0 47 | MaxValue = 0 48 | start = AnomalyRange[0] 49 | AnomalyLength = AnomalyRange[1] - AnomalyRange[0] + 1 50 | for i in range(start, start +AnomalyLength): 51 | bi = self.b(i, AnomalyLength) 52 | MaxValue += bi 53 | if i in p: 54 | MyValue += bi 55 | return MyValue/MaxValue 56 | 57 | def Cardinality_factor(self, Anomolyrange, Prange): 58 | score = 0 59 | start = Anomolyrange[0] 60 | end = Anomolyrange[1] 61 | for i in Prange: 62 | if i[0] >= start and i[0] <= end: 63 | score +=1 64 | elif start >= i[0] and start <= i[1]: 65 | score += 1 66 | elif end >= i[0] and end <= i[1]: 67 | score += 1 68 | elif start >= i[0] and end <= i[1]: 69 | score += 1 70 | if score == 0: 71 | return 0 72 | else: 73 | return 1/score 74 | 75 | def b(self, i, length): 76 | bias = self.bias 77 | if bias == 'flat': 78 | return 1 79 | elif bias == 'front-end bias': 80 | return length - i + 1 81 | elif bias == 'back-end bias': 82 | return i 83 | else: 84 | if i <= length/2: 85 | return i 86 | else: 87 | return length - i + 1 88 | 89 | 90 | def scale_threshold(self, score, score_mu, score_sigma): 91 | return (score >= (score_mu + 3*score_sigma)).astype(int) 92 | 93 | 94 | def metric_new(self, label, score, plot_ROC=False, alpha=0.2,coeff=3): 95 | '''input: 96 | Real labels and anomaly score in prediction 97 | 98 | output: 99 | AUC, 100 | Precision, 101 | Recall, 102 | F-score, 103 | Range-precision, 104 | Range-recall, 105 | Range-Fscore, 106 | Precison@k, 107 | 108 | k is chosen to be # of outliers in real labels 109 | ''' 110 | if np.sum(label) == 0: 111 | print('All labels are 0. Label must have groud truth value for calculating AUC score.') 112 | return None 113 | 114 | if np.isnan(score).any() or score is None: 115 | print('Score must not be none.') 116 | return None 117 | 118 | #area under curve 119 | auc = metrics.roc_auc_score(label, score) 120 | # plor ROC curve 121 | if plot_ROC: 122 | fpr, tpr, thresholds = metrics.roc_curve(label, score) 123 | # display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc) 124 | # display.plot() 125 | 126 | #precision, recall, F 127 | 128 | preds = score > (np.mean(score)+coeff*np.std(score)) 129 | if np.sum(preds) == 0: 130 | preds = score > (np.mean(score)+2*np.std(score)) 131 | if np.sum(preds) == 0: 132 | preds = score > (np.mean(score)+1*np.std(score)) 133 | Precision, Recall, F, Support = metrics.precision_recall_fscore_support(label, preds, zero_division=0) 134 | precision = Precision[1] 135 | recall = Recall[1] 136 | f = F[1] 137 | 138 | #range anomaly 139 | Rrecall, ExistenceReward, OverlapReward = self.range_recall_new(label, preds, alpha) 140 | Rprecision = self.range_recall_new(preds, label, 0)[0] 141 | 142 | if Rprecision + Rrecall==0: 143 | Rf=0 144 | else: 145 | Rf = 2 * Rrecall * Rprecision / (Rprecision + Rrecall) 146 | 147 | # top-k 148 | k = int(np.sum(label)) 149 | threshold = np.percentile(score, 100 * (1-k/len(label))) 150 | 151 | # precision_at_k = metrics.top_k_accuracy_score(label, score, k) 152 | p_at_k = np.where(preds > threshold)[0] 153 | TP_at_k = sum(label[p_at_k]) 154 | precision_at_k = TP_at_k/k 155 | 156 | L = [auc, precision, recall, f, Rrecall, ExistenceReward, OverlapReward, Rprecision, Rf, precision_at_k] 157 | if plot_ROC: 158 | return L, fpr, tpr 159 | return L 160 | 161 | def metric_PR(self, label, score): 162 | precision, recall, thresholds = metrics.precision_recall_curve(label, score) 163 | # plt.figure() 164 | # disp = metrics.PrecisionRecallDisplay(precision=precision, recall=recall) 165 | # disp.plot() 166 | AP = metrics.auc(recall, precision) 167 | #AP = metrics.average_precision_score(label, score) 168 | return precision, recall, AP 169 | 170 | def range_recall_new(self, labels, preds, alpha): 171 | 172 | 173 | p = np.where(preds == 1)[0] # positions of predicted label==1 174 | range_pred = self.range_convers_new(preds) 175 | range_label = self.range_convers_new(labels) 176 | 177 | Nr = len(range_label) # total # of real anomaly segments 178 | 179 | ExistenceReward = self.existence_reward(range_label, p) 180 | 181 | 182 | OverlapReward = 0 183 | for i in range_label: 184 | OverlapReward += self.w(i, p) * self.Cardinality_factor(i, range_pred) 185 | 186 | 187 | score = alpha * ExistenceReward + (1-alpha) * OverlapReward 188 | if Nr != 0: 189 | return score/Nr, ExistenceReward/Nr, OverlapReward/Nr 190 | else: 191 | return 0,0,0 192 | 193 | def range_convers_new(self, label): 194 | ''' 195 | input: arrays of binary values 196 | output: list of ordered pair [[a0,b0], [a1,b1]... ] of the inputs 197 | ''' 198 | L = [] 199 | i = 0 200 | j = 0 201 | while j < len(label): 202 | # print(i) 203 | while label[i] == 0: 204 | i+=1 205 | if i >= len(label): 206 | break 207 | j = i+1 208 | # print('j'+str(j)) 209 | if j >= len(label): 210 | if j==len(label): 211 | L.append((i,j-1)) 212 | 213 | break 214 | while label[j] != 0: 215 | j+=1 216 | if j >= len(label): 217 | L.append((i,j-1)) 218 | break 219 | if j >= len(label): 220 | break 221 | L.append((i, j-1)) 222 | i = j 223 | return L 224 | 225 | def existence_reward(self, labels, preds): 226 | ''' 227 | labels: list of ordered pair 228 | preds predicted data 229 | ''' 230 | 231 | score = 0 232 | for i in labels: 233 | if np.sum(np.multiply(preds <= i[1], preds >= i[0])) > 0: 234 | score += 1 235 | return score 236 | 237 | def num_nonzero_segments(self, x): 238 | count=0 239 | if x[0]>0: 240 | count+=1 241 | for i in range(1, len(x)): 242 | if x[i]>0 and x[i-1]==0: 243 | count+=1 244 | return count 245 | 246 | def extend_postive_range(self, x, window=5): 247 | label = x.copy().astype(float) 248 | L = self.range_convers_new(label) # index of non-zero segments 249 | length = len(label) 250 | for k in range(len(L)): 251 | s = L[k][0] 252 | e = L[k][1] 253 | 254 | 255 | x1 = np.arange(e,min(e+window//2,length)) 256 | label[x1] += np.sqrt(1 - (x1-e)/(window)) 257 | 258 | x2 = np.arange(max(s-window//2,0),s) 259 | label[x2] += np.sqrt(1 - (s-x2)/(window)) 260 | 261 | label = np.minimum(np.ones(length), label) 262 | return label 263 | 264 | def extend_postive_range_individual(self, x, percentage=0.2): 265 | label = x.copy().astype(float) 266 | L = self.range_convers_new(label) # index of non-zero segments 267 | length = len(label) 268 | for k in range(len(L)): 269 | s = L[k][0] 270 | e = L[k][1] 271 | 272 | l0 = int((e-s+1)*percentage) 273 | 274 | x1 = np.arange(e,min(e+l0,length)) 275 | label[x1] += np.sqrt(1 - (x1-e)/(2*l0)) 276 | 277 | x2 = np.arange(max(s-l0,0),s) 278 | label[x2] += np.sqrt(1 - (s-x2)/(2*l0)) 279 | 280 | label = np.minimum(np.ones(length), label) 281 | return label 282 | 283 | def TPR_FPR_RangeAUC(self, labels, pred, P, L): 284 | product = labels * pred 285 | 286 | TP = np.sum(product) 287 | 288 | # recall = min(TP/P,1) 289 | P_new = (P+np.sum(labels))/2 # so TPR is neither large nor small 290 | # P_new = np.sum(labels) 291 | recall = min(TP/P_new,1) 292 | # recall = TP/np.sum(labels) 293 | # print('recall '+str(recall)) 294 | 295 | 296 | existence = 0 297 | for seg in L: 298 | if np.sum(product[seg[0]:(seg[1]+1)])>0: 299 | existence += 1 300 | 301 | existence_ratio = existence/len(L) 302 | # print(existence_ratio) 303 | 304 | # TPR_RangeAUC = np.sqrt(recall*existence_ratio) 305 | # print(existence_ratio) 306 | TPR_RangeAUC = recall*existence_ratio 307 | 308 | FP = np.sum(pred) - TP 309 | # TN = np.sum((1-pred) * (1-labels)) 310 | 311 | # FPR_RangeAUC = FP/(FP+TN) 312 | N_new = len(labels) - P_new 313 | FPR_RangeAUC = FP/N_new 314 | 315 | Precision_RangeAUC = TP/np.sum(pred) 316 | 317 | return TPR_RangeAUC, FPR_RangeAUC, Precision_RangeAUC 318 | 319 | def RangeAUC(self, labels, score, window=0, percentage=0, plot_ROC=False, AUC_type='window'): 320 | # AUC_type='window'/'percentage' 321 | score_sorted = -np.sort(-score) 322 | 323 | P = np.sum(labels) 324 | # print(np.sum(labels)) 325 | if AUC_type=='window': 326 | labels = self.extend_postive_range(labels, window=window) 327 | else: 328 | labels = self.extend_postive_range_individual(labels, percentage=percentage) 329 | 330 | # print(np.sum(labels)) 331 | L = self.range_convers_new(labels) 332 | TPR_list = [0] 333 | FPR_list = [0] 334 | Precision_list = [1] 335 | 336 | for i in np.linspace(0, len(score)-1, 250).astype(int): 337 | threshold = score_sorted[i] 338 | # print('thre='+str(threshold)) 339 | pred = score>= threshold 340 | TPR, FPR, Precision = self.TPR_FPR_RangeAUC(labels, pred, P,L) 341 | 342 | TPR_list.append(TPR) 343 | FPR_list.append(FPR) 344 | Precision_list.append(Precision) 345 | 346 | TPR_list.append(1) 347 | FPR_list.append(1) # otherwise, range-AUC will stop earlier than (1,1) 348 | 349 | tpr = np.array(TPR_list) 350 | fpr = np.array(FPR_list) 351 | prec = np.array(Precision_list) 352 | 353 | width = fpr[1:] - fpr[:-1] 354 | height = (tpr[1:] + tpr[:-1])/2 355 | AUC_range = np.sum(width*height) 356 | 357 | width_PR = tpr[1:-1] - tpr[:-2] 358 | height_PR = (prec[1:] + prec[:-1])/2 359 | AP_range = np.sum(width_PR*height_PR) 360 | 361 | if plot_ROC: 362 | return AUC_range, AP_range, fpr, tpr, prec 363 | 364 | return AUC_range 365 | 366 | 367 | # TPR_FPR_window 368 | def RangeAUC_volume(self, labels_original, score, windowSize): 369 | score_sorted = -np.sort(-score) 370 | 371 | tpr_3d=[] 372 | fpr_3d=[] 373 | prec_3d=[] 374 | 375 | auc_3d=[] 376 | ap_3d=[] 377 | 378 | window_3d = np.arange(0, windowSize+1, 1) 379 | P = np.sum(labels_original) 380 | 381 | for window in window_3d: 382 | labels = self.extend_postive_range(labels_original, window) 383 | 384 | # print(np.sum(labels)) 385 | L = self.range_convers_new(labels) 386 | TPR_list = [0] 387 | FPR_list = [0] 388 | Precision_list = [1] 389 | 390 | for i in np.linspace(0, len(score)-1, 250).astype(int): 391 | threshold = score_sorted[i] 392 | # print('thre='+str(threshold)) 393 | pred = score>= threshold 394 | TPR, FPR, Precision = self.TPR_FPR_RangeAUC(labels, pred, P,L) 395 | 396 | TPR_list.append(TPR) 397 | FPR_list.append(FPR) 398 | Precision_list.append(Precision) 399 | 400 | TPR_list.append(1) 401 | FPR_list.append(1) # otherwise, range-AUC will stop earlier than (1,1) 402 | 403 | 404 | tpr = np.array(TPR_list) 405 | fpr = np.array(FPR_list) 406 | prec = np.array(Precision_list) 407 | 408 | tpr_3d.append(tpr) 409 | fpr_3d.append(fpr) 410 | prec_3d.append(prec) 411 | 412 | width = fpr[1:] - fpr[:-1] 413 | height = (tpr[1:] + tpr[:-1])/2 414 | AUC_range = np.sum(width*height) 415 | auc_3d.append(AUC_range) 416 | 417 | width_PR = tpr[1:-1] - tpr[:-2] 418 | height_PR = (prec[1:] + prec[:-1])/2 419 | AP_range = np.sum(width_PR*height_PR) 420 | ap_3d.append(AP_range) 421 | 422 | 423 | return tpr_3d, fpr_3d, prec_3d, window_3d, sum(auc_3d)/len(window_3d), sum(ap_3d)/len(window_3d) 424 | 425 | 426 | 427 | 428 | def generate_curve(label,score,slidingWindow): 429 | tpr_3d, fpr_3d, prec_3d, window_3d, avg_auc_3d, avg_ap_3d = metricor().RangeAUC_volume(labels_original=label, score=score, windowSize=1*slidingWindow) 430 | 431 | X = np.array(tpr_3d).reshape(1,-1).ravel() 432 | X_ap = np.array(tpr_3d)[:,:-1].reshape(1,-1).ravel() 433 | Y = np.array(fpr_3d).reshape(1,-1).ravel() 434 | W = np.array(prec_3d).reshape(1,-1).ravel() 435 | Z = np.repeat(window_3d, len(tpr_3d[0])) 436 | Z_ap = np.repeat(window_3d, len(tpr_3d[0])-1) 437 | 438 | return Y, Z, X, X_ap, W, Z_ap,avg_auc_3d, avg_ap_3d 439 | 440 | -------------------------------------------------------------------------------- /metrics/vus/utils/slidingWindows.py: -------------------------------------------------------------------------------- 1 | from statsmodels.tsa.stattools import acf 2 | from scipy.signal import argrelextrema 3 | import numpy as np 4 | 5 | import matplotlib.patches as mpatches 6 | import matplotlib.pyplot as plt 7 | # determine sliding window (period) based on ACF 8 | def find_length(data): 9 | if len(data.shape)>1: 10 | return 0 11 | data = data[:min(20000, len(data))] 12 | 13 | base = 3 14 | auto_corr = acf(data, nlags=400, fft=True)[base:] 15 | 16 | 17 | local_max = argrelextrema(auto_corr, np.greater)[0] 18 | try: 19 | max_local_max = np.argmax([auto_corr[lcm] for lcm in local_max]) 20 | if local_max[max_local_max]<3 or local_max[max_local_max]>300: 21 | return 125 22 | return local_max[max_local_max]+base 23 | except: 24 | return 125 -------------------------------------------------------------------------------- /model/DCdetector.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from einops import rearrange 5 | from .attn import DAC_structure, AttentionLayer 6 | from .embed import DataEmbedding, TokenEmbedding 7 | from .RevIN import RevIN 8 | from tkinter import _flatten 9 | 10 | 11 | class Encoder(nn.Module): 12 | def __init__(self, attn_layers, norm_layer=None): 13 | super(Encoder, self).__init__() 14 | self.attn_layers = nn.ModuleList(attn_layers) 15 | self.norm = norm_layer 16 | 17 | def forward(self, x_patch_size, x_patch_num, x_ori, patch_index, attn_mask=None): 18 | series_list = [] 19 | prior_list = [] 20 | for attn_layer in self.attn_layers: 21 | series, prior = attn_layer(x_patch_size, x_patch_num, x_ori, patch_index, attn_mask=attn_mask) 22 | series_list.append(series) 23 | prior_list.append(prior) 24 | return series_list, prior_list 25 | 26 | 27 | 28 | class DCdetector(nn.Module): 29 | def __init__(self, win_size, enc_in, c_out, n_heads=1, d_model=256, e_layers=3, patch_size=[3,5,7], channel=55, d_ff=512, dropout=0.0, activation='gelu', output_attention=True): 30 | super(DCdetector, self).__init__() 31 | self.output_attention = output_attention 32 | self.patch_size = patch_size 33 | self.channel = channel 34 | self.win_size = win_size 35 | 36 | # Patching List 37 | self.embedding_patch_size = nn.ModuleList() 38 | self.embedding_patch_num = nn.ModuleList() 39 | for i, patchsize in enumerate(self.patch_size): 40 | self.embedding_patch_size.append(DataEmbedding(patchsize, d_model, dropout)) 41 | self.embedding_patch_num.append(DataEmbedding(self.win_size//patchsize, d_model, dropout)) 42 | 43 | self.embedding_window_size = DataEmbedding(enc_in, d_model, dropout) 44 | 45 | # Dual Attention Encoder 46 | self.encoder = Encoder( 47 | [ 48 | AttentionLayer( 49 | DAC_structure(win_size, patch_size, channel, False, attention_dropout=dropout, output_attention=output_attention), 50 | d_model, patch_size, channel, n_heads, win_size)for l in range(e_layers) 51 | ], 52 | norm_layer=torch.nn.LayerNorm(d_model) 53 | ) 54 | 55 | self.projection = nn.Linear(d_model, c_out, bias=True) 56 | 57 | 58 | def forward(self, x): 59 | B, L, M = x.shape #Batch win_size channel 60 | series_patch_mean = [] 61 | prior_patch_mean = [] 62 | revin_layer = RevIN(num_features=M) 63 | 64 | # Instance Normalization Operation 65 | x = revin_layer(x, 'norm') 66 | x_ori = self.embedding_window_size(x) 67 | 68 | # Mutil-scale Patching Operation 69 | for patch_index, patchsize in enumerate(self.patch_size): 70 | x_patch_size, x_patch_num = x, x 71 | x_patch_size = rearrange(x_patch_size, 'b l m -> b m l') #Batch channel win_size 72 | x_patch_num = rearrange(x_patch_num, 'b l m -> b m l') #Batch channel win_size 73 | 74 | x_patch_size = rearrange(x_patch_size, 'b m (n p) -> (b m) n p', p = patchsize) 75 | x_patch_size = self.embedding_patch_size[patch_index](x_patch_size) 76 | x_patch_num = rearrange(x_patch_num, 'b m (p n) -> (b m) p n', p = patchsize) 77 | x_patch_num = self.embedding_patch_num[patch_index](x_patch_num) 78 | 79 | series, prior = self.encoder(x_patch_size, x_patch_num, x_ori, patch_index) 80 | series_patch_mean.append(series), prior_patch_mean.append(prior) 81 | 82 | series_patch_mean = list(_flatten(series_patch_mean)) 83 | prior_patch_mean = list(_flatten(prior_patch_mean)) 84 | 85 | if self.output_attention: 86 | return series_patch_mean, prior_patch_mean 87 | else: 88 | return None 89 | 90 | 91 | -------------------------------------------------------------------------------- /model/RevIN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class RevIN(nn.Module): 5 | def __init__(self, num_features: int, eps=1e-5, affine=True): 6 | """ 7 | :param num_features: the number of features or channels 8 | :param eps: a value added for numerical stability 9 | :param affine: if True, RevIN has learnable affine parameters 10 | """ 11 | super(RevIN, self).__init__() 12 | self.num_features = num_features 13 | self.eps = eps 14 | self.affine = affine 15 | if self.affine: 16 | self._init_params() 17 | 18 | def forward(self, x, mode:str): 19 | if mode == 'norm': 20 | self._get_statistics(x) 21 | x = self._normalize(x) 22 | elif mode == 'denorm': 23 | x = self._denormalize(x) 24 | else: raise NotImplementedError 25 | return x 26 | 27 | def _init_params(self): 28 | # initialize RevIN params: (C,) 29 | self.affine_weight = torch.ones(self.num_features) 30 | self.affine_bias = torch.zeros(self.num_features) 31 | self.affine_weight=self.affine_weight.to(device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')) 32 | self.affine_bias=self.affine_bias.to(device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')) 33 | 34 | 35 | def _get_statistics(self, x): 36 | dim2reduce = tuple(range(1, x.ndim-1)) 37 | self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach() 38 | self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach() 39 | 40 | 41 | def _normalize(self, x): 42 | x = x - self.mean 43 | x = x / self.stdev 44 | if self.affine: 45 | x = x * self.affine_weight 46 | x = x + self.affine_bias 47 | return x 48 | 49 | def _denormalize(self, x): 50 | if self.affine: 51 | x = x - self.affine_bias 52 | x = x / (self.affine_weight + self.eps*self.eps) 53 | x = x * self.stdev 54 | x = x + self.mean 55 | return x 56 | -------------------------------------------------------------------------------- /model/attn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import math 6 | from math import sqrt 7 | import os 8 | from einops import rearrange, reduce, repeat 9 | 10 | 11 | class DAC_structure(nn.Module): 12 | def __init__(self, win_size, patch_size, channel, mask_flag=True, scale=None, attention_dropout=0.05, output_attention=False): 13 | super(DAC_structure, self).__init__() 14 | self.scale = scale 15 | self.mask_flag = mask_flag 16 | self.output_attention = output_attention 17 | self.dropout = nn.Dropout(attention_dropout) 18 | self.window_size = win_size 19 | self.patch_size = patch_size 20 | self.channel = channel 21 | 22 | def forward(self, queries_patch_size, queries_patch_num, keys_patch_size, keys_patch_num, values, patch_index, attn_mask): 23 | 24 | # Patch-wise Representation 25 | B, L, H, E = queries_patch_size.shape #batch_size*channel, patch_num, n_head, d_model/n_head 26 | scale_patch_size = self.scale or 1. / sqrt(E) 27 | scores_patch_size = torch.einsum("blhe,bshe->bhls", queries_patch_size, keys_patch_size) #batch*ch, nheads, p_num, p_num 28 | attn_patch_size = scale_patch_size * scores_patch_size 29 | series_patch_size = self.dropout(torch.softmax(attn_patch_size, dim=-1)) # B*D_model H N N 30 | 31 | # In-patch Representation 32 | B, L, H, E = queries_patch_num.shape #batch_size*channel, patch_size, n_head, d_model/n_head 33 | scale_patch_num = self.scale or 1. / sqrt(E) 34 | scores_patch_num = torch.einsum("blhe,bshe->bhls", queries_patch_num, keys_patch_num) #batch*ch, nheads, p_size, p_size 35 | attn_patch_num = scale_patch_num * scores_patch_num 36 | series_patch_num = self.dropout(torch.softmax(attn_patch_num, dim=-1)) # B*D_model H S S 37 | 38 | # Upsampling 39 | series_patch_size = repeat(series_patch_size, 'b l m n -> b l (m repeat_m) (n repeat_n)', repeat_m=self.patch_size[patch_index], repeat_n=self.patch_size[patch_index]) 40 | series_patch_num = series_patch_num.repeat(1,1,self.window_size//self.patch_size[patch_index],self.window_size//self.patch_size[patch_index]) 41 | series_patch_size = reduce(series_patch_size, '(b reduce_b) l m n-> b l m n', 'mean', reduce_b=self.channel) 42 | series_patch_num = reduce(series_patch_num, '(b reduce_b) l m n-> b l m n', 'mean', reduce_b=self.channel) 43 | 44 | 45 | if self.output_attention: 46 | return series_patch_size, series_patch_num 47 | else: 48 | return (None) 49 | 50 | 51 | 52 | class AttentionLayer(nn.Module): 53 | def __init__(self, attention, d_model, patch_size, channel, n_heads, win_size, d_keys=None, d_values=None): 54 | super(AttentionLayer, self).__init__() 55 | 56 | d_keys = d_keys or (d_model // n_heads) 57 | d_values = d_values or (d_model // n_heads) 58 | self.norm = nn.LayerNorm(d_model) 59 | self.inner_attention = attention 60 | self.patch_size = patch_size 61 | self.channel = channel 62 | self.window_size = win_size 63 | self.n_heads = n_heads 64 | 65 | self.patch_query_projection = nn.Linear(d_model, d_keys * n_heads) 66 | self.patch_key_projection = nn.Linear(d_model, d_keys * n_heads) 67 | self.out_projection = nn.Linear(d_values * n_heads, d_model) 68 | self.value_projection = nn.Linear(d_model, d_values * n_heads) 69 | 70 | def forward(self, x_patch_size, x_patch_num, x_ori, patch_index, attn_mask): 71 | 72 | # patch_size 73 | B, L, M = x_patch_size.shape 74 | H = self.n_heads 75 | queries_patch_size, keys_patch_size = x_patch_size, x_patch_size 76 | queries_patch_size = self.patch_query_projection(queries_patch_size).view(B, L, H, -1) 77 | keys_patch_size = self.patch_key_projection(keys_patch_size).view(B, L, H, -1) 78 | 79 | # patch_num 80 | B, L, M = x_patch_num.shape 81 | queries_patch_num, keys_patch_num = x_patch_num, x_patch_num 82 | queries_patch_num = self.patch_query_projection(queries_patch_num).view(B, L, H, -1) 83 | keys_patch_num = self.patch_key_projection(keys_patch_num).view(B, L, H, -1) 84 | 85 | # x_ori 86 | B, L, _ = x_ori.shape 87 | values = self.value_projection(x_ori).view(B, L, H, -1) 88 | 89 | series, prior = self.inner_attention( 90 | queries_patch_size, queries_patch_num, 91 | keys_patch_size, keys_patch_num, 92 | values, patch_index, 93 | attn_mask 94 | ) 95 | 96 | return series, prior 97 | -------------------------------------------------------------------------------- /model/embed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn.utils import weight_norm 5 | import math 6 | 7 | 8 | class PositionalEmbedding(nn.Module): 9 | def __init__(self, d_model, max_len=5000): 10 | super(PositionalEmbedding, self).__init__() 11 | # Compute the positional encodings once in log space. 12 | pe = torch.zeros(max_len, d_model).float() 13 | pe.require_grad = False 14 | 15 | position = torch.arange(0, max_len).float().unsqueeze(1) 16 | div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() 17 | 18 | pe[:, 0::2] = torch.sin(position * div_term) 19 | pe[:, 1::2] = torch.cos(position * div_term) 20 | 21 | pe = pe.unsqueeze(0) 22 | self.register_buffer('pe', pe) 23 | 24 | def forward(self, x): 25 | return self.pe[:, :x.size(1)] 26 | 27 | 28 | class TokenEmbedding(nn.Module): 29 | def __init__(self, c_in, d_model): 30 | super(TokenEmbedding, self).__init__() 31 | padding = 1 if torch.__version__ >= '1.5.0' else 2 32 | self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, 33 | kernel_size=3, padding=padding, padding_mode='circular', bias=False) 34 | for m in self.modules(): 35 | if isinstance(m, nn.Conv1d): 36 | nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu') 37 | 38 | def forward(self, x): 39 | x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) 40 | return x 41 | 42 | 43 | class DataEmbedding(nn.Module): 44 | def __init__(self, c_in, d_model, dropout=0.05): 45 | super(DataEmbedding, self).__init__() 46 | 47 | self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) 48 | self.position_embedding = PositionalEmbedding(d_model=d_model) 49 | 50 | self.dropout = nn.Dropout(p=dropout) 51 | 52 | def forward(self, x): 53 | x = self.value_embedding(x) + self.position_embedding(x) 54 | return self.dropout(x) 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # DCdetector (KDD 2023 research track paper) 2 | 3 | **DCdetector: Dual Attention Contrastive Representation Learning for Time Series Anomaly Detection** 4 | [[Paper]](https://arxiv.org/abs/2306.10347) 5 | 6 | 7 | 8 | The most fundamental challenge for time series anomaly detection is to learn a representation map that enables effective discrimination of anomalies. Reconstruction-based methods still dominate, but the representation learning with anomalies might hurt the performance with its large abnormal loss. In this paper, we propose DCdetector, a multi-scale dual attention contrastive representation learning model. 9 | 10 | - **Overall**: DCdetector utilizes a novel dual attention asymmetric design to create the permutated environment and pure contrastive loss to guide the learning process, thus learning a permutation invariant representation with superior discrimination abilities. 11 | 12 | - **Architecture**: A contrastive learning-based dual-branch attention structure is designed to learn a permutation invariant representation that enlarges the representation differences between normal points and anomalies. 13 | 14 | - **Architecture**: Two additional structures are designed for time series. Channel independence patching is proposed to enhance local semantic information in time series. Multi-scale is proposed in the attention module to reduce information loss during patching. 15 | 16 | - **Optimization**: An effective and robust loss function is designed based on the similarity of two branches. Note that the model is trained purely contrastively without reconstruction loss, which reduces distractions from anomalies. 17 | 18 | - **Performance & Justification**: DCdetector achieves performance comparable or superior to SOTA methods on eight time series anomaly detection benchmark datasets. We also provide justification discussion to explain how our model avoids collapse without negative samples. 19 | 20 | |![Figure1](img/art-compare.png)| 21 | |:--:| 22 | | *Figure 1. Architecture comparison of three approaches.* | 23 | 24 | |![Figure2](img/workflow.png)| 25 | |:--:| 26 | | *Figure 2. The workflow of the DCdetector framework.* | 27 | 28 | 29 | ## Main Result 30 | We compare our model with 26 baselines based on various evaluation criteria. Extensive experiments show that DCdetector achieves the best or comparable performance on eight benchmark datasets compared to various state-of-the-art algorithms. 31 | 32 | |![Figure1](img/DCdetector.jpg)| 33 | |:--:| 34 | | *Table 1. Overall results on real-world multivariate datasets.* | 35 | 36 | |![image](img/result_2.png) | ![image](img/result_count.jpg) 37 | |:--:|:--:| 38 | | *Table 2. Overall results on NIPS-TS datasets.* | *Table 3. Overall results on univariate dataset.* | 39 | 40 | |![Figure4](img/result_4.png)| 41 | |:--:| 42 | | *Table 4. Multi-metrics results on NIPS-TS datasets.* | 43 | 44 | 45 | ## Code Description 46 | There are ten files/folders in the source. 47 | 48 | - data_factory: The preprocessing folder/file. All datasets preprocessing codes are here. 49 | - dataset: The dataset folder, and you can download all datasets [here](https://drive.google.com/drive/folders/1RaIJQ8esoWuhyphhmMaH-VCDh-WIluRR?usp=sharing). 50 | - main.py: The main python file. You can adjustment all parameters in there. 51 | - metrics: There is the evaluation metrics code folder, which includes VUC, affiliation precision/recall pair, and other common metrics. The details can be corresponding to paper’s Section 4.2. 52 | - model: DCdetector model folder. The details can be corresponding to paper’s Section 3. 53 | - result: In our code demo, we can automatically save the results and train processing log in this folder. 54 | - scripts: All datasets and ablation experiments scripts. You can reproduce the experiment results as get start shown. 55 | - solver.py: Another python file. The training, validation, and testing processing are all in there. 56 | - utils: Other functions for data processing and model building. 57 | - img: Images needed in readme.md. 58 | - requirements.txt: Python packages needed to run this repo. 59 | 60 | 61 | ## Get Start 62 | 1. Install Python 3.6, PyTorch >= 1.4.0. 63 | 2. Download data. You can obtain all benchmarks from [Google Cloud](https://drive.google.com/drive/folders/1RaIJQ8esoWuhyphhmMaH-VCDh-WIluRR?usp=sharing). All the datasets are well pre-processed. 64 | 3. Train and evaluate. We provide the experiment scripts of all benchmarks under the folder ```./scripts```. You can reproduce the experiment results as follows: 65 | 66 | ```bash 67 | bash ./scripts/SMD.sh 68 | bash ./scripts/MSL.sh 69 | bash ./scripts/SMAP.sh 70 | bash ./scripts/PSM.sh 71 | bash ./scripts/SWAT.sh 72 | bash ./scripts/NIPS_TS_Swan.sh 73 | bash ./scripts/NIPS_TS_Water.sh 74 | bash ./scripts/UCR.sh 75 | ``` 76 | 77 | Also, some scripts of ablation experiments. 78 | 79 | ```bash 80 | bash ./scripts/Ablation_attention_head.sh 81 | bash ./scripts/Ablation_encoder_layer.sh 82 | bash ./scripts/Ablation_Multiscale.sh 83 | bash ./scripts/Ablation_Window_Size.sh 84 | ``` 85 | 86 | ## Citation 87 | If you find this repo useful, please cite our paper. 88 | 89 | ``` 90 | @inproceedings{yang2023dcdetector, 91 | title={DCdetector: Dual Attention Contrastive Representation Learning for Time Series Anomaly Detection}, 92 | author={Yiyuan Yang and Chaoli Zhang and Tian Zhou and Qingsong Wen and Liang Sun}, 93 | booktitle={Proc. 29th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD 2023)}, 94 | location = {Long Beach, CA}, 95 | pages={3033–3045}, 96 | year={2023} 97 | } 98 | ``` 99 | 100 | ## Contact 101 | If you have any question or want to use the code, please contact yiyuan.yang@cs.ox.ac.uk, chaoli.zcl@alibaba-inc.com,tian.zt@alibaba-inc.com,qingsong.wen@alibaba-inc.com. 102 | 103 | ## Acknowledgement 104 | We appreciate the following github repos a lot for their valuable code: 105 | 106 | https://github.com/thuml/Anomaly-Transformer 107 | 108 | https://github.com/ahstat/affiliation-metrics-py 109 | 110 | https://github.com/TheDatumOrg/VUS 111 | 112 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | arch==6.1.0 2 | einops==0.6.1 3 | matplotlib==3.7.0 4 | numpy==1.23.5 5 | pandas==1.5.3 6 | Pillow==9.4.0 7 | scikit_learn==1.2.2 8 | scipy==1.8.1 9 | statsmodels==0.14.0 10 | torch==1.13.0 11 | tqdm==4.65.0 12 | tsfresh==0.20.1 13 | -------------------------------------------------------------------------------- /result_count.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DAMO-DI-ML/KDD2023-DCdetector/9d107cae518ef24ee2eb2ad1b92107cdcd09ea10/result_count.jpg -------------------------------------------------------------------------------- /scripts/Ablation_Multiscale.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | #MSL 4 | for j in {1,3,5,13,15,35,135}; 5 | do 6 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 128 --mode train --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size 60 --patch_size $j 7 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 128 --mode test --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size 60 --patch_size $j 8 | done 9 | 10 | 11 | #PSM 12 | for j in {1,3,5,13,15,35,135}; 13 | do 14 | python main.py --anormly_ratio 1 --num_epochs 5 --batch_size 128 --mode train --dataset PSM --data_path PSM --input_c 25 --output_c 25 --win_size 60 --patch_size $j 15 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 128 --mode test --dataset PSM --data_path PSM --input_c 25 --output_c 25 --win_size 60 --patch_size $j 16 | done 17 | 18 | 19 | # SMAP 20 | for j in {1,3,5,13,15,35,135}; 21 | do 22 | python main.py --anormly_ratio 0.85 --num_epochs 3 --batch_size 128 --mode train --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --win_size 60 --patch_size $j 23 | python main.py --anormly_ratio 0.85 --num_epochs 10 --batch_size 128 --mode test --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --win_size 60 --patch_size $j 24 | done 25 | -------------------------------------------------------------------------------- /scripts/Ablation_Window_Size.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | # #MSL 4 | for i in {30,45,60,75,90,105,120,135,150,175,195,210}; 5 | do 6 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 128 --mode train --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size $i --patch_size 35 7 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 128 --mode test --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size $i --patch_size 35 8 | done 9 | 10 | 11 | #SMAP 12 | for i in {30,45,60,75,90,105,120,135,150,175,195,210}; 13 | do 14 | python main.py --anormly_ratio 0.85 --num_epochs 3 --batch_size 128 --mode train --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --win_size $i --patch_size 35 15 | python main.py --anormly_ratio 0.85 --num_epochs 10 --batch_size 128 --mode test --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --win_size $i --patch_size 35 16 | done 17 | 18 | 19 | #PSM 20 | for i in {30,45,60,75,90,105,120,135,150,175,195,210}; 21 | do 22 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 128 --mode train --dataset PSM --data_path PSM --input_c 25 --output_c 25 --win_size $i --patch_size 35 23 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 128 --mode test --dataset PSM --data_path PSM --input_c 25 --output_c 25 --win_size $i --patch_size 35 24 | done 25 | 26 | -------------------------------------------------------------------------------- /scripts/Ablation_attention_head.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | #MSL 4 | for i in {1,2,4,8}; 5 | do 6 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 128 --mode train --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size 60 --patch_size 35 --n_heads $i 7 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 128 --mode test --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size 60 --patch_size 35 --n_heads $i 8 | done 9 | 10 | 11 | #PSM 12 | for i in {1,2,4,8}; 13 | do 14 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 128 --mode train --dataset PSM --data_path PSM --input_c 25 --output_c 25 --win_size 60 --patch_size 35 --n_heads $i 15 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 128 --mode test --dataset PSM --data_path PSM --input_c 25 --output_c 25 --win_size 60 --patch_size 35 --n_heads $i 16 | done 17 | 18 | 19 | #SMAP 20 | for i in {1,2,4,8}; 21 | do 22 | python main.py --anormly_ratio 0.85 --num_epochs 3 --batch_size 128 --mode train --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --win_size 60 --patch_size 35 --n_heads $i 23 | python main.py --anormly_ratio 0.85 --num_epochs 10 --batch_size 128 --mode test --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --win_size 60 --patch_size 35 --n_heads $i 24 | done 25 | -------------------------------------------------------------------------------- /scripts/Ablation_encoder_layer.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | #MSL 4 | for i in {1,2,3,4,5}; 5 | do 6 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 128 --mode train --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size 60 --patch_size 35 --e_layer $i 7 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 128 --mode test --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size 60 --patch_size 35 --e_layer $i 8 | done 9 | 10 | #SMAP 11 | for i in {1,2,3,4,5}; 12 | do 13 | python main.py --anormly_ratio 0.85 --num_epochs 3 --batch_size 128 --mode train --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --win_size 60 --patch_size 35 --e_layer $i 14 | python main.py --anormly_ratio 0.85 --num_epochs 10 --batch_size 128 --mode test --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --win_size 60 --patch_size 35 --e_layer $i 15 | done 16 | 17 | #PSM 18 | for i in {1,2,3,4,5}; 19 | do 20 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 128 --mode train --dataset PSM --data_path PSM --input_c 25 --output_c 25 --win_size 60 --patch_size 35 --e_layer $i 21 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 128 --mode test --dataset PSM --data_path PSM --input_c 25 --output_c 25 --win_size 60 --patch_size 35 --e_layer $i 22 | done 23 | -------------------------------------------------------------------------------- /scripts/MSL.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 64 --mode train --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size 90 --patch_size 35 4 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 64 --mode test --dataset MSL --data_path MSL --input_c 55 --output_c 55 --win_size 90 --patch_size 35 -------------------------------------------------------------------------------- /scripts/NIPS_TS_Swan.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=1 2 | 3 | python main.py --anormly_ratio 0.9 --num_epochs 3 --batch_size 128 --mode train --dataset NIPS_TS_Swan --data_path NIPS_TS_Swan --input_c 38 --output_c 38 --loss_fuc MSE --win_size 36 --patch_size 13 4 | python main.py --anormly_ratio 0.9 --num_epochs 10 --batch_size 128 --mode test --dataset NIPS_TS_Swan --data_path NIPS_TS_Swan --input_c 38 --output_c 38 --loss_fuc MSE --win_size 36 --patch_size 13 5 | -------------------------------------------------------------------------------- /scripts/NIPS_TS_Water.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 256 --mode train --dataset NIPS_TS_Water --data_path NIPS_TS_Water --input_c 9 --output_c 9 --loss_fuc MSE --patch_size 135 --win_size 90 4 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 256 --mode test --dataset NIPS_TS_Water --data_path NIPS_TS_Water --input_c 9 --output_c 9 --loss_fuc MSE --patch_size 135 --win_size 90 5 | 6 | 7 | -------------------------------------------------------------------------------- /scripts/PSM.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 256 --mode train --dataset PSM --data_path PSM --input_c 25 --output_c 25 --loss_fuc MSE --win_size 60 --patch_size 135 4 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 256 --mode test --dataset PSM --data_path PSM --input_c 25 --output_c 25 --loss_fuc MSE --win_size 60 --patch_size 135 -------------------------------------------------------------------------------- /scripts/SMAP.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | python main.py --anormly_ratio 0.85 --num_epochs 3 --batch_size 256 --mode train --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --loss_fuc MSE --patch_size 357 --win_size 105 4 | python main.py --anormly_ratio 0.85 --num_epochs 10 --batch_size 256 --mode test --dataset SMAP --data_path SMAP --input_c 25 --output_c 25 --loss_fuc MSE --patch_size 357 --win_size 105 5 | -------------------------------------------------------------------------------- /scripts/SMD.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=3 2 | 3 | python main.py --anormly_ratio 0.6 --num_epochs 2 --batch_size 256 --mode train --dataset SMD --data_path SMD --input_c 38 --output_c 38 --loss_fuc MSE --win_size 105 --patch_size 57 4 | python main.py --anormly_ratio 0.6 --num_epochs 10 --batch_size 256 --mode test --dataset SMD --data_path SMD --input_c 38 --output_c 38 --loss_fuc MSE --win_size 105 --patch_size 57 -------------------------------------------------------------------------------- /scripts/SWAT.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | python main.py --anormly_ratio 1 --num_epochs 3 --batch_size 128 --mode train --dataset SWAT --data_path SWAT --input_c 51 --output_c 51 --loss_fuc MSE --patch_size 357 --win_size 105 4 | python main.py --anormly_ratio 1 --num_epochs 10 --batch_size 128 --mode test --dataset SWAT --data_path SWAT --input_c 51 --output_c 51 --loss_fuc MSE --patch_size 357 --win_size 105 5 | -------------------------------------------------------------------------------- /scripts/UCR.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=1 2 | 3 | for i in {1..250}; 4 | do 5 | 6 | python main.py --anormly_ratio 0.5 --num_epochs 3 --batch_size 128 --mode train --dataset UCR --data_path UCR --input_c 1 --output 1 --index $i --win_size 105 --patch_size 357 7 | python main.py --anormly_ratio 0.5 --num_epochs 10 --batch_size 128 --mode test --dataset UCR --data_path UCR --input_c 1 --output 1 --index $i --win_size 105 --patch_size 357 8 | 9 | done 10 | 11 | -------------------------------------------------------------------------------- /scripts/UCR_AUG.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | for i in {1..247}; 4 | do 5 | 6 | python main.py --anormly_ratio 0.5 --num_epochs 3 --batch_size 128 --mode train --dataset UCR_AUG --data_path UCR_AUG --input_c 1 --output 1 --index $i --win_size 60 --patch_size 35 7 | python main.py --anormly_ratio 0.5 --num_epochs 10 --batch_size 128 --mode test --dataset UCR_AUG --data_path UCR_AUG --input_c 1 --output 1 --index $i --win_size 60 --patch_size 35 8 | 9 | done -------------------------------------------------------------------------------- /solver.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import os 6 | import time 7 | from utils.utils import * 8 | from model.DCdetector import DCdetector 9 | from data_factory.data_loader import get_loader_segment 10 | from einops import rearrange 11 | from metrics.metrics import * 12 | import warnings 13 | warnings.filterwarnings('ignore') 14 | 15 | def my_kl_loss(p, q): 16 | res = p * (torch.log(p + 0.0001) - torch.log(q + 0.0001)) 17 | return torch.mean(torch.sum(res, dim=-1), dim=1) 18 | 19 | def adjust_learning_rate(optimizer, epoch, lr_): 20 | lr_adjust = {epoch: lr_ * (0.5 ** ((epoch - 1) // 1))} 21 | if epoch in lr_adjust.keys(): 22 | lr = lr_adjust[epoch] 23 | for param_group in optimizer.param_groups: 24 | param_group['lr'] = lr 25 | 26 | class EarlyStopping: 27 | def __init__(self, patience=7, verbose=False, dataset_name='', delta=0): 28 | self.patience = patience 29 | self.verbose = verbose 30 | self.counter = 0 31 | self.best_score = None 32 | self.best_score2 = None 33 | self.early_stop = False 34 | self.val_loss_min = np.Inf 35 | self.val_loss2_min = np.Inf 36 | self.delta = delta 37 | self.dataset = dataset_name 38 | 39 | def __call__(self, val_loss, val_loss2, model, path): 40 | score = -val_loss 41 | score2 = -val_loss2 42 | if self.best_score is None: 43 | self.best_score = score 44 | self.best_score2 = score2 45 | self.save_checkpoint(val_loss, val_loss2, model, path) 46 | elif score < self.best_score + self.delta or score2 < self.best_score2 + self.delta: 47 | self.counter += 1 48 | if self.counter >= self.patience: 49 | self.early_stop = True 50 | else: 51 | self.best_score = score 52 | self.best_score2 = score2 53 | self.save_checkpoint(val_loss, val_loss2, model, path) 54 | self.counter = 0 55 | 56 | def save_checkpoint(self, val_loss, val_loss2, model, path): 57 | torch.save(model.state_dict(), os.path.join(path, str(self.dataset) + '_checkpoint.pth')) 58 | self.val_loss_min = val_loss 59 | self.val_loss2_min = val_loss2 60 | 61 | 62 | class Solver(object): 63 | DEFAULTS = {} 64 | 65 | def __init__(self, config): 66 | 67 | self.__dict__.update(Solver.DEFAULTS, **config) 68 | 69 | self.train_loader = get_loader_segment(self.index, 'dataset/'+self.data_path, batch_size=self.batch_size, win_size=self.win_size, mode='train', dataset=self.dataset, ) 70 | self.vali_loader = get_loader_segment(self.index, 'dataset/'+self.data_path, batch_size=self.batch_size, win_size=self.win_size, mode='val', dataset=self.dataset) 71 | self.test_loader = get_loader_segment(self.index, 'dataset/'+self.data_path, batch_size=self.batch_size, win_size=self.win_size, mode='test', dataset=self.dataset) 72 | self.thre_loader = get_loader_segment(self.index, 'dataset/'+self.data_path, batch_size=self.batch_size, win_size=self.win_size, mode='thre', dataset=self.dataset) 73 | 74 | self.build_model() 75 | 76 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 77 | 78 | if self.loss_fuc == 'MAE': 79 | self.criterion = nn.L1Loss() 80 | elif self.loss_fuc == 'MSE': 81 | self.criterion = nn.MSELoss() 82 | 83 | 84 | def build_model(self): 85 | self.model = DCdetector(win_size=self.win_size, enc_in=self.input_c, c_out=self.output_c, n_heads=self.n_heads, d_model=self.d_model, e_layers=self.e_layers, patch_size=self.patch_size, channel=self.input_c) 86 | 87 | if torch.cuda.is_available(): 88 | self.model.cuda() 89 | 90 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr) 91 | 92 | 93 | def vali(self, vali_loader): 94 | self.model.eval() 95 | loss_1 = [] 96 | loss_2 = [] 97 | for i, (input_data, _) in enumerate(vali_loader): 98 | input = input_data.float().to(self.device) 99 | series, prior = self.model(input) 100 | series_loss = 0.0 101 | prior_loss = 0.0 102 | for u in range(len(prior)): 103 | series_loss += (torch.mean(my_kl_loss(series[u], ( 104 | prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 105 | self.win_size)).detach())) + torch.mean( 106 | my_kl_loss( 107 | (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 108 | self.win_size)).detach(), 109 | series[u]))) 110 | prior_loss += (torch.mean( 111 | my_kl_loss((prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 112 | self.win_size)), 113 | series[u].detach())) + torch.mean( 114 | my_kl_loss(series[u].detach(), 115 | (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 116 | self.win_size))))) 117 | 118 | series_loss = series_loss / len(prior) 119 | prior_loss = prior_loss / len(prior) 120 | 121 | loss_1.append((prior_loss - series_loss).item()) 122 | 123 | return np.average(loss_1), np.average(loss_2) 124 | 125 | 126 | def train(self): 127 | 128 | time_now = time.time() 129 | path = self.model_save_path 130 | if not os.path.exists(path): 131 | os.makedirs(path) 132 | early_stopping = EarlyStopping(patience=5, verbose=True, dataset_name=self.data_path) 133 | train_steps = len(self.train_loader) 134 | 135 | for epoch in range(self.num_epochs): 136 | iter_count = 0 137 | 138 | epoch_time = time.time() 139 | self.model.train() 140 | for i, (input_data, labels) in enumerate(self.train_loader): 141 | 142 | self.optimizer.zero_grad() 143 | iter_count += 1 144 | input = input_data.float().to(self.device) 145 | series, prior = self.model(input) 146 | 147 | series_loss = 0.0 148 | prior_loss = 0.0 149 | 150 | for u in range(len(prior)): 151 | series_loss += (torch.mean(my_kl_loss(series[u], ( 152 | prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 153 | self.win_size)).detach())) + torch.mean( 154 | my_kl_loss((prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 155 | self.win_size)).detach(), 156 | series[u]))) 157 | prior_loss += (torch.mean(my_kl_loss( 158 | (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 159 | self.win_size)), 160 | series[u].detach())) + torch.mean( 161 | my_kl_loss(series[u].detach(), ( 162 | prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 163 | self.win_size))))) 164 | 165 | series_loss = series_loss / len(prior) 166 | prior_loss = prior_loss / len(prior) 167 | 168 | loss = prior_loss - series_loss 169 | 170 | if (i + 1) % 100 == 0: 171 | speed = (time.time() - time_now) / iter_count 172 | left_time = speed * ((self.num_epochs - epoch) * train_steps - i) 173 | print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time)) 174 | iter_count = 0 175 | time_now = time.time() 176 | 177 | loss.backward() 178 | self.optimizer.step() 179 | 180 | vali_loss1, vali_loss2 = self.vali(self.test_loader) 181 | 182 | print( 183 | "Epoch: {0}, Cost time: {1:.3f}s ".format( 184 | epoch + 1, time.time() - epoch_time)) 185 | early_stopping(vali_loss1, vali_loss2, self.model, path) 186 | if early_stopping.early_stop: 187 | break 188 | adjust_learning_rate(self.optimizer, epoch + 1, self.lr) 189 | 190 | 191 | def test(self): 192 | self.model.load_state_dict( 193 | torch.load( 194 | os.path.join(str(self.model_save_path), str(self.data_path) + '_checkpoint.pth'))) 195 | self.model.eval() 196 | temperature = 50 197 | 198 | # (1) stastic on the train set 199 | attens_energy = [] 200 | for i, (input_data, labels) in enumerate(self.train_loader): 201 | input = input_data.float().to(self.device) 202 | series, prior = self.model(input) 203 | series_loss = 0.0 204 | prior_loss = 0.0 205 | for u in range(len(prior)): 206 | if u == 0: 207 | series_loss = my_kl_loss(series[u], ( 208 | prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 209 | self.win_size)).detach()) * temperature 210 | prior_loss = my_kl_loss( 211 | (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 212 | self.win_size)), 213 | series[u].detach()) * temperature 214 | else: 215 | series_loss += my_kl_loss(series[u], ( 216 | prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 217 | self.win_size)).detach()) * temperature 218 | prior_loss += my_kl_loss( 219 | (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 220 | self.win_size)), 221 | series[u].detach()) * temperature 222 | 223 | metric = torch.softmax((-series_loss - prior_loss), dim=-1) 224 | cri = metric.detach().cpu().numpy() 225 | attens_energy.append(cri) 226 | 227 | attens_energy = np.concatenate(attens_energy, axis=0).reshape(-1) 228 | train_energy = np.array(attens_energy) 229 | 230 | # (2) find the threshold 231 | attens_energy = [] 232 | for i, (input_data, labels) in enumerate(self.thre_loader): 233 | input = input_data.float().to(self.device) 234 | series, prior = self.model(input) 235 | series_loss = 0.0 236 | prior_loss = 0.0 237 | for u in range(len(prior)): 238 | if u == 0: 239 | series_loss = my_kl_loss(series[u], ( 240 | prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 241 | self.win_size)).detach()) * temperature 242 | prior_loss = my_kl_loss( 243 | (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 244 | self.win_size)), 245 | series[u].detach()) * temperature 246 | else: 247 | series_loss += my_kl_loss(series[u], ( 248 | prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 249 | self.win_size)).detach()) * temperature 250 | prior_loss += my_kl_loss( 251 | (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 252 | self.win_size)), 253 | series[u].detach()) * temperature 254 | 255 | metric = torch.softmax((-series_loss - prior_loss), dim=-1) 256 | cri = metric.detach().cpu().numpy() 257 | attens_energy.append(cri) 258 | 259 | attens_energy = np.concatenate(attens_energy, axis=0).reshape(-1) 260 | test_energy = np.array(attens_energy) 261 | combined_energy = np.concatenate([train_energy, test_energy], axis=0) 262 | thresh = np.percentile(combined_energy, 100 - self.anormly_ratio) 263 | print("Threshold :", thresh) 264 | 265 | # (3) evaluation on the test set 266 | test_labels = [] 267 | attens_energy = [] 268 | for i, (input_data, labels) in enumerate(self.thre_loader): 269 | input = input_data.float().to(self.device) 270 | series, prior = self.model(input) 271 | series_loss = 0.0 272 | prior_loss = 0.0 273 | for u in range(len(prior)): 274 | if u == 0: 275 | series_loss = my_kl_loss(series[u], ( 276 | prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 277 | self.win_size)).detach()) * temperature 278 | prior_loss = my_kl_loss( 279 | (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 280 | self.win_size)), 281 | series[u].detach()) * temperature 282 | else: 283 | series_loss += my_kl_loss(series[u], ( 284 | prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 285 | self.win_size)).detach()) * temperature 286 | prior_loss += my_kl_loss( 287 | (prior[u] / torch.unsqueeze(torch.sum(prior[u], dim=-1), dim=-1).repeat(1, 1, 1, 288 | self.win_size)), 289 | series[u].detach()) * temperature 290 | metric = torch.softmax((-series_loss - prior_loss), dim=-1) 291 | cri = metric.detach().cpu().numpy() 292 | attens_energy.append(cri) 293 | test_labels.append(labels) 294 | 295 | attens_energy = np.concatenate(attens_energy, axis=0).reshape(-1) 296 | test_labels = np.concatenate(test_labels, axis=0).reshape(-1) 297 | test_energy = np.array(attens_energy) 298 | test_labels = np.array(test_labels) 299 | 300 | pred = (test_energy > thresh).astype(int) 301 | gt = test_labels.astype(int) 302 | 303 | matrix = [self.index] 304 | scores_simple = combine_all_evaluation_scores(pred, gt, test_energy) 305 | for key, value in scores_simple.items(): 306 | matrix.append(value) 307 | print('{0:21} : {1:0.4f}'.format(key, value)) 308 | 309 | anomaly_state = False 310 | for i in range(len(gt)): 311 | if gt[i] == 1 and pred[i] == 1 and not anomaly_state: 312 | anomaly_state = True 313 | for j in range(i, 0, -1): 314 | if gt[j] == 0: 315 | break 316 | else: 317 | if pred[j] == 0: 318 | pred[j] = 1 319 | for j in range(i, len(gt)): 320 | if gt[j] == 0: 321 | break 322 | else: 323 | if pred[j] == 0: 324 | pred[j] = 1 325 | elif gt[i] == 0: 326 | anomaly_state = False 327 | if anomaly_state: 328 | pred[i] = 1 329 | 330 | pred = np.array(pred) 331 | gt = np.array(gt) 332 | 333 | from sklearn.metrics import precision_recall_fscore_support 334 | from sklearn.metrics import accuracy_score 335 | 336 | accuracy = accuracy_score(gt, pred) 337 | precision, recall, f_score, support = precision_recall_fscore_support(gt, pred, average='binary') 338 | print("Accuracy : {:0.4f}, Precision : {:0.4f}, Recall : {:0.4f}, F-score : {:0.4f} ".format(accuracy, precision, recall, f_score)) 339 | 340 | if self.data_path == 'UCR' or 'UCR_AUG': 341 | import csv 342 | with open('result/'+self.data_path+'.csv', 'a+') as f: 343 | writer = csv.writer(f) 344 | writer.writerow(matrix) 345 | 346 | return accuracy, precision, recall, f_score 347 | -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | # Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514 2 | import tensorflow as tf 3 | import numpy as np 4 | import scipy.misc 5 | 6 | try: 7 | from StringIO import StringIO # Python 2.7 8 | except ImportError: 9 | from io import BytesIO # Python 3.5+ 10 | 11 | 12 | class Logger(object): 13 | 14 | def __init__(self, log_dir): 15 | """Create a summary writer logging to log_dir.""" 16 | self.writer = tf.summary.FileWriter(log_dir) 17 | 18 | def scalar_summary(self, tag, value, step): 19 | """Log a scalar variable.""" 20 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 21 | self.writer.add_summary(summary, step) 22 | 23 | def image_summary(self, tag, images, step): 24 | """Log a list of images.""" 25 | 26 | img_summaries = [] 27 | for i, img in enumerate(images): 28 | # Write the image to a string 29 | try: 30 | s = StringIO() 31 | except: 32 | s = BytesIO() 33 | scipy.misc.toimage(img).save(s, format="png") 34 | 35 | # Create an Image object 36 | img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), 37 | height=img.shape[0], 38 | width=img.shape[1]) 39 | # Create a Summary value 40 | img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) 41 | 42 | # Create and write Summary 43 | summary = tf.Summary(value=img_summaries) 44 | self.writer.add_summary(summary, step) 45 | 46 | def histo_summary(self, tag, values, step, bins=1000): 47 | """Log a histogram of the tensor of values.""" 48 | 49 | # Create a histogram using numpy 50 | counts, bin_edges = np.histogram(values, bins=bins) 51 | 52 | # Fill the fields of the histogram proto 53 | hist = tf.HistogramProto() 54 | hist.min = float(np.min(values)) 55 | hist.max = float(np.max(values)) 56 | hist.num = int(np.prod(values.shape)) 57 | hist.sum = float(np.sum(values)) 58 | hist.sum_squares = float(np.sum(values ** 2)) 59 | 60 | # Drop the start of the first bin 61 | bin_edges = bin_edges[1:] 62 | 63 | # Add bin edges and counts 64 | for edge in bin_edges: 65 | hist.bucket_limit.append(edge) 66 | for c in counts: 67 | hist.bucket.append(c) 68 | 69 | # Create and write Summary 70 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) 71 | self.writer.add_summary(summary, step) 72 | self.writer.flush() 73 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | import numpy as np 7 | 8 | 9 | def to_var(x, volatile=False): 10 | if torch.cuda.is_available(): 11 | x = x.cuda() 12 | return Variable(x, volatile=volatile) 13 | 14 | 15 | def mkdir(directory): 16 | if not os.path.exists(directory): 17 | os.makedirs(directory) 18 | --------------------------------------------------------------------------------