├── MAWI.py ├── MLE.py ├── MOMspot.py ├── README.md ├── bspot.py ├── drif_spot.py ├── edf_stocks.csv ├── espot.py ├── mawi_170812_50_50.csv ├── mawi_180812_50_50.csv ├── middle_spot.py ├── physic.py ├── physics.dat ├── pic ├── 1.png ├── 2.png └── middle_3.png ├── rain.dat ├── rain.py ├── spot.py └── stock.py /MAWI.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | #from bspot import bidSPOT 4 | from spot import biSPOT 5 | from drif_spot import DRSPOT 6 | from middle_spot import MISPOT 7 | from MOMspot import momSPOT 8 | import pandas as pd 9 | import time 10 | #no label 11 | f17 = './mawi_170812_50_50.csv' 12 | f18 = './mawi_180812_50_50.csv' 13 | 14 | P17 = pd.DataFrame.from_csv(f17) 15 | P18 = pd.DataFrame.from_csv(f18) 16 | 17 | X17 = P17['rSYN'].values 18 | X18 = P18['rSYN'].values 19 | 20 | n_init = 1000 21 | init_data = X17[-n_init:] # initial batch 22 | data = X18 # stream 23 | 24 | q = 1e-4 # risk parameter 25 | 26 | start = time.clock() 27 | 28 | #s = momSPOT(q) 29 | #s = biSPOT(q) # SPOT object 30 | #s = DRSPOT(q) 31 | s = MISPOT(q) 32 | s.fit(init_data,data) # data import 33 | s.initialize() # initialization step 34 | results = s.run() # run 35 | 36 | end = time.clock() 37 | t=end-start 38 | print("Runtime is:",t) 39 | 40 | s.plot(results) # plot -------------------------------------------------------------------------------- /MLE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | #Maximum likelihood estimation 4 | import numpy as np 5 | from scipy.optimize import root 6 | class CALError(Exception): 7 | pass 8 | def Gamma(x, Yi): 9 | vx = 0 10 | Nt = len(Yi) 11 | for i in range(1, Nt+1): 12 | try: 13 | if 1 + x*Yi[i-1] <=0: 14 | raise CALError() 15 | vx = vx + np.log(1 + x*Yi[i-1]) 16 | except CALError: 17 | pass 18 | vx = vx/Nt 19 | return vx 20 | 21 | def Delta(x, gamma): 22 | try: 23 | if x < 0.00000001: 24 | raise CALError() 25 | return gamma/x 26 | except CALError: 27 | pass 28 | def zq(q, gamma,delta,n, Nt,t): 29 | try: 30 | if gamma < 0.00000001: 31 | raise CALError() 32 | except CALError: 33 | pass 34 | zq = t 35 | tmp = (q*n/Nt)**(-gamma)-1 36 | zq += tmp*(delta/gamma) 37 | return zq 38 | def f(x, Yi): 39 | x = float(x[0]) 40 | Nt = len(Yi) 41 | ux = 0 42 | for i in range(1, Nt+1): 43 | if (1+ x*Yi[i-1]) < 0.00000001: 44 | return 1 45 | ux = ux + 1.0/(1+ x*Yi[i-1]) 46 | ux = ux/Nt 47 | vx = 0 48 | for i in range(1, Nt+1): 49 | if 1 + x*Yi[i-1] <= 0: 50 | return 1 51 | vx = vx + np.log(1 + x*Yi[i-1]) 52 | vx = vx/Nt 53 | vx = vx + 1 54 | return [ 55 | ux * vx -1 56 | ] 57 | 58 | def choose_zq(Yi, t, zq_lst): 59 | l = [i + t for i in Yi] 60 | m = np.median(l) 61 | minus = 1000000 62 | ans = zq_lst[0] 63 | for zq in zq_lst: 64 | tmp = abs( zq - m) 65 | if tmp < minus: 66 | minus = tmp 67 | ans = zq 68 | return ans 69 | 70 | def MLE_get_zq(Yi, q, n, t): 71 | Nt = len(Yi) 72 | Ym = np.max(Yi) 73 | low = -1.0/Ym 74 | ym = np.min(Yi)+0.1 75 | high = 2*((np.mean(Yi) - ym)/ym**2) 76 | #print [low, high] 77 | zq_lst = [] 78 | for i in range(5): 79 | guess = low + i*(high-low)/5 80 | guess = float(guess) 81 | #print "guess", guess 82 | sol = root(f, guess, Yi) 83 | #print sol 84 | if sol['success'] == True: 85 | x = sol['x'][0] 86 | try: 87 | gamma = Gamma(x, Yi) 88 | delta = Delta(x, gamma) 89 | #print gamma, delta 90 | zq_lst.append( zq(q, gamma,delta,n, Nt,t) ) 91 | except: 92 | pass 93 | #print i, " ********** is over" 94 | return choose_zq(Yi, t, zq_lst) 95 | 96 | def MOM_get_zq(Yi, q, n, t): 97 | avg = np.mean(Yi) 98 | var = np.var(Yi) 99 | Nt = len(Yi) 100 | gamma = 0.5*(avg**2/var + 1) 101 | delta = 0.5*avg*(avg**2/var +1) 102 | return zq(q, gamma,delta,n, Nt,t) 103 | 104 | Yi = [5,10,2,4,8,100,102,3,4,8,100,102,3] 105 | q = 0.96 106 | n = 1000 107 | t = 100 #t是Y序列的q分位数 108 | print MLE_get_zq(Yi, q, n, t) 109 | print MOM_get_zq(Yi, q, n, t) -------------------------------------------------------------------------------- /MOMspot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | from math import log,floor 6 | import tqdm 7 | from scipy.optimize import minimize 8 | # colors for plot 9 | deep_saffron = '#FF9933' 10 | air_force_blue = '#5D8AA8' 11 | 12 | def backMean(X,d): 13 | M = [] 14 | w = X[:d].sum() 15 | M.append(w/d) 16 | for i in range(d,len(X)): 17 | w = w - X[i-d] + X[i] 18 | M.append(w/d) 19 | return np.array(M) 20 | class momSPOT: 21 | """ 22 | This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds) 23 | 24 | Attributes 25 | ---------- 26 | proba : float 27 | Detection level (risk), chosen by the user 28 | 29 | extreme_quantile : float 30 | current threshold (bound between normal and abnormal events) 31 | 32 | data : numpy.array 33 | stream 34 | 35 | init_data : numpy.array 36 | initial batch of observations (for the calibration/initialization step) 37 | 38 | init_threshold : float 39 | initial threshold computed during the calibration step 40 | 41 | peaks : numpy.array 42 | array of peaks (excesses above the initial threshold) 43 | 44 | n : int 45 | number of observed values 46 | 47 | Nt : int 48 | number of observed peaks 49 | """ 50 | def __init__(self, q = 1e-4): 51 | """ 52 | Constructor 53 | Parameters 54 | ---------- 55 | q 56 | Detection level (risk) 57 | 58 | Returns 59 | ---------- 60 | biSPOT object 61 | """ 62 | self.proba = q 63 | self.data = None 64 | self.init_data = None 65 | self.n = 0 66 | nonedict = {'up':None,'down':None} 67 | 68 | self.extreme_quantile = dict.copy(nonedict) 69 | self.init_threshold = dict.copy(nonedict) 70 | self.peaks = dict.copy(nonedict) 71 | self.gamma = dict.copy(nonedict) 72 | self.sigma = dict.copy(nonedict) 73 | self.Nt = {'up':0,'down':0} 74 | 75 | 76 | def __str__(self): 77 | s = '' 78 | s += 'Streaming Peaks-Over-Threshold Object\n' 79 | s += 'Detection level q = %s\n' % self.proba 80 | if self.data is not None: 81 | s += 'Data imported : Yes\n' 82 | s += '\t initialization : %s values\n' % self.init_data.size 83 | s += '\t stream : %s values\n' % self.data.size 84 | else: 85 | s += 'Data imported : No\n' 86 | return s 87 | 88 | if self.n == 0: 89 | s += 'Algorithm initialized : No\n' 90 | else: 91 | s += 'Algorithm initialized : Yes\n' 92 | s += '\t initial threshold : %s\n' % self.init_threshold 93 | 94 | r = self.n-self.init_data.size 95 | if r > 0: 96 | s += 'Algorithm run : Yes\n' 97 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 98 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 99 | else: 100 | s += '\t number of peaks : %s\n' % self.Nt 101 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 102 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 103 | s += 'Algorithm run : No\n' 104 | return s 105 | 106 | 107 | def fit(self,init_data,data): 108 | """ 109 | Import data to biSPOT object 110 | 111 | Parameters 112 | ---------- 113 | init_data : list, numpy.array or pandas.Series 114 | initial batch to calibrate the algorithm () 115 | 116 | data : numpy.array 117 | data for the run (list, np.array or pd.series) 118 | 119 | """ 120 | if isinstance(data,list): 121 | self.data = np.array(data) 122 | elif isinstance(data,np.ndarray): 123 | self.data = data 124 | elif isinstance(data,pd.Series): 125 | self.data = data.values 126 | else: 127 | print('This data format (%s) is not supported' % type(data)) 128 | return 129 | 130 | if isinstance(init_data,list): 131 | self.init_data = np.array(init_data) 132 | elif isinstance(init_data,np.ndarray): 133 | self.init_data = init_data 134 | elif isinstance(init_data,pd.Series): 135 | self.init_data = init_data.values 136 | elif isinstance(init_data,int): 137 | self.init_data = self.data[:init_data] 138 | self.data = self.data[init_data:] 139 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 140 | r = int(init_data*data.size) 141 | self.init_data = self.data[:r] 142 | self.data = self.data[r:] 143 | else: 144 | print('The initial data cannot be set') 145 | return 146 | 147 | def add(self,data): 148 | """ 149 | This function allows to append data to the already fitted data 150 | 151 | Parameters 152 | ---------- 153 | data : list, numpy.array, pandas.Series 154 | data to append 155 | """ 156 | if isinstance(data,list): 157 | data = np.array(data) 158 | elif isinstance(data,np.ndarray): 159 | data = data 160 | elif isinstance(data,pd.Series): 161 | data = data.values 162 | else: 163 | print('This data format (%s) is not supported' % type(data)) 164 | return 165 | 166 | self.data = np.append(self.data,data) 167 | return 168 | 169 | def initialize(self, verbose = True): 170 | """ 171 | Run the calibration (initialization) step 172 | 173 | Parameters 174 | ---------- 175 | verbose : bool 176 | (default = True) If True, gives details about the batch initialization 177 | """ 178 | n_init = self.init_data.size 179 | 180 | S = np.sort(self.init_data) # we sort X to get the empirical quantile 181 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 182 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 183 | 184 | # initial peaks 185 | self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up'] 186 | self.peaks['down'] = -(self.init_data[self.init_dataself.init_data.size): 282 | print('Warning : the algorithm seems to have already been run, you \ 283 | should initialize before running again') 284 | return {} 285 | 286 | # list of the thresholds 287 | thup = [] 288 | thdown = [] 289 | alarm = [] 290 | # Loop over the stream 291 | for i in tqdm.tqdm(range(self.data.size)): 292 | 293 | # If the observed value exceeds the current threshold (alarm case) 294 | if self.data[i]>self.extreme_quantile['up'] : 295 | # if we want to alarm, we put it in the alarm list 296 | if with_alarm: 297 | alarm.append(i) 298 | # otherwise we add it in the peaks 299 | else: 300 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 301 | self.Nt['up'] += 1 302 | self.n += 1 303 | # and we update the thresholds 304 | 305 | g,s,l = self._MOM('up') 306 | self.extreme_quantile['up'] = self._quantile('up',g,s) 307 | 308 | # case where the value exceeds the initial threshold but not the alarm ones 309 | elif self.data[i]>self.init_threshold['up']: 310 | # we add it in the peaks 311 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 312 | self.Nt['up'] += 1 313 | self.n += 1 314 | # and we update the thresholds 315 | 316 | g,s,l = self._MOM('up') 317 | self.extreme_quantile['up'] = self._quantile('up',g,s) 318 | 319 | elif self.data[i] 0: 90 | s += 'Algorithm run : Yes\n' 91 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 92 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 93 | else: 94 | s += '\t number of peaks : %s\n' % self.Nt 95 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 96 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 97 | s += 'Algorithm run : No\n' 98 | return s 99 | 100 | 101 | def fit(self,init_data,data): 102 | """ 103 | Import data to biDSPOT object 104 | 105 | Parameters 106 | ---------- 107 | init_data : list, numpy.array or pandas.Series 108 | initial batch to calibrate the algorithm 109 | 110 | data : numpy.array 111 | data for the run (list, np.array or pd.series) 112 | 113 | """ 114 | if isinstance(data,list): 115 | self.data = np.array(data) 116 | elif isinstance(data,np.ndarray): 117 | self.data = data 118 | elif isinstance(data,pd.Series): 119 | self.data = data.values 120 | else: 121 | print('This data format (%s) is not supported' % type(data)) 122 | return 123 | 124 | if isinstance(init_data,list): 125 | self.init_data = np.array(init_data) 126 | elif isinstance(init_data,np.ndarray): 127 | self.init_data = init_data 128 | elif isinstance(init_data,pd.Series): 129 | self.init_data = init_data.values 130 | elif isinstance(init_data,int): 131 | self.init_data = self.data[:init_data] 132 | self.data = self.data[init_data:] 133 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 134 | r = int(init_data*data.size) 135 | self.init_data = self.data[:r] 136 | self.data = self.data[r:] 137 | else: 138 | print('The initial data cannot be set') 139 | return 140 | 141 | def add(self,data): 142 | """ 143 | This function allows to append data to the already fitted data 144 | 145 | Parameters 146 | ---------- 147 | data : list, numpy.array, pandas.Series 148 | data to append 149 | """ 150 | if isinstance(data,list): 151 | data = np.array(data) 152 | elif isinstance(data,np.ndarray): 153 | data = data 154 | elif isinstance(data,pd.Series): 155 | data = data.values 156 | else: 157 | print('This data format (%s) is not supported' % type(data)) 158 | return 159 | 160 | self.data = np.append(self.data,data) 161 | return 162 | 163 | def initialize(self, verbose = True): 164 | """ 165 | Run the calibration (initialization) step 166 | 167 | Parameters 168 | ---------- 169 | verbose : bool 170 | (default = True) If True, gives details about the batch initialization 171 | """ 172 | n_init = self.init_data.size - self.depth 173 | 174 | M = backMean(self.init_data,self.depth) 175 | T = self.init_data[self.depth:]-M[:-1] # new variable 176 | 177 | S = np.sort(T) # we sort T to get the empirical quantile 178 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 179 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 180 | 181 | # initial peaks 182 | self.peaks['up'] = T[T>self.init_threshold['up']]-self.init_threshold['up'] 183 | self.peaks['down'] = -( T[ T0) 278 | Returns 279 | ---------- 280 | float 281 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 282 | """ 283 | n = Y.size 284 | if gamma != 0: 285 | tau = gamma/sigma 286 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 287 | else: 288 | L = n * ( 1 + log(Y.mean()) ) 289 | return L 290 | 291 | 292 | def _grimshaw(self,side,epsilon = 1e-8, n_points = 8): 293 | """ 294 | Compute the GPD parameters estimation with the Grimshaw's trick 295 | 296 | Parameters 297 | ---------- 298 | epsilon : float 299 | numerical parameter to perform (default : 1e-8) 300 | n_points : int 301 | maximum number of candidates for maximum likelihood (default : 10) 302 | Returns 303 | ---------- 304 | gamma_best,sigma_best,ll_best 305 | gamma estimates, sigma estimates and corresponding log-likelihood 306 | """ 307 | def u(s): 308 | return 1 + np.log(s).mean() 309 | 310 | def v(s): 311 | return np.mean(1/s) 312 | 313 | def w(Y,t): 314 | s = 1+t*Y 315 | us = u(s) 316 | vs = v(s) 317 | return us*vs-1 318 | 319 | def jac_w(Y,t): 320 | s = 1+t*Y 321 | us = u(s) 322 | vs = v(s) 323 | jac_us = (1/t)*(1-vs) 324 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 325 | return us*jac_vs+vs*jac_us 326 | 327 | 328 | Ym = self.peaks[side].min() 329 | YM = self.peaks[side].max() 330 | Ymean = self.peaks[side].mean() 331 | 332 | 333 | a = -1/YM 334 | if abs(a)<2*epsilon: 335 | epsilon = abs(a)/n_points 336 | 337 | a = a + epsilon 338 | b = 2*(Ymean-Ym)/(Ymean*Ym) 339 | c = 2*(Ymean-Ym)/(Ym**2) 340 | 341 | # We look for possible roots 342 | left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 343 | lambda t: jac_w(self.peaks[side],t), 344 | (a+epsilon,-epsilon), 345 | n_points,'regular') 346 | 347 | right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 348 | lambda t: jac_w(self.peaks[side],t), 349 | (b,c), 350 | n_points,'regular') 351 | 352 | # all the possible roots 353 | zeros = np.concatenate((left_zeros,right_zeros)) 354 | 355 | # 0 is always a solution so we initialize with it 356 | gamma_best = 0 357 | sigma_best = Ymean 358 | ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best) 359 | 360 | # we look for better candidates 361 | for z in zeros: 362 | gamma = u(1+z*self.peaks[side])-1 363 | sigma = gamma/z 364 | ll = self._log_likelihood(self.peaks[side],gamma,sigma) 365 | if ll>ll_best: 366 | gamma_best = gamma 367 | sigma_best = sigma 368 | ll_best = ll 369 | 370 | return gamma_best,sigma_best,ll_best 371 | 372 | 373 | 374 | def _quantile(self,side,gamma,sigma): 375 | """ 376 | Compute the quantile at level 1-q for a given side 377 | 378 | Parameters 379 | ---------- 380 | side : str 381 | 'up' or 'down' 382 | gamma : float 383 | GPD parameter 384 | sigma : float 385 | GPD parameter 386 | Returns 387 | ---------- 388 | float 389 | quantile at level 1-q for the GPD(γ,σ,μ=0) 390 | """ 391 | if side == 'up': 392 | r = self.n * self.proba / self.Nt[side] 393 | if gamma != 0: 394 | return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1) 395 | else: 396 | return self.init_threshold['up'] - sigma*log(r) 397 | elif side == 'down': 398 | r = self.n * self.proba / self.Nt[side] 399 | if gamma != 0: 400 | return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1) 401 | else: 402 | return self.init_threshold['down'] + sigma*log(r) 403 | else: 404 | print('error : the side is not right') 405 | 406 | 407 | def run(self, with_alarm = True, plot = True): 408 | """ 409 | Run biDSPOT on the stream 410 | 411 | Parameters 412 | ---------- 413 | with_alarm : bool 414 | (default = True) If False, SPOT will adapt the threshold assuming \ 415 | there is no abnormal values 416 | Returns 417 | ---------- 418 | dict 419 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 420 | 421 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 422 | the indexes of the values which have triggered alarms 423 | 424 | """ 425 | if (self.n>self.init_data.size): 426 | print('Warning : the algorithm seems to have already been run, you \ 427 | should initialize before running again') 428 | return {} 429 | 430 | # actual normal window 431 | W = self.init_data[-self.depth:] 432 | 433 | # list of the thresholds 434 | thup = [] 435 | thdown = [] 436 | alarm = [] 437 | # Loop over the stream 438 | for i in tqdm.tqdm(range(self.data.size)): 439 | Mi = W.mean() 440 | Ni = self.data[i]-Mi 441 | # If the observed value exceeds the current threshold (alarm case) 442 | if Ni>self.extreme_quantile['up'] : 443 | # if we want to alarm, we put it in the alarm list 444 | if with_alarm: 445 | alarm.append(i) 446 | # otherwise we add it in the peaks 447 | else: 448 | self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up']) 449 | self.Nt['up'] += 1 450 | self.n += 1 451 | # and we update the thresholds 452 | 453 | g,s,l = self._grimshaw('up') 454 | self.extreme_quantile['up'] = self._quantile('up',g,s) 455 | W = np.append(W[1:],self.data[i]) 456 | 457 | # case where the value exceeds the initial threshold but not the alarm ones 458 | elif Ni>self.init_threshold['up']: 459 | # we add it in the peaks 460 | self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up']) 461 | self.Nt['up'] += 1 462 | self.n += 1 463 | # and we update the thresholds 464 | g,s,l = self._grimshaw('up') 465 | self.extreme_quantile['up'] = self._quantile('up',g,s) 466 | W = np.append(W[1:],self.data[i]) 467 | 468 | elif Ni0: 540 | al_fig = plt.scatter(alarm,self.data[alarm],color='red') 541 | fig.append(al_fig) 542 | 543 | plt.xlim((0,self.data.size)) 544 | plt.show() 545 | 546 | return fig 547 | 548 | -------------------------------------------------------------------------------- /drif_spot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | from math import log,floor 6 | import tqdm 7 | from scipy.optimize import minimize 8 | # colors for plot 9 | deep_saffron = '#FF9933' 10 | air_force_blue = '#5D8AA8' 11 | 12 | def backMean(X,d): 13 | M = [] 14 | w = X[:d].sum() 15 | M.append(w/d) 16 | for i in range(d,len(X)): 17 | w = w - X[i-d] + X[i] 18 | M.append(w/d) 19 | return np.array(M) 20 | class DRSPOT: 21 | """ 22 | This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds) 23 | 24 | Attributes 25 | ---------- 26 | proba : float 27 | Detection level (risk), chosen by the user 28 | 29 | extreme_quantile : float 30 | current threshold (bound between normal and abnormal events) 31 | 32 | data : numpy.array 33 | stream 34 | 35 | init_data : numpy.array 36 | initial batch of observations (for the calibration/initialization step) 37 | 38 | init_threshold : float ------------t 39 | initial threshold computed during the calibration step 40 | 41 | peaks : numpy.array 42 | array of peaks (excesses above the initial threshold) 43 | 44 | n : int 45 | number of observed values 46 | 47 | Nt : int 48 | number of observed peaks 49 | """ 50 | def __init__(self, q = 1e-4): 51 | """ 52 | Constructor 53 | Parameters 54 | ---------- 55 | q 56 | Detection level (risk) 57 | 58 | Returns 59 | ---------- 60 | biSPOT object 61 | """ 62 | self.proba = q 63 | self.data = None 64 | self.init_data = None 65 | self.update_number = 0 66 | self.n = 0 67 | nonedict = {'up':None,'down':None} 68 | 69 | self.extreme_quantile = dict.copy(nonedict) 70 | self.init_threshold = dict.copy(nonedict) 71 | self.peaks = dict.copy(nonedict) 72 | self.gamma = dict.copy(nonedict) 73 | self.sigma = dict.copy(nonedict) 74 | self.Nt = {'up':0,'down':0} 75 | 76 | 77 | def __str__(self): 78 | s = '' 79 | s += 'Streaming Peaks-Over-Threshold Object\n' 80 | s += 'Detection level q = %s\n' % self.proba 81 | if self.data is not None: 82 | s += 'Data imported : Yes\n' 83 | s += '\t initialization : %s values\n' % self.init_data.size 84 | s += '\t stream : %s values\n' % self.data.size 85 | else: 86 | s += 'Data imported : No\n' 87 | return s 88 | 89 | if self.n == 0: 90 | s += 'Algorithm initialized : No\n' 91 | else: 92 | s += 'Algorithm initialized : Yes\n' 93 | s += '\t initial threshold : %s\n' % self.init_threshold 94 | 95 | r = self.n-self.init_data.size 96 | if r > 0: 97 | s += 'Algorithm run : Yes\n' 98 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 99 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 100 | else: 101 | s += '\t number of peaks : %s\n' % self.Nt 102 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 103 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 104 | s += 'Algorithm run : No\n' 105 | return s 106 | 107 | 108 | def fit(self,init_data,data): 109 | """ 110 | Import data to biSPOT object 111 | 112 | Parameters 113 | ---------- 114 | init_data : list, numpy.array or pandas.Series 115 | initial batch to calibrate the algorithm () 116 | 117 | data : numpy.array 118 | data for the run (list, np.array or pd.series) 119 | 120 | """ 121 | if isinstance(data,list): 122 | self.data = np.array(data) 123 | elif isinstance(data,np.ndarray): 124 | self.data = data 125 | elif isinstance(data,pd.Series): 126 | self.data = data.values 127 | else: 128 | print('This data format (%s) is not supported' % type(data)) 129 | return 130 | 131 | if isinstance(init_data,list): 132 | self.init_data = np.array(init_data) 133 | self.update_number = len(self.init_data) 134 | elif isinstance(init_data,np.ndarray): 135 | self.init_data = init_data 136 | self.update_number = len(self.init_data) 137 | elif isinstance(init_data,pd.Series): 138 | self.init_data = init_data.values 139 | self.update_number = len(self.init_data) 140 | elif isinstance(init_data,int): 141 | self.init_data = self.data[:init_data] 142 | self.data = self.data[init_data:] 143 | self.update_number = init_data 144 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 145 | r = int(init_data*data.size) 146 | self.init_data = self.data[:r] 147 | self.data = self.data[r:] 148 | else: 149 | print('The initial data cannot be set') 150 | return 151 | 152 | def add(self,data): 153 | """ 154 | This function allows to append data to the already fitted data 155 | 156 | Parameters 157 | ---------- 158 | data : list, numpy.array, pandas.Series 159 | data to append 160 | """ 161 | if isinstance(data,list): 162 | data = np.array(data) 163 | elif isinstance(data,np.ndarray): 164 | data = data 165 | elif isinstance(data,pd.Series): 166 | data = data.values 167 | else: 168 | print('This data format (%s) is not supported' % type(data)) 169 | return 170 | 171 | self.data = np.append(self.data,data) 172 | return 173 | 174 | def initialize(self, verbose = True): 175 | """ 176 | Run the calibration (initialization) step 177 | 178 | Parameters 179 | ---------- 180 | verbose : bool 181 | (default = True) If True, gives details about the batch initialization 182 | """ 183 | n_init = self.init_data.size 184 | 185 | S = np.sort(self.init_data) # we sort X to get the empirical quantile 186 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 187 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 188 | 189 | # initial peaks 190 | self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up'] 191 | self.peaks['down'] = -(self.init_data[self.init_data0) 285 | Returns 286 | ---------- 287 | float 288 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 289 | """ 290 | n = Y.size 291 | if gamma != 0: 292 | tau = gamma/sigma 293 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 294 | else: 295 | L = n * ( 1 + log(Y.mean()) ) 296 | return L 297 | 298 | 299 | def _grimshaw(self,side,epsilon = 1e-8, n_points = 10): 300 | """ 301 | Compute the GPD parameters estimation with the Grimshaw's trick 302 | 303 | Parameters 304 | ---------- 305 | epsilon : float 306 | numerical parameter to perform (default : 1e-8) 307 | n_points : int 308 | maximum number of candidates for maximum likelihood (default : 10) 309 | Returns 310 | ---------- 311 | gamma_best,sigma_best,ll_best 312 | gamma estimates, sigma estimates and corresponding log-likelihood 313 | """ 314 | def u(s): 315 | return 1 + np.log(s).mean() 316 | 317 | def v(s): 318 | return np.mean(1/s) 319 | 320 | def w(Y,t): 321 | s = 1+t*Y 322 | us = u(s) 323 | vs = v(s) 324 | return us*vs-1 325 | 326 | def jac_w(Y,t): 327 | s = 1+t*Y 328 | us = u(s) 329 | vs = v(s) 330 | jac_us = (1/t)*(1-vs) 331 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 332 | return us*jac_vs+vs*jac_us 333 | 334 | 335 | Ym = self.peaks[side].min() 336 | YM = self.peaks[side].max() 337 | Ymean = self.peaks[side].mean() 338 | 339 | 340 | a = -1/YM 341 | if abs(a)<2*epsilon: 342 | epsilon = abs(a)/n_points 343 | 344 | a = a + epsilon 345 | b = 2*(Ymean-Ym)/(Ymean*Ym) 346 | c = 2*(Ymean-Ym)/(Ym**2) 347 | 348 | # We look for possible roots 349 | left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 350 | lambda t: jac_w(self.peaks[side],t), 351 | (a+epsilon,-epsilon), 352 | n_points,'regular') 353 | 354 | right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 355 | lambda t: jac_w(self.peaks[side],t), 356 | (b,c), 357 | n_points,'regular') 358 | 359 | # all the possible roots 360 | zeros = np.concatenate((left_zeros,right_zeros)) 361 | 362 | # 0 is always a solution so we initialize with it 363 | gamma_best = 0 364 | sigma_best = Ymean 365 | ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best) 366 | 367 | # we look for better candidates 368 | for z in zeros: 369 | gamma = u(1+z*self.peaks[side])-1 370 | sigma = gamma/z 371 | ll = self._log_likelihood(self.peaks[side],gamma,sigma) 372 | if ll>ll_best: 373 | gamma_best = gamma 374 | sigma_best = sigma 375 | ll_best = ll 376 | 377 | return gamma_best,sigma_best,ll_best 378 | 379 | 380 | 381 | def _quantile(self,side,gamma,sigma): 382 | """ 383 | Compute the quantile at level 1-q for a given side 384 | 385 | Parameters 386 | ---------- 387 | side : str 388 | 'up' or 'down' 389 | gamma : float 390 | GPD parameter 391 | sigma : float 392 | GPD parameter 393 | Returns 394 | ---------- 395 | float 396 | quantile at level 1-q for the GPD(γ,σ,μ=0) 397 | """ 398 | if side == 'up': 399 | r = self.n * self.proba / self.Nt[side] 400 | if gamma != 0: 401 | return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1) 402 | else: 403 | return self.init_threshold['up'] - sigma*log(r) 404 | elif side == 'down': 405 | r = self.n * self.proba / self.Nt[side] 406 | if gamma != 0: 407 | return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1) 408 | else: 409 | return self.init_threshold['down'] + sigma*log(r) 410 | else: 411 | print('error : the side is not right') 412 | 413 | 414 | def run(self, with_alarm = True): 415 | """ 416 | Run biSPOT on the stream 417 | 418 | Parameters 419 | ---------- 420 | with_alarm : bool 421 | (default = True) If False, SPOT will adapt the threshold assuming \ 422 | there is no abnormal values 423 | Returns 424 | ---------- 425 | dict 426 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 427 | 428 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 429 | the indexes of the values which have triggered alarms 430 | 431 | """ 432 | if (self.n>self.init_data.size): 433 | print('Warning : the algorithm seems to have already been run, you \ 434 | should initialize before running again') 435 | return {} 436 | 437 | # list of the thresholds 438 | thup = [] 439 | thdown = [] 440 | alarm = [] 441 | # Loop over the stream 442 | for i in tqdm.tqdm(range(self.data.size)): 443 | 444 | # If the observed value exceeds the current threshold (alarm case) 445 | if self.data[i]>self.extreme_quantile['up'] : 446 | # if we want to alarm, we put it in the alarm list 447 | if with_alarm: 448 | alarm.append(i) 449 | # otherwise we add it in the peaks 450 | else: 451 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 452 | self.Nt['up'] += 1 453 | self.n += 1 454 | # and we update the thresholds 455 | 456 | g,s,l = self._grimshaw('up') 457 | self.extreme_quantile['up'] = self._quantile('up',g,s) 458 | 459 | # case where the value exceeds the initial threshold but not the alarm ones 460 | elif self.data[i]>self.init_threshold['up']: 461 | # we add it in the peaks 462 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 463 | self.Nt['up'] += 1 464 | self.n += 1 465 | # and we update the thresholds 466 | 467 | g,s,l = self._grimshaw('up') 468 | self.extreme_quantile['up'] = self._quantile('up',g,s) 469 | 470 | elif self.data[i]self.init_threshold['up']]-self.init_threshold['up'] 506 | self.peaks['down'] = -(up_data[up_data 0: 90 | s += 'Algorithm run : Yes\n' 91 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 92 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 93 | else: 94 | s += '\t number of peaks : %s\n' % self.Nt 95 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 96 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 97 | s += 'Algorithm run : No\n' 98 | return s 99 | 100 | 101 | def fit(self,init_data,data): 102 | """ 103 | Import data to ESPOT object 104 | 105 | Parameters 106 | ---------- 107 | init_data : list, numpy.array or pandas.Series 108 | initial batch to calibrate the algorithm 109 | 110 | data : numpy.array 111 | data for the run (list, np.array or pd.series) 112 | 113 | """ 114 | if isinstance(data,list): 115 | self.data = np.array(data) 116 | elif isinstance(data,np.ndarray): 117 | self.data = data 118 | elif isinstance(data,pd.Series): 119 | self.data = data.values 120 | else: 121 | print('This data format (%s) is not supported' % type(data)) 122 | return 123 | 124 | if isinstance(init_data,list): 125 | self.init_data = np.array(init_data) 126 | elif isinstance(init_data,np.ndarray): 127 | self.init_data = init_data 128 | elif isinstance(init_data,pd.Series): 129 | self.init_data = init_data.values 130 | elif isinstance(init_data,int): 131 | self.init_data = self.data[:init_data] 132 | self.data = self.data[init_data:] 133 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 134 | r = int(init_data*data.size) 135 | self.init_data = self.data[:r] 136 | self.data = self.data[r:] 137 | else: 138 | print('The initial data cannot be set') 139 | return 140 | 141 | def add(self,data): 142 | """ 143 | This function allows to append data to the already fitted data 144 | 145 | Parameters 146 | ---------- 147 | data : list, numpy.array, pandas.Series 148 | data to append 149 | """ 150 | if isinstance(data,list): 151 | data = np.array(data) 152 | elif isinstance(data,np.ndarray): 153 | data = data 154 | elif isinstance(data,pd.Series): 155 | data = data.values 156 | else: 157 | print('This data format (%s) is not supported' % type(data)) 158 | return 159 | 160 | self.data = np.append(self.data,data) 161 | return 162 | 163 | def initialize(self, verbose = True): 164 | """ 165 | Run the calibration (initialization) step 166 | 167 | Parameters 168 | ---------- 169 | verbose : bool 170 | (default = True) If True, gives details about the batch initialization 171 | """ 172 | n_init = self.init_data.size - self.depth 173 | 174 | M = backMean(self.init_data,self.depth) 175 | T = self.init_data[self.depth:]-M[:-1] # new variable 176 | 177 | S = np.sort(T) # we sort T to get the empirical quantile 178 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 179 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 180 | 181 | # initial peaks 182 | self.peaks['up'] = T[T>self.init_threshold['up']]-self.init_threshold['up'] 183 | self.peaks['down'] = -( T[ T0) 278 | Returns 279 | ---------- 280 | float 281 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 282 | """ 283 | n = Y.size 284 | if gamma != 0: 285 | tau = gamma/sigma 286 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 287 | else: 288 | L = n * ( 1 + log(Y.mean()) ) 289 | return L 290 | 291 | 292 | def _grimshaw(self,side,epsilon = 1e-8, n_points = 8): 293 | """ 294 | Compute the GPD parameters estimation with the Grimshaw's trick 295 | 296 | Parameters 297 | ---------- 298 | epsilon : float 299 | numerical parameter to perform (default : 1e-8) 300 | n_points : int 301 | maximum number of candidates for maximum likelihood (default : 10) 302 | Returns 303 | ---------- 304 | gamma_best,sigma_best,ll_best 305 | gamma estimates, sigma estimates and corresponding log-likelihood 306 | """ 307 | def u(s): 308 | return 1 + np.log(s).mean() 309 | 310 | def v(s): 311 | return np.mean(1/s) 312 | 313 | def w(Y,t): 314 | s = 1+t*Y 315 | us = u(s) 316 | vs = v(s) 317 | return us*vs-1 318 | 319 | def jac_w(Y,t): 320 | s = 1+t*Y 321 | us = u(s) 322 | vs = v(s) 323 | jac_us = (1/t)*(1-vs) 324 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 325 | return us*jac_vs+vs*jac_us 326 | 327 | 328 | Ym = self.peaks[side].min() 329 | YM = self.peaks[side].max() 330 | Ymean = self.peaks[side].mean() 331 | 332 | 333 | a = -1/YM 334 | if abs(a)<2*epsilon: 335 | epsilon = abs(a)/n_points 336 | 337 | a = a + epsilon 338 | b = 2*(Ymean-Ym)/(Ymean*Ym) 339 | c = 2*(Ymean-Ym)/(Ym**2) 340 | 341 | # We look for possible roots 342 | left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 343 | lambda t: jac_w(self.peaks[side],t), 344 | (a+epsilon,-epsilon), 345 | n_points,'regular') 346 | 347 | right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 348 | lambda t: jac_w(self.peaks[side],t), 349 | (b,c), 350 | n_points,'regular') 351 | 352 | # all the possible roots 353 | zeros = np.concatenate((left_zeros,right_zeros)) 354 | 355 | # 0 is always a solution so we initialize with it 356 | gamma_best = 0 357 | sigma_best = Ymean 358 | ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best) 359 | 360 | # we look for better candidates 361 | for z in zeros: 362 | gamma = u(1+z*self.peaks[side])-1 363 | sigma = gamma/z 364 | ll = self._log_likelihood(self.peaks[side],gamma,sigma) 365 | if ll>ll_best: 366 | gamma_best = gamma 367 | sigma_best = sigma 368 | ll_best = ll 369 | 370 | return gamma_best,sigma_best,ll_best 371 | 372 | 373 | 374 | def _quantile(self,side,gamma,sigma): 375 | """ 376 | Compute the quantile at level 1-q for a given side 377 | 378 | Parameters 379 | ---------- 380 | side : str 381 | 'up' or 'down' 382 | gamma : float 383 | GPD parameter 384 | sigma : float 385 | GPD parameter 386 | Returns 387 | ---------- 388 | float 389 | quantile at level 1-q for the GPD(γ,σ,μ=0) 390 | """ 391 | if side == 'up': 392 | r = self.n * self.proba / self.Nt[side] 393 | if gamma != 0: 394 | return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1) 395 | else: 396 | return self.init_threshold['up'] - sigma*log(r) 397 | elif side == 'down': 398 | r = self.n * self.proba / self.Nt[side] 399 | if gamma != 0: 400 | return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1) 401 | else: 402 | return self.init_threshold['down'] + sigma*log(r) 403 | else: 404 | print('error : the side is not right') 405 | 406 | def ewma(self, X, alpha = 0.1): 407 | s = [X[0]] 408 | for i in range(1, len(X)): 409 | temp = alpha * X[i] + (1 - alpha) * s[-1] 410 | s.append(temp) 411 | return s[-1] 412 | 413 | def run(self, with_alarm = True, plot = True): 414 | """ 415 | Run ESPOT on the stream 416 | 417 | Parameters 418 | ---------- 419 | with_alarm : bool 420 | (default = True) If False, SPOT will adapt the threshold assuming \ 421 | there is no abnormal values 422 | Returns 423 | ---------- 424 | dict 425 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 426 | 427 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 428 | the indexes of the values which have triggered alarms 429 | 430 | """ 431 | if (self.n>self.init_data.size): 432 | print('Warning : the algorithm seems to have already been run, you \ 433 | should initialize before running again') 434 | return {} 435 | 436 | # actual normal window 437 | W = self.init_data[-self.depth:] 438 | 439 | # list of the thresholds 440 | thup = [] 441 | thdown = [] 442 | alarm = [] 443 | # Loop over the stream 444 | for i in tqdm.tqdm(range(self.data.size)): 445 | Mi = self.ewma(W)# DAWNSON IN YOUR AREA HAHA HAHA 446 | Ni = self.data[i]-Mi 447 | # If the observed value exceeds the current threshold (alarm case) 448 | if Ni>self.extreme_quantile['up'] : 449 | # if we want to alarm, we put it in the alarm list 450 | if with_alarm: 451 | alarm.append(i) 452 | # otherwise we add it in the peaks 453 | else: 454 | self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up']) 455 | self.Nt['up'] += 1 456 | self.n += 1 457 | # and we update the thresholds 458 | 459 | g,s,l = self._grimshaw('up') 460 | self.extreme_quantile['up'] = self._quantile('up',g,s) 461 | W = np.append(W[1:],self.data[i]) 462 | 463 | # case where the value exceeds the initial threshold but not the alarm ones 464 | elif Ni>self.init_threshold['up']: 465 | # we add it in the peaks 466 | self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up']) 467 | self.Nt['up'] += 1 468 | self.n += 1 469 | # and we update the thresholds 470 | g,s,l = self._grimshaw('up') 471 | self.extreme_quantile['up'] = self._quantile('up',g,s) 472 | W = np.append(W[1:],self.data[i]) 473 | 474 | elif Ni0: 546 | al_fig = plt.scatter(alarm,self.data[alarm],color='red') 547 | fig.append(al_fig) 548 | 549 | plt.xlim((0,self.data.size)) 550 | plt.show() 551 | 552 | return fig 553 | 554 | -------------------------------------------------------------------------------- /middle_spot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | from math import log,floor 6 | import tqdm 7 | from scipy.optimize import minimize 8 | # colors for plot 9 | deep_saffron = '#FF9933' 10 | air_force_blue = '#5D8AA8' 11 | 12 | def backMean(X,d): 13 | M = [] 14 | w = X[:d].sum() 15 | M.append(w/d) 16 | for i in range(d,len(X)): 17 | w = w - X[i-d] + X[i] 18 | M.append(w/d) 19 | return np.array(M) 20 | class MISPOT: 21 | """ 22 | This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds) 23 | 24 | Attributes 25 | ---------- 26 | proba : float 27 | Detection level (risk), chosen by the user 28 | 29 | extreme_quantile : float 30 | current threshold (bound between normal and abnormal events) 31 | 32 | data : numpy.array 33 | stream 34 | 35 | init_data : numpy.array 36 | initial batch of observations (for the calibration/initialization step) 37 | 38 | init_threshold : float ------------t 39 | initial threshold computed during the calibration step 40 | 41 | peaks : numpy.array 42 | array of peaks (excesses above the initial threshold) 43 | 44 | n : int 45 | number of observed values 46 | 47 | Nt : int 48 | number of observed peaks 49 | """ 50 | def __init__(self, q = 1e-4): 51 | """ 52 | Constructor 53 | Parameters 54 | ---------- 55 | q 56 | Detection level (risk) 57 | 58 | Returns 59 | ---------- 60 | biSPOT object 61 | """ 62 | self.proba = q 63 | self.data = None 64 | self.init_data = None 65 | self.update_number = 0 66 | self.n = 0 67 | nonedict = {'up':None,'down':None} 68 | 69 | self.extreme_quantile = dict.copy(nonedict) 70 | self.init_threshold = dict.copy(nonedict) 71 | self.peaks = dict.copy(nonedict) 72 | self.gamma = dict.copy(nonedict) 73 | self.sigma = dict.copy(nonedict) 74 | self.Nt = {'up':0,'down':0} 75 | 76 | 77 | def __str__(self): 78 | s = '' 79 | s += 'Streaming Peaks-Over-Threshold Object\n' 80 | s += 'Detection level q = %s\n' % self.proba 81 | if self.data is not None: 82 | s += 'Data imported : Yes\n' 83 | s += '\t initialization : %s values\n' % self.init_data.size 84 | s += '\t stream : %s values\n' % self.data.size 85 | else: 86 | s += 'Data imported : No\n' 87 | return s 88 | 89 | if self.n == 0: 90 | s += 'Algorithm initialized : No\n' 91 | else: 92 | s += 'Algorithm initialized : Yes\n' 93 | s += '\t initial threshold : %s\n' % self.init_threshold 94 | 95 | r = self.n-self.init_data.size 96 | if r > 0: 97 | s += 'Algorithm run : Yes\n' 98 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 99 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 100 | else: 101 | s += '\t number of peaks : %s\n' % self.Nt 102 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 103 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 104 | s += 'Algorithm run : No\n' 105 | return s 106 | 107 | 108 | def fit(self,init_data,data): 109 | """ 110 | Import data to biSPOT object 111 | 112 | Parameters 113 | ---------- 114 | init_data : list, numpy.array or pandas.Series 115 | initial batch to calibrate the algorithm () 116 | 117 | data : numpy.array 118 | data for the run (list, np.array or pd.series) 119 | 120 | """ 121 | if isinstance(data,list): 122 | self.data = np.array(data) 123 | elif isinstance(data,np.ndarray): 124 | self.data = data 125 | elif isinstance(data,pd.Series): 126 | self.data = data.values 127 | else: 128 | print('This data format (%s) is not supported' % type(data)) 129 | return 130 | 131 | if isinstance(init_data,list): 132 | self.init_data = np.array(init_data) 133 | self.update_number = len(self.init_data) 134 | elif isinstance(init_data,np.ndarray): 135 | self.init_data = init_data 136 | self.update_number = len(self.init_data) 137 | elif isinstance(init_data,pd.Series): 138 | self.init_data = init_data.values 139 | self.update_number = len(self.init_data) 140 | elif isinstance(init_data,int): 141 | self.init_data = self.data[:init_data] 142 | self.data = self.data[init_data:] 143 | self.update_number = init_data 144 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 145 | r = int(init_data*data.size) 146 | self.init_data = self.data[:r] 147 | self.data = self.data[r:] 148 | else: 149 | print('The initial data cannot be set') 150 | return 151 | 152 | def add(self,data): 153 | """ 154 | This function allows to append data to the already fitted data 155 | 156 | Parameters 157 | ---------- 158 | data : list, numpy.array, pandas.Series 159 | data to append 160 | """ 161 | if isinstance(data,list): 162 | data = np.array(data) 163 | elif isinstance(data,np.ndarray): 164 | data = data 165 | elif isinstance(data,pd.Series): 166 | data = data.values 167 | else: 168 | print('This data format (%s) is not supported' % type(data)) 169 | return 170 | 171 | self.data = np.append(self.data,data) 172 | return 173 | 174 | def initialize(self, verbose = True): 175 | """ 176 | Run the calibration (initialization) step 177 | 178 | Parameters 179 | ---------- 180 | verbose : bool 181 | (default = True) If True, gives details about the batch initialization 182 | """ 183 | n_init = self.init_data.size 184 | 185 | S = np.sort(self.init_data) # we sort X to get the empirical quantile 186 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 187 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 188 | 189 | # initial peaks 190 | self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up'] 191 | self.peaks['down'] = -(self.init_data[self.init_data0) 285 | Returns 286 | ---------- 287 | float 288 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 289 | """ 290 | n = Y.size 291 | if gamma != 0: 292 | tau = gamma/sigma 293 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 294 | else: 295 | L = n * ( 1 + log(Y.mean()) ) 296 | return L 297 | 298 | 299 | def _grimshaw(self,side,epsilon = 1e-8, n_points = 10): 300 | """ 301 | Compute the GPD parameters estimation with the Grimshaw's trick 302 | 303 | Parameters 304 | ---------- 305 | epsilon : float 306 | numerical parameter to perform (default : 1e-8) 307 | n_points : int 308 | maximum number of candidates for maximum likelihood (default : 10) 309 | Returns 310 | ---------- 311 | gamma_best,sigma_best,ll_best 312 | gamma estimates, sigma estimates and corresponding log-likelihood 313 | """ 314 | def u(s): 315 | return 1 + np.log(s).mean() 316 | 317 | def v(s): 318 | return np.mean(1/s) 319 | 320 | def w(Y,t): 321 | s = 1+t*Y 322 | us = u(s) 323 | vs = v(s) 324 | return us*vs-1 325 | 326 | def jac_w(Y,t): 327 | s = 1+t*Y 328 | us = u(s) 329 | vs = v(s) 330 | jac_us = (1/t)*(1-vs) 331 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 332 | return us*jac_vs+vs*jac_us 333 | 334 | 335 | Ym = self.peaks[side].min() 336 | YM = self.peaks[side].max() 337 | Ymean = self.peaks[side].mean() 338 | 339 | 340 | a = -1/YM 341 | if abs(a)<2*epsilon: 342 | epsilon = abs(a)/n_points 343 | 344 | a = a + epsilon 345 | b = 2*(Ymean-Ym)/(Ymean*Ym) 346 | c = 2*(Ymean-Ym)/(Ym**2) 347 | 348 | # We look for possible roots 349 | left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 350 | lambda t: jac_w(self.peaks[side],t), 351 | (a+epsilon,-epsilon), 352 | n_points,'regular') 353 | 354 | right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 355 | lambda t: jac_w(self.peaks[side],t), 356 | (b,c), 357 | n_points,'regular') 358 | 359 | # all the possible roots 360 | zeros = np.concatenate((left_zeros,right_zeros)) 361 | 362 | # 0 is always a solution so we initialize with it 363 | gamma_best = 0 364 | sigma_best = Ymean 365 | ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best) 366 | 367 | # we look for better candidates 368 | for z in zeros: 369 | gamma = u(1+z*self.peaks[side])-1 370 | sigma = gamma/z 371 | ll = self._log_likelihood(self.peaks[side],gamma,sigma) 372 | if ll>ll_best: 373 | gamma_best = gamma 374 | sigma_best = sigma 375 | ll_best = ll 376 | 377 | return gamma_best,sigma_best,ll_best 378 | 379 | 380 | 381 | def _quantile(self,side,gamma,sigma): 382 | """ 383 | Compute the quantile at level 1-q for a given side 384 | 385 | Parameters 386 | ---------- 387 | side : str 388 | 'up' or 'down' 389 | gamma : float 390 | GPD parameter 391 | sigma : float 392 | GPD parameter 393 | Returns 394 | ---------- 395 | float 396 | quantile at level 1-q for the GPD(γ,σ,μ=0) 397 | """ 398 | if side == 'up': 399 | r = self.n * self.proba / self.Nt[side] 400 | if gamma != 0: 401 | return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1) 402 | else: 403 | return self.init_threshold['up'] - sigma*log(r) 404 | elif side == 'down': 405 | r = self.n * self.proba / self.Nt[side] 406 | if gamma != 0: 407 | return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1) 408 | else: 409 | return self.init_threshold['down'] + sigma*log(r) 410 | else: 411 | print('error : the side is not right') 412 | 413 | 414 | def run(self, with_alarm = True): 415 | """ 416 | Run biSPOT on the stream 417 | 418 | Parameters 419 | ---------- 420 | with_alarm : bool 421 | (default = True) If False, SPOT will adapt the threshold assuming \ 422 | there is no abnormal values 423 | Returns 424 | ---------- 425 | dict 426 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 427 | 428 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 429 | the indexes of the values which have triggered alarms 430 | 431 | """ 432 | if (self.n>self.init_data.size): 433 | print('Warning : the algorithm seems to have already been run, you \ 434 | should initialize before running again') 435 | return {} 436 | 437 | # list of the thresholds 438 | thup = [] 439 | thdown = [] 440 | alarm = [] 441 | # Loop over the stream 442 | for i in tqdm.tqdm(range(self.data.size)): 443 | 444 | # If the observed value exceeds the current threshold (alarm case) 445 | if self.data[i]>self.extreme_quantile['up'] : 446 | # if we want to alarm, we put it in the alarm list 447 | if with_alarm: 448 | alarm.append(i) 449 | # otherwise we add it in the peaks 450 | else: 451 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 452 | self.Nt['up'] += 1 453 | self.n += 1 454 | # and we update the thresholds 455 | 456 | g,s,l = self._grimshaw('up') 457 | self.extreme_quantile['up'] = self._quantile('up',g,s) 458 | 459 | # case where the value exceeds the initial threshold but not the alarm ones 460 | elif self.data[i]>self.init_threshold['up']: 461 | # we add it in the peaks 462 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 463 | self.Nt['up'] += 1 464 | self.n += 1 465 | # and we update the thresholds 466 | 467 | g,s,l = self._grimshaw('up') 468 | self.extreme_quantile['up'] = self._quantile('up',g,s) 469 | 470 | elif self.data[i]self.init_threshold['up']]-self.init_threshold['up'] 508 | self.peaks['down'] = -(up_data[up_data 0: 96 | s += 'Algorithm run : Yes\n' 97 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 98 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 99 | else: 100 | s += '\t number of peaks : %s\n' % self.Nt 101 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 102 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 103 | s += 'Algorithm run : No\n' 104 | return s 105 | 106 | 107 | def fit(self,init_data,data): 108 | """ 109 | Import data to biSPOT object 110 | 111 | Parameters 112 | ---------- 113 | init_data : list, numpy.array or pandas.Series 114 | initial batch to calibrate the algorithm () 115 | 116 | data : numpy.array 117 | data for the run (list, np.array or pd.series) 118 | 119 | """ 120 | if isinstance(data,list): 121 | self.data = np.array(data) 122 | elif isinstance(data,np.ndarray): 123 | self.data = data 124 | elif isinstance(data,pd.Series): 125 | self.data = data.values 126 | else: 127 | print('This data format (%s) is not supported' % type(data)) 128 | return 129 | 130 | if isinstance(init_data,list): 131 | self.init_data = np.array(init_data) 132 | elif isinstance(init_data,np.ndarray): 133 | self.init_data = init_data 134 | elif isinstance(init_data,pd.Series): 135 | self.init_data = init_data.values 136 | elif isinstance(init_data,int): 137 | self.init_data = self.data[:init_data] 138 | self.data = self.data[init_data:] 139 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 140 | r = int(init_data*data.size) 141 | self.init_data = self.data[:r] 142 | self.data = self.data[r:] 143 | else: 144 | print('The initial data cannot be set') 145 | return 146 | 147 | def add(self,data): 148 | """ 149 | This function allows to append data to the already fitted data 150 | 151 | Parameters 152 | ---------- 153 | data : list, numpy.array, pandas.Series 154 | data to append 155 | """ 156 | if isinstance(data,list): 157 | data = np.array(data) 158 | elif isinstance(data,np.ndarray): 159 | data = data 160 | elif isinstance(data,pd.Series): 161 | data = data.values 162 | else: 163 | print('This data format (%s) is not supported' % type(data)) 164 | return 165 | 166 | self.data = np.append(self.data,data) 167 | return 168 | 169 | def initialize(self, verbose = True): 170 | """ 171 | Run the calibration (initialization) step 172 | 173 | Parameters 174 | ---------- 175 | verbose : bool 176 | (default = True) If True, gives details about the batch initialization 177 | """ 178 | n_init = self.init_data.size 179 | 180 | S = np.sort(self.init_data) # we sort X to get the empirical quantile 181 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 182 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 183 | 184 | # initial peaks 185 | self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up'] 186 | self.peaks['down'] = -(self.init_data[self.init_data0) 280 | Returns 281 | ---------- 282 | float 283 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 284 | """ 285 | n = Y.size 286 | if gamma != 0: 287 | tau = gamma/sigma 288 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 289 | else: 290 | L = n * ( 1 + log(Y.mean()) ) 291 | return L 292 | 293 | 294 | def _grimshaw(self,side,epsilon = 1e-8, n_points = 10): 295 | """ 296 | Compute the GPD parameters estimation with the Grimshaw's trick 297 | 298 | Parameters 299 | ---------- 300 | epsilon : float 301 | numerical parameter to perform (default : 1e-8) 302 | n_points : int 303 | maximum number of candidates for maximum likelihood (default : 10) 304 | Returns 305 | ---------- 306 | gamma_best,sigma_best,ll_best 307 | gamma estimates, sigma estimates and corresponding log-likelihood 308 | """ 309 | def u(s): 310 | return 1 + np.log(s).mean() 311 | 312 | def v(s): 313 | return np.mean(1/s) 314 | 315 | def w(Y,t): 316 | s = 1+t*Y 317 | us = u(s) 318 | vs = v(s) 319 | return us*vs-1 320 | 321 | def jac_w(Y,t): 322 | s = 1+t*Y 323 | us = u(s) 324 | vs = v(s) 325 | jac_us = (1/t)*(1-vs) 326 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 327 | return us*jac_vs+vs*jac_us 328 | 329 | 330 | Ym = self.peaks[side].min() 331 | YM = self.peaks[side].max() 332 | Ymean = self.peaks[side].mean() 333 | 334 | 335 | a = -1/YM 336 | if abs(a)<2*epsilon: 337 | epsilon = abs(a)/n_points 338 | 339 | a = a + epsilon 340 | b = 2*(Ymean-Ym)/(Ymean*Ym) 341 | c = 2*(Ymean-Ym)/(Ym**2) 342 | 343 | # We look for possible roots 344 | left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 345 | lambda t: jac_w(self.peaks[side],t), 346 | (a+epsilon,-epsilon), 347 | n_points,'regular') 348 | 349 | right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t), 350 | lambda t: jac_w(self.peaks[side],t), 351 | (b,c), 352 | n_points,'regular') 353 | 354 | # all the possible roots 355 | zeros = np.concatenate((left_zeros,right_zeros)) 356 | 357 | # 0 is always a solution so we initialize with it 358 | gamma_best = 0 359 | sigma_best = Ymean 360 | ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best) 361 | 362 | # we look for better candidates 363 | for z in zeros: 364 | gamma = u(1+z*self.peaks[side])-1 365 | sigma = gamma/z 366 | ll = self._log_likelihood(self.peaks[side],gamma,sigma) 367 | if ll>ll_best: 368 | gamma_best = gamma 369 | sigma_best = sigma 370 | ll_best = ll 371 | 372 | return gamma_best,sigma_best,ll_best 373 | 374 | 375 | 376 | def _quantile(self,side,gamma,sigma): 377 | """ 378 | Compute the quantile at level 1-q for a given side 379 | 380 | Parameters 381 | ---------- 382 | side : str 383 | 'up' or 'down' 384 | gamma : float 385 | GPD parameter 386 | sigma : float 387 | GPD parameter 388 | Returns 389 | ---------- 390 | float 391 | quantile at level 1-q for the GPD(γ,σ,μ=0) 392 | """ 393 | if side == 'up': 394 | r = self.n * self.proba / self.Nt[side] 395 | if gamma != 0: 396 | return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1) 397 | else: 398 | return self.init_threshold['up'] - sigma*log(r) 399 | elif side == 'down': 400 | r = self.n * self.proba / self.Nt[side] 401 | if gamma != 0: 402 | return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1) 403 | else: 404 | return self.init_threshold['down'] + sigma*log(r) 405 | else: 406 | print('error : the side is not right') 407 | 408 | 409 | def run(self, with_alarm = True): 410 | """ 411 | Run biSPOT on the stream 412 | 413 | Parameters 414 | ---------- 415 | with_alarm : bool 416 | (default = True) If False, SPOT will adapt the threshold assuming \ 417 | there is no abnormal values 418 | Returns 419 | ---------- 420 | dict 421 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 422 | 423 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 424 | the indexes of the values which have triggered alarms 425 | 426 | """ 427 | if (self.n>self.init_data.size): 428 | print('Warning : the algorithm seems to have already been run, you \ 429 | should initialize before running again') 430 | return {} 431 | 432 | # list of the thresholds 433 | thup = [] 434 | thdown = [] 435 | alarm = [] 436 | # Loop over the stream 437 | for i in tqdm.tqdm(range(self.data.size)): 438 | 439 | # If the observed value exceeds the current threshold (alarm case) 440 | if self.data[i]>self.extreme_quantile['up'] : 441 | # if we want to alarm, we put it in the alarm list 442 | if with_alarm: 443 | alarm.append(i) 444 | # otherwise we add it in the peaks 445 | else: 446 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 447 | self.Nt['up'] += 1 448 | self.n += 1 449 | # and we update the thresholds 450 | 451 | g,s,l = self._grimshaw('up') 452 | self.extreme_quantile['up'] = self._quantile('up',g,s) 453 | 454 | # case where the value exceeds the initial threshold but not the alarm ones 455 | elif self.data[i]>self.init_threshold['up']: 456 | # we add it in the peaks 457 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 458 | self.Nt['up'] += 1 459 | self.n += 1 460 | # and we update the thresholds 461 | 462 | g,s,l = self._grimshaw('up') 463 | self.extreme_quantile['up'] = self._quantile('up',g,s) 464 | 465 | elif self.data[i]