├── MAWI.py
├── MLE.py
├── MOMspot.py
├── README.md
├── bspot.py
├── drif_spot.py
├── edf_stocks.csv
├── espot.py
├── mawi_170812_50_50.csv
├── mawi_180812_50_50.csv
├── middle_spot.py
├── physic.py
├── physics.dat
├── pic
    ├── 1.png
    ├── 2.png
    └── middle_3.png
├── rain.dat
├── rain.py
├── spot.py
└── stock.py


/MAWI.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | #from bspot import bidSPOT
 4 | from spot import biSPOT
 5 | from drif_spot import DRSPOT
 6 | from middle_spot import MISPOT
 7 | from MOMspot import momSPOT
 8 | import pandas as pd
 9 | import time
10 | #no label
11 | f17 = './mawi_170812_50_50.csv'
12 | f18 = './mawi_180812_50_50.csv'
13 | 
14 | P17 = pd.DataFrame.from_csv(f17)
15 | P18 = pd.DataFrame.from_csv(f18)
16 | 
17 | X17 = P17['rSYN'].values
18 | X18 = P18['rSYN'].values
19 | 
20 | n_init = 1000
21 | init_data = X17[-n_init:]     # initial batch
22 | data = X18                # stream
23 | 
24 | q = 1e-4             # risk parameter
25 | 
26 | start = time.clock()
27 | 
28 | #s = momSPOT(q)
29 | #s = biSPOT(q)         # SPOT object
30 | #s = DRSPOT(q)
31 | s = MISPOT(q)
32 | s.fit(init_data,data)     # data import
33 | s.initialize()         # initialization step
34 | results = s.run()     # run
35 | 
36 | end = time.clock()
37 | t=end-start
38 | print("Runtime is:",t) 
39 | 
40 | s.plot(results)     # plot


--------------------------------------------------------------------------------
/MLE.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | #Maximum likelihood estimation
  4 | import numpy as np
  5 | from scipy.optimize import root
  6 | class CALError(Exception):
  7 |     pass
  8 | def Gamma(x, Yi):
  9 |     vx = 0
 10 |     Nt = len(Yi)
 11 |     for i in range(1, Nt+1):
 12 |         try:
 13 |             if 1 + x*Yi[i-1] <=0:
 14 |                 raise CALError()
 15 |             vx = vx + np.log(1 + x*Yi[i-1])
 16 |         except CALError:
 17 |             pass
 18 |     vx = vx/Nt
 19 |     return vx
 20 | 
 21 | def Delta(x, gamma):
 22 |     try:
 23 |         if x < 0.00000001:
 24 |             raise CALError()
 25 |         return gamma/x
 26 |     except CALError:
 27 |             pass
 28 | def zq(q, gamma,delta,n, Nt,t):
 29 |     try:
 30 |         if gamma < 0.00000001:
 31 |             raise CALError()
 32 |     except CALError:
 33 |             pass
 34 |     zq = t
 35 |     tmp = (q*n/Nt)**(-gamma)-1
 36 |     zq += tmp*(delta/gamma)
 37 |     return zq
 38 | def f(x, Yi):
 39 |     x = float(x[0])
 40 |     Nt = len(Yi)
 41 |     ux = 0
 42 |     for i in range(1, Nt+1):
 43 |         if (1+ x*Yi[i-1]) < 0.00000001:
 44 |             return 1
 45 |         ux = ux + 1.0/(1+ x*Yi[i-1])
 46 |     ux = ux/Nt
 47 |     vx = 0
 48 |     for i in range(1, Nt+1):
 49 |         if 1 + x*Yi[i-1] <= 0:
 50 |             return 1
 51 |         vx = vx + np.log(1 + x*Yi[i-1])
 52 |     vx = vx/Nt
 53 |     vx = vx + 1
 54 |     return [
 55 |         ux * vx -1
 56 |         ]
 57 | 
 58 | def choose_zq(Yi, t, zq_lst):
 59 |     l = [i + t for i in Yi]
 60 |     m = np.median(l)
 61 |     minus = 1000000
 62 |     ans = zq_lst[0]
 63 |     for zq in zq_lst:
 64 |         tmp = abs( zq - m)
 65 |         if tmp < minus:
 66 |             minus = tmp
 67 |             ans = zq
 68 |     return ans
 69 | 
 70 | def MLE_get_zq(Yi, q, n, t):
 71 |     Nt = len(Yi)
 72 |     Ym = np.max(Yi)
 73 |     low = -1.0/Ym
 74 |     ym = np.min(Yi)+0.1
 75 |     high = 2*((np.mean(Yi) - ym)/ym**2)
 76 |     #print [low, high]
 77 |     zq_lst = []
 78 |     for i in range(5):
 79 |         guess = low + i*(high-low)/5 
 80 |         guess = float(guess)
 81 |         #print "guess", guess
 82 |         sol = root(f, guess, Yi)
 83 |         #print sol
 84 |         if sol['success'] == True:
 85 |             x =  sol['x'][0]
 86 |             try:
 87 |                 gamma =  Gamma(x, Yi)
 88 |                 delta = Delta(x, gamma)
 89 |                 #print gamma, delta
 90 |                 zq_lst.append( zq(q, gamma,delta,n, Nt,t) )
 91 |             except:
 92 |                 pass
 93 |         #print i, " ********** is over"
 94 |     return choose_zq(Yi, t, zq_lst) 
 95 | 
 96 | def MOM_get_zq(Yi, q, n, t):
 97 |     avg = np.mean(Yi)
 98 |     var = np.var(Yi)
 99 |     Nt = len(Yi)
100 |     gamma = 0.5*(avg**2/var + 1)
101 |     delta = 0.5*avg*(avg**2/var +1)
102 |     return zq(q, gamma,delta,n, Nt,t)
103 | 
104 | Yi = [5,10,2,4,8,100,102,3,4,8,100,102,3]
105 | q = 0.96
106 | n = 1000
107 | t = 100 #t是Y序列的q分位数
108 | print MLE_get_zq(Yi, q, n, t)
109 | print MOM_get_zq(Yi, q, n, t)


--------------------------------------------------------------------------------
/MOMspot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from math import log,floor
  6 | import tqdm
  7 | from scipy.optimize import minimize
  8 | # colors for plot
  9 | deep_saffron = '#FF9933'
 10 | air_force_blue = '#5D8AA8'
 11 | 
 12 | def backMean(X,d):
 13 |     M = []
 14 |     w = X[:d].sum()
 15 |     M.append(w/d)
 16 |     for i in range(d,len(X)):
 17 |         w = w - X[i-d] + X[i]
 18 |         M.append(w/d)
 19 |     return np.array(M)
 20 | class momSPOT:
 21 |     """
 22 |     This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds)
 23 |     
 24 |     Attributes
 25 |     ----------
 26 |     proba : float
 27 |         Detection level (risk), chosen by the user
 28 |         
 29 |     extreme_quantile : float
 30 |         current threshold (bound between normal and abnormal events)
 31 |         
 32 |     data : numpy.array
 33 |         stream
 34 |     
 35 |     init_data : numpy.array
 36 |         initial batch of observations (for the calibration/initialization step)
 37 |     
 38 |     init_threshold : float
 39 |         initial threshold computed during the calibration step
 40 |     
 41 |     peaks : numpy.array
 42 |         array of peaks (excesses above the initial threshold)
 43 |     
 44 |     n : int
 45 |         number of observed values
 46 |     
 47 |     Nt : int
 48 |         number of observed peaks
 49 |     """
 50 |     def __init__(self, q = 1e-4):
 51 |         """
 52 |         Constructor
 53 |         Parameters
 54 |         ----------
 55 |         q
 56 |             Detection level (risk)
 57 |     
 58 |         Returns
 59 |         ----------
 60 |         biSPOT object
 61 |         """
 62 |         self.proba = q
 63 |         self.data = None
 64 |         self.init_data = None
 65 |         self.n = 0
 66 |         nonedict =  {'up':None,'down':None}
 67 |         
 68 |         self.extreme_quantile = dict.copy(nonedict)
 69 |         self.init_threshold = dict.copy(nonedict)
 70 |         self.peaks = dict.copy(nonedict)
 71 |         self.gamma = dict.copy(nonedict)
 72 |         self.sigma = dict.copy(nonedict)
 73 |         self.Nt = {'up':0,'down':0}
 74 |         
 75 |         
 76 |     def __str__(self):
 77 |         s = ''
 78 |         s += 'Streaming Peaks-Over-Threshold Object\n'
 79 |         s += 'Detection level q = %s\n' % self.proba
 80 |         if self.data is not None:
 81 |             s += 'Data imported : Yes\n'
 82 |             s += '\t initialization  : %s values\n' % self.init_data.size
 83 |             s += '\t stream : %s values\n' % self.data.size
 84 |         else:
 85 |             s += 'Data imported : No\n'
 86 |             return s
 87 |             
 88 |         if self.n == 0:
 89 |             s += 'Algorithm initialized : No\n'
 90 |         else:
 91 |             s += 'Algorithm initialized : Yes\n'
 92 |             s += '\t initial threshold : %s\n' % self.init_threshold
 93 |             
 94 |             r = self.n-self.init_data.size
 95 |             if r > 0:
 96 |                 s += 'Algorithm run : Yes\n'
 97 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
 98 |                 s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n)
 99 |             else:
100 |                 s += '\t number of peaks  : %s\n' % self.Nt
101 |                 s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up']
102 |                 s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down']
103 |                 s += 'Algorithm run : No\n'
104 |         return s
105 |     
106 |     
107 |     def fit(self,init_data,data):
108 |         """
109 |         Import data to biSPOT object
110 |         
111 |         Parameters
112 |         ----------
113 |         init_data : list, numpy.array or pandas.Series
114 |             initial batch to calibrate the algorithm ()
115 |             
116 |         data : numpy.array
117 |             data for the run (list, np.array or pd.series)
118 |     
119 |         """
120 |         if isinstance(data,list):
121 |             self.data = np.array(data)
122 |         elif isinstance(data,np.ndarray):
123 |             self.data = data
124 |         elif isinstance(data,pd.Series):
125 |             self.data = data.values
126 |         else:
127 |             print('This data format (%s) is not supported' % type(data))
128 |             return
129 |             
130 |         if isinstance(init_data,list):
131 |             self.init_data = np.array(init_data)
132 |         elif isinstance(init_data,np.ndarray):
133 |             self.init_data = init_data
134 |         elif isinstance(init_data,pd.Series):
135 |             self.init_data = init_data.values
136 |         elif isinstance(init_data,int):
137 |             self.init_data = self.data[:init_data]
138 |             self.data = self.data[init_data:]
139 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
140 |             r = int(init_data*data.size)
141 |             self.init_data = self.data[:r]
142 |             self.data = self.data[r:]
143 |         else:
144 |             print('The initial data cannot be set')
145 |             return
146 |         
147 |     def add(self,data):
148 |         """
149 |         This function allows to append data to the already fitted data
150 |         
151 |         Parameters
152 |         ----------
153 |         data : list, numpy.array, pandas.Series
154 |             data to append
155 |         """
156 |         if isinstance(data,list):
157 |             data = np.array(data)
158 |         elif isinstance(data,np.ndarray):
159 |             data = data
160 |         elif isinstance(data,pd.Series):
161 |             data = data.values
162 |         else:
163 |             print('This data format (%s) is not supported' % type(data))
164 |             return
165 |         
166 |         self.data = np.append(self.data,data)
167 |         return
168 | 
169 |     def initialize(self, verbose = True):
170 |         """
171 |         Run the calibration (initialization) step
172 |         
173 |         Parameters
174 |         ----------
175 |         verbose : bool
176 |             (default = True) If True, gives details about the batch initialization
177 |         """
178 |         n_init = self.init_data.size
179 |         
180 |         S = np.sort(self.init_data)     # we sort X to get the empirical quantile
181 |         self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm
182 |         self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm
183 | 
184 |         # initial peaks
185 |         self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up']
186 |         self.peaks['down'] = -(self.init_data[self.init_data<self.init_threshold['down']]-self.init_threshold['down'])
187 |         self.Nt['up'] = self.peaks['up'].size
188 |         self.Nt['down'] = self.peaks['down'].size
189 |         self.n = n_init
190 |         
191 |         if verbose:
192 |             print('Initial threshold : %s' % self.init_threshold)
193 |             print('Number of peaks : %s' % self.Nt)
194 |             #print('Grimshaw maximum log-likelihood estimation ... ', end = '')
195 |             
196 |         l = {'up':None,'down':None}
197 |         for side in ['up','down']:
198 |             g,s,l[side] = self._MOM(side)
199 |             self.extreme_quantile[side] = self._quantile(side,g,s)
200 |             self.gamma[side] = g
201 |             self.sigma[side] = s
202 |         
203 |         ltab = 20
204 |         form = ('\t'+'%20s' + '%20.2f' + '%20.2f')
205 |         '''
206 |         if verbose:
207 |             print('[done]')
208 |             print('\t' + 'Parameters'.rjust(ltab) + 'Upper'.rjust(ltab) + 'Lower'.rjust(ltab))
209 |             print('\t' + '-'*ltab*3)
210 |             print(form % (chr(0x03B3),self.gamma['up'],self.gamma['down']))
211 |             print(form % (chr(0x03C3),self.sigma['up'],self.sigma['down']))
212 |             print(form % ('likelihood',l['up'],l['down']))
213 |             print(form % ('Extreme quantile',self.extreme_quantile['up'],self.extreme_quantile['down']))
214 |             print('\t' + '-'*ltab*3)
215 |         '''
216 |         return 
217 |     
218 | 
219 |     def _MOM(self,side,epsilon = 1e-8, n_points = 10):
220 |         Yi = self.peaks[side]
221 |         avg = np.mean(Yi)
222 |         var = np.var(Yi)
223 |         sigma = 0.5*avg*(avg**2/var + 1)
224 |         gamma = 0.5*(avg**2/var - 1)
225 |         print gamma, sigma
226 |         return gamma,sigma,100
227 | 
228 |     
229 | 
230 |     def _quantile(self,side,gamma,sigma):
231 |         """
232 |         Compute the quantile at level 1-q for a given side
233 |         
234 |         Parameters
235 |         ----------
236 |         side : str
237 |             'up' or 'down'
238 |         gamma : float
239 |             GPD parameter
240 |         sigma : float
241 |             GPD parameter
242 |         Returns
243 |         ----------
244 |         float
245 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
246 |         """
247 |         if side == 'up':
248 |             r = self.n * self.proba / self.Nt[side]
249 |             if gamma != 0:
250 |                 return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1)
251 |             else:
252 |                 return self.init_threshold['up'] - sigma*log(r)
253 |         elif side == 'down':
254 |             r = self.n * self.proba / self.Nt[side]
255 |             if gamma != 0:
256 |                 return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1)
257 |             else:
258 |                 return self.init_threshold['down'] + sigma*log(r)
259 |         else:
260 |             print('error : the side is not right')
261 | 
262 |         
263 |     def run(self, with_alarm = True):
264 |         """
265 |         Run biSPOT on the stream
266 |         
267 |         Parameters
268 |         ----------
269 |         with_alarm : bool
270 |             (default = True) If False, SPOT will adapt the threshold assuming \
271 |             there is no abnormal values
272 |         Returns
273 |         ----------
274 |         dict
275 |             keys : 'upper_thresholds', 'lower_thresholds' and 'alarms'
276 |             
277 |             '***-thresholds' contains the extreme quantiles and 'alarms' contains \
278 |             the indexes of the values which have triggered alarms
279 |             
280 |         """
281 |         if (self.n>self.init_data.size):
282 |             print('Warning : the algorithm seems to have already been run, you \
283 |             should initialize before running again')
284 |             return {}
285 |         
286 |         # list of the thresholds
287 |         thup = []
288 |         thdown = []
289 |         alarm = []
290 |         # Loop over the stream
291 |         for i in tqdm.tqdm(range(self.data.size)):
292 |     
293 |             # If the observed value exceeds the current threshold (alarm case)
294 |             if self.data[i]>self.extreme_quantile['up'] :
295 |                 # if we want to alarm, we put it in the alarm list
296 |                 if with_alarm:
297 |                     alarm.append(i)
298 |                 # otherwise we add it in the peaks
299 |                 else:
300 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
301 |                     self.Nt['up'] += 1
302 |                     self.n += 1
303 |                     # and we update the thresholds
304 | 
305 |                     g,s,l = self._MOM('up')
306 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
307 | 
308 |             # case where the value exceeds the initial threshold but not the alarm ones
309 |             elif self.data[i]>self.init_threshold['up']:
310 |                     # we add it in the peaks
311 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
312 |                     self.Nt['up'] += 1
313 |                     self.n += 1
314 |                     # and we update the thresholds
315 | 
316 |                     g,s,l = self._MOM('up')
317 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
318 |                     
319 |             elif self.data[i]<self.extreme_quantile['down'] :
320 |                 # if we want to alarm, we put it in the alarm list
321 |                 if with_alarm:
322 |                     alarm.append(i)
323 |                 # otherwise we add it in the peaks
324 |                 else:
325 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
326 |                     self.Nt['down'] += 1
327 |                     self.n += 1
328 |                     # and we update the thresholds
329 | 
330 |                     g,s,l = self._MOM('down')
331 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
332 | 
333 |             # case where the value exceeds the initial threshold but not the alarm ones
334 |             elif self.data[i]<self.init_threshold['down']:
335 |                     # we add it in the peaks
336 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
337 |                     self.Nt['down'] += 1
338 |                     self.n += 1
339 |                     # and we update the thresholds
340 | 
341 |                     g,s,l = self._MOM('down')
342 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
343 |             else:
344 |                 self.n += 1
345 | 
346 |                 
347 |             thup.append(self.extreme_quantile['up']) # thresholds record
348 |             thdown.append(self.extreme_quantile['down']) # thresholds record
349 |         
350 |         return {'upper_thresholds' : thup,'lower_thresholds' : thdown, 'alarms': alarm}
351 |     
352 |     def plot(self,run_results,with_alarm = True):
353 |         """
354 |         Plot the results of given by the run
355 |         
356 |         Parameters
357 |         ----------
358 |         run_results : dict
359 |             results given by the 'run' method
360 |         with_alarm : bool
361 |             (default = True) If True, alarms are plotted.
362 |         Returns
363 |         ----------
364 |         list
365 |             list of the plots
366 |             
367 |         """
368 |         x = range(self.data.size)
369 |         K = run_results.keys()
370 |         
371 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
372 |         fig = [ts_fig]
373 |         
374 |         if 'upper_thresholds' in K:
375 |             thup = run_results['upper_thresholds']
376 |             uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed')
377 |             fig.append(uth_fig)
378 |             
379 |         if 'lower_thresholds' in K:
380 |             thdown = run_results['lower_thresholds']
381 |             lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed')
382 |             fig.append(lth_fig)
383 |         
384 |         if with_alarm and ('alarms' in K):
385 |             alarm = run_results['alarms']
386 |             al_fig = plt.scatter(alarm,self.data[alarm],color='red')
387 |             fig.append(al_fig)
388 |             
389 |         plt.xlim((0,self.data.size))
390 |         plt.show()
391 |         
392 |         return fig
393 | 
394 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # EVT
 2 | 使用极端值理论(Extreme Value Theory)实现阈值动态自动化设置
 3 | # 介绍
 4 | 我们的工作建立在2017 KDD "Anomaly Detection in Streams with Extreme Value Theory"论文的基础上，做了如下改进：
 5 | * 引入矩估计算法，加速计算。该算法比极大似然估计快100多倍
 6 | * 提出了更高层，更抽象的基于预测残差的算法框架，而DSPOT算法是我们提出框架的一种具体算法
 7 | * 我们强调了数据漂移对系统带来的影响，提出了批量更新的算法，有效应对数据漂移
 8 | # 应用
 9 | * 异常探测问题中，经常需要设置阈值，例如：内存的使用率大于90%时，判定为异常。这里阈值90%是人为设定的，需要用户有足够的使用经验，而且这种设定方式随机性很大，比如设置为89%或者91%似乎也是合理的。
10 | * 现实应用中，每条KPI都需要手动设置不同的阈值，这是一项十分复杂和庞大的工作，如果我们能够只设定概率值q而无需设定阈值，那么会免除巨大的工作量。
11 | ![应用实例](https://github.com/DawnsonLi/EVT/blob/master/pic/1.png)
12 | * 使用我们的方法只需定义异常事件发生的概率，而无需设置成百上千的阈值，以不变应万变
13 | ![应用实例](https://github.com/DawnsonLi/EVT/blob/master/pic/2.png)
14 | * 使用示例:
15 | 这里给出一个应对数据漂移的算法运行结果示意图，上下黄色虚线分别对应算法自动设置的上下阈值。
16 | ![应用实例](https://github.com/DawnsonLi/EVT/blob/master/pic/middle_3.png)
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/bspot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from math import log,floor
  6 | import tqdm
  7 | from scipy.optimize import minimize
  8 | # colors for plot
  9 | deep_saffron = '#FF9933'
 10 | air_force_blue = '#5D8AA8'
 11 | 
 12 | def backMean(X,d):
 13 |     M = []
 14 |     w = X[:d].sum()
 15 |     M.append(w/d)
 16 |     for i in range(d,len(X)):
 17 |         w = w - X[i-d] + X[i]
 18 |         M.append(w/d)
 19 |     return np.array(M)
 20 | class bidSPOT:
 21 |     """
 22 |     This class allows to run DSPOT algorithm on univariate dataset (upper and lower bounds)
 23 |     
 24 |     Attributes
 25 |     ----------
 26 |     proba : float
 27 |         Detection level (risk), chosen by the user
 28 |         
 29 |     depth : int
 30 |         Number of observations to compute the moving average
 31 |         
 32 |     extreme_quantile : float
 33 |         current threshold (bound between normal and abnormal events)
 34 |         
 35 |     data : numpy.array
 36 |         stream
 37 |     
 38 |     init_data : numpy.array
 39 |         initial batch of observations (for the calibration/initialization step)
 40 |     
 41 |     init_threshold : float
 42 |         initial threshold computed during the calibration step
 43 |     
 44 |     peaks : numpy.array
 45 |         array of peaks (excesses above the initial threshold)
 46 |     
 47 |     n : int
 48 |         number of observed values
 49 |     
 50 |     Nt : int
 51 |         number of observed peaks
 52 |     """
 53 |     def __init__(self, q = 1e-4, depth = 10):
 54 |         self.proba = q
 55 |         self.data = None
 56 |         self.init_data = None
 57 |         self.n = 0
 58 |         self.depth = depth
 59 |         
 60 |         nonedict =  {'up':None,'down':None}
 61 |         
 62 |         self.extreme_quantile = dict.copy(nonedict)
 63 |         self.init_threshold = dict.copy(nonedict)
 64 |         self.peaks = dict.copy(nonedict)
 65 |         self.gamma = dict.copy(nonedict)
 66 |         self.sigma = dict.copy(nonedict)
 67 |         self.Nt = {'up':0,'down':0}
 68 |         
 69 |         
 70 |     def __str__(self):
 71 |         s = ''
 72 |         s += 'Streaming Peaks-Over-Threshold Object\n'
 73 |         s += 'Detection level q = %s\n' % self.proba
 74 |         if self.data is not None:
 75 |             s += 'Data imported : Yes\n'
 76 |             s += '\t initialization  : %s values\n' % self.init_data.size
 77 |             s += '\t stream : %s values\n' % self.data.size
 78 |         else:
 79 |             s += 'Data imported : No\n'
 80 |             return s
 81 |             
 82 |         if self.n == 0:
 83 |             s += 'Algorithm initialized : No\n'
 84 |         else:
 85 |             s += 'Algorithm initialized : Yes\n'
 86 |             s += '\t initial threshold : %s\n' % self.init_threshold
 87 |             
 88 |             r = self.n-self.init_data.size
 89 |             if r > 0:
 90 |                 s += 'Algorithm run : Yes\n'
 91 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
 92 |                 s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n)
 93 |             else:
 94 |                 s += '\t number of peaks  : %s\n' % self.Nt
 95 |                 s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up']
 96 |                 s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down']
 97 |                 s += 'Algorithm run : No\n'
 98 |         return s
 99 |     
100 |     
101 |     def fit(self,init_data,data):
102 |         """
103 |         Import data to biDSPOT object
104 |         
105 |         Parameters
106 |         ----------
107 |         init_data : list, numpy.array or pandas.Series
108 |             initial batch to calibrate the algorithm
109 |             
110 |         data : numpy.array
111 |             data for the run (list, np.array or pd.series)
112 |     
113 |         """
114 |         if isinstance(data,list):
115 |             self.data = np.array(data)
116 |         elif isinstance(data,np.ndarray):
117 |             self.data = data
118 |         elif isinstance(data,pd.Series):
119 |             self.data = data.values
120 |         else:
121 |             print('This data format (%s) is not supported' % type(data))
122 |             return
123 |             
124 |         if isinstance(init_data,list):
125 |             self.init_data = np.array(init_data)
126 |         elif isinstance(init_data,np.ndarray):
127 |             self.init_data = init_data
128 |         elif isinstance(init_data,pd.Series):
129 |             self.init_data = init_data.values
130 |         elif isinstance(init_data,int):
131 |             self.init_data = self.data[:init_data]
132 |             self.data = self.data[init_data:]
133 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
134 |             r = int(init_data*data.size)
135 |             self.init_data = self.data[:r]
136 |             self.data = self.data[r:]
137 |         else:
138 |             print('The initial data cannot be set')
139 |             return
140 |         
141 |     def add(self,data):
142 |         """
143 |         This function allows to append data to the already fitted data
144 |         
145 |         Parameters
146 |         ----------
147 |         data : list, numpy.array, pandas.Series
148 |             data to append
149 |         """
150 |         if isinstance(data,list):
151 |             data = np.array(data)
152 |         elif isinstance(data,np.ndarray):
153 |             data = data
154 |         elif isinstance(data,pd.Series):
155 |             data = data.values
156 |         else:
157 |             print('This data format (%s) is not supported' % type(data))
158 |             return
159 |         
160 |         self.data = np.append(self.data,data)
161 |         return
162 |     
163 |     def initialize(self, verbose = True):
164 |         """
165 |         Run the calibration (initialization) step
166 |         
167 |         Parameters
168 |         ----------
169 |         verbose : bool
170 |             (default = True) If True, gives details about the batch initialization
171 |         """
172 |         n_init = self.init_data.size - self.depth
173 |         
174 |         M = backMean(self.init_data,self.depth)
175 |         T = self.init_data[self.depth:]-M[:-1] # new variable
176 |         
177 |         S = np.sort(T)     # we sort T to get the empirical quantile
178 |         self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm
179 |         self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm
180 | 
181 |         # initial peaks
182 |         self.peaks['up'] = T[T>self.init_threshold['up']]-self.init_threshold['up']
183 |         self.peaks['down'] = -( T[ T<self.init_threshold['down'] ] - self.init_threshold['down'] )
184 |         self.Nt['up'] = self.peaks['up'].size
185 |         self.Nt['down'] = self.peaks['down'].size
186 |         self.n = n_init
187 |         
188 |         if verbose:
189 |             print('Initial threshold : %s' % self.init_threshold)
190 |             print('Number of peaks : %s' % self.Nt)
191 |             print('Grimshaw maximum log-likelihood estimation ... ')
192 |             
193 |         l = {'up':None,'down':None}
194 |         for side in ['up','down']:
195 |             g,s,l[side] = self._grimshaw(side)
196 |             self.extreme_quantile[side] = self._quantile(side,g,s)
197 |             self.gamma[side] = g
198 |             self.sigma[side] = s
199 |         
200 |         ltab = 20
201 |         '''
202 |         form = ('\t'+'%20s' + '%20.2f' + '%20.2f')
203 |         if verbose:
204 |             print('[done]')
205 |             print('\t' + 'Parameters'.rjust(ltab) + 'Upper'.rjust(ltab) + 'Lower'.rjust(ltab))
206 |             print('\t' + '-'*ltab*3)
207 |             print(form % (chr(0x03B3),self.gamma['up'],self.gamma['down']))
208 |             print(form % (chr(0x03C3),self.sigma['up'],self.sigma['down']))
209 |             print(form % ('likelihood',l['up'],l['down']))
210 |             print(form % ('Extreme quantile',self.extreme_quantile['up'],self.extreme_quantile['down']))
211 |             print('\t' + '-'*ltab*3)
212 |         '''
213 |         return 
214 |     
215 |     
216 |     
217 |     
218 |     def _rootsFinder(self, fun,jac,bounds,npoints,method):
219 |         """
220 |         Find possible roots of a scalar function
221 |         
222 |         Parameters
223 |         ----------
224 |         fun : function
225 |             scalar function 
226 |         jac : function
227 |             first order derivative of the function  
228 |         bounds : tuple
229 |             (min,max) interval for the roots search    
230 |         npoints : int
231 |             maximum number of roots to output      
232 |         method : str
233 |             'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval
234 |         
235 |         Returns
236 |         ----------
237 |         numpy.array
238 |             possible roots of the function
239 |         """
240 |         if method == 'regular':
241 |             step = (bounds[1]-bounds[0])/(npoints+1)
242 |             X0 = np.arange(bounds[0]+step,bounds[1],step)
243 |         elif method == 'random':
244 |             X0 = np.random.uniform(bounds[0],bounds[1],npoints)
245 |         
246 |         def objFun(X,f,jac):
247 |             g = 0
248 |             j = np.zeros(X.shape)
249 |             i = 0
250 |             for x in X:
251 |                 fx = f(x)
252 |                 g = g+fx**2
253 |                 j[i] = 2*fx*jac(x)
254 |                 i = i+1
255 |             return g,j
256 |         
257 |         opt = minimize(lambda X:objFun(X,fun,jac), X0, 
258 |                        method='L-BFGS-B', 
259 |                        jac=True, bounds=[bounds]*len(X0))
260 |         
261 |         X = opt.x
262 |         np.round(X,decimals = 5)
263 |         return np.unique(X)
264 |     
265 |     
266 |     def _log_likelihood(self, Y,gamma,sigma):
267 |         """
268 |         Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
269 |         
270 |         Parameters
271 |         ----------
272 |         Y : numpy.array
273 |             observations
274 |         gamma : float
275 |             GPD index parameter
276 |         sigma : float
277 |             GPD scale parameter (>0)   
278 |         Returns
279 |         ----------
280 |         float
281 |             log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
282 |         """
283 |         n = Y.size
284 |         if gamma != 0:
285 |             tau = gamma/sigma
286 |             L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum()
287 |         else:
288 |             L = n * ( 1 + log(Y.mean()) )
289 |         return L
290 | 
291 | 
292 |     def _grimshaw(self,side,epsilon = 1e-8, n_points = 8):
293 |         """
294 |         Compute the GPD parameters estimation with the Grimshaw's trick
295 |         
296 |         Parameters
297 |         ----------
298 |         epsilon : float
299 |             numerical parameter to perform (default : 1e-8)
300 |         n_points : int
301 |             maximum number of candidates for maximum likelihood (default : 10)
302 |         Returns
303 |         ----------
304 |         gamma_best,sigma_best,ll_best
305 |             gamma estimates, sigma estimates and corresponding log-likelihood
306 |         """
307 |         def u(s):
308 |             return 1 + np.log(s).mean()
309 |             
310 |         def v(s):
311 |             return np.mean(1/s)
312 |         
313 |         def w(Y,t):
314 |             s = 1+t*Y
315 |             us = u(s)
316 |             vs = v(s)
317 |             return us*vs-1
318 |         
319 |         def jac_w(Y,t):
320 |             s = 1+t*Y
321 |             us = u(s)
322 |             vs = v(s)
323 |             jac_us = (1/t)*(1-vs)
324 |             jac_vs = (1/t)*(-vs+np.mean(1/s**2))
325 |             return us*jac_vs+vs*jac_us
326 |             
327 |     
328 |         Ym = self.peaks[side].min()
329 |         YM = self.peaks[side].max()
330 |         Ymean = self.peaks[side].mean()
331 |         
332 |         
333 |         a = -1/YM
334 |         if abs(a)<2*epsilon:
335 |             epsilon = abs(a)/n_points
336 |         
337 |         a = a + epsilon
338 |         b = 2*(Ymean-Ym)/(Ymean*Ym)
339 |         c = 2*(Ymean-Ym)/(Ym**2)
340 |     
341 |         # We look for possible roots
342 |         left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
343 |                                  lambda t: jac_w(self.peaks[side],t),
344 |                                  (a+epsilon,-epsilon),
345 |                                  n_points,'regular')
346 |         
347 |         right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
348 |                                   lambda t: jac_w(self.peaks[side],t),
349 |                                   (b,c),
350 |                                   n_points,'regular')
351 |     
352 |         # all the possible roots
353 |         zeros = np.concatenate((left_zeros,right_zeros))
354 |         
355 |         # 0 is always a solution so we initialize with it
356 |         gamma_best = 0
357 |         sigma_best = Ymean
358 |         ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best)
359 |         
360 |         # we look for better candidates
361 |         for z in zeros:
362 |             gamma = u(1+z*self.peaks[side])-1
363 |             sigma = gamma/z
364 |             ll = self._log_likelihood(self.peaks[side],gamma,sigma)
365 |             if ll>ll_best:
366 |                 gamma_best = gamma
367 |                 sigma_best = sigma
368 |                 ll_best = ll
369 |     
370 |         return gamma_best,sigma_best,ll_best
371 | 
372 |     
373 | 
374 |     def _quantile(self,side,gamma,sigma):
375 |         """
376 |         Compute the quantile at level 1-q for a given side
377 |         
378 |         Parameters
379 |         ----------
380 |         side : str
381 |             'up' or 'down'
382 |         gamma : float
383 |             GPD parameter
384 |         sigma : float
385 |             GPD parameter
386 |         Returns
387 |         ----------
388 |         float
389 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
390 |         """
391 |         if side == 'up':
392 |             r = self.n * self.proba / self.Nt[side]
393 |             if gamma != 0:
394 |                 return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1)
395 |             else:
396 |                 return self.init_threshold['up'] - sigma*log(r)
397 |         elif side == 'down':
398 |             r = self.n * self.proba / self.Nt[side]
399 |             if gamma != 0:
400 |                 return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1)
401 |             else:
402 |                 return self.init_threshold['down'] + sigma*log(r)
403 |         else:
404 |             print('error : the side is not right')
405 | 
406 |         
407 |     def run(self, with_alarm = True, plot = True):
408 |         """
409 |         Run biDSPOT on the stream
410 |         
411 |         Parameters
412 |         ----------
413 |         with_alarm : bool
414 |             (default = True) If False, SPOT will adapt the threshold assuming \
415 |             there is no abnormal values
416 |         Returns
417 |         ----------
418 |         dict
419 |             keys : 'upper_thresholds', 'lower_thresholds' and 'alarms'
420 |             
421 |             '***-thresholds' contains the extreme quantiles and 'alarms' contains \
422 |             the indexes of the values which have triggered alarms
423 |             
424 |         """
425 |         if (self.n>self.init_data.size):
426 |             print('Warning : the algorithm seems to have already been run, you \
427 |             should initialize before running again')
428 |             return {}
429 |         
430 |         # actual normal window
431 |         W = self.init_data[-self.depth:]
432 |         
433 |         # list of the thresholds
434 |         thup = []
435 |         thdown = []
436 |         alarm = []
437 |         # Loop over the stream
438 |         for i in tqdm.tqdm(range(self.data.size)):
439 |             Mi = W.mean()
440 |             Ni = self.data[i]-Mi
441 |             # If the observed value exceeds the current threshold (alarm case)
442 |             if Ni>self.extreme_quantile['up'] :
443 |                 # if we want to alarm, we put it in the alarm list
444 |                 if with_alarm:
445 |                     alarm.append(i)
446 |                 # otherwise we add it in the peaks
447 |                 else:
448 |                     self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up'])
449 |                     self.Nt['up'] += 1
450 |                     self.n += 1
451 |                     # and we update the thresholds
452 | 
453 |                     g,s,l = self._grimshaw('up')
454 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
455 |                     W = np.append(W[1:],self.data[i])
456 |                     
457 |             # case where the value exceeds the initial threshold but not the alarm ones
458 |             elif Ni>self.init_threshold['up']:
459 |                     # we add it in the peaks
460 |                     self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up'])
461 |                     self.Nt['up'] += 1
462 |                     self.n += 1
463 |                     # and we update the thresholds
464 |                     g,s,l = self._grimshaw('up')
465 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
466 |                     W = np.append(W[1:],self.data[i])
467 |                     
468 |             elif Ni<self.extreme_quantile['down'] :
469 |                 # if we want to alarm, we put it in the alarm list
470 |                 if with_alarm:
471 |                     alarm.append(i)
472 |                 # otherwise we add it in the peaks
473 |                 else:
474 |                     self.peaks['down'] = np.append(self.peaks['down'],-(Ni-self.init_threshold['down']))
475 |                     self.Nt['down'] += 1
476 |                     self.n += 1
477 |                     # and we update the thresholds
478 | 
479 |                     g,s,l = self._grimshaw('down')
480 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
481 |                     W = np.append(W[1:],self.data[i])
482 |                     
483 |             # case where the value exceeds the initial threshold but not the alarm ones
484 |             elif Ni<self.init_threshold['down']:
485 |                     # we add it in the peaks
486 |                     self.peaks['down'] = np.append(self.peaks['down'],-(Ni-self.init_threshold['down']))
487 |                     self.Nt['down'] += 1
488 |                     self.n += 1
489 |                     # and we update the thresholds
490 | 
491 |                     g,s,l = self._grimshaw('down')
492 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
493 |                     W = np.append(W[1:],self.data[i])
494 |             else:
495 |                 self.n += 1
496 |                 W = np.append(W[1:],self.data[i])
497 | 
498 |                 
499 |             thup.append(self.extreme_quantile['up']+Mi) # upper thresholds record
500 |             thdown.append(self.extreme_quantile['down']+Mi) # lower thresholds record
501 |         
502 |         return {'upper_thresholds' : thup,'lower_thresholds' : thdown, 'alarms': alarm}
503 |     
504 | 
505 |     def plot(self,run_results, with_alarm = True):
506 |         """
507 |         Plot the results given by the run
508 |         
509 |         Parameters
510 |         ----------
511 |         run_results : dict
512 |             results given by the 'run' method
513 |         with_alarm : bool
514 |             (default = True) If True, alarms are plotted.
515 |         Returns
516 |         ----------
517 |         list
518 |             list of the plots
519 |             
520 |         """
521 |         x = range(self.data.size)
522 |         K = run_results.keys()
523 |         
524 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
525 |         fig = [ts_fig]
526 |         
527 |         if 'upper_thresholds' in K:
528 |             thup = run_results['upper_thresholds']
529 |             uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed')
530 |             fig.append(uth_fig)
531 |             
532 |         if 'lower_thresholds' in K:
533 |             thdown = run_results['lower_thresholds']
534 |             lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed')
535 |             fig.append(lth_fig)
536 |         
537 |         if with_alarm and ('alarms' in K):
538 |             alarm = run_results['alarms']
539 |             if len(alarm)>0:
540 |                 al_fig = plt.scatter(alarm,self.data[alarm],color='red')
541 |                 fig.append(al_fig)
542 |             
543 |         plt.xlim((0,self.data.size))
544 |         plt.show()
545 |         
546 |         return fig
547 | 
548 | 


--------------------------------------------------------------------------------
/drif_spot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from math import log,floor
  6 | import tqdm
  7 | from scipy.optimize import minimize
  8 | # colors for plot
  9 | deep_saffron = '#FF9933'
 10 | air_force_blue = '#5D8AA8'
 11 | 
 12 | def backMean(X,d):
 13 |     M = []
 14 |     w = X[:d].sum()
 15 |     M.append(w/d)
 16 |     for i in range(d,len(X)):
 17 |         w = w - X[i-d] + X[i]
 18 |         M.append(w/d)
 19 |     return np.array(M)
 20 | class DRSPOT:
 21 |     """
 22 |     This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds)
 23 |     
 24 |     Attributes
 25 |     ----------
 26 |     proba : float
 27 |         Detection level (risk), chosen by the user
 28 |         
 29 |     extreme_quantile : float
 30 |         current threshold (bound between normal and abnormal events)
 31 |         
 32 |     data : numpy.array
 33 |         stream
 34 |     
 35 |     init_data : numpy.array
 36 |         initial batch of observations (for the calibration/initialization step)
 37 |     
 38 |     init_threshold : float  ------------t
 39 |         initial threshold computed during the calibration step
 40 |     
 41 |     peaks : numpy.array
 42 |         array of peaks (excesses above the initial threshold)
 43 |     
 44 |     n : int
 45 |         number of observed values
 46 |     
 47 |     Nt : int
 48 |         number of observed peaks
 49 |     """
 50 |     def __init__(self, q = 1e-4):
 51 |         """
 52 |         Constructor
 53 |         Parameters
 54 |         ----------
 55 |         q
 56 |             Detection level (risk)
 57 |     
 58 |         Returns
 59 |         ----------
 60 |         biSPOT object
 61 |         """
 62 |         self.proba = q
 63 |         self.data = None
 64 |         self.init_data = None
 65 |         self.update_number = 0
 66 |         self.n = 0
 67 |         nonedict =  {'up':None,'down':None}
 68 |         
 69 |         self.extreme_quantile = dict.copy(nonedict)
 70 |         self.init_threshold = dict.copy(nonedict)
 71 |         self.peaks = dict.copy(nonedict)
 72 |         self.gamma = dict.copy(nonedict)
 73 |         self.sigma = dict.copy(nonedict)
 74 |         self.Nt = {'up':0,'down':0}
 75 |         
 76 |         
 77 |     def __str__(self):
 78 |         s = ''
 79 |         s += 'Streaming Peaks-Over-Threshold Object\n'
 80 |         s += 'Detection level q = %s\n' % self.proba
 81 |         if self.data is not None:
 82 |             s += 'Data imported : Yes\n'
 83 |             s += '\t initialization  : %s values\n' % self.init_data.size
 84 |             s += '\t stream : %s values\n' % self.data.size
 85 |         else:
 86 |             s += 'Data imported : No\n'
 87 |             return s
 88 |             
 89 |         if self.n == 0:
 90 |             s += 'Algorithm initialized : No\n'
 91 |         else:
 92 |             s += 'Algorithm initialized : Yes\n'
 93 |             s += '\t initial threshold : %s\n' % self.init_threshold
 94 |             
 95 |             r = self.n-self.init_data.size
 96 |             if r > 0:
 97 |                 s += 'Algorithm run : Yes\n'
 98 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
 99 |                 s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n)
100 |             else:
101 |                 s += '\t number of peaks  : %s\n' % self.Nt
102 |                 s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up']
103 |                 s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down']
104 |                 s += 'Algorithm run : No\n'
105 |         return s
106 |     
107 |     
108 |     def fit(self,init_data,data):
109 |         """
110 |         Import data to biSPOT object
111 |         
112 |         Parameters
113 |         ----------
114 |         init_data : list, numpy.array or pandas.Series
115 |             initial batch to calibrate the algorithm ()
116 |             
117 |         data : numpy.array
118 |             data for the run (list, np.array or pd.series)
119 |     
120 |         """
121 |         if isinstance(data,list):
122 |             self.data = np.array(data)
123 |         elif isinstance(data,np.ndarray):
124 |             self.data = data
125 |         elif isinstance(data,pd.Series):
126 |             self.data = data.values
127 |         else:
128 |             print('This data format (%s) is not supported' % type(data))
129 |             return
130 |             
131 |         if isinstance(init_data,list):
132 |             self.init_data = np.array(init_data)
133 |             self.update_number = len(self.init_data)
134 |         elif isinstance(init_data,np.ndarray):
135 |             self.init_data = init_data
136 |             self.update_number = len(self.init_data)
137 |         elif isinstance(init_data,pd.Series):
138 |             self.init_data = init_data.values
139 |             self.update_number = len(self.init_data)
140 |         elif isinstance(init_data,int):
141 |             self.init_data = self.data[:init_data]
142 |             self.data = self.data[init_data:]
143 |             self.update_number = init_data
144 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
145 |             r = int(init_data*data.size)
146 |             self.init_data = self.data[:r]
147 |             self.data = self.data[r:]
148 |         else:
149 |             print('The initial data cannot be set')
150 |             return
151 |         
152 |     def add(self,data):
153 |         """
154 |         This function allows to append data to the already fitted data
155 |         
156 |         Parameters
157 |         ----------
158 |         data : list, numpy.array, pandas.Series
159 |             data to append
160 |         """
161 |         if isinstance(data,list):
162 |             data = np.array(data)
163 |         elif isinstance(data,np.ndarray):
164 |             data = data
165 |         elif isinstance(data,pd.Series):
166 |             data = data.values
167 |         else:
168 |             print('This data format (%s) is not supported' % type(data))
169 |             return
170 |         
171 |         self.data = np.append(self.data,data)
172 |         return
173 | 
174 |     def initialize(self, verbose = True):
175 |         """
176 |         Run the calibration (initialization) step
177 |         
178 |         Parameters
179 |         ----------
180 |         verbose : bool
181 |             (default = True) If True, gives details about the batch initialization
182 |         """
183 |         n_init = self.init_data.size
184 |         
185 |         S = np.sort(self.init_data)     # we sort X to get the empirical quantile
186 |         self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm
187 |         self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm
188 | 
189 |         # initial peaks
190 |         self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up']
191 |         self.peaks['down'] = -(self.init_data[self.init_data<self.init_threshold['down']]-self.init_threshold['down'])
192 |         self.Nt['up'] = self.peaks['up'].size
193 |         self.Nt['down'] = self.peaks['down'].size
194 |         self.n = n_init
195 |         
196 |         if verbose:
197 |             print('Initial threshold : %s' % self.init_threshold)
198 |             print('Number of peaks : %s' % self.Nt)
199 |             #print('Grimshaw maximum log-likelihood estimation ... ', end = '')
200 |             
201 |         l = {'up':None,'down':None}
202 |         for side in ['up','down']:
203 |             g,s,l[side] = self._grimshaw(side)
204 |             self.extreme_quantile[side] = self._quantile(side,g,s)
205 |             self.gamma[side] = g
206 |             self.sigma[side] = s
207 |         
208 |         ltab = 20
209 |         form = ('\t'+'%20s' + '%20.2f' + '%20.2f')
210 |         '''
211 |         if verbose:
212 |             print('[done]')
213 |             print('\t' + 'Parameters'.rjust(ltab) + 'Upper'.rjust(ltab) + 'Lower'.rjust(ltab))
214 |             print('\t' + '-'*ltab*3)
215 |             print(form % (chr(0x03B3),self.gamma['up'],self.gamma['down']))
216 |             print(form % (chr(0x03C3),self.sigma['up'],self.sigma['down']))
217 |             print(form % ('likelihood',l['up'],l['down']))
218 |             print(form % ('Extreme quantile',self.extreme_quantile['up'],self.extreme_quantile['down']))
219 |             print('\t' + '-'*ltab*3)
220 |         '''
221 |         return 
222 |     
223 |     
224 |     
225 |     
226 |     def _rootsFinder(self, fun,jac,bounds,npoints,method):
227 |         """
228 |         Find possible roots of a scalar function
229 |         
230 |         Parameters
231 |         ----------
232 |         fun : function
233 |             scalar function 
234 |         jac : function
235 |             first order derivative of the function  
236 |         bounds : tuple
237 |             (min,max) interval for the roots search    
238 |         npoints : int
239 |             maximum number of roots to output      
240 |         method : str
241 |             'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval
242 |         
243 |         Returns
244 |         ----------
245 |         numpy.array
246 |             possible roots of the function
247 |         """
248 |         if method == 'regular':
249 |             step = (bounds[1]-bounds[0])/(npoints+1)
250 |             X0 = np.arange(bounds[0]+step,bounds[1],step)
251 |         elif method == 'random':
252 |             X0 = np.random.uniform(bounds[0],bounds[1],npoints)
253 |         
254 |         def objFun(X,f,jac):
255 |             g = 0
256 |             j = np.zeros(X.shape)
257 |             i = 0
258 |             for x in X:
259 |                 fx = f(x)
260 |                 g = g+fx**2
261 |                 j[i] = 2*fx*jac(x)
262 |                 i = i+1
263 |             return g,j
264 |         opt = minimize(lambda X:objFun(X,fun,jac), X0, 
265 |                        method='L-BFGS-B', 
266 |                        jac=True, bounds=[bounds]*len(X0))
267 |         
268 |         X = opt.x
269 |         np.round(X,decimals = 5)
270 |         return np.unique(X)
271 |     
272 |     
273 |     def _log_likelihood(self, Y,gamma,sigma):
274 |         """
275 |         Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
276 |         
277 |         Parameters
278 |         ----------
279 |         Y : numpy.array
280 |             observations
281 |         gamma : float
282 |             GPD index parameter
283 |         sigma : float
284 |             GPD scale parameter (>0)   
285 |         Returns
286 |         ----------
287 |         float
288 |             log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
289 |         """
290 |         n = Y.size
291 |         if gamma != 0:
292 |             tau = gamma/sigma
293 |             L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum()
294 |         else:
295 |             L = n * ( 1 + log(Y.mean()) )
296 |         return L
297 | 
298 | 
299 |     def _grimshaw(self,side,epsilon = 1e-8, n_points = 10):
300 |         """
301 |         Compute the GPD parameters estimation with the Grimshaw's trick
302 |         
303 |         Parameters
304 |         ----------
305 |         epsilon : float
306 |             numerical parameter to perform (default : 1e-8)
307 |         n_points : int
308 |             maximum number of candidates for maximum likelihood (default : 10)
309 |         Returns
310 |         ----------
311 |         gamma_best,sigma_best,ll_best
312 |             gamma estimates, sigma estimates and corresponding log-likelihood
313 |         """
314 |         def u(s):
315 |             return 1 + np.log(s).mean()
316 |             
317 |         def v(s):
318 |             return np.mean(1/s)
319 |         
320 |         def w(Y,t):
321 |             s = 1+t*Y
322 |             us = u(s)
323 |             vs = v(s)
324 |             return us*vs-1
325 |         
326 |         def jac_w(Y,t):
327 |             s = 1+t*Y
328 |             us = u(s)
329 |             vs = v(s)
330 |             jac_us = (1/t)*(1-vs)
331 |             jac_vs = (1/t)*(-vs+np.mean(1/s**2))
332 |             return us*jac_vs+vs*jac_us
333 |             
334 |     
335 |         Ym = self.peaks[side].min()
336 |         YM = self.peaks[side].max()
337 |         Ymean = self.peaks[side].mean()
338 |         
339 |         
340 |         a = -1/YM
341 |         if abs(a)<2*epsilon:
342 |             epsilon = abs(a)/n_points
343 |         
344 |         a = a + epsilon
345 |         b = 2*(Ymean-Ym)/(Ymean*Ym)
346 |         c = 2*(Ymean-Ym)/(Ym**2)
347 |     
348 |         # We look for possible roots
349 |         left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
350 |                                  lambda t: jac_w(self.peaks[side],t),
351 |                                  (a+epsilon,-epsilon),
352 |                                  n_points,'regular')
353 |         
354 |         right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
355 |                                   lambda t: jac_w(self.peaks[side],t),
356 |                                   (b,c),
357 |                                   n_points,'regular')
358 |     
359 |         # all the possible roots
360 |         zeros = np.concatenate((left_zeros,right_zeros))
361 |         
362 |         # 0 is always a solution so we initialize with it
363 |         gamma_best = 0
364 |         sigma_best = Ymean
365 |         ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best)
366 |         
367 |         # we look for better candidates
368 |         for z in zeros:
369 |             gamma = u(1+z*self.peaks[side])-1
370 |             sigma = gamma/z
371 |             ll = self._log_likelihood(self.peaks[side],gamma,sigma)
372 |             if ll>ll_best:
373 |                 gamma_best = gamma
374 |                 sigma_best = sigma
375 |                 ll_best = ll
376 |     
377 |         return gamma_best,sigma_best,ll_best
378 | 
379 |     
380 | 
381 |     def _quantile(self,side,gamma,sigma):
382 |         """
383 |         Compute the quantile at level 1-q for a given side
384 |         
385 |         Parameters
386 |         ----------
387 |         side : str
388 |             'up' or 'down'
389 |         gamma : float
390 |             GPD parameter
391 |         sigma : float
392 |             GPD parameter
393 |         Returns
394 |         ----------
395 |         float
396 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
397 |         """
398 |         if side == 'up':
399 |             r = self.n * self.proba / self.Nt[side]
400 |             if gamma != 0:
401 |                 return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1)
402 |             else:
403 |                 return self.init_threshold['up'] - sigma*log(r)
404 |         elif side == 'down':
405 |             r = self.n * self.proba / self.Nt[side]
406 |             if gamma != 0:
407 |                 return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1)
408 |             else:
409 |                 return self.init_threshold['down'] + sigma*log(r)
410 |         else:
411 |             print('error : the side is not right')
412 | 
413 |         
414 |     def run(self, with_alarm = True):
415 |         """
416 |         Run biSPOT on the stream
417 |         
418 |         Parameters
419 |         ----------
420 |         with_alarm : bool
421 |             (default = True) If False, SPOT will adapt the threshold assuming \
422 |             there is no abnormal values
423 |         Returns
424 |         ----------
425 |         dict
426 |             keys : 'upper_thresholds', 'lower_thresholds' and 'alarms'
427 |             
428 |             '***-thresholds' contains the extreme quantiles and 'alarms' contains \
429 |             the indexes of the values which have triggered alarms
430 |             
431 |         """
432 |         if (self.n>self.init_data.size):
433 |             print('Warning : the algorithm seems to have already been run, you \
434 |             should initialize before running again')
435 |             return {}
436 |         
437 |         # list of the thresholds
438 |         thup = []
439 |         thdown = []
440 |         alarm = []
441 |         # Loop over the stream
442 |         for i in tqdm.tqdm(range(self.data.size)):
443 |     
444 |             # If the observed value exceeds the current threshold (alarm case)
445 |             if self.data[i]>self.extreme_quantile['up'] :
446 |                 # if we want to alarm, we put it in the alarm list
447 |                 if with_alarm:
448 |                     alarm.append(i)
449 |                 # otherwise we add it in the peaks
450 |                 else:
451 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
452 |                     self.Nt['up'] += 1
453 |                     self.n += 1
454 |                     # and we update the thresholds
455 | 
456 |                     g,s,l = self._grimshaw('up')
457 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
458 | 
459 |             # case where the value exceeds the initial threshold but not the alarm ones
460 |             elif self.data[i]>self.init_threshold['up']:
461 |                     # we add it in the peaks
462 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
463 |                     self.Nt['up'] += 1
464 |                     self.n += 1
465 |                     # and we update the thresholds
466 | 
467 |                     g,s,l = self._grimshaw('up')
468 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
469 |                     
470 |             elif self.data[i]<self.extreme_quantile['down'] :
471 |                 # if we want to alarm, we put it in the alarm list
472 |                 if with_alarm:
473 |                     alarm.append(i)
474 |                 # otherwise we add it in the peaks
475 |                 else:
476 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
477 |                     self.Nt['down'] += 1
478 |                     self.n += 1
479 |                     # and we update the thresholds
480 | 
481 |                     g,s,l = self._grimshaw('down')
482 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
483 | 
484 |             # case where the value exceeds the initial threshold but not the alarm ones
485 |             elif self.data[i]<self.init_threshold['down']:
486 |                     # we add it in the peaks
487 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
488 |                     self.Nt['down'] += 1
489 |                     self.n += 1
490 |                     # and we update the thresholds
491 | 
492 |                     g,s,l = self._grimshaw('down')
493 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
494 |             else:
495 |                 self.n += 1
496 |             if self.n % self.update_number == 0: #update 
497 |                 # update on time
498 |                 up_data =np.append(self.init_data, self.data[:i])
499 |                 print 'update at ',i
500 |                 print 'updating using data: ', len(up_data)
501 |                 S = np.sort(up_data)     # we sort X to get the empirical quantile
502 |                 self.init_threshold['up'] = S[int(0.98*len(S))] # t is fixed for the whole algorithm
503 |                 self.init_threshold['down'] = S[int(0.02*len(S))] # t is fixed for the whole algorithm
504 | 
505 |                 self.peaks['up'] = up_data[up_data>self.init_threshold['up']]-self.init_threshold['up']
506 |                 self.peaks['down'] = -(up_data[up_data<self.init_threshold['down']]-self.init_threshold['down'])
507 |                 self.Nt['up'] = self.peaks['up'].size
508 |                 self.Nt['down'] = self.peaks['down'].size
509 |                 
510 |                 l = {'up':None,'down':None}
511 |                 for side in ['up','down']:
512 |                     g,s,l[side] = self._grimshaw(side)
513 |                     self.extreme_quantile[side] = self._quantile(side,g,s)
514 |                     self.gamma[side] = g
515 |                     self.sigma[side] = s
516 |                 
517 |             thup.append(self.extreme_quantile['up']) # thresholds record
518 |             thdown.append(self.extreme_quantile['down']) # thresholds record
519 |         
520 |         return {'upper_thresholds' : thup,'lower_thresholds' : thdown, 'alarms': alarm}
521 |     
522 |     def plot(self,run_results,with_alarm = True):
523 |         """
524 |         Plot the results of given by the run
525 |         
526 |         Parameters
527 |         ----------
528 |         run_results : dict
529 |             results given by the 'run' method
530 |         with_alarm : bool
531 |             (default = True) If True, alarms are plotted.
532 |         Returns
533 |         ----------
534 |         list
535 |             list of the plots
536 |             
537 |         """
538 |         x = range(self.data.size)
539 |         K = run_results.keys()
540 |         
541 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
542 |         fig = [ts_fig]
543 |         
544 |         if 'upper_thresholds' in K:
545 |             thup = run_results['upper_thresholds']
546 |             uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed')
547 |             fig.append(uth_fig)
548 |             
549 |         if 'lower_thresholds' in K:
550 |             thdown = run_results['lower_thresholds']
551 |             lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed')
552 |             fig.append(lth_fig)
553 |         
554 |         if with_alarm and ('alarms' in K):
555 |             alarm = run_results['alarms']
556 |             al_fig = plt.scatter(alarm,self.data[alarm],color='red')
557 |             fig.append(al_fig)
558 |             
559 |         plt.xlim((0,self.data.size))
560 |         plt.show()
561 |         
562 |         return fig
563 | 
564 | 


--------------------------------------------------------------------------------
/espot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from math import log,floor
  6 | import tqdm
  7 | from scipy.optimize import minimize
  8 | # colors for plot
  9 | deep_saffron = '#FF9933'
 10 | air_force_blue = '#5D8AA8'
 11 | 
 12 | def backMean(X,d):
 13 |     M = []
 14 |     w = X[:d].sum()
 15 |     M.append(w/d)
 16 |     for i in range(d,len(X)):
 17 |         w = w - X[i-d] + X[i]
 18 |         M.append(w/d)
 19 |     return np.array(M)
 20 | class ESPOT:
 21 |     """
 22 |     This class allows to run DSPOT algorithm on univariate dataset (upper and lower bounds)
 23 |     
 24 |     Attributes
 25 |     ----------
 26 |     proba : float
 27 |         Detection level (risk), chosen by the user
 28 |         
 29 |     depth : int
 30 |         Number of observations to compute the moving average
 31 |         
 32 |     extreme_quantile : float
 33 |         current threshold (bound between normal and abnormal events)
 34 |         
 35 |     data : numpy.array
 36 |         stream
 37 |     
 38 |     init_data : numpy.array
 39 |         initial batch of observations (for the calibration/initialization step)
 40 |     
 41 |     init_threshold : float
 42 |         initial threshold computed during the calibration step
 43 |     
 44 |     peaks : numpy.array
 45 |         array of peaks (excesses above the initial threshold)
 46 |     
 47 |     n : int
 48 |         number of observed values
 49 |     
 50 |     Nt : int
 51 |         number of observed peaks
 52 |     """
 53 |     def __init__(self, q = 1e-4, depth = 10):
 54 |         self.proba = q
 55 |         self.data = None
 56 |         self.init_data = None
 57 |         self.n = 0
 58 |         self.depth = depth
 59 |         
 60 |         nonedict =  {'up':None,'down':None}
 61 |         
 62 |         self.extreme_quantile = dict.copy(nonedict)
 63 |         self.init_threshold = dict.copy(nonedict)
 64 |         self.peaks = dict.copy(nonedict)
 65 |         self.gamma = dict.copy(nonedict)
 66 |         self.sigma = dict.copy(nonedict)
 67 |         self.Nt = {'up':0,'down':0}
 68 |         
 69 |         
 70 |     def __str__(self):
 71 |         s = ''
 72 |         s += 'Streaming Peaks-Over-Threshold Object\n'
 73 |         s += 'Detection level q = %s\n' % self.proba
 74 |         if self.data is not None:
 75 |             s += 'Data imported : Yes\n'
 76 |             s += '\t initialization  : %s values\n' % self.init_data.size
 77 |             s += '\t stream : %s values\n' % self.data.size
 78 |         else:
 79 |             s += 'Data imported : No\n'
 80 |             return s
 81 |             
 82 |         if self.n == 0:
 83 |             s += 'Algorithm initialized : No\n'
 84 |         else:
 85 |             s += 'Algorithm initialized : Yes\n'
 86 |             s += '\t initial threshold : %s\n' % self.init_threshold
 87 |             
 88 |             r = self.n-self.init_data.size
 89 |             if r > 0:
 90 |                 s += 'Algorithm run : Yes\n'
 91 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
 92 |                 s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n)
 93 |             else:
 94 |                 s += '\t number of peaks  : %s\n' % self.Nt
 95 |                 s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up']
 96 |                 s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down']
 97 |                 s += 'Algorithm run : No\n'
 98 |         return s
 99 |     
100 |     
101 |     def fit(self,init_data,data):
102 |         """
103 |         Import data to ESPOT object
104 |         
105 |         Parameters
106 |         ----------
107 |         init_data : list, numpy.array or pandas.Series
108 |             initial batch to calibrate the algorithm
109 |             
110 |         data : numpy.array
111 |             data for the run (list, np.array or pd.series)
112 |     
113 |         """
114 |         if isinstance(data,list):
115 |             self.data = np.array(data)
116 |         elif isinstance(data,np.ndarray):
117 |             self.data = data
118 |         elif isinstance(data,pd.Series):
119 |             self.data = data.values
120 |         else:
121 |             print('This data format (%s) is not supported' % type(data))
122 |             return
123 |             
124 |         if isinstance(init_data,list):
125 |             self.init_data = np.array(init_data)
126 |         elif isinstance(init_data,np.ndarray):
127 |             self.init_data = init_data
128 |         elif isinstance(init_data,pd.Series):
129 |             self.init_data = init_data.values
130 |         elif isinstance(init_data,int):
131 |             self.init_data = self.data[:init_data]
132 |             self.data = self.data[init_data:]
133 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
134 |             r = int(init_data*data.size)
135 |             self.init_data = self.data[:r]
136 |             self.data = self.data[r:]
137 |         else:
138 |             print('The initial data cannot be set')
139 |             return
140 |         
141 |     def add(self,data):
142 |         """
143 |         This function allows to append data to the already fitted data
144 |         
145 |         Parameters
146 |         ----------
147 |         data : list, numpy.array, pandas.Series
148 |             data to append
149 |         """
150 |         if isinstance(data,list):
151 |             data = np.array(data)
152 |         elif isinstance(data,np.ndarray):
153 |             data = data
154 |         elif isinstance(data,pd.Series):
155 |             data = data.values
156 |         else:
157 |             print('This data format (%s) is not supported' % type(data))
158 |             return
159 |         
160 |         self.data = np.append(self.data,data)
161 |         return
162 |     
163 |     def initialize(self, verbose = True):
164 |         """
165 |         Run the calibration (initialization) step
166 |         
167 |         Parameters
168 |         ----------
169 |         verbose : bool
170 |             (default = True) If True, gives details about the batch initialization
171 |         """
172 |         n_init = self.init_data.size - self.depth
173 |         
174 |         M = backMean(self.init_data,self.depth)
175 |         T = self.init_data[self.depth:]-M[:-1] # new variable
176 |         
177 |         S = np.sort(T)     # we sort T to get the empirical quantile
178 |         self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm
179 |         self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm
180 | 
181 |         # initial peaks
182 |         self.peaks['up'] = T[T>self.init_threshold['up']]-self.init_threshold['up']
183 |         self.peaks['down'] = -( T[ T<self.init_threshold['down'] ] - self.init_threshold['down'] )
184 |         self.Nt['up'] = self.peaks['up'].size
185 |         self.Nt['down'] = self.peaks['down'].size
186 |         self.n = n_init
187 |         
188 |         if verbose:
189 |             print('Initial threshold : %s' % self.init_threshold)
190 |             print('Number of peaks : %s' % self.Nt)
191 |             print('Grimshaw maximum log-likelihood estimation ... ')
192 |             
193 |         l = {'up':None,'down':None}
194 |         for side in ['up','down']:
195 |             g,s,l[side] = self._grimshaw(side)
196 |             self.extreme_quantile[side] = self._quantile(side,g,s)
197 |             self.gamma[side] = g
198 |             self.sigma[side] = s
199 |         
200 |         ltab = 20
201 |         '''
202 |         form = ('\t'+'%20s' + '%20.2f' + '%20.2f')
203 |         if verbose:
204 |             print('[done]')
205 |             print('\t' + 'Parameters'.rjust(ltab) + 'Upper'.rjust(ltab) + 'Lower'.rjust(ltab))
206 |             print('\t' + '-'*ltab*3)
207 |             print(form % (chr(0x03B3),self.gamma['up'],self.gamma['down']))
208 |             print(form % (chr(0x03C3),self.sigma['up'],self.sigma['down']))
209 |             print(form % ('likelihood',l['up'],l['down']))
210 |             print(form % ('Extreme quantile',self.extreme_quantile['up'],self.extreme_quantile['down']))
211 |             print('\t' + '-'*ltab*3)
212 |         '''
213 |         return 
214 |     
215 |     
216 |     
217 |     
218 |     def _rootsFinder(self, fun,jac,bounds,npoints,method):
219 |         """
220 |         Find possible roots of a scalar function
221 |         
222 |         Parameters
223 |         ----------
224 |         fun : function
225 |             scalar function 
226 |         jac : function
227 |             first order derivative of the function  
228 |         bounds : tuple
229 |             (min,max) interval for the roots search    
230 |         npoints : int
231 |             maximum number of roots to output      
232 |         method : str
233 |             'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval
234 |         
235 |         Returns
236 |         ----------
237 |         numpy.array
238 |             possible roots of the function
239 |         """
240 |         if method == 'regular':
241 |             step = (bounds[1]-bounds[0])/(npoints+1)
242 |             X0 = np.arange(bounds[0]+step,bounds[1],step)
243 |         elif method == 'random':
244 |             X0 = np.random.uniform(bounds[0],bounds[1],npoints)
245 |         
246 |         def objFun(X,f,jac):
247 |             g = 0
248 |             j = np.zeros(X.shape)
249 |             i = 0
250 |             for x in X:
251 |                 fx = f(x)
252 |                 g = g+fx**2
253 |                 j[i] = 2*fx*jac(x)
254 |                 i = i+1
255 |             return g,j
256 |         
257 |         opt = minimize(lambda X:objFun(X,fun,jac), X0, 
258 |                        method='L-BFGS-B', 
259 |                        jac=True, bounds=[bounds]*len(X0))
260 |         
261 |         X = opt.x
262 |         np.round(X,decimals = 5)
263 |         return np.unique(X)
264 |     
265 |     
266 |     def _log_likelihood(self, Y,gamma,sigma):
267 |         """
268 |         Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
269 |         
270 |         Parameters
271 |         ----------
272 |         Y : numpy.array
273 |             observations
274 |         gamma : float
275 |             GPD index parameter
276 |         sigma : float
277 |             GPD scale parameter (>0)   
278 |         Returns
279 |         ----------
280 |         float
281 |             log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
282 |         """
283 |         n = Y.size
284 |         if gamma != 0:
285 |             tau = gamma/sigma
286 |             L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum()
287 |         else:
288 |             L = n * ( 1 + log(Y.mean()) )
289 |         return L
290 | 
291 | 
292 |     def _grimshaw(self,side,epsilon = 1e-8, n_points = 8):
293 |         """
294 |         Compute the GPD parameters estimation with the Grimshaw's trick
295 |         
296 |         Parameters
297 |         ----------
298 |         epsilon : float
299 |             numerical parameter to perform (default : 1e-8)
300 |         n_points : int
301 |             maximum number of candidates for maximum likelihood (default : 10)
302 |         Returns
303 |         ----------
304 |         gamma_best,sigma_best,ll_best
305 |             gamma estimates, sigma estimates and corresponding log-likelihood
306 |         """
307 |         def u(s):
308 |             return 1 + np.log(s).mean()
309 |             
310 |         def v(s):
311 |             return np.mean(1/s)
312 |         
313 |         def w(Y,t):
314 |             s = 1+t*Y
315 |             us = u(s)
316 |             vs = v(s)
317 |             return us*vs-1
318 |         
319 |         def jac_w(Y,t):
320 |             s = 1+t*Y
321 |             us = u(s)
322 |             vs = v(s)
323 |             jac_us = (1/t)*(1-vs)
324 |             jac_vs = (1/t)*(-vs+np.mean(1/s**2))
325 |             return us*jac_vs+vs*jac_us
326 |             
327 |     
328 |         Ym = self.peaks[side].min()
329 |         YM = self.peaks[side].max()
330 |         Ymean = self.peaks[side].mean()
331 |         
332 |         
333 |         a = -1/YM
334 |         if abs(a)<2*epsilon:
335 |             epsilon = abs(a)/n_points
336 |         
337 |         a = a + epsilon
338 |         b = 2*(Ymean-Ym)/(Ymean*Ym)
339 |         c = 2*(Ymean-Ym)/(Ym**2)
340 |     
341 |         # We look for possible roots
342 |         left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
343 |                                  lambda t: jac_w(self.peaks[side],t),
344 |                                  (a+epsilon,-epsilon),
345 |                                  n_points,'regular')
346 |         
347 |         right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
348 |                                   lambda t: jac_w(self.peaks[side],t),
349 |                                   (b,c),
350 |                                   n_points,'regular')
351 |     
352 |         # all the possible roots
353 |         zeros = np.concatenate((left_zeros,right_zeros))
354 |         
355 |         # 0 is always a solution so we initialize with it
356 |         gamma_best = 0
357 |         sigma_best = Ymean
358 |         ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best)
359 |         
360 |         # we look for better candidates
361 |         for z in zeros:
362 |             gamma = u(1+z*self.peaks[side])-1
363 |             sigma = gamma/z
364 |             ll = self._log_likelihood(self.peaks[side],gamma,sigma)
365 |             if ll>ll_best:
366 |                 gamma_best = gamma
367 |                 sigma_best = sigma
368 |                 ll_best = ll
369 |     
370 |         return gamma_best,sigma_best,ll_best
371 | 
372 |     
373 | 
374 |     def _quantile(self,side,gamma,sigma):
375 |         """
376 |         Compute the quantile at level 1-q for a given side
377 |         
378 |         Parameters
379 |         ----------
380 |         side : str
381 |             'up' or 'down'
382 |         gamma : float
383 |             GPD parameter
384 |         sigma : float
385 |             GPD parameter
386 |         Returns
387 |         ----------
388 |         float
389 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
390 |         """
391 |         if side == 'up':
392 |             r = self.n * self.proba / self.Nt[side]
393 |             if gamma != 0:
394 |                 return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1)
395 |             else:
396 |                 return self.init_threshold['up'] - sigma*log(r)
397 |         elif side == 'down':
398 |             r = self.n * self.proba / self.Nt[side]
399 |             if gamma != 0:
400 |                 return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1)
401 |             else:
402 |                 return self.init_threshold['down'] + sigma*log(r)
403 |         else:
404 |             print('error : the side is not right')
405 | 
406 |     def ewma(self, X, alpha = 0.1):
407 |         s = [X[0]]
408 |         for i in range(1, len(X)):
409 |             temp = alpha * X[i] + (1 - alpha) * s[-1]
410 |             s.append(temp)  
411 |         return s[-1]  
412 |     
413 |     def run(self, with_alarm = True, plot = True):
414 |         """
415 |         Run ESPOT on the stream
416 |         
417 |         Parameters
418 |         ----------
419 |         with_alarm : bool
420 |             (default = True) If False, SPOT will adapt the threshold assuming \
421 |             there is no abnormal values
422 |         Returns
423 |         ----------
424 |         dict
425 |             keys : 'upper_thresholds', 'lower_thresholds' and 'alarms'
426 |             
427 |             '***-thresholds' contains the extreme quantiles and 'alarms' contains \
428 |             the indexes of the values which have triggered alarms
429 |             
430 |         """
431 |         if (self.n>self.init_data.size):
432 |             print('Warning : the algorithm seems to have already been run, you \
433 |             should initialize before running again')
434 |             return {}
435 |         
436 |         # actual normal window
437 |         W = self.init_data[-self.depth:]
438 |         
439 |         # list of the thresholds
440 |         thup = []
441 |         thdown = []
442 |         alarm = []
443 |         # Loop over the stream
444 |         for i in tqdm.tqdm(range(self.data.size)):
445 |             Mi = self.ewma(W)# DAWNSON IN YOUR AREA HAHA HAHA
446 |             Ni = self.data[i]-Mi
447 |             # If the observed value exceeds the current threshold (alarm case)
448 |             if Ni>self.extreme_quantile['up'] :
449 |                 # if we want to alarm, we put it in the alarm list
450 |                 if with_alarm:
451 |                     alarm.append(i)
452 |                 # otherwise we add it in the peaks
453 |                 else:
454 |                     self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up'])
455 |                     self.Nt['up'] += 1
456 |                     self.n += 1
457 |                     # and we update the thresholds
458 | 
459 |                     g,s,l = self._grimshaw('up')
460 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
461 |                     W = np.append(W[1:],self.data[i])
462 |                     
463 |             # case where the value exceeds the initial threshold but not the alarm ones
464 |             elif Ni>self.init_threshold['up']:
465 |                     # we add it in the peaks
466 |                     self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up'])
467 |                     self.Nt['up'] += 1
468 |                     self.n += 1
469 |                     # and we update the thresholds
470 |                     g,s,l = self._grimshaw('up')
471 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
472 |                     W = np.append(W[1:],self.data[i])
473 |                     
474 |             elif Ni<self.extreme_quantile['down'] :
475 |                 # if we want to alarm, we put it in the alarm list
476 |                 if with_alarm:
477 |                     alarm.append(i)
478 |                 # otherwise we add it in the peaks
479 |                 else:
480 |                     self.peaks['down'] = np.append(self.peaks['down'],-(Ni-self.init_threshold['down']))
481 |                     self.Nt['down'] += 1
482 |                     self.n += 1
483 |                     # and we update the thresholds
484 | 
485 |                     g,s,l = self._grimshaw('down')
486 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
487 |                     W = np.append(W[1:],self.data[i])
488 |                     
489 |             # case where the value exceeds the initial threshold but not the alarm ones
490 |             elif Ni<self.init_threshold['down']:
491 |                     # we add it in the peaks
492 |                     self.peaks['down'] = np.append(self.peaks['down'],-(Ni-self.init_threshold['down']))
493 |                     self.Nt['down'] += 1
494 |                     self.n += 1
495 |                     # and we update the thresholds
496 | 
497 |                     g,s,l = self._grimshaw('down')
498 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
499 |                     W = np.append(W[1:],self.data[i])
500 |             else:
501 |                 self.n += 1
502 |                 W = np.append(W[1:],self.data[i])
503 | 
504 |                 
505 |             thup.append(self.extreme_quantile['up']+Mi) # upper thresholds record
506 |             thdown.append(self.extreme_quantile['down']+Mi) # lower thresholds record
507 |         
508 |         return {'upper_thresholds' : thup,'lower_thresholds' : thdown, 'alarms': alarm}
509 |     
510 | 
511 |     def plot(self,run_results, with_alarm = True):
512 |         """
513 |         Plot the results given by the run
514 |         
515 |         Parameters
516 |         ----------
517 |         run_results : dict
518 |             results given by the 'run' method
519 |         with_alarm : bool
520 |             (default = True) If True, alarms are plotted.
521 |         Returns
522 |         ----------
523 |         list
524 |             list of the plots
525 |             
526 |         """
527 |         x = range(self.data.size)
528 |         K = run_results.keys()
529 |         
530 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
531 |         fig = [ts_fig]
532 |         
533 |         if 'upper_thresholds' in K:
534 |             thup = run_results['upper_thresholds']
535 |             uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed')
536 |             fig.append(uth_fig)
537 |             
538 |         if 'lower_thresholds' in K:
539 |             thdown = run_results['lower_thresholds']
540 |             lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed')
541 |             fig.append(lth_fig)
542 |         
543 |         if with_alarm and ('alarms' in K):
544 |             alarm = run_results['alarms']
545 |             if len(alarm)>0:
546 |                 al_fig = plt.scatter(alarm,self.data[alarm],color='red')
547 |                 fig.append(al_fig)
548 |             
549 |         plt.xlim((0,self.data.size))
550 |         plt.show()
551 |         
552 |         return fig
553 | 
554 | 


--------------------------------------------------------------------------------
/middle_spot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from math import log,floor
  6 | import tqdm
  7 | from scipy.optimize import minimize
  8 | # colors for plot
  9 | deep_saffron = '#FF9933'
 10 | air_force_blue = '#5D8AA8'
 11 | 
 12 | def backMean(X,d):
 13 |     M = []
 14 |     w = X[:d].sum()
 15 |     M.append(w/d)
 16 |     for i in range(d,len(X)):
 17 |         w = w - X[i-d] + X[i]
 18 |         M.append(w/d)
 19 |     return np.array(M)
 20 | class MISPOT:
 21 |     """
 22 |     This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds)
 23 |     
 24 |     Attributes
 25 |     ----------
 26 |     proba : float
 27 |         Detection level (risk), chosen by the user
 28 |         
 29 |     extreme_quantile : float
 30 |         current threshold (bound between normal and abnormal events)
 31 |         
 32 |     data : numpy.array
 33 |         stream
 34 |     
 35 |     init_data : numpy.array
 36 |         initial batch of observations (for the calibration/initialization step)
 37 |     
 38 |     init_threshold : float  ------------t
 39 |         initial threshold computed during the calibration step
 40 |     
 41 |     peaks : numpy.array
 42 |         array of peaks (excesses above the initial threshold)
 43 |     
 44 |     n : int
 45 |         number of observed values
 46 |     
 47 |     Nt : int
 48 |         number of observed peaks
 49 |     """
 50 |     def __init__(self, q = 1e-4):
 51 |         """
 52 |         Constructor
 53 |         Parameters
 54 |         ----------
 55 |         q
 56 |             Detection level (risk)
 57 |     
 58 |         Returns
 59 |         ----------
 60 |         biSPOT object
 61 |         """
 62 |         self.proba = q
 63 |         self.data = None
 64 |         self.init_data = None
 65 |         self.update_number = 0
 66 |         self.n = 0
 67 |         nonedict =  {'up':None,'down':None}
 68 |         
 69 |         self.extreme_quantile = dict.copy(nonedict)
 70 |         self.init_threshold = dict.copy(nonedict)
 71 |         self.peaks = dict.copy(nonedict)
 72 |         self.gamma = dict.copy(nonedict)
 73 |         self.sigma = dict.copy(nonedict)
 74 |         self.Nt = {'up':0,'down':0}
 75 |         
 76 |         
 77 |     def __str__(self):
 78 |         s = ''
 79 |         s += 'Streaming Peaks-Over-Threshold Object\n'
 80 |         s += 'Detection level q = %s\n' % self.proba
 81 |         if self.data is not None:
 82 |             s += 'Data imported : Yes\n'
 83 |             s += '\t initialization  : %s values\n' % self.init_data.size
 84 |             s += '\t stream : %s values\n' % self.data.size
 85 |         else:
 86 |             s += 'Data imported : No\n'
 87 |             return s
 88 |             
 89 |         if self.n == 0:
 90 |             s += 'Algorithm initialized : No\n'
 91 |         else:
 92 |             s += 'Algorithm initialized : Yes\n'
 93 |             s += '\t initial threshold : %s\n' % self.init_threshold
 94 |             
 95 |             r = self.n-self.init_data.size
 96 |             if r > 0:
 97 |                 s += 'Algorithm run : Yes\n'
 98 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
 99 |                 s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n)
100 |             else:
101 |                 s += '\t number of peaks  : %s\n' % self.Nt
102 |                 s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up']
103 |                 s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down']
104 |                 s += 'Algorithm run : No\n'
105 |         return s
106 |     
107 |     
108 |     def fit(self,init_data,data):
109 |         """
110 |         Import data to biSPOT object
111 |         
112 |         Parameters
113 |         ----------
114 |         init_data : list, numpy.array or pandas.Series
115 |             initial batch to calibrate the algorithm ()
116 |             
117 |         data : numpy.array
118 |             data for the run (list, np.array or pd.series)
119 |     
120 |         """
121 |         if isinstance(data,list):
122 |             self.data = np.array(data)
123 |         elif isinstance(data,np.ndarray):
124 |             self.data = data
125 |         elif isinstance(data,pd.Series):
126 |             self.data = data.values
127 |         else:
128 |             print('This data format (%s) is not supported' % type(data))
129 |             return
130 |             
131 |         if isinstance(init_data,list):
132 |             self.init_data = np.array(init_data)
133 |             self.update_number = len(self.init_data)
134 |         elif isinstance(init_data,np.ndarray):
135 |             self.init_data = init_data
136 |             self.update_number = len(self.init_data)
137 |         elif isinstance(init_data,pd.Series):
138 |             self.init_data = init_data.values
139 |             self.update_number = len(self.init_data)
140 |         elif isinstance(init_data,int):
141 |             self.init_data = self.data[:init_data]
142 |             self.data = self.data[init_data:]
143 |             self.update_number = init_data
144 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
145 |             r = int(init_data*data.size)
146 |             self.init_data = self.data[:r]
147 |             self.data = self.data[r:]
148 |         else:
149 |             print('The initial data cannot be set')
150 |             return
151 |         
152 |     def add(self,data):
153 |         """
154 |         This function allows to append data to the already fitted data
155 |         
156 |         Parameters
157 |         ----------
158 |         data : list, numpy.array, pandas.Series
159 |             data to append
160 |         """
161 |         if isinstance(data,list):
162 |             data = np.array(data)
163 |         elif isinstance(data,np.ndarray):
164 |             data = data
165 |         elif isinstance(data,pd.Series):
166 |             data = data.values
167 |         else:
168 |             print('This data format (%s) is not supported' % type(data))
169 |             return
170 |         
171 |         self.data = np.append(self.data,data)
172 |         return
173 | 
174 |     def initialize(self, verbose = True):
175 |         """
176 |         Run the calibration (initialization) step
177 |         
178 |         Parameters
179 |         ----------
180 |         verbose : bool
181 |             (default = True) If True, gives details about the batch initialization
182 |         """
183 |         n_init = self.init_data.size
184 |         
185 |         S = np.sort(self.init_data)     # we sort X to get the empirical quantile
186 |         self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm
187 |         self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm
188 | 
189 |         # initial peaks
190 |         self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up']
191 |         self.peaks['down'] = -(self.init_data[self.init_data<self.init_threshold['down']]-self.init_threshold['down'])
192 |         self.Nt['up'] = self.peaks['up'].size
193 |         self.Nt['down'] = self.peaks['down'].size
194 |         self.n = n_init
195 |         
196 |         if verbose:
197 |             print('Initial threshold : %s' % self.init_threshold)
198 |             print('Number of peaks : %s' % self.Nt)
199 |             #print('Grimshaw maximum log-likelihood estimation ... ', end = '')
200 |             
201 |         l = {'up':None,'down':None}
202 |         for side in ['up','down']:
203 |             g,s,l[side] = self._grimshaw(side)
204 |             self.extreme_quantile[side] = self._quantile(side,g,s)
205 |             self.gamma[side] = g
206 |             self.sigma[side] = s
207 |         
208 |         ltab = 20
209 |         form = ('\t'+'%20s' + '%20.2f' + '%20.2f')
210 |         '''
211 |         if verbose:
212 |             print('[done]')
213 |             print('\t' + 'Parameters'.rjust(ltab) + 'Upper'.rjust(ltab) + 'Lower'.rjust(ltab))
214 |             print('\t' + '-'*ltab*3)
215 |             print(form % (chr(0x03B3),self.gamma['up'],self.gamma['down']))
216 |             print(form % (chr(0x03C3),self.sigma['up'],self.sigma['down']))
217 |             print(form % ('likelihood',l['up'],l['down']))
218 |             print(form % ('Extreme quantile',self.extreme_quantile['up'],self.extreme_quantile['down']))
219 |             print('\t' + '-'*ltab*3)
220 |         '''
221 |         return 
222 |     
223 |     
224 |     
225 |     
226 |     def _rootsFinder(self, fun,jac,bounds,npoints,method):
227 |         """
228 |         Find possible roots of a scalar function
229 |         
230 |         Parameters
231 |         ----------
232 |         fun : function
233 |             scalar function 
234 |         jac : function
235 |             first order derivative of the function  
236 |         bounds : tuple
237 |             (min,max) interval for the roots search    
238 |         npoints : int
239 |             maximum number of roots to output      
240 |         method : str
241 |             'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval
242 |         
243 |         Returns
244 |         ----------
245 |         numpy.array
246 |             possible roots of the function
247 |         """
248 |         if method == 'regular':
249 |             step = (bounds[1]-bounds[0])/(npoints+1)
250 |             X0 = np.arange(bounds[0]+step,bounds[1],step)
251 |         elif method == 'random':
252 |             X0 = np.random.uniform(bounds[0],bounds[1],npoints)
253 |         
254 |         def objFun(X,f,jac):
255 |             g = 0
256 |             j = np.zeros(X.shape)
257 |             i = 0
258 |             for x in X:
259 |                 fx = f(x)
260 |                 g = g+fx**2
261 |                 j[i] = 2*fx*jac(x)
262 |                 i = i+1
263 |             return g,j
264 |         opt = minimize(lambda X:objFun(X,fun,jac), X0, 
265 |                        method='L-BFGS-B', 
266 |                        jac=True, bounds=[bounds]*len(X0))
267 |         
268 |         X = opt.x
269 |         np.round(X,decimals = 5)
270 |         return np.unique(X)
271 |     
272 |     
273 |     def _log_likelihood(self, Y,gamma,sigma):
274 |         """
275 |         Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
276 |         
277 |         Parameters
278 |         ----------
279 |         Y : numpy.array
280 |             observations
281 |         gamma : float
282 |             GPD index parameter
283 |         sigma : float
284 |             GPD scale parameter (>0)   
285 |         Returns
286 |         ----------
287 |         float
288 |             log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
289 |         """
290 |         n = Y.size
291 |         if gamma != 0:
292 |             tau = gamma/sigma
293 |             L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum()
294 |         else:
295 |             L = n * ( 1 + log(Y.mean()) )
296 |         return L
297 | 
298 | 
299 |     def _grimshaw(self,side,epsilon = 1e-8, n_points = 10):
300 |         """
301 |         Compute the GPD parameters estimation with the Grimshaw's trick
302 |         
303 |         Parameters
304 |         ----------
305 |         epsilon : float
306 |             numerical parameter to perform (default : 1e-8)
307 |         n_points : int
308 |             maximum number of candidates for maximum likelihood (default : 10)
309 |         Returns
310 |         ----------
311 |         gamma_best,sigma_best,ll_best
312 |             gamma estimates, sigma estimates and corresponding log-likelihood
313 |         """
314 |         def u(s):
315 |             return 1 + np.log(s).mean()
316 |             
317 |         def v(s):
318 |             return np.mean(1/s)
319 |         
320 |         def w(Y,t):
321 |             s = 1+t*Y
322 |             us = u(s)
323 |             vs = v(s)
324 |             return us*vs-1
325 |         
326 |         def jac_w(Y,t):
327 |             s = 1+t*Y
328 |             us = u(s)
329 |             vs = v(s)
330 |             jac_us = (1/t)*(1-vs)
331 |             jac_vs = (1/t)*(-vs+np.mean(1/s**2))
332 |             return us*jac_vs+vs*jac_us
333 |             
334 |     
335 |         Ym = self.peaks[side].min()
336 |         YM = self.peaks[side].max()
337 |         Ymean = self.peaks[side].mean()
338 |         
339 |         
340 |         a = -1/YM
341 |         if abs(a)<2*epsilon:
342 |             epsilon = abs(a)/n_points
343 |         
344 |         a = a + epsilon
345 |         b = 2*(Ymean-Ym)/(Ymean*Ym)
346 |         c = 2*(Ymean-Ym)/(Ym**2)
347 |     
348 |         # We look for possible roots
349 |         left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
350 |                                  lambda t: jac_w(self.peaks[side],t),
351 |                                  (a+epsilon,-epsilon),
352 |                                  n_points,'regular')
353 |         
354 |         right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
355 |                                   lambda t: jac_w(self.peaks[side],t),
356 |                                   (b,c),
357 |                                   n_points,'regular')
358 |     
359 |         # all the possible roots
360 |         zeros = np.concatenate((left_zeros,right_zeros))
361 |         
362 |         # 0 is always a solution so we initialize with it
363 |         gamma_best = 0
364 |         sigma_best = Ymean
365 |         ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best)
366 |         
367 |         # we look for better candidates
368 |         for z in zeros:
369 |             gamma = u(1+z*self.peaks[side])-1
370 |             sigma = gamma/z
371 |             ll = self._log_likelihood(self.peaks[side],gamma,sigma)
372 |             if ll>ll_best:
373 |                 gamma_best = gamma
374 |                 sigma_best = sigma
375 |                 ll_best = ll
376 |     
377 |         return gamma_best,sigma_best,ll_best
378 | 
379 |     
380 | 
381 |     def _quantile(self,side,gamma,sigma):
382 |         """
383 |         Compute the quantile at level 1-q for a given side
384 |         
385 |         Parameters
386 |         ----------
387 |         side : str
388 |             'up' or 'down'
389 |         gamma : float
390 |             GPD parameter
391 |         sigma : float
392 |             GPD parameter
393 |         Returns
394 |         ----------
395 |         float
396 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
397 |         """
398 |         if side == 'up':
399 |             r = self.n * self.proba / self.Nt[side]
400 |             if gamma != 0:
401 |                 return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1)
402 |             else:
403 |                 return self.init_threshold['up'] - sigma*log(r)
404 |         elif side == 'down':
405 |             r = self.n * self.proba / self.Nt[side]
406 |             if gamma != 0:
407 |                 return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1)
408 |             else:
409 |                 return self.init_threshold['down'] + sigma*log(r)
410 |         else:
411 |             print('error : the side is not right')
412 | 
413 |         
414 |     def run(self, with_alarm = True):
415 |         """
416 |         Run biSPOT on the stream
417 |         
418 |         Parameters
419 |         ----------
420 |         with_alarm : bool
421 |             (default = True) If False, SPOT will adapt the threshold assuming \
422 |             there is no abnormal values
423 |         Returns
424 |         ----------
425 |         dict
426 |             keys : 'upper_thresholds', 'lower_thresholds' and 'alarms'
427 |             
428 |             '***-thresholds' contains the extreme quantiles and 'alarms' contains \
429 |             the indexes of the values which have triggered alarms
430 |             
431 |         """
432 |         if (self.n>self.init_data.size):
433 |             print('Warning : the algorithm seems to have already been run, you \
434 |             should initialize before running again')
435 |             return {}
436 |         
437 |         # list of the thresholds
438 |         thup = []
439 |         thdown = []
440 |         alarm = []
441 |         # Loop over the stream
442 |         for i in tqdm.tqdm(range(self.data.size)):
443 |     
444 |             # If the observed value exceeds the current threshold (alarm case)
445 |             if self.data[i]>self.extreme_quantile['up'] :
446 |                 # if we want to alarm, we put it in the alarm list
447 |                 if with_alarm:
448 |                     alarm.append(i)
449 |                 # otherwise we add it in the peaks
450 |                 else:
451 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
452 |                     self.Nt['up'] += 1
453 |                     self.n += 1
454 |                     # and we update the thresholds
455 | 
456 |                     g,s,l = self._grimshaw('up')
457 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
458 | 
459 |             # case where the value exceeds the initial threshold but not the alarm ones
460 |             elif self.data[i]>self.init_threshold['up']:
461 |                     # we add it in the peaks
462 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
463 |                     self.Nt['up'] += 1
464 |                     self.n += 1
465 |                     # and we update the thresholds
466 | 
467 |                     g,s,l = self._grimshaw('up')
468 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
469 |                     
470 |             elif self.data[i]<self.extreme_quantile['down'] :
471 |                 # if we want to alarm, we put it in the alarm list
472 |                 if with_alarm:
473 |                     alarm.append(i)
474 |                 # otherwise we add it in the peaks
475 |                 else:
476 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
477 |                     self.Nt['down'] += 1
478 |                     self.n += 1
479 |                     # and we update the thresholds
480 | 
481 |                     g,s,l = self._grimshaw('down')
482 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
483 | 
484 |             # case where the value exceeds the initial threshold but not the alarm ones
485 |             elif self.data[i]<self.init_threshold['down']:
486 |                     # we add it in the peaks
487 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
488 |                     self.Nt['down'] += 1
489 |                     self.n += 1
490 |                     # and we update the thresholds
491 | 
492 |                     g,s,l = self._grimshaw('down')
493 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
494 |             else:
495 |                 self.n += 1
496 |             counter = 3*self.update_number
497 |             if self.n  % counter == 0: #update 
498 |                 # update on time
499 |                 #up_data =np.append(self.init_data, self.data[:i])
500 |                 up_data =self.data[self.n- counter :self.n]
501 |                 print 'update at ',self.n
502 |                 print 'updating using data: ', len(up_data)
503 |                 S = np.sort(up_data)     # we sort X to get the empirical quantile
504 |                 self.init_threshold['up'] = S[int(0.98*len(S))] # t is fixed for the whole algorithm
505 |                 self.init_threshold['down'] = S[int(0.02*len(S))] # t is fixed for the whole algorithm
506 | 
507 |                 self.peaks['up'] = up_data[up_data>self.init_threshold['up']]-self.init_threshold['up']
508 |                 self.peaks['down'] = -(up_data[up_data<self.init_threshold['down']]-self.init_threshold['down'])
509 |                 self.Nt['up'] = self.peaks['up'].size
510 |                 self.Nt['down'] = self.peaks['down'].size
511 |                 
512 |                 l = {'up':None,'down':None}
513 |                 for side in ['up','down']:
514 |                     g,s,l[side] = self._grimshaw(side)
515 |                     self.extreme_quantile[side] = self._quantile(side,g,s)
516 |                     self.gamma[side] = g
517 |                     self.sigma[side] = s
518 |                 
519 |             thup.append(self.extreme_quantile['up']) # thresholds record
520 |             thdown.append(self.extreme_quantile['down']) # thresholds record
521 |         
522 |         return {'upper_thresholds' : thup,'lower_thresholds' : thdown, 'alarms': alarm}
523 |     
524 |     def plot(self,run_results,with_alarm = True):
525 |         """
526 |         Plot the results of given by the run
527 |         
528 |         Parameters
529 |         ----------
530 |         run_results : dict
531 |             results given by the 'run' method
532 |         with_alarm : bool
533 |             (default = True) If True, alarms are plotted.
534 |         Returns
535 |         ----------
536 |         list
537 |             list of the plots
538 |             
539 |         """
540 |         x = range(self.data.size)
541 |         K = run_results.keys()
542 |         
543 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
544 |         fig = [ts_fig]
545 |         
546 |         if 'upper_thresholds' in K:
547 |             thup = run_results['upper_thresholds']
548 |             uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed')
549 |             fig.append(uth_fig)
550 |             
551 |         if 'lower_thresholds' in K:
552 |             thdown = run_results['lower_thresholds']
553 |             lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed')
554 |             fig.append(lth_fig)
555 |         
556 |         if with_alarm and ('alarms' in K):
557 |             alarm = run_results['alarms']
558 |             al_fig = plt.scatter(alarm,self.data[alarm],color='red')
559 |             fig.append(al_fig)
560 |             
561 |         plt.xlim((0,self.data.size))
562 |         plt.show()
563 |         
564 |         return fig
565 | 
566 | 


--------------------------------------------------------------------------------
/physic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from espot import ESPOT
 6 | from MOMspot import momSPOT
 7 | from spot import biSPOT
 8 | from bspot import bidSPOT
 9 | f = './physics.dat'
10 | r = open(f,'r').read().split(',')
11 | X = np.array(list(map(float,r)))
12 | import time
13 | n_init = 2000
14 | init_data = X[:n_init]     # initial batch
15 | data = X[n_init:]          # stream
16 | 
17 | q = 1e-3                 # risk parameter
18 | d = 450                  # depth parameter
19 | start = time.clock()
20 | s = biSPOT(q)
21 | #s = ESPOT(q,d)         # biDSPOT object
22 | #s = momSPOT(q)
23 | #s = bidSPOT(q,d)
24 | s.fit(init_data,data)     # data import
25 | s.initialize()               # initialization step
26 | results = s.run()        # run
27 | end = time.clock()
28 | t=end-start
29 | print("Runtime is:",t) 
30 | s.plot(results)          # plot


--------------------------------------------------------------------------------
/pic/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DawnsonLi/EVT/a98b85e88fdb6a92d8d30dfa687df173fa46de7e/pic/1.png


--------------------------------------------------------------------------------
/pic/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DawnsonLi/EVT/a98b85e88fdb6a92d8d30dfa687df173fa46de7e/pic/2.png


--------------------------------------------------------------------------------
/pic/middle_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DawnsonLi/EVT/a98b85e88fdb6a92d8d30dfa687df173fa46de7e/pic/middle_3.png


--------------------------------------------------------------------------------
/rain.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from spot import biSPOT
 4 | #from MOMspot import momSPOT
 5 | #no label
 6 | f = './rain.dat'
 7 | r = open(f,'r').read().split(',')
 8 | X = np.array(list(map(float,r)))
 9 | 
10 | n_init = 1000
11 | init_data = X[:n_init]     # initial batch
12 | data = X[n_init:]         # stream
13 | 
14 | q = 1e-07            # risk parameter
15 | s = biSPOT(q)          # biDSPOT object
16 | s.fit(init_data,data)     # data import
17 | s.initialize()         # initialization step
18 | results = s.run()     # run
19 | s.plot(results)     # plot


--------------------------------------------------------------------------------
/spot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from math import log,floor
  6 | import tqdm
  7 | from scipy.optimize import minimize
  8 | # colors for plot
  9 | deep_saffron = '#FF9933'
 10 | air_force_blue = '#5D8AA8'
 11 | 
 12 | def backMean(X,d):
 13 |     M = []
 14 |     w = X[:d].sum()
 15 |     M.append(w/d)
 16 |     for i in range(d,len(X)):
 17 |         w = w - X[i-d] + X[i]
 18 |         M.append(w/d)
 19 |     return np.array(M)
 20 | class biSPOT:
 21 |     """
 22 |     This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds)
 23 |     
 24 |     Attributes
 25 |     ----------
 26 |     proba : float
 27 |         Detection level (risk), chosen by the user
 28 |         
 29 |     extreme_quantile : float
 30 |         current threshold (bound between normal and abnormal events)
 31 |         
 32 |     data : numpy.array
 33 |         stream
 34 |     
 35 |     init_data : numpy.array
 36 |         initial batch of observations (for the calibration/initialization step)
 37 |     
 38 |     init_threshold : float  -
 39 |         initial threshold computed during the calibration step
 40 |     
 41 |     peaks : numpy.array
 42 |         array of peaks (excesses above the initial threshold)
 43 |     
 44 |     n : int
 45 |         number of observed values
 46 |     
 47 |     Nt : int
 48 |         number of observed peaks
 49 |     """
 50 |     def __init__(self, q = 1e-4):
 51 |         """
 52 |         Constructor
 53 |         Parameters
 54 |         ----------
 55 |         q
 56 |             Detection level (risk)
 57 |     
 58 |         Returns
 59 |         ----------
 60 |         biSPOT object
 61 |         """
 62 |         self.proba = q
 63 |         self.data = None
 64 |         self.init_data = None
 65 |         self.n = 0
 66 |         nonedict =  {'up':None,'down':None}
 67 |         
 68 |         self.extreme_quantile = dict.copy(nonedict)
 69 |         self.init_threshold = dict.copy(nonedict)
 70 |         self.peaks = dict.copy(nonedict)
 71 |         self.gamma = dict.copy(nonedict)
 72 |         self.sigma = dict.copy(nonedict)
 73 |         self.Nt = {'up':0,'down':0}
 74 |         
 75 |         
 76 |     def __str__(self):
 77 |         s = ''
 78 |         s += 'Streaming Peaks-Over-Threshold Object\n'
 79 |         s += 'Detection level q = %s\n' % self.proba
 80 |         if self.data is not None:
 81 |             s += 'Data imported : Yes\n'
 82 |             s += '\t initialization  : %s values\n' % self.init_data.size
 83 |             s += '\t stream : %s values\n' % self.data.size
 84 |         else:
 85 |             s += 'Data imported : No\n'
 86 |             return s
 87 |             
 88 |         if self.n == 0:
 89 |             s += 'Algorithm initialized : No\n'
 90 |         else:
 91 |             s += 'Algorithm initialized : Yes\n'
 92 |             s += '\t initial threshold : %s\n' % self.init_threshold
 93 |             
 94 |             r = self.n-self.init_data.size
 95 |             if r > 0:
 96 |                 s += 'Algorithm run : Yes\n'
 97 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
 98 |                 s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n)
 99 |             else:
100 |                 s += '\t number of peaks  : %s\n' % self.Nt
101 |                 s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up']
102 |                 s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down']
103 |                 s += 'Algorithm run : No\n'
104 |         return s
105 |     
106 |     
107 |     def fit(self,init_data,data):
108 |         """
109 |         Import data to biSPOT object
110 |         
111 |         Parameters
112 |         ----------
113 |         init_data : list, numpy.array or pandas.Series
114 |             initial batch to calibrate the algorithm ()
115 |             
116 |         data : numpy.array
117 |             data for the run (list, np.array or pd.series)
118 |     
119 |         """
120 |         if isinstance(data,list):
121 |             self.data = np.array(data)
122 |         elif isinstance(data,np.ndarray):
123 |             self.data = data
124 |         elif isinstance(data,pd.Series):
125 |             self.data = data.values
126 |         else:
127 |             print('This data format (%s) is not supported' % type(data))
128 |             return
129 |             
130 |         if isinstance(init_data,list):
131 |             self.init_data = np.array(init_data)
132 |         elif isinstance(init_data,np.ndarray):
133 |             self.init_data = init_data
134 |         elif isinstance(init_data,pd.Series):
135 |             self.init_data = init_data.values
136 |         elif isinstance(init_data,int):
137 |             self.init_data = self.data[:init_data]
138 |             self.data = self.data[init_data:]
139 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
140 |             r = int(init_data*data.size)
141 |             self.init_data = self.data[:r]
142 |             self.data = self.data[r:]
143 |         else:
144 |             print('The initial data cannot be set')
145 |             return
146 |         
147 |     def add(self,data):
148 |         """
149 |         This function allows to append data to the already fitted data
150 |         
151 |         Parameters
152 |         ----------
153 |         data : list, numpy.array, pandas.Series
154 |             data to append
155 |         """
156 |         if isinstance(data,list):
157 |             data = np.array(data)
158 |         elif isinstance(data,np.ndarray):
159 |             data = data
160 |         elif isinstance(data,pd.Series):
161 |             data = data.values
162 |         else:
163 |             print('This data format (%s) is not supported' % type(data))
164 |             return
165 |         
166 |         self.data = np.append(self.data,data)
167 |         return
168 | 
169 |     def initialize(self, verbose = True):
170 |         """
171 |         Run the calibration (initialization) step
172 |         
173 |         Parameters
174 |         ----------
175 |         verbose : bool
176 |             (default = True) If True, gives details about the batch initialization
177 |         """
178 |         n_init = self.init_data.size
179 |         
180 |         S = np.sort(self.init_data)     # we sort X to get the empirical quantile
181 |         self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm
182 |         self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm
183 | 
184 |         # initial peaks
185 |         self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up']
186 |         self.peaks['down'] = -(self.init_data[self.init_data<self.init_threshold['down']]-self.init_threshold['down'])
187 |         self.Nt['up'] = self.peaks['up'].size
188 |         self.Nt['down'] = self.peaks['down'].size
189 |         self.n = n_init
190 |         
191 |         if verbose:
192 |             print('Initial threshold : %s' % self.init_threshold)
193 |             print('Number of peaks : %s' % self.Nt)
194 |             #print('Grimshaw maximum log-likelihood estimation ... ', end = '')
195 |             
196 |         l = {'up':None,'down':None}
197 |         for side in ['up','down']:
198 |             g,s,l[side] = self._grimshaw(side)
199 |             self.extreme_quantile[side] = self._quantile(side,g,s)
200 |             self.gamma[side] = g
201 |             self.sigma[side] = s
202 |         
203 |         ltab = 20
204 |         form = ('\t'+'%20s' + '%20.2f' + '%20.2f')
205 |         '''
206 |         if verbose:
207 |             print('[done]')
208 |             print('\t' + 'Parameters'.rjust(ltab) + 'Upper'.rjust(ltab) + 'Lower'.rjust(ltab))
209 |             print('\t' + '-'*ltab*3)
210 |             print(form % (chr(0x03B3),self.gamma['up'],self.gamma['down']))
211 |             print(form % (chr(0x03C3),self.sigma['up'],self.sigma['down']))
212 |             print(form % ('likelihood',l['up'],l['down']))
213 |             print(form % ('Extreme quantile',self.extreme_quantile['up'],self.extreme_quantile['down']))
214 |             print('\t' + '-'*ltab*3)
215 |         '''
216 |         return 
217 |     
218 |     
219 |     
220 |     
221 |     def _rootsFinder(self, fun,jac,bounds,npoints,method):
222 |         """
223 |         Find possible roots of a scalar function
224 |         
225 |         Parameters
226 |         ----------
227 |         fun : function
228 |             scalar function 
229 |         jac : function
230 |             first order derivative of the function  
231 |         bounds : tuple
232 |             (min,max) interval for the roots search    
233 |         npoints : int
234 |             maximum number of roots to output      
235 |         method : str
236 |             'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval
237 |         
238 |         Returns
239 |         ----------
240 |         numpy.array
241 |             possible roots of the function
242 |         """
243 |         if method == 'regular':
244 |             step = (bounds[1]-bounds[0])/(npoints+1)
245 |             X0 = np.arange(bounds[0]+step,bounds[1],step)
246 |         elif method == 'random':
247 |             X0 = np.random.uniform(bounds[0],bounds[1],npoints)
248 |         
249 |         def objFun(X,f,jac):
250 |             g = 0
251 |             j = np.zeros(X.shape)
252 |             i = 0
253 |             for x in X:
254 |                 fx = f(x)
255 |                 g = g+fx**2
256 |                 j[i] = 2*fx*jac(x)
257 |                 i = i+1
258 |             return g,j
259 |         opt = minimize(lambda X:objFun(X,fun,jac), X0, 
260 |                        method='L-BFGS-B', 
261 |                        jac=True, bounds=[bounds]*len(X0))
262 |         
263 |         X = opt.x
264 |         np.round(X,decimals = 5)
265 |         return np.unique(X)
266 |     
267 |     
268 |     def _log_likelihood(self, Y,gamma,sigma):
269 |         """
270 |         Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
271 |         
272 |         Parameters
273 |         ----------
274 |         Y : numpy.array
275 |             observations
276 |         gamma : float
277 |             GPD index parameter
278 |         sigma : float
279 |             GPD scale parameter (>0)   
280 |         Returns
281 |         ----------
282 |         float
283 |             log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
284 |         """
285 |         n = Y.size
286 |         if gamma != 0:
287 |             tau = gamma/sigma
288 |             L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum()
289 |         else:
290 |             L = n * ( 1 + log(Y.mean()) )
291 |         return L
292 | 
293 | 
294 |     def _grimshaw(self,side,epsilon = 1e-8, n_points = 10):
295 |         """
296 |         Compute the GPD parameters estimation with the Grimshaw's trick
297 |         
298 |         Parameters
299 |         ----------
300 |         epsilon : float
301 |             numerical parameter to perform (default : 1e-8)
302 |         n_points : int
303 |             maximum number of candidates for maximum likelihood (default : 10)
304 |         Returns
305 |         ----------
306 |         gamma_best,sigma_best,ll_best
307 |             gamma estimates, sigma estimates and corresponding log-likelihood
308 |         """
309 |         def u(s):
310 |             return 1 + np.log(s).mean()
311 |             
312 |         def v(s):
313 |             return np.mean(1/s)
314 |         
315 |         def w(Y,t):
316 |             s = 1+t*Y
317 |             us = u(s)
318 |             vs = v(s)
319 |             return us*vs-1
320 |         
321 |         def jac_w(Y,t):
322 |             s = 1+t*Y
323 |             us = u(s)
324 |             vs = v(s)
325 |             jac_us = (1/t)*(1-vs)
326 |             jac_vs = (1/t)*(-vs+np.mean(1/s**2))
327 |             return us*jac_vs+vs*jac_us
328 |             
329 |     
330 |         Ym = self.peaks[side].min()
331 |         YM = self.peaks[side].max()
332 |         Ymean = self.peaks[side].mean()
333 |         
334 |         
335 |         a = -1/YM
336 |         if abs(a)<2*epsilon:
337 |             epsilon = abs(a)/n_points
338 |         
339 |         a = a + epsilon
340 |         b = 2*(Ymean-Ym)/(Ymean*Ym)
341 |         c = 2*(Ymean-Ym)/(Ym**2)
342 |     
343 |         # We look for possible roots
344 |         left_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
345 |                                  lambda t: jac_w(self.peaks[side],t),
346 |                                  (a+epsilon,-epsilon),
347 |                                  n_points,'regular')
348 |         
349 |         right_zeros = self._rootsFinder(lambda t: w(self.peaks[side],t),
350 |                                   lambda t: jac_w(self.peaks[side],t),
351 |                                   (b,c),
352 |                                   n_points,'regular')
353 |     
354 |         # all the possible roots
355 |         zeros = np.concatenate((left_zeros,right_zeros))
356 |         
357 |         # 0 is always a solution so we initialize with it
358 |         gamma_best = 0
359 |         sigma_best = Ymean
360 |         ll_best = self._log_likelihood(self.peaks[side],gamma_best,sigma_best)
361 |         
362 |         # we look for better candidates
363 |         for z in zeros:
364 |             gamma = u(1+z*self.peaks[side])-1
365 |             sigma = gamma/z
366 |             ll = self._log_likelihood(self.peaks[side],gamma,sigma)
367 |             if ll>ll_best:
368 |                 gamma_best = gamma
369 |                 sigma_best = sigma
370 |                 ll_best = ll
371 |     
372 |         return gamma_best,sigma_best,ll_best
373 | 
374 |     
375 | 
376 |     def _quantile(self,side,gamma,sigma):
377 |         """
378 |         Compute the quantile at level 1-q for a given side
379 |         
380 |         Parameters
381 |         ----------
382 |         side : str
383 |             'up' or 'down'
384 |         gamma : float
385 |             GPD parameter
386 |         sigma : float
387 |             GPD parameter
388 |         Returns
389 |         ----------
390 |         float
391 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
392 |         """
393 |         if side == 'up':
394 |             r = self.n * self.proba / self.Nt[side]
395 |             if gamma != 0:
396 |                 return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1)
397 |             else:
398 |                 return self.init_threshold['up'] - sigma*log(r)
399 |         elif side == 'down':
400 |             r = self.n * self.proba / self.Nt[side]
401 |             if gamma != 0:
402 |                 return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1)
403 |             else:
404 |                 return self.init_threshold['down'] + sigma*log(r)
405 |         else:
406 |             print('error : the side is not right')
407 | 
408 |         
409 |     def run(self, with_alarm = True):
410 |         """
411 |         Run biSPOT on the stream
412 |         
413 |         Parameters
414 |         ----------
415 |         with_alarm : bool
416 |             (default = True) If False, SPOT will adapt the threshold assuming \
417 |             there is no abnormal values
418 |         Returns
419 |         ----------
420 |         dict
421 |             keys : 'upper_thresholds', 'lower_thresholds' and 'alarms'
422 |             
423 |             '***-thresholds' contains the extreme quantiles and 'alarms' contains \
424 |             the indexes of the values which have triggered alarms
425 |             
426 |         """
427 |         if (self.n>self.init_data.size):
428 |             print('Warning : the algorithm seems to have already been run, you \
429 |             should initialize before running again')
430 |             return {}
431 |         
432 |         # list of the thresholds
433 |         thup = []
434 |         thdown = []
435 |         alarm = []
436 |         # Loop over the stream
437 |         for i in tqdm.tqdm(range(self.data.size)):
438 |     
439 |             # If the observed value exceeds the current threshold (alarm case)
440 |             if self.data[i]>self.extreme_quantile['up'] :
441 |                 # if we want to alarm, we put it in the alarm list
442 |                 if with_alarm:
443 |                     alarm.append(i)
444 |                 # otherwise we add it in the peaks
445 |                 else:
446 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
447 |                     self.Nt['up'] += 1
448 |                     self.n += 1
449 |                     # and we update the thresholds
450 | 
451 |                     g,s,l = self._grimshaw('up')
452 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
453 | 
454 |             # case where the value exceeds the initial threshold but not the alarm ones
455 |             elif self.data[i]>self.init_threshold['up']:
456 |                     # we add it in the peaks
457 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
458 |                     self.Nt['up'] += 1
459 |                     self.n += 1
460 |                     # and we update the thresholds
461 | 
462 |                     g,s,l = self._grimshaw('up')
463 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
464 |                     
465 |             elif self.data[i]<self.extreme_quantile['down'] :
466 |                 # if we want to alarm, we put it in the alarm list
467 |                 if with_alarm:
468 |                     alarm.append(i)
469 |                 # otherwise we add it in the peaks
470 |                 else:
471 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
472 |                     self.Nt['down'] += 1
473 |                     self.n += 1
474 |                     # and we update the thresholds
475 | 
476 |                     g,s,l = self._grimshaw('down')
477 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
478 | 
479 |             # case where the value exceeds the initial threshold but not the alarm ones
480 |             elif self.data[i]<self.init_threshold['down']:
481 |                     # we add it in the peaks
482 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
483 |                     self.Nt['down'] += 1
484 |                     self.n += 1
485 |                     # and we update the thresholds
486 | 
487 |                     g,s,l = self._grimshaw('down')
488 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
489 |             else:
490 |                 self.n += 1
491 | 
492 |                 
493 |             thup.append(self.extreme_quantile['up']) # thresholds record
494 |             thdown.append(self.extreme_quantile['down']) # thresholds record
495 |         
496 |         return {'upper_thresholds' : thup,'lower_thresholds' : thdown, 'alarms': alarm}
497 |     
498 |     def plot(self,run_results,with_alarm = True):
499 |         """
500 |         Plot the results of given by the run
501 |         
502 |         Parameters
503 |         ----------
504 |         run_results : dict
505 |             results given by the 'run' method
506 |         with_alarm : bool
507 |             (default = True) If True, alarms are plotted.
508 |         Returns
509 |         ----------
510 |         list
511 |             list of the plots
512 |             
513 |         """
514 |         x = range(self.data.size)
515 |         K = run_results.keys()
516 |         
517 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
518 |         fig = [ts_fig]
519 |         
520 |         if 'upper_thresholds' in K:
521 |             thup = run_results['upper_thresholds']
522 |             uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed')
523 |             fig.append(uth_fig)
524 |             
525 |         if 'lower_thresholds' in K:
526 |             thdown = run_results['lower_thresholds']
527 |             lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed')
528 |             fig.append(lth_fig)
529 |         
530 |         if with_alarm and ('alarms' in K):
531 |             alarm = run_results['alarms']
532 |             al_fig = plt.scatter(alarm,self.data[alarm],color='red')
533 |             fig.append(al_fig)
534 |             
535 |         plt.xlim((0,self.data.size))
536 |         plt.show()
537 |         
538 |         return fig
539 | 
540 | 


--------------------------------------------------------------------------------
/stock.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from bspot import bidSPOT
 4 | from spot import biSPOT
 5 | from espot import ESPOT
 6 | from MOMspot import momSPOT
 7 | from drif_spot import DRSPOT
 8 | import pandas as pd
 9 | import time
10 | f = './edf_stocks.csv'
11 | 
12 | P = pd.DataFrame.from_csv(f)
13 | 
14 | # stream
15 | u_data = (P['DATE'] == '2017-02-09')
16 | data = P['LOW'][u_data].values
17 | 
18 | # initial batch
19 | u_init_data = (P['DATE'] == '2017-02-08') | (P['DATE'] == '2017-02-07') | (P['DATE'] == '2017-02-06')
20 | init_data = P['LOW'][u_init_data].values
21 | 
22 | 
23 | q = 1e-5             # risk parameter
24 | d = 10                # depth
25 | start = time.clock()
26 | #s = ESPOT(q,d)     # bidSPOT object
27 | #s = bidSPOT(q)  
28 | s = biSPOT(q)  
29 | #s =DRSPOT(q)
30 | s.fit(init_data,data)     # data import
31 | s.initialize()             # initialization step
32 | results = s.run()     # run
33 | end = time.clock()
34 | t=end-start
35 | print("Runtime is:",t) 
36 | #del results['upper_thresholds'] # we can delete the upper thresholds
37 | fig = s.plot(results)            # plot
38 | 


--------------------------------------------------------------------------------