├── EVT ├── __init__.py ├── __pycache__ │ ├── spot.cpython-36.pyc │ ├── spot.cpython-37.pyc │ └── __init__.cpython-37.pyc └── spot.py ├── shape ├── __init__.py ├── __pycache__ │ ├── RMDF.cpython-36.pyc │ ├── RMDF.cpython-37.pyc │ └── __init__.cpython-37.pyc └── RMDF.py ├── generator ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── pattern.cpython-36.pyc │ ├── pattern.cpython-37.pyc │ ├── noise_generator.cpython-36.pyc │ ├── noise_generator.cpython-37.pyc │ ├── season_generator.cpython-36.pyc │ ├── season_generator.cpython-37.pyc │ ├── trend_generator.cpython-36.pyc │ ├── trend_generator.cpython-37.pyc │ ├── abstract_generator.cpython-36.pyc │ ├── abstract_generator.cpython-37.pyc │ ├── anomaly_generator.cpython-37.pyc │ ├── additive_anomaly_generator.cpython-36.pyc │ └── additive_anomaly_generator.cpython-37.pyc ├── trend_generator.py ├── pattern.py ├── pearson.py ├── abstract_generator.py ├── season_generator.py ├── noise_generator.py ├── additive_anomaly_generator.py └── test.py ├── tsagen_visual ├── __init__.py ├── __pycache__ │ ├── visual.cpython-36.pyc │ ├── visual.cpython-37.pyc │ └── __init__.cpython-37.pyc └── visual.py ├── requirements.txt ├── setup.py ├── gen_data_for_correlation_analysis.py ├── meta_features.yaml ├── computational_weight_alalysis.py ├── README.md ├── TSAGen.py ├── gen.py └── Assembler.py /EVT/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /shape/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /generator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tsagen_visual/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/requirements.txt -------------------------------------------------------------------------------- /EVT/__pycache__/spot.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/EVT/__pycache__/spot.cpython-36.pyc -------------------------------------------------------------------------------- /EVT/__pycache__/spot.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/EVT/__pycache__/spot.cpython-37.pyc -------------------------------------------------------------------------------- /shape/__pycache__/RMDF.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/shape/__pycache__/RMDF.cpython-36.pyc -------------------------------------------------------------------------------- /shape/__pycache__/RMDF.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/shape/__pycache__/RMDF.cpython-37.pyc -------------------------------------------------------------------------------- /EVT/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/EVT/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /shape/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/shape/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /generator/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /generator/__pycache__/pattern.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/pattern.cpython-36.pyc -------------------------------------------------------------------------------- /generator/__pycache__/pattern.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/pattern.cpython-37.pyc -------------------------------------------------------------------------------- /tsagen_visual/__pycache__/visual.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/tsagen_visual/__pycache__/visual.cpython-36.pyc -------------------------------------------------------------------------------- /tsagen_visual/__pycache__/visual.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/tsagen_visual/__pycache__/visual.cpython-37.pyc -------------------------------------------------------------------------------- /tsagen_visual/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/tsagen_visual/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /generator/__pycache__/noise_generator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/noise_generator.cpython-36.pyc -------------------------------------------------------------------------------- /generator/__pycache__/noise_generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/noise_generator.cpython-37.pyc -------------------------------------------------------------------------------- /generator/__pycache__/season_generator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/season_generator.cpython-36.pyc -------------------------------------------------------------------------------- /generator/__pycache__/season_generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/season_generator.cpython-37.pyc -------------------------------------------------------------------------------- /generator/__pycache__/trend_generator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/trend_generator.cpython-36.pyc -------------------------------------------------------------------------------- /generator/__pycache__/trend_generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/trend_generator.cpython-37.pyc -------------------------------------------------------------------------------- /generator/__pycache__/abstract_generator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/abstract_generator.cpython-36.pyc -------------------------------------------------------------------------------- /generator/__pycache__/abstract_generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/abstract_generator.cpython-37.pyc -------------------------------------------------------------------------------- /generator/__pycache__/anomaly_generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/anomaly_generator.cpython-37.pyc -------------------------------------------------------------------------------- /generator/__pycache__/additive_anomaly_generator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/additive_anomaly_generator.cpython-36.pyc -------------------------------------------------------------------------------- /generator/__pycache__/additive_anomaly_generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AprilCal/TSAGen/HEAD/generator/__pycache__/additive_anomaly_generator.cpython-37.pyc -------------------------------------------------------------------------------- /generator/trend_generator.py: -------------------------------------------------------------------------------- 1 | #!python3.6 2 | 3 | # Created by Chengyu on 2020/5/14. 4 | # Trend generator. 5 | 6 | import numpy as np 7 | import math 8 | 9 | class TrendGenerator(): 10 | def __init__(self): 11 | pass 12 | 13 | def _expression(self,zeta,ba,x): 14 | k = math.tan(zeta) 15 | b = ba 16 | return k*x+b 17 | 18 | def _inject(self): 19 | pass 20 | 21 | def gen(self,ba,zeta,size): 22 | x = np.arange(size) 23 | trend = [] 24 | for i in x: 25 | trend.append(self._expression(zeta,ba,i)) 26 | label = np.zeros(size,dtype=np.int) 27 | return (np.array(trend),label) 28 | -------------------------------------------------------------------------------- /generator/pattern.py: -------------------------------------------------------------------------------- 1 | #!python3.6 2 | 3 | # Created by Chengyu on 2020/6/6. 4 | # Additive anomaly patterns. 5 | 6 | import numpy as np 7 | 8 | def typeI(w1,w2,h): 9 | return list(map(lambda x: exprForTypeII(w1,w2,h,x), np.arange(w1+w2+1))) 10 | 11 | def typeII(w,h1,h2): 12 | k = (h2-h1)/(w-1) 13 | return list(map(lambda x: linear(k,h1,x), np.arange(w+1))) 14 | 15 | def a(w1,h,x): 16 | return h*(np.e**((-np.log(1/1000)/w1)*(x-w1))) 17 | 18 | def b(w1,w2,h,x): 19 | return h*(np.e**((np.log(1/1000)/w2)*(x-w1))) 20 | 21 | # function expressions. 22 | def linear(k,b,x): 23 | return k*x+b 24 | 25 | def exprForTypeII(w1,w2,h,x): 26 | if(x<=w1): 27 | return a(w1,h,x) 28 | else: 29 | return b(w1,w2,h,x) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!python3.6 2 | 3 | # Created by Chengyu on 2020/5/14。 4 | # Usage: python setup.py install 5 | 6 | from distutils.core import setup 7 | setup(name="shape", version="1.0", description="RMDF", author="Chengyu", py_modules=['shape.RMDF']) 8 | setup(name="generator", version="1.0", description="some generators", author="Chengyu", py_modules=['generator.additive_anomaly_generator','generator.pattern','generator.abstract_generator','generator.noise_generator','generator.trend_generator','generator.season_generator','generator.anomaly_generator']) 9 | setup(name="EVT", version="1.0", description="Extreme Value Theory", author="Chengyu", py_modules=['EVT.spot']) 10 | setup(name="visual", version="1.0", description="visualization tools", author="Chengyu", py_modules=['tsagen_visual.visual']) -------------------------------------------------------------------------------- /generator/pearson.py: -------------------------------------------------------------------------------- 1 | #!python3.6 2 | 3 | # Created by Chengyu on 2020/5/15. 4 | # Pearson Distribution System. 5 | 6 | # Usage: 7 | # p = Pearson() 8 | # p.pearsrnd(nu,sigma,skew,kurt,size) 9 | # pearsrnd returns a np.array 10 | 11 | import matlab 12 | import matlab.engine 13 | import numpy as np 14 | 15 | engine = matlab.engine.start_matlab() 16 | 17 | class Pearson: 18 | def __init__(self): 19 | self.engine = engine # Start MATLAB process 20 | # engine = matlab.engine.start_matlab("-desktop") # Start MATLAB process with graphic UI 21 | def pearsrnd(self,mu,sigma,skew,kurt,size): 22 | result = self.engine.pearsrnd(matlab.double([mu]), 23 | matlab.double([sigma]), 24 | matlab.double([skew]), 25 | matlab.double([kurt]), 26 | matlab.double([1]), 27 | matlab.double([size]))[0] 28 | return np.array(result) -------------------------------------------------------------------------------- /gen_data_for_correlation_analysis.py: -------------------------------------------------------------------------------- 1 | #!python3.6 2 | 3 | # Created by Chengyu on 2021/3/8. 4 | # Generating data for correlation analysis. 5 | 6 | import numpy as np 7 | import Assembler as assem 8 | import generator.trend_generator as tg 9 | import matplotlib.pyplot as plt 10 | import generator.noise_generator as ng 11 | import generator.season_generator as sg 12 | import matplotlib.pyplot as plt 13 | 14 | # season_generator = sg.SeasonGeneratorWithShapeDeformation(10,10,200,drift_a=0,drift_f=0,forking_depth=7) 15 | season_generator = sg.NormalSeasonGenerator(10,10,200,drift_a=0,drift_f=0,forking_depth=7) 16 | noise_generator = ng.Gaussian() 17 | # noise_generator = ng.GaussianWithChangePoints() 18 | trend_generator = tg.TrendGenerator() 19 | 20 | season = [season_generator.gen_season() for x in range(1)] 21 | length = len(season[0][0]) 22 | noise = noise_generator.gen(0,0.5,length) 23 | trend = trend_generator.gen(15,0,length) 24 | 25 | # assembler = assem.AbstractAssembler(season,noise,trend,'season') 26 | # assembler = assem.AbstractAssembler(season,noise,trend,'season') 27 | assembler = assem.AssemblerWithAdditiveAnomalyInjector_v1(season,noise,trend,'season', q=10e-7,a_type='type2') 28 | assembler.assemble() 29 | assembler.save(path='output/TSACorr') -------------------------------------------------------------------------------- /meta_features.yaml: -------------------------------------------------------------------------------- 1 | # Output path 2 | OUT_PATH: './test_path' 3 | # Total number of time series. 4 | TOTAL_NUM: 5 5 | 6 | # A list of 3 elements (e.g., [a, b, c], a is start value, b is end value, c is step length) 7 | # means generatring this variable in variable-control manner. 8 | # Only one controlled variable is allowed in a configuration file, and meanwhile, 9 | # other variables must be precise values (not meta feature). 10 | 11 | # A list of 5 elements (e.g., [a, b, c, d, e], i.e., [min, lower_quartile, mid, upper_quartile, max]) 12 | # means meta feature. There can be multiple meta features in a configuration file. 13 | 14 | # We give some examples below. 15 | 16 | # Use meta features. 17 | FEATURES: 18 | #TREND: 19 | theta1: [4,5,6,7,8] # level 20 | theta2: 0.01 # trend slope 21 | 22 | #SEASON: 23 | theta3: [2,4,5,6,10] # amplitude 24 | theta4: 1000 # cycle length, i.e., 1/frequency. 25 | theta5: 10 # num_of_cycle 26 | d: 10 # recursion depth 27 | d_hat: 8 # forking depth 28 | k1: 0.2 29 | k2: 0.2 30 | 31 | #NOISE: 32 | theta6: 0 # mean 33 | theta7: [0.1,0.2,0.3,0.4,1] # std 34 | theta8: 0 # skew 35 | theta9: 3 # kurt 36 | 37 | ANOMALY: 38 | type: spike # spike/deformation/vanish/typeI/typeII 39 | severity: 10 40 | 41 | 42 | -------------------------------------------------------------------------------- /tsagen_visual/visual.py: -------------------------------------------------------------------------------- 1 | #!python3.6 2 | 3 | # Created by Chengyu on 2020/6/26 4 | # visualization tools. 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | 9 | air_force_blue = '#5D8AA8' 10 | 11 | def show(values,labels,title='default',a_color='red',dilated=True, figure_size=(8,6)): 12 | plt.rcParams['pdf.fonttype'] = 42 13 | plt.rcParams['ps.fonttype'] = 42 14 | # check length 15 | if len(values)!=len(labels): 16 | print('length of values must equal length of labels') 17 | i = 0 18 | length = len(values) 19 | # adjecent label of value 1 will be grouped into the same group 20 | groups = [] 21 | while i < length: 22 | if labels[i] == 0: 23 | i += 1 24 | continue 25 | else: 26 | start = i 27 | while i= length: 12 | print("error:segment num > length.") 13 | sublen = int(length/seg_num) 14 | pos = 0 15 | pt = [] 16 | for i in range(0,seg_num-1): 17 | pt.append([pos,pos+sublen]) 18 | pos += sublen 19 | pt.append([pos,length]) 20 | return pt 21 | 22 | # insert spike anomaly 23 | def insert_spike_anomaly(kpi,label,upt,dwt,pos_list): 24 | kpi = kpi.copy() 25 | length = len(kpi) 26 | for pos in pos_list: 27 | position = int(pos*length) 28 | degree = [upt[position]-kpi[position]] 29 | a = degree 30 | for i in np.arange(len(a)): 31 | kpi[position+i] = a[i]+kpi[position+i] 32 | label[position+i] = 1 33 | return (kpi, label) 34 | 35 | def insert_beat_anomaly(kpi,label,upt,dwt,pos_list): 36 | kpi = kpi.copy() 37 | length = len(kpi) 38 | for pos in pos_list: 39 | position = int(pos*length) 40 | direction = 1 #np.random.choice([-1,1],1,p=[0.5,0.5]) 41 | degree1 = upt[position]-kpi[position] 42 | degree2 = kpi[position+1]-upt[position+1] 43 | a = [degree1,degree2] 44 | for i in np.arange(len(a)): 45 | kpi[position+i] = a[i]+kpi[position+i] 46 | label[position+i] = 1 47 | return (kpi,label) 48 | 49 | def insert_type1_anomaly(kpi, label, upt, dwt, pos_list): 50 | kpi = kpi.copy() 51 | length = len(kpi) 52 | for pos in pos_list: 53 | position = int(pos*length) 54 | # direction = 1 #np.random.choice([-1,1],1,p=[0.5,0.5]) 55 | degree = upt[position]-kpi[position] 56 | a = pt.typeI(10,20,degree) 57 | for i in np.arange(len(a)): 58 | kpi[position+i] = a[i]+kpi[position+i] 59 | label[position+i] = 1 60 | return (kpi, label) 61 | 62 | def insert_type2_anomaly(kpi, label, upt, dwt, pos_list): 63 | kpi = kpi.copy() 64 | length = len(kpi) 65 | for pos in pos_list: 66 | position = int(pos*length) 67 | direction = 1 #np.random.choice([-1,1],1,p=[0.5,0.5]) 68 | a_l = 20 69 | degree1 = upt[position]-kpi[position] 70 | degree2 = upt[position]-kpi[position+a_l] 71 | a = pt.typeII(20,degree1,degree2) 72 | for i in np.arange(len(a)): 73 | kpi[position+i] = a[i]+kpi[position+i] 74 | label[position+i] = 1 75 | return (kpi, label) 76 | 77 | # def insert_fluctuate_anomaly(noise,label,degree,num,moms): 78 | # noise = noise.copy() 79 | # mu = moms[0] 80 | # sigma = moms[1] 81 | # skew = -10*moms[2] 82 | # kurt = moms[3] 83 | 84 | # pt = partition(len(noise),num) 85 | # for k in pt: 86 | # a_length = int(random.uniform(50,150)) 87 | # pos = int(random.uniform(k[0],k[1]-a_length)) 88 | 89 | # noiseGenerator = ng.NoiseGenerator() 90 | # a_noise = noiseGenerator.genNoise(moms[0],moms[1],moms[2],moms[3],150) 91 | # print('a_noise',a_noise) 92 | # for i in range(0,a_length): 93 | # noise[pos+i] = a_noise[i] 94 | # label[pos+i] = 1 95 | # return noise,label 96 | 97 | # # insert null point. 98 | # def insert_null_anomaly(kpi,label,num): 99 | # kpi = kpi.copy() 100 | # pt = partition(len(kpi),num) 101 | # for k in pt: 102 | # pos = int(random.uniform(k[0],k[1]-1)) 103 | # kpi[pos]=0 104 | # label[pos-1]=1 105 | # label[pos]=1 106 | # label[pos+1]=1 107 | # return kpi,label 108 | 109 | # # insert dip anomaly 110 | # def insert_dip_anomaly(kpi,label,num, upt, dwt): 111 | # kpi = kpi.copy() 112 | # pt = partition(len(kpi),num) 113 | # for k in pt: 114 | # pos = int(random.uniform(k[0],k[1]-1)) 115 | # label[pos-1]=1 116 | # label[pos]=1 117 | # label[pos+1]=1 118 | # degree = kpi[pos]-dwt[pos] 119 | # kpi[pos]+=degree 120 | # return kpi,label 121 | 122 | # this type is defined in the paper of Microsoft. 123 | def insert_point_anomaly(kpi,label,num): 124 | pt = partition(len(kpi),num) 125 | for k in pt: 126 | pos = int(random.uniform(k[0],k[1]-1)) 127 | 128 | local_mean = np.mean(kpi[:pos]) 129 | mean = np.mean(kpi[pos-50:pos+50]) 130 | var = np.var(kpi[pos-50:pos+50]) 131 | r = np.random.normal(0,1,1) 132 | print(local_mean,mean,var,r) 133 | x = (local_mean+mean)*(1+var)*r+kpi[pos] 134 | 135 | kpi[pos] = x 136 | label[pos-1]=1 137 | label[pos]=1 138 | label[pos+1]=1 139 | return kpi,label -------------------------------------------------------------------------------- /shape/RMDF.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import copy 4 | 5 | class RMDF(): 6 | def __init__(self,depth=10,ascent_rate=20,start=np.array([0,0]),end=np.array([1,0])): 7 | # self.control_points = [[],[],[],[],[],[],[],[],[],[],[]] 8 | # self.control_points_copy = [[],[],[],[],[],[],[],[],[],[],[]] 9 | # self.anchor = [[],[],[],[],[],[],[],[],[],[],[]] 10 | self.control_points = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] 11 | self.control_points_copy = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] 12 | self.anchor = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] 13 | self.depth = depth 14 | self.start = start 15 | self.end = end 16 | self.ascent_rate = ascent_rate 17 | 18 | def gen_anchor(self): 19 | start = self.start 20 | end = self.end 21 | self.control_points[0].append([[start[0],end[0]],start,end]) 22 | for d in range(self.depth): 23 | for e in self.control_points[d]: 24 | start = e[1] 25 | end = e[2] 26 | l = self.__length(start,end) 27 | pmid = self.__mid(start,end) 28 | h = np.random.normal(0,l/self.ascent_rate) 29 | 30 | zeta = math.atan(h/(l/2)) 31 | l2 = math.sqrt(h*h+(l/2)*(l/2)) 32 | T = np.matrix([[math.cos(zeta),-math.sin(zeta)],[math.sin(zeta),math.cos(zeta)]]) 33 | a = np.matrix([[pmid[0]-start[0]],[pmid[1]-start[1]]]) 34 | b = np.matmul(T,a)*(l2/l*2) 35 | p = np.array([start[0]+b[0,0],start[1]+b[1,0]]) 36 | 37 | self.control_points[d+1].append([[start[0],p[0]],start,p]) 38 | self.control_points[d+1].append([[p[0],end[0]],p,end]) 39 | # ll = ll/2 40 | self.anchor = self.control_points.copy() 41 | # self.__std_anchor() 42 | 43 | def clear_all(self): 44 | self.__clear(self.depth+1) 45 | self.gen_anchor() 46 | 47 | def gen(self,forking_depth,length): 48 | self.__clear(forking_depth) 49 | self.__forking(forking_depth) 50 | self.__std() 51 | x_ = np.arange(0,1,1/length) 52 | y = np.array([self.__expression(x,10) for x in x_]) 53 | # y2 = np.array([self.__expression(x,2) for x in x_]) 54 | # y3 = np.array([self.__expression(x,3) for x in x_]) 55 | # y4 = np.array([self.__expression(x,4) for x in x_]) 56 | # y5 = np.array([self.__expression(x,5) for x in x_]) 57 | # y6 = np.array([self.__expression(x,6) for x in x_]) 58 | # y7 = np.array([self.__expression(x,7) for x in x_]) 59 | # y8 = np.array([self.__expression(x,8) for x in x_]) 60 | # y9 = np.array([self.__expression(x,9) for x in x_]) 61 | # return y,y2,y3,y4,y5,y6,y7,y8,y9 62 | return y 63 | 64 | def __std_anchor(self): 65 | point_list = list(map(lambda x:x[2],self.anchor[10])) 66 | y_value_list = list(map(lambda x:x[1], point_list)) 67 | max_y = np.max(y_value_list) 68 | min_y = np.min(y_value_list) 69 | height = max_y-min_y 70 | for i in range(len(self.anchor[10])): 71 | self.anchor[10][i][2][1]=self.anchor[10][i][2][1]/height 72 | 73 | def __std(self): 74 | point_list = list(map(lambda x:x[2],self.control_points_copy[10])) 75 | y_value_list = list(map(lambda x:x[1], point_list)) 76 | max_y = np.max(y_value_list) 77 | min_y = np.min(y_value_list) 78 | height = max_y-min_y 79 | for i in range(len(self.control_points_copy[10])): 80 | self.control_points_copy[10][i][2][1] = self.control_points_copy[10][i][2][1]/height 81 | 82 | def __expression(self,x,depth): 83 | expression = self.control_points_copy[depth] 84 | for e in expression: 85 | if x>=e[0][0] and x<=e[0][1]: 86 | p1 = e[1] 87 | p2 = e[2] 88 | k = (p2[1]-p1[1])/(p2[0]-p1[0]) 89 | b = p1[1]-k*p1[0] 90 | return k*x+b 91 | 92 | def __forking(self,forking_depth): 93 | shared_depth = self.depth - forking_depth 94 | for d in range(shared_depth,self.depth): 95 | for e in self.control_points[d]: 96 | start = e[1] 97 | end = e[2] 98 | l = self.__length(start,end) 99 | pmid = self.__mid(start,end) 100 | h = np.random.normal(0,l/self.ascent_rate) 101 | 102 | zeta = math.atan(h/(l/2)) 103 | l2 = math.sqrt(h*h+(l/2)*(l/2)) 104 | T = np.matrix([[math.cos(zeta),-math.sin(zeta)],[math.sin(zeta),math.cos(zeta)]]) 105 | a = np.matrix([[pmid[0]-start[0]],[pmid[1]-start[1]]]) 106 | b = np.matmul(T,a)*(l2/l*2) 107 | p = np.array([start[0]+b[0,0],start[1]+b[1,0]]) 108 | 109 | self.control_points[d+1].append([[start[0],p[0]],start,p]) 110 | self.control_points[d+1].append([[p[0],end[0]],p,end]) 111 | self.control_points_copy = copy.deepcopy(self.control_points) 112 | 113 | def __clear(self,forking_depth): 114 | # clear the latest forking_depth layer. 115 | shared_depth = self.depth - forking_depth 116 | for i in range(shared_depth,self.depth): 117 | self.control_points[i+1]=[] 118 | 119 | def __length(self,p1,p2): 120 | # length of line (p1,p2) 121 | # p = [x,y] 122 | # L2-norm 123 | return np.linalg.norm(p1-p2) 124 | 125 | def __mid(self,p1,p2): 126 | # mid point of line(p1,p2) 127 | x = (p2[0]+p1[0])/2 128 | y = (p2[1]+p1[1])/2 129 | return np.array([x,y]) -------------------------------------------------------------------------------- /Assembler.py: -------------------------------------------------------------------------------- 1 | #!python3.6 2 | 3 | # Created by Chengyu on 2020/12/9. 4 | # Assembler. 5 | 6 | import numpy as np 7 | import os 8 | import matplotlib.pyplot as plt 9 | import pandas as pd 10 | import generator.pattern as pattern 11 | import generator.additive_anomaly_generator as ag 12 | from tqdm import tqdm 13 | from tsagen_visual.visual import show 14 | from EVT.spot import bidSPOT 15 | 16 | # This is an abstract calss. 17 | # method assemble() is template method. 18 | # inject() should be rewritten according to your need. 19 | class AbstractAssembler(): 20 | def __init__(self, season, noise, trend, control=None): 21 | self.season = season 22 | self.trend = trend 23 | self.noise = noise 24 | self.label = 0 25 | self.additive = 0 26 | self.results = [] 27 | self.control = control 28 | 29 | # template method. 30 | def assemble(self): 31 | # invoke hook 32 | self._inject() 33 | # print(self.control) 34 | if self.control == 'season': 35 | for s,l in self.season: 36 | label = np.bitwise_or(l,self.noise[1]) 37 | label = np.bitwise_or(label,self.trend[1]) 38 | self.results.append((s+self.noise[0]+self.trend[0],label)) 39 | elif self.control == 'noise': 40 | for n,l in self.noise: 41 | label = np.bitwise_or(l,self.season[1]) 42 | label = np.bitwise_or(label,self.trend[1]) 43 | self.results.append((n+self.season[0]+self.trend[0],label)) 44 | elif self.control == 'trend': 45 | for t,l in self.season: 46 | label = np.bitwise_or(l,self.noise[1]) 47 | label = np.bitwise_or(label,self.season[1]) 48 | self.results.append((t+self.noise[0]+self.season[0],label)) 49 | elif self.control == 'drift_f': 50 | pass 51 | else: 52 | for i in range(0,len(self.season)): 53 | label = np.bitwise_or(self.noise[i][1],self.season[i][1]) 54 | label = np.bitwise_or(label,self.trend[i][1]) 55 | self.results.append((self.noise[i][0]+self.season[i][0]+self.trend[i][0],label)) 56 | self._post_inject() 57 | 58 | # hook. 59 | def _inject(self): 60 | pass 61 | 62 | # post hook. 63 | def _post_inject(self): 64 | pass 65 | 66 | def save(self,path='output',preifix='synthetic',plot=True,fig_size=(16,4)): 67 | idx = 0 68 | for r, l in tqdm(self.results): 69 | df = pd.DataFrame() 70 | df['timestamp']=np.arange(len(r)) 71 | df['value']=r 72 | df['label']=l 73 | 74 | if not os.path.exists(path+'/data'): 75 | os.makedirs(path+'/data') 76 | if not os.path.exists(path+'/fig'): 77 | os.makedirs(path+'/fig') 78 | filename = path+'/data/'+preifix+'_'+str(idx) 79 | figname = path+'/fig/'+preifix + '_' + str(idx) 80 | df.to_csv(filename + '.csv',index=None) 81 | if plot: 82 | sub = show(df['value'],df['label'],title=filename,figure_size=fig_size) 83 | sub.savefig(figname + '.jpg') 84 | # plt.show() 85 | plt.close() 86 | idx += 1 87 | 88 | # Assmebler with additive anomaly injector. 89 | class AssemblerWithAdditiveAnomalyInjector(AbstractAssembler): 90 | def __init__(self, season, noise, trend, control=None, q=10e-5, init_portion=0.2): 91 | AbstractAssembler.__init__(self, season, noise, trend, control) 92 | self.q = q 93 | self.init_portion = init_portion 94 | 95 | def _post_inject(self): 96 | # establish low probablity boundary. 97 | q = self.q 98 | d = 10 99 | init_portion = self.init_portion 100 | idx = 0 101 | for result, label in self.results: 102 | length = len(result) 103 | init_data = result[:int(length*init_portion)] 104 | s = bidSPOT(q,d) 105 | s.fit(init_data, result) 106 | s.initialize() 107 | r = s.run() 108 | s.plot(r) 109 | # plt.show() 110 | upper_thresholds = r['upper_thresholds'] 111 | lower_thresholds = r['lower_thresholds'] 112 | 113 | # r,l = insert_type2_anomaly(result,label,upper_thresholds,lower_thresholds,[0.3,0.4,0.5,0.6,0.7]) 114 | r,l = ag.insert_spike_anomaly(result,label,upper_thresholds,lower_thresholds,[0.3,0.4,0.5,0.6,0.7]) #insert_type2_anomaly(result,label,upper_thresholds,lower_thresholds,[0.3,0.4,0.5,0.6,0.7]) 115 | self.results[idx]=(r,l) 116 | 117 | # Assmebler with additive anomaly injector. 118 | # control noise level. 119 | class AssemblerWithAdditiveAnomalyInjector_v1(AbstractAssembler): 120 | def __init__(self, season, noise, trend, control=None, q=10e-7, init_portion=0.2, a_type='spike'): 121 | AbstractAssembler.__init__(self, season, noise, trend, control) 122 | self.q = q 123 | self.a_type = a_type 124 | self.init_portion = init_portion 125 | 126 | def _post_inject(self): 127 | # establish low probablity boundary. 128 | q = self.q 129 | d = 10 130 | init_portion = self.init_portion 131 | idx = 0 132 | for result, label in self.results: 133 | length = len(result) 134 | init_data = result[:int(length*init_portion)] 135 | s = bidSPOT(q,d) 136 | s.fit(init_data, result) 137 | s.initialize() 138 | r = s.run() 139 | s.plot(r) 140 | # plt.show() 141 | upper_thresholds = r['upper_thresholds'] 142 | lower_thresholds = r['lower_thresholds'] 143 | if self.a_type == 'spike': 144 | r,l = ag.insert_spike_anomaly(result,label,upper_thresholds,lower_thresholds,[0.3,0.4,0.5,0.6,0.7]) 145 | elif self.a_type == 'beat': 146 | r,l = ag.insert_beat_anomaly(result,label,upper_thresholds,lower_thresholds,[0.3,0.4,0.5,0.6,0.7]) 147 | elif self.a_type == 'type1': 148 | r,l = ag.insert_type1_anomaly(result,label,upper_thresholds,lower_thresholds,[0.3,0.4,0.5,0.6,0.7]) 149 | elif self.a_type == 'type2': 150 | r,l = ag.insert_type2_anomaly(result,label,upper_thresholds,lower_thresholds,[0.6,0.9]) 151 | else: 152 | print('a_type does not exist.') 153 | self.results[idx]=(r,l) 154 | idx += 1 155 | -------------------------------------------------------------------------------- /generator/test.py: -------------------------------------------------------------------------------- 1 | #!python3.6 2 | 3 | # Created by Chengyu on 2020/5/13. 4 | # season generator. 5 | 6 | import numpy as np 7 | import math 8 | import matplotlib.pyplot as plt 9 | import pandas as pd 10 | from tqdm import tqdm 11 | from sklearn.metrics import mean_squared_error 12 | 13 | def sine_p(size): 14 | # print(np.pi/np.arange(len)) 15 | # print(np.linspace(10,100,91)) 16 | return np.sin(np.linspace(0,np.pi,size)) 17 | 18 | def sine(size): 19 | return np.sin(np.linspace(0,2*np.pi,size)) 20 | 21 | # std_size is the standard size of a cycle. 22 | # cycle_num is the number of cycles. 23 | # The overall length of returned seasonal component is 24 | # std_size * cycle_number, in the absense of drift. 25 | # drift_a, drift_f are drift factors of amplitude and 26 | # frequency, respectively. 27 | def sine_p_season(std_size, cycle_num, drift_a, drift_f): 28 | sines = [sine_p(std_size) for x in range(cycle_num)] 29 | return np.concatenate(sines) 30 | 31 | def sine_season(std_size, cycle_num, drift_a, drift_f): 32 | sines = sine(std_size) 33 | sines = [sine(std_size)[1:] for x in range(cycle_num)] 34 | return np.concatenate(sines) 35 | 36 | # length of line (p1,p2) 37 | # p = [x,y] 38 | def length(p1,p2): 39 | # 2-norm 40 | return np.linalg.norm(p1-p2) 41 | 42 | # mid of line(p1,p2) 43 | def mid(p1,p2): 44 | x = (p2[0]+p1[0])/2 45 | y = (p2[1]+p1[1])/2 46 | return np.array([x,y]) 47 | 48 | def std(): 49 | point_list = list(map(lambda x:x[2],expression)) 50 | # print(point_list) 51 | y_value_list = list(map(lambda x:x[1], point_list)) 52 | max_y = np.max(y_value_list) 53 | min_y = np.min(y_value_list) 54 | height = max_y-min_y 55 | for i in range(len(expression)): 56 | expression[i][2][1]=expression[i][2][1]/height 57 | 58 | def std10(): 59 | point_list = list(map(lambda x:x[2],expression_[10])) 60 | # print(point_list) 61 | y_value_list = list(map(lambda x:x[1], point_list)) 62 | max_y = np.max(y_value_list) 63 | min_y = np.min(y_value_list) 64 | height = max_y-min_y 65 | for i in range(len(expression_[10])): 66 | expression_[10][i][2][1]=expression_[10][i][2][1]/height 67 | 68 | # plot curve of depth d 69 | # d start from 0 70 | def func_of_d(x,depth): 71 | expression = expression_[depth] 72 | for e in expression: 73 | if x>=e[0][0] and x<=e[0][1]: 74 | p1 = e[1] 75 | p2 = e[2] 76 | k = (p2[1]-p1[1])/(p2[0]-p1[0]) 77 | b = p1[1]-k*p1[0] 78 | return k*x+b 79 | 80 | expression_ = [[],[],[],[],[],[],[],[],[],[],[]] 81 | 82 | # RMDF loop version. 83 | def RMDF_loop(H,sigma,max_depth): 84 | start = np.array([0,0]) 85 | end = np.array([1,0]) 86 | expression_[0].append([[start[0],end[0]],start,end]) 87 | for d in range(max_depth): 88 | for e in expression_[d]: 89 | start = e[1] 90 | end = e[2] 91 | l = length(start,end) 92 | pmid = mid(start,end) 93 | h = np.random.normal(0,l/8) 94 | 95 | zeta = math.atan(h/(l/2)) 96 | l2 = math.sqrt(h*h+(l/2)*(l/2)) 97 | T = np.matrix([[math.cos(zeta),-math.sin(zeta)],[math.sin(zeta),math.cos(zeta)]]) 98 | a = np.matrix([[pmid[0]-start[0]],[pmid[1]-start[1]]]) 99 | b = np.matmul(T,a)*(l2/l*2) 100 | p = np.array([start[0]+b[0,0],start[1]+b[1,0]]) 101 | 102 | expression_[d+1].append([[start[0],p[0]],start,p]) 103 | expression_[d+1].append([[p[0],end[0]],p,end]) 104 | 105 | # RMDF_loop(0.3,0.2,10) 106 | 107 | # RMDF recursive version. 108 | def RMDF(start,end,depth,H,sigma,max_depth): 109 | if depth >= max_depth: 110 | expression_[depth].append([[start[0],end[0]],start,end]) 111 | return 112 | else: 113 | expression_[depth].append([[start[0],end[0]],start,end]) 114 | 115 | l = length(start,end) 116 | pmid = mid(start,end) 117 | h = np.random.normal(0,l/8) 118 | 119 | zeta = math.atan(h/(l/2)) 120 | l2 = math.sqrt(h*h+(l/2)*(l/2)) 121 | T = np.matrix([[math.cos(zeta),-math.sin(zeta)],[math.sin(zeta),math.cos(zeta)]]) 122 | a = np.matrix([[pmid[0]-start[0]],[pmid[1]-start[1]]]) 123 | b = np.matmul(T,a)*(l2/l*2) 124 | p = np.array([start[0]+b[0,0],start[1]+b[1,0]]) 125 | RMDF(start,p,depth+1,H,sigma,max_depth) 126 | RMDF(p,end,depth+1,H,sigma,max_depth) 127 | 128 | # experiment code 129 | # RMDF(np.array([0,0]),np.array([1,0]),0,0.3,0.2,5) 130 | 131 | def draww(): 132 | plt.rcParams['pdf.fonttype'] = 42 133 | plt.rcParams['ps.fonttype'] = 42 134 | air_force_blue = '#5D8AA8' 135 | sub1 = plt.subplot(151) 136 | sub2 = plt.subplot(152) 137 | sub3 = plt.subplot(153) 138 | plt.subplots_adjust(wspace =0, hspace =0) 139 | x = np.arange(0,1,1/1000) 140 | 141 | # std()3 142 | y = [func_of_d(x,3) for x in np.arange(0,1,1/1000)] 143 | sub1.plot(x,y,linewidth=2,color='#5D8AA8') 144 | y = [func_of_d(x,4) for x in np.arange(0,1,1/1000)] 145 | sub2.plot(x,y,linewidth=2,color='#5D8AA8') 146 | y = [func_of_d(x,10) for x in np.arange(0,1,1/1000)] 147 | sub3.plot(x,y,linewidth=2,color='#5D8AA8') 148 | 149 | sub1.set_title("d = 3", y=-0.3,fontsize=25) 150 | sub1.set_yticks([]) 151 | sub2.set_title("d = 4", y=-0.3,fontsize=25) 152 | sub2.set_yticks([]) 153 | sub3.set_title("d = 6", y=-0.3,fontsize=25) 154 | sub3.set_yticks([]) 155 | plt.show() 156 | 157 | # draww() 158 | 159 | # RMDF loop version. 160 | def RMDF_shared(shared_depth): 161 | start = np.array([0,0]) 162 | end = np.array([1,0]) 163 | expression_[0].append([[start[0],end[0]],start,end]) 164 | for d in range(shared_depth): 165 | for e in expression_[d]: 166 | start = e[1] 167 | end = e[2] 168 | l = length(start,end) 169 | pmid = mid(start,end) 170 | h = np.random.normal(0,l/8) 171 | 172 | zeta = math.atan(h/(l/2)) 173 | l2 = math.sqrt(h*h+(l/2)*(l/2)) 174 | T = np.matrix([[math.cos(zeta),-math.sin(zeta)],[math.sin(zeta),math.cos(zeta)]]) 175 | a = np.matrix([[pmid[0]-start[0]],[pmid[1]-start[1]]]) 176 | b = np.matmul(T,a)*(l2/l*2) 177 | p = np.array([start[0]+b[0,0],start[1]+b[1,0]]) 178 | 179 | expression_[d+1].append([[start[0],p[0]],start,p]) 180 | expression_[d+1].append([[p[0],end[0]],p,end]) 181 | 182 | def clear(shared_depth,max_depth): 183 | for i in range(shared_depth,max_depth): 184 | expression_[i+1]=[] 185 | 186 | def RMDF_diverge(shared_depth,max_depth): 187 | for d in range(shared_depth,max_depth): 188 | for e in expression_[d]: 189 | start = e[1] 190 | end = e[2] 191 | l = length(start,end) 192 | pmid = mid(start,end) 193 | h = np.random.normal(0,l/8) 194 | 195 | zeta = math.atan(h/(l/2)) 196 | l2 = math.sqrt(h*h+(l/2)*(l/2)) 197 | T = np.matrix([[math.cos(zeta),-math.sin(zeta)],[math.sin(zeta),math.cos(zeta)]]) 198 | a = np.matrix([[pmid[0]-start[0]],[pmid[1]-start[1]]]) 199 | b = np.matmul(T,a)*(l2/l*2) 200 | p = np.array([start[0]+b[0,0],start[1]+b[1,0]]) 201 | 202 | expression_[d+1].append([[start[0],p[0]],start,p]) 203 | expression_[d+1].append([[p[0],end[0]],p,end]) 204 | 205 | RMDF_shared(10) 206 | 207 | 208 | def dtw_distance(ts_a, ts_b, d=lambda x,y: abs(x-y), mww=10000): 209 | """Computes dtw distance between two time series 210 | 211 | Args: 212 | ts_a: time series a 213 | ts_b: time series b 214 | d: distance function 215 | mww: max warping window, int, optional (default = infinity) 216 | 217 | Returns: 218 | dtw distance 219 | """ 220 | 221 | # Create cost matrix via broadcasting with large int 222 | ts_a, ts_b = np.array(ts_a), np.array(ts_b) 223 | M, N = len(ts_a), len(ts_b) 224 | cost = np.ones((M, N)) 225 | 226 | # Initialize the first row and column 227 | cost[0, 0] = d(ts_a[0], ts_b[0]) 228 | for i in range(1, M): 229 | cost[i, 0] = cost[i-1, 0] + d(ts_a[i], ts_b[0]) 230 | 231 | for j in range(1, N): 232 | cost[0, j] = cost[0, j-1] + d(ts_a[0], ts_b[j]) 233 | 234 | # Populate rest of cost matrix within window 235 | for i in range(1, M): 236 | for j in range(max(1, i - mww), min(N, i + mww)): 237 | choices = cost[i-1, j-1], cost[i, j-1], cost[i-1, j] 238 | cost[i, j] = min(choices) + d(ts_a[i], ts_b[j]) 239 | 240 | # Return DTW distance given window 241 | return cost[-1, -1] 242 | 243 | 244 | y = [] 245 | def gen_1000curves_and_save(): 246 | x_ = np.arange(0,1,1/1000) 247 | for i in tqdm(range(100)): 248 | clear(0,10) 249 | RMDF_shared(10) 250 | std10() 251 | y1 = [func_of_d(x,10) for x in x_] 252 | clear(9,10) 253 | RMDF_diverge(9,10) 254 | std10() 255 | y2 = [func_of_d(x,10) for x in x_] 256 | clear(8,10) 257 | RMDF_diverge(8,10) 258 | std10() 259 | y3 = [func_of_d(x,10) for x in x_] 260 | clear(7,10) 261 | RMDF_diverge(7,10) 262 | std10() 263 | y4 = [func_of_d(x,10) for x in x_] 264 | clear(6,10) 265 | RMDF_diverge(6,10) 266 | std10() 267 | y5 = [func_of_d(x,10) for x in x_] 268 | clear(5,10) 269 | RMDF_diverge(5,10) 270 | std10() 271 | y6 = [func_of_d(x,10) for x in x_] 272 | clear(4,10) 273 | RMDF_diverge(4,10) 274 | std10() 275 | y7 = [func_of_d(x,10) for x in x_] 276 | clear(3,10) 277 | RMDF_diverge(3,10) 278 | std10() 279 | y8 = [func_of_d(x,10) for x in x_] 280 | clear(2,10) 281 | RMDF_diverge(2,10) 282 | std10() 283 | y9 = [func_of_d(x,10) for x in x_] 284 | clear(1,10) 285 | RMDF_diverge(1,10) 286 | std10() 287 | y10 = [func_of_d(x,10) for x in x_] 288 | y.append((y1,y2,y3,y4,y5,y6,y7,y8,y9,y10)) 289 | # save list 290 | a = np.array(y) 291 | np.save('exp_data_in_paper/curves.npy',a) 292 | 293 | distance_list = [[],[],[],[],[],[],[],[],[]] 294 | mse_list = [[],[],[],[],[],[],[],[],[]] 295 | rmse_list = [[],[],[],[],[],[],[],[],[]] 296 | 297 | def calculate_mse_and_save(): 298 | curves=np.load('exp_data_in_paper/curves.npy') 299 | curves=curves.tolist() 300 | for curve in tqdm(curves): 301 | anchor = curve[0] 302 | for sample,idx in zip(curve[1:],range(9)): 303 | mse = mean_squared_error(anchor,sample) 304 | mse_list[idx].append(mse) 305 | mse = np.array(mse_list) 306 | np.save('exp_data_in_paper/mse.npy',mse) 307 | 308 | def calculate_rmse_and_save(): 309 | curves=np.load('exp_data_in_paper/curves.npy') 310 | curves=curves.tolist() 311 | for curve in tqdm(curves): 312 | anchor = curve[0] 313 | for sample,idx in zip(curve[1:],range(9)): 314 | rmse = np.sqrt(mean_squared_error(anchor,sample)) 315 | rmse_list[idx].append(rmse) 316 | rmse = np.array(rmse_list) 317 | np.save('exp_data_in_paper/rmse.npy',rmse) 318 | 319 | def calculate_DTW_and_save(): 320 | curves=np.load('exp_data_in_paper/curves.npy') 321 | curves=curves.tolist() 322 | for curve in tqdm(curves): 323 | anchor = curve[0] 324 | for sample,idx in zip(curve[1:],range(9)): 325 | distance = dtw_distance(anchor,sample,mww=100) 326 | distance_list[idx].append(distance) 327 | dtw = np.array(distance_list) 328 | np.save('exp_data_in_paper/dtw.npy',dtw) 329 | 330 | def draw_boxplot_of_DTW(): 331 | distance_list=np.load('exp_data_in_paper/dtw.npy') 332 | distance_list=distance_list.tolist() 333 | # remove outlier 334 | new_distance_list = [] 335 | for dist in distance_list: 336 | new_distance_list.append(np.sort(dist)[:-10]) 337 | f = plt.figure(figsize=(16, 6)) 338 | plt.rcParams['pdf.fonttype'] = 42 339 | plt.rcParams['ps.fonttype'] = 42 340 | plt.boxplot(new_distance_list,labels=['1','2','3','4','5','6','7','8','9'],whis=1.5,sym = '.',showmeans=True) 341 | plt.xlabel('forking depth',size='20') 342 | plt.ylabel('DTW cost',size='20') 343 | plt.show() 344 | 345 | def draw_boxplot_of_MSE(): 346 | distance_list=np.load('exp_data_in_paper/mse.npy') 347 | distance_list=distance_list.tolist() 348 | plt.rcParams['pdf.fonttype'] = 42 349 | plt.rcParams['ps.fonttype'] = 42 350 | plt.boxplot(distance_list,labels=['1','2','3','4','5','6','7','8','9'],whis=1.5,sym = '.',showmeans=True) 351 | plt.xlabel('forking depth',size='20') 352 | plt.ylabel('MSE',size='20') 353 | plt.show() 354 | 355 | def draw_boxplot_of_RMSE(): 356 | distance_list=np.load('exp_data_in_paper/rmse.npy') 357 | distance_list=distance_list.tolist() 358 | # remove outlier 359 | new_distance_list = [] 360 | for dist in distance_list: 361 | new_distance_list.append(np.sort(dist)[:-10]) 362 | f = plt.figure(figsize=(16, 6)) 363 | plt.rcParams['pdf.fonttype'] = 42 364 | plt.rcParams['ps.fonttype'] = 42 365 | plt.boxplot(new_distance_list,labels=['1','2','3','4','5','6','7','8','9'],whis=1.5,sym = '*',showmeans=True) 366 | plt.xlabel('forking depth',size='20') 367 | plt.ylabel('RMSE',size='20') 368 | plt.show() 369 | 370 | # # gen 371 | # gen_1000curves_and_save() 372 | 373 | # # calculate distance 374 | # calculate_DTW_and_save() 375 | # calculate_mse_and_save() 376 | # calculate_rmse_and_save() 377 | 378 | # # draw 379 | # draw_boxplot_of_MSE() 380 | # draw_boxplot_of_RMSE() 381 | # draw_boxplot_of_DTW() 382 | 383 | 384 | 385 | 386 | 387 | 388 | # # DO NOT TOUCH THIS!!! 389 | plt.rcParams['pdf.fonttype'] = 42 390 | plt.rcParams['ps.fonttype'] = 42 391 | air_force_blue = '#5D8AA8' 392 | sub1 = plt.subplot(151) 393 | sub2 = plt.subplot(152) 394 | sub3 = plt.subplot(153) 395 | sub4 = plt.subplot(154) 396 | sub5 = plt.subplot(155) 397 | 398 | plt.subplots_adjust(wspace =0, hspace =0) 399 | x = np.arange(0,1,1/1000) 400 | 401 | std10() 402 | y1 = [func_of_d(x,10) for x in np.arange(0,1,1/1000)] 403 | 404 | clear(6,10) 405 | RMDF_diverge(6,10) 406 | std10() 407 | y2 = [func_of_d(x,10) for x in np.arange(0,1,1/1000)] 408 | 409 | clear(3,10) 410 | RMDF_diverge(3,10) 411 | std10() 412 | y3 = [func_of_d(x,10) for x in np.arange(0,1,1/1000)] 413 | 414 | clear(2,10) 415 | RMDF_diverge(2,10) 416 | std10() 417 | y4 = [func_of_d(x,10) for x in np.arange(0,1,1/1000)] 418 | 419 | clear(1,10) 420 | RMDF_diverge(1,10) 421 | std10() 422 | y5 = [func_of_d(x,10) for x in np.arange(0,1,1/1000)] 423 | 424 | sub1.plot(x,y1,linewidth=2,color='#5D8AA8') 425 | sub2.plot(x,y2,linewidth=2,color='#5D8AA8') 426 | sub3.plot(x,y3,linewidth=2,color='#5D8AA8') 427 | sub4.plot(x,y4,linewidth=2,color='#5D8AA8') 428 | sub5.plot(x,y5,linewidth=2,color='#5D8AA8') 429 | 430 | sub1.set_title("contrast", y=-0.3,fontsize=25) 431 | sub2.set_title(r'$\hat{d} = 2$', y=-0.3,fontsize=25) 432 | sub3.set_title(r'$\hat{d} = 6$', y=-0.3,fontsize=25) 433 | sub4.set_title(r'$\hat{d} = 8$', y=-0.3,fontsize=25) 434 | sub5.set_title(r'$\hat{d} = 9$', y=-0.3,fontsize=25) 435 | 436 | sub1.set_yticks([]) 437 | sub2.set_yticks([]) 438 | sub3.set_yticks([]) 439 | sub4.set_yticks([]) 440 | sub5.set_yticks([]) 441 | sub1.set_xticks([]) 442 | sub2.set_xticks([]) 443 | sub3.set_xticks([]) 444 | sub4.set_xticks([]) 445 | sub5.set_xticks([]) 446 | 447 | plt.show() -------------------------------------------------------------------------------- /EVT/spot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Dec 12 10:08:16 2016 5 | 6 | @author: Alban Siffer 7 | @company: Amossys 8 | @license: GNU GPLv3 9 | """ 10 | 11 | from scipy.optimize import minimize 12 | from math import log,floor 13 | import numpy as np 14 | import pandas as pd 15 | import matplotlib.pyplot as plt 16 | import tqdm 17 | 18 | # colors for plot 19 | deep_saffron = '#FF9933' 20 | air_force_blue = '#5D8AA8' 21 | 22 | 23 | """ 24 | ================================= MAIN CLASS ================================== 25 | """ 26 | 27 | class SPOT: 28 | """ 29 | This class allows to run SPOT algorithm on univariate dataset (upper-bound) 30 | 31 | Attributes 32 | ---------- 33 | proba : float 34 | Detection level (risk), chosen by the user 35 | 36 | extreme_quantile : float 37 | current threshold (bound between normal and abnormal events) 38 | 39 | data : numpy.array 40 | stream 41 | 42 | init_data : numpy.array 43 | initial batch of observations (for the calibration/initialization step) 44 | 45 | init_threshold : float 46 | initial threshold computed during the calibration step 47 | 48 | peaks : numpy.array 49 | array of peaks (excesses above the initial threshold) 50 | 51 | n : int 52 | number of observed values 53 | 54 | Nt : int 55 | number of observed peaks 56 | """ 57 | 58 | def __init__(self, q = 1e-4): 59 | """ 60 | Constructor 61 | 62 | Parameters 63 | ---------- 64 | q 65 | Detection level (risk) 66 | 67 | Returns 68 | ---------- 69 | SPOT object 70 | """ 71 | self.proba = q 72 | self.extreme_quantile = None 73 | self.data = None 74 | self.init_data = None 75 | self.init_threshold = None 76 | self.peaks = None 77 | self.n = 0 78 | self.Nt = 0 79 | 80 | def __str__(self): 81 | s = '' 82 | s += 'Streaming Peaks-Over-Threshold Object\n' 83 | s += 'Detection level q = %s\n' % self.proba 84 | if self.data is not None: 85 | s += 'Data imported : Yes\n' 86 | s += '\t initialization : %s values\n' % self.init_data.size 87 | s += '\t stream : %s values\n' % self.data.size 88 | else: 89 | s += 'Data imported : No\n' 90 | return s 91 | 92 | if self.n == 0: 93 | s += 'Algorithm initialized : No\n' 94 | else: 95 | s += 'Algorithm initialized : Yes\n' 96 | s += '\t initial threshold : %s\n' % self.init_threshold 97 | 98 | r = self.n-self.init_data.size 99 | if r > 0: 100 | s += 'Algorithm run : Yes\n' 101 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 102 | else: 103 | s += '\t number of peaks : %s\n' % self.Nt 104 | s += '\t extreme quantile : %s\n' % self.extreme_quantile 105 | s += 'Algorithm run : No\n' 106 | return s 107 | 108 | 109 | def fit(self,init_data,data): 110 | """ 111 | Import data to SPOT object 112 | 113 | Parameters 114 | ---------- 115 | init_data : list, numpy.array or pandas.Series 116 | initial batch to calibrate the algorithm 117 | 118 | data : numpy.array 119 | data for the run (list, np.array or pd.series) 120 | 121 | """ 122 | if isinstance(data,list): 123 | self.data = np.array(data) 124 | elif isinstance(data,np.ndarray): 125 | self.data = data 126 | elif isinstance(data,pd.Series): 127 | self.data = data.values 128 | else: 129 | print('This data format (%s) is not supported' % type(data)) 130 | return 131 | 132 | if isinstance(init_data,list): 133 | self.init_data = np.array(init_data) 134 | elif isinstance(init_data,np.ndarray): 135 | self.init_data = init_data 136 | elif isinstance(init_data,pd.Series): 137 | self.init_data = init_data.values 138 | elif isinstance(init_data,int): 139 | self.init_data = self.data[:init_data] 140 | self.data = self.data[init_data:] 141 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 142 | r = int(init_data*data.size) 143 | self.init_data = self.data[:r] 144 | self.data = self.data[r:] 145 | else: 146 | print('The initial data cannot be set') 147 | return 148 | 149 | def add(self,data): 150 | """ 151 | This function allows to append data to the already fitted data 152 | 153 | Parameters 154 | ---------- 155 | data : list, numpy.array, pandas.Series 156 | data to append 157 | """ 158 | if isinstance(data,list): 159 | data = np.array(data) 160 | elif isinstance(data,np.ndarray): 161 | data = data 162 | elif isinstance(data,pd.Series): 163 | data = data.values 164 | else: 165 | print('This data format (%s) is not supported' % type(data)) 166 | return 167 | 168 | self.data = np.append(self.data,data) 169 | return 170 | 171 | def initialize(self, level = 0.98, verbose = True): 172 | """ 173 | Run the calibration (initialization) step 174 | 175 | Parameters 176 | ---------- 177 | level : float 178 | (default 0.98) Probability associated with the initial threshold t 179 | verbose : bool 180 | (default = True) If True, gives details about the batch initialization 181 | """ 182 | level = level-floor(level) 183 | 184 | n_init = self.init_data.size 185 | 186 | S = np.sort(self.init_data) # we sort X to get the empirical quantile 187 | self.init_threshold = S[int(level*n_init)] # t is fixed for the whole algorithm 188 | 189 | # initial peaks 190 | self.peaks = self.init_data[self.init_data>self.init_threshold]-self.init_threshold 191 | self.Nt = self.peaks.size 192 | self.n = n_init 193 | 194 | if verbose: 195 | print('Initial threshold : %s' % self.init_threshold) 196 | print('Number of peaks : %s' % self.Nt) 197 | print('Grimshaw maximum log-likelihood estimation ... ', end = '') 198 | 199 | g,s,l = self._grimshaw() 200 | self.extreme_quantile = self._quantile(g,s) 201 | 202 | if verbose: 203 | print('[done]') 204 | print('\t'+chr(0x03B3) + ' = ' + str(g)) 205 | print('\t'+chr(0x03C3) + ' = ' + str(s)) 206 | print('\tL = ' + str(l)) 207 | print('Extreme quantile (probability = %s): %s' % (self.proba,self.extreme_quantile)) 208 | 209 | return 210 | 211 | 212 | 213 | 214 | def _rootsFinder(fun,jac,bounds,npoints,method): 215 | """ 216 | Find possible roots of a scalar function 217 | 218 | Parameters 219 | ---------- 220 | fun : function 221 | scalar function 222 | jac : function 223 | first order derivative of the function 224 | bounds : tuple 225 | (min,max) interval for the roots search 226 | npoints : int 227 | maximum number of roots to output 228 | method : str 229 | 'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval 230 | 231 | Returns 232 | ---------- 233 | numpy.array 234 | possible roots of the function 235 | """ 236 | if method == 'regular': 237 | step = (bounds[1]-bounds[0])/(npoints+1) 238 | X0 = np.arange(bounds[0]+step,bounds[1],step) 239 | elif method == 'random': 240 | X0 = np.random.uniform(bounds[0],bounds[1],npoints) 241 | 242 | def objFun(X,f,jac): 243 | g = 0 244 | j = np.zeros(X.shape) 245 | i = 0 246 | for x in X: 247 | fx = f(x) 248 | g = g+fx**2 249 | j[i] = 2*fx*jac(x) 250 | i = i+1 251 | return g,j 252 | 253 | opt = minimize(lambda X:objFun(X,fun,jac), X0, 254 | method='L-BFGS-B', 255 | jac=True, bounds=[bounds]*len(X0)) 256 | 257 | X = opt.x 258 | np.round(X,decimals = 5) 259 | return np.unique(X) 260 | 261 | 262 | def _log_likelihood(Y,gamma,sigma): 263 | """ 264 | Compute the log-likelihood for the Generalized Pareto Distribution (μ=0) 265 | 266 | Parameters 267 | ---------- 268 | Y : numpy.array 269 | observations 270 | gamma : float 271 | GPD index parameter 272 | sigma : float 273 | GPD scale parameter (>0) 274 | 275 | Returns 276 | ---------- 277 | float 278 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 279 | """ 280 | n = Y.size 281 | if gamma != 0: 282 | tau = gamma/sigma 283 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 284 | else: 285 | L = n * ( 1 + log(Y.mean()) ) 286 | return L 287 | 288 | 289 | def _grimshaw(self,epsilon = 1e-8, n_points = 10): 290 | """ 291 | Compute the GPD parameters estimation with the Grimshaw's trick 292 | 293 | Parameters 294 | ---------- 295 | epsilon : float 296 | numerical parameter to perform (default : 1e-8) 297 | n_points : int 298 | maximum number of candidates for maximum likelihood (default : 10) 299 | 300 | Returns 301 | ---------- 302 | gamma_best,sigma_best,ll_best 303 | gamma estimates, sigma estimates and corresponding log-likelihood 304 | """ 305 | def u(s): 306 | return 1 + np.log(s).mean() 307 | 308 | def v(s): 309 | return np.mean(1/s) 310 | 311 | def w(Y,t): 312 | s = 1+t*Y 313 | us = u(s) 314 | vs = v(s) 315 | return us*vs-1 316 | 317 | def jac_w(Y,t): 318 | s = 1+t*Y 319 | us = u(s) 320 | vs = v(s) 321 | jac_us = (1/t)*(1-vs) 322 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 323 | return us*jac_vs+vs*jac_us 324 | 325 | 326 | Ym = self.peaks.min() 327 | YM = self.peaks.max() 328 | Ymean = self.peaks.mean() 329 | 330 | 331 | a = -1/YM 332 | if abs(a)<2*epsilon: 333 | epsilon = abs(a)/n_points 334 | 335 | a = a + epsilon 336 | b = 2*(Ymean-Ym)/(Ymean*Ym) 337 | c = 2*(Ymean-Ym)/(Ym**2) 338 | 339 | # We look for possible roots 340 | left_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t), 341 | lambda t: jac_w(self.peaks,t), 342 | (a+epsilon,-epsilon), 343 | n_points,'regular') 344 | 345 | right_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t), 346 | lambda t: jac_w(self.peaks,t), 347 | (b,c), 348 | n_points,'regular') 349 | 350 | # all the possible roots 351 | zeros = np.concatenate((left_zeros,right_zeros)) 352 | 353 | # 0 is always a solution so we initialize with it 354 | gamma_best = 0 355 | sigma_best = Ymean 356 | ll_best = SPOT._log_likelihood(self.peaks,gamma_best,sigma_best) 357 | 358 | # we look for better candidates 359 | for z in zeros: 360 | gamma = u(1+z*self.peaks)-1 361 | sigma = gamma/z 362 | ll = SPOT._log_likelihood(self.peaks,gamma,sigma) 363 | if ll>ll_best: 364 | gamma_best = gamma 365 | sigma_best = sigma 366 | ll_best = ll 367 | 368 | return gamma_best,sigma_best,ll_best 369 | 370 | 371 | 372 | def _quantile(self,gamma,sigma): 373 | """ 374 | Compute the quantile at level 1-q 375 | 376 | Parameters 377 | ---------- 378 | gamma : float 379 | GPD parameter 380 | sigma : float 381 | GPD parameter 382 | 383 | Returns 384 | ---------- 385 | float 386 | quantile at level 1-q for the GPD(γ,σ,μ=0) 387 | """ 388 | r = self.n * self.proba / self.Nt 389 | if gamma != 0: 390 | return self.init_threshold + (sigma/gamma)*(pow(r,-gamma)-1) 391 | else: 392 | return self.init_threshold - sigma*log(r) 393 | 394 | 395 | def run(self, with_alarm = True): 396 | """ 397 | Run SPOT on the stream 398 | 399 | Parameters 400 | ---------- 401 | with_alarm : bool 402 | (default = True) If False, SPOT will adapt the threshold assuming \ 403 | there is no abnormal values 404 | 405 | 406 | Returns 407 | ---------- 408 | dict 409 | keys : 'thresholds' and 'alarms' 410 | 411 | 'thresholds' contains the extreme quantiles and 'alarms' contains \ 412 | the indexes of the values which have triggered alarms 413 | 414 | """ 415 | if (self.n>self.init_data.size): 416 | print('Warning : the algorithm seems to have already been run, you \ 417 | should initialize before running again') 418 | return {} 419 | 420 | # list of the thresholds 421 | th = [] 422 | alarm = [] 423 | # Loop over the stream 424 | for i in tqdm.tqdm(range(self.data.size)): 425 | 426 | # If the observed value exceeds the current threshold (alarm case) 427 | if self.data[i]>self.extreme_quantile: 428 | # if we want to alarm, we put it in the alarm list 429 | if with_alarm: 430 | alarm.append(i) 431 | # otherwise we add it in the peaks 432 | else: 433 | self.peaks = np.append(self.peaks,self.data[i]-self.init_threshold) 434 | self.Nt += 1 435 | self.n += 1 436 | # and we update the thresholds 437 | 438 | g,s,l = self._grimshaw() 439 | self.extreme_quantile = self._quantile(g,s) 440 | 441 | # case where the value exceeds the initial threshold but not the alarm ones 442 | elif self.data[i]>self.init_threshold: 443 | # we add it in the peaks 444 | self.peaks = np.append(self.peaks,self.data[i]-self.init_threshold) 445 | self.Nt += 1 446 | self.n += 1 447 | # and we update the thresholds 448 | 449 | g,s,l = self._grimshaw() 450 | self.extreme_quantile = self._quantile(g,s) 451 | else: 452 | self.n += 1 453 | 454 | 455 | th.append(self.extreme_quantile) # thresholds record 456 | 457 | return {'thresholds' : th, 'alarms': alarm} 458 | 459 | 460 | def plot(self,run_results,with_alarm = True): 461 | """ 462 | Plot the results of given by the run 463 | 464 | Parameters 465 | ---------- 466 | run_results : dict 467 | results given by the 'run' method 468 | with_alarm : bool 469 | (default = True) If True, alarms are plotted. 470 | 471 | 472 | Returns 473 | ---------- 474 | list 475 | list of the plots 476 | 477 | """ 478 | x = range(self.data.size) 479 | K = run_results.keys() 480 | 481 | ts_fig, = plt.plot(x,self.data,color=air_force_blue) 482 | fig = [ts_fig] 483 | 484 | if 'thresholds' in K: 485 | th = run_results['thresholds'] 486 | th_fig, = plt.plot(x,th,color=deep_saffron,lw=2,ls='dashed') 487 | fig.append(th_fig) 488 | 489 | if with_alarm and ('alarms' in K): 490 | alarm = run_results['alarms'] 491 | al_fig = plt.scatter(alarm,self.data[alarm],color='red') 492 | fig.append(al_fig) 493 | 494 | plt.xlim((0,self.data.size)) 495 | 496 | 497 | return fig 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | """ 509 | ============================ UPPER & LOWER BOUNDS ============================= 510 | """ 511 | 512 | 513 | 514 | 515 | class biSPOT: 516 | """ 517 | This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds) 518 | 519 | Attributes 520 | ---------- 521 | proba : float 522 | Detection level (risk), chosen by the user 523 | 524 | extreme_quantile : float 525 | current threshold (bound between normal and abnormal events) 526 | 527 | data : numpy.array 528 | stream 529 | 530 | init_data : numpy.array 531 | initial batch of observations (for the calibration/initialization step) 532 | 533 | init_threshold : float 534 | initial threshold computed during the calibration step 535 | 536 | peaks : numpy.array 537 | array of peaks (excesses above the initial threshold) 538 | 539 | n : int 540 | number of observed values 541 | 542 | Nt : int 543 | number of observed peaks 544 | """ 545 | def __init__(self, q = 1e-4): 546 | """ 547 | Constructor 548 | 549 | Parameters 550 | ---------- 551 | q 552 | Detection level (risk) 553 | 554 | Returns 555 | ---------- 556 | biSPOT object 557 | """ 558 | self.proba = q 559 | self.data = None 560 | self.init_data = None 561 | self.n = 0 562 | nonedict = {'up':None,'down':None} 563 | 564 | self.extreme_quantile = dict.copy(nonedict) 565 | self.init_threshold = dict.copy(nonedict) 566 | self.peaks = dict.copy(nonedict) 567 | self.gamma = dict.copy(nonedict) 568 | self.sigma = dict.copy(nonedict) 569 | self.Nt = {'up':0,'down':0} 570 | 571 | 572 | def __str__(self): 573 | s = '' 574 | s += 'Streaming Peaks-Over-Threshold Object\n' 575 | s += 'Detection level q = %s\n' % self.proba 576 | if self.data is not None: 577 | s += 'Data imported : Yes\n' 578 | s += '\t initialization : %s values\n' % self.init_data.size 579 | s += '\t stream : %s values\n' % self.data.size 580 | else: 581 | s += 'Data imported : No\n' 582 | return s 583 | 584 | if self.n == 0: 585 | s += 'Algorithm initialized : No\n' 586 | else: 587 | s += 'Algorithm initialized : Yes\n' 588 | s += '\t initial threshold : %s\n' % self.init_threshold 589 | 590 | r = self.n-self.init_data.size 591 | if r > 0: 592 | s += 'Algorithm run : Yes\n' 593 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 594 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 595 | else: 596 | s += '\t number of peaks : %s\n' % self.Nt 597 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 598 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 599 | s += 'Algorithm run : No\n' 600 | return s 601 | 602 | 603 | def fit(self,init_data,data): 604 | """ 605 | Import data to biSPOT object 606 | 607 | Parameters 608 | ---------- 609 | init_data : list, numpy.array or pandas.Series 610 | initial batch to calibrate the algorithm () 611 | 612 | data : numpy.array 613 | data for the run (list, np.array or pd.series) 614 | 615 | """ 616 | if isinstance(data,list): 617 | self.data = np.array(data) 618 | elif isinstance(data,np.ndarray): 619 | self.data = data 620 | elif isinstance(data,pd.Series): 621 | self.data = data.values 622 | else: 623 | print('This data format (%s) is not supported' % type(data)) 624 | return 625 | 626 | if isinstance(init_data,list): 627 | self.init_data = np.array(init_data) 628 | elif isinstance(init_data,np.ndarray): 629 | self.init_data = init_data 630 | elif isinstance(init_data,pd.Series): 631 | self.init_data = init_data.values 632 | elif isinstance(init_data,int): 633 | self.init_data = self.data[:init_data] 634 | self.data = self.data[init_data:] 635 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 636 | r = int(init_data*data.size) 637 | self.init_data = self.data[:r] 638 | self.data = self.data[r:] 639 | else: 640 | print('The initial data cannot be set') 641 | return 642 | 643 | def add(self,data): 644 | """ 645 | This function allows to append data to the already fitted data 646 | 647 | Parameters 648 | ---------- 649 | data : list, numpy.array, pandas.Series 650 | data to append 651 | """ 652 | if isinstance(data,list): 653 | data = np.array(data) 654 | elif isinstance(data,np.ndarray): 655 | data = data 656 | elif isinstance(data,pd.Series): 657 | data = data.values 658 | else: 659 | print('This data format (%s) is not supported' % type(data)) 660 | return 661 | 662 | self.data = np.append(self.data,data) 663 | return 664 | 665 | def initialize(self, verbose = True): 666 | """ 667 | Run the calibration (initialization) step 668 | 669 | Parameters 670 | ---------- 671 | verbose : bool 672 | (default = True) If True, gives details about the batch initialization 673 | """ 674 | n_init = self.init_data.size 675 | 676 | S = np.sort(self.init_data) # we sort X to get the empirical quantile 677 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 678 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 679 | 680 | # initial peaks 681 | self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up'] 682 | self.peaks['down'] = -(self.init_data[self.init_data0) 774 | 775 | Returns 776 | ---------- 777 | float 778 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 779 | """ 780 | n = Y.size 781 | if gamma != 0: 782 | tau = gamma/sigma 783 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 784 | else: 785 | L = n * ( 1 + log(Y.mean()) ) 786 | return L 787 | 788 | 789 | def _grimshaw(self,side,epsilon = 1e-8, n_points = 10): 790 | """ 791 | Compute the GPD parameters estimation with the Grimshaw's trick 792 | 793 | Parameters 794 | ---------- 795 | epsilon : float 796 | numerical parameter to perform (default : 1e-8) 797 | n_points : int 798 | maximum number of candidates for maximum likelihood (default : 10) 799 | 800 | Returns 801 | ---------- 802 | gamma_best,sigma_best,ll_best 803 | gamma estimates, sigma estimates and corresponding log-likelihood 804 | """ 805 | def u(s): 806 | return 1 + np.log(s).mean() 807 | 808 | def v(s): 809 | return np.mean(1/s) 810 | 811 | def w(Y,t): 812 | s = 1+t*Y 813 | us = u(s) 814 | vs = v(s) 815 | return us*vs-1 816 | 817 | def jac_w(Y,t): 818 | s = 1+t*Y 819 | us = u(s) 820 | vs = v(s) 821 | jac_us = (1/t)*(1-vs) 822 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 823 | return us*jac_vs+vs*jac_us 824 | 825 | 826 | Ym = self.peaks[side].min() 827 | YM = self.peaks[side].max() 828 | Ymean = self.peaks[side].mean() 829 | 830 | 831 | a = -1/YM 832 | if abs(a)<2*epsilon: 833 | epsilon = abs(a)/n_points 834 | 835 | a = a + epsilon 836 | b = 2*(Ymean-Ym)/(Ymean*Ym) 837 | c = 2*(Ymean-Ym)/(Ym**2) 838 | 839 | # We look for possible roots 840 | left_zeros = biSPOT._rootsFinder(lambda t: w(self.peaks[side],t), 841 | lambda t: jac_w(self.peaks[side],t), 842 | (a+epsilon,-epsilon), 843 | n_points,'regular') 844 | 845 | right_zeros = biSPOT._rootsFinder(lambda t: w(self.peaks[side],t), 846 | lambda t: jac_w(self.peaks[side],t), 847 | (b,c), 848 | n_points,'regular') 849 | 850 | # all the possible roots 851 | zeros = np.concatenate((left_zeros,right_zeros)) 852 | 853 | # 0 is always a solution so we initialize with it 854 | gamma_best = 0 855 | sigma_best = Ymean 856 | ll_best = biSPOT._log_likelihood(self.peaks[side],gamma_best,sigma_best) 857 | 858 | # we look for better candidates 859 | for z in zeros: 860 | gamma = u(1+z*self.peaks[side])-1 861 | sigma = gamma/z 862 | ll = biSPOT._log_likelihood(self.peaks[side],gamma,sigma) 863 | if ll>ll_best: 864 | gamma_best = gamma 865 | sigma_best = sigma 866 | ll_best = ll 867 | 868 | return gamma_best,sigma_best,ll_best 869 | 870 | 871 | 872 | def _quantile(self,side,gamma,sigma): 873 | """ 874 | Compute the quantile at level 1-q for a given side 875 | 876 | Parameters 877 | ---------- 878 | side : str 879 | 'up' or 'down' 880 | gamma : float 881 | GPD parameter 882 | sigma : float 883 | GPD parameter 884 | 885 | Returns 886 | ---------- 887 | float 888 | quantile at level 1-q for the GPD(γ,σ,μ=0) 889 | """ 890 | if side == 'up': 891 | r = self.n * self.proba / self.Nt[side] 892 | if gamma != 0: 893 | return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1) 894 | else: 895 | return self.init_threshold['up'] - sigma*log(r) 896 | elif side == 'down': 897 | r = self.n * self.proba / self.Nt[side] 898 | if gamma != 0: 899 | return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1) 900 | else: 901 | return self.init_threshold['down'] + sigma*log(r) 902 | else: 903 | print('error : the side is not right') 904 | 905 | 906 | def run(self, with_alarm = True): 907 | """ 908 | Run biSPOT on the stream 909 | 910 | Parameters 911 | ---------- 912 | with_alarm : bool 913 | (default = True) If False, SPOT will adapt the threshold assuming \ 914 | there is no abnormal values 915 | 916 | 917 | Returns 918 | ---------- 919 | dict 920 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 921 | 922 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 923 | the indexes of the values which have triggered alarms 924 | 925 | """ 926 | if (self.n>self.init_data.size): 927 | print('Warning : the algorithm seems to have already been run, you \ 928 | should initialize before running again') 929 | return {} 930 | 931 | # list of the thresholds 932 | thup = [] 933 | thdown = [] 934 | alarm = [] 935 | # Loop over the stream 936 | for i in tqdm.tqdm(range(self.data.size)): 937 | 938 | # If the observed value exceeds the current threshold (alarm case) 939 | if self.data[i]>self.extreme_quantile['up'] : 940 | # if we want to alarm, we put it in the alarm list 941 | if with_alarm: 942 | alarm.append(i) 943 | # otherwise we add it in the peaks 944 | else: 945 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 946 | self.Nt['up'] += 1 947 | self.n += 1 948 | # and we update the thresholds 949 | 950 | g,s,l = self._grimshaw('up') 951 | self.extreme_quantile['up'] = self._quantile('up',g,s) 952 | 953 | # case where the value exceeds the initial threshold but not the alarm ones 954 | elif self.data[i]>self.init_threshold['up']: 955 | # we add it in the peaks 956 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 957 | self.Nt['up'] += 1 958 | self.n += 1 959 | # and we update the thresholds 960 | 961 | g,s,l = self._grimshaw('up') 962 | self.extreme_quantile['up'] = self._quantile('up',g,s) 963 | 964 | elif self.data[i] 0: 1127 | s += 'Algorithm run : Yes\n' 1128 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 1129 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 1130 | else: 1131 | s += '\t number of peaks : %s\n' % self.Nt 1132 | s += '\t extreme quantile : %s\n' % self.extreme_quantile 1133 | s += 'Algorithm run : No\n' 1134 | return s 1135 | 1136 | 1137 | def fit(self,init_data,data): 1138 | """ 1139 | Import data to DSPOT object 1140 | 1141 | Parameters 1142 | ---------- 1143 | init_data : list, numpy.array or pandas.Series 1144 | initial batch to calibrate the algorithm 1145 | 1146 | data : numpy.array 1147 | data for the run (list, np.array or pd.series) 1148 | 1149 | """ 1150 | if isinstance(data,list): 1151 | self.data = np.array(data) 1152 | elif isinstance(data,np.ndarray): 1153 | self.data = data 1154 | elif isinstance(data,pd.Series): 1155 | self.data = data.values 1156 | else: 1157 | print('This data format (%s) is not supported' % type(data)) 1158 | return 1159 | 1160 | if isinstance(init_data,list): 1161 | self.init_data = np.array(init_data) 1162 | elif isinstance(init_data,np.ndarray): 1163 | self.init_data = init_data 1164 | elif isinstance(init_data,pd.Series): 1165 | self.init_data = init_data.values 1166 | elif isinstance(init_data,int): 1167 | self.init_data = self.data[:init_data] 1168 | self.data = self.data[init_data:] 1169 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 1170 | r = int(init_data*data.size) 1171 | self.init_data = self.data[:r] 1172 | self.data = self.data[r:] 1173 | else: 1174 | print('The initial data cannot be set') 1175 | return 1176 | 1177 | def add(self,data): 1178 | """ 1179 | This function allows to append data to the already fitted data 1180 | 1181 | Parameters 1182 | ---------- 1183 | data : list, numpy.array, pandas.Series 1184 | data to append 1185 | """ 1186 | if isinstance(data,list): 1187 | data = np.array(data) 1188 | elif isinstance(data,np.ndarray): 1189 | data = data 1190 | elif isinstance(data,pd.Series): 1191 | data = data.values 1192 | else: 1193 | print('This data format (%s) is not supported' % type(data)) 1194 | return 1195 | 1196 | self.data = np.append(self.data,data) 1197 | return 1198 | 1199 | def initialize(self, verbose = True): 1200 | """ 1201 | Run the calibration (initialization) step 1202 | 1203 | Parameters 1204 | ---------- 1205 | verbose : bool 1206 | (default = True) If True, gives details about the batch initialization 1207 | """ 1208 | n_init = self.init_data.size - self.depth 1209 | 1210 | M = backMean(self.init_data,self.depth) 1211 | T = self.init_data[self.depth:]-M[:-1] # new variable 1212 | 1213 | S = np.sort(T) # we sort X to get the empirical quantile 1214 | self.init_threshold = S[int(0.98*n_init)] # t is fixed for the whole algorithm 1215 | 1216 | # initial peaks 1217 | self.peaks = T[T>self.init_threshold]-self.init_threshold 1218 | self.Nt = self.peaks.size 1219 | self.n = n_init 1220 | 1221 | if verbose: 1222 | print('Initial threshold : %s' % self.init_threshold) 1223 | print('Number of peaks : %s' % self.Nt) 1224 | print('Grimshaw maximum log-likelihood estimation ... ', end = '') 1225 | 1226 | g,s,l = self._grimshaw() 1227 | self.extreme_quantile = self._quantile(g,s) 1228 | 1229 | if verbose: 1230 | print('[done]') 1231 | print('\t'+chr(0x03B3) + ' = ' + str(g)) 1232 | print('\t'+chr(0x03C3) + ' = ' + str(s)) 1233 | print('\tL = ' + str(l)) 1234 | print('Extreme quantile (probability = %s): %s' % (self.proba,self.extreme_quantile)) 1235 | 1236 | return 1237 | 1238 | 1239 | 1240 | 1241 | def _rootsFinder(fun,jac,bounds,npoints,method): 1242 | """ 1243 | Find possible roots of a scalar function 1244 | 1245 | Parameters 1246 | ---------- 1247 | fun : function 1248 | scalar function 1249 | jac : function 1250 | first order derivative of the function 1251 | bounds : tuple 1252 | (min,max) interval for the roots search 1253 | npoints : int 1254 | maximum number of roots to output 1255 | method : str 1256 | 'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval 1257 | 1258 | Returns 1259 | ---------- 1260 | numpy.array 1261 | possible roots of the function 1262 | """ 1263 | if method == 'regular': 1264 | step = (bounds[1]-bounds[0])/(npoints+1) 1265 | X0 = np.arange(bounds[0]+step,bounds[1],step) 1266 | elif method == 'random': 1267 | X0 = np.random.uniform(bounds[0],bounds[1],npoints) 1268 | 1269 | def objFun(X,f,jac): 1270 | g = 0 1271 | j = np.zeros(X.shape) 1272 | i = 0 1273 | for x in X: 1274 | fx = f(x) 1275 | g = g+fx**2 1276 | j[i] = 2*fx*jac(x) 1277 | i = i+1 1278 | return g,j 1279 | 1280 | opt = minimize(lambda X:objFun(X,fun,jac), X0, 1281 | method='L-BFGS-B', 1282 | jac=True, bounds=[bounds]*len(X0)) 1283 | 1284 | X = opt.x 1285 | np.round(X,decimals = 5) 1286 | return np.unique(X) 1287 | 1288 | 1289 | def _log_likelihood(Y,gamma,sigma): 1290 | """ 1291 | Compute the log-likelihood for the Generalized Pareto Distribution (μ=0) 1292 | 1293 | Parameters 1294 | ---------- 1295 | Y : numpy.array 1296 | observations 1297 | gamma : float 1298 | GPD index parameter 1299 | sigma : float 1300 | GPD scale parameter (>0) 1301 | 1302 | Returns 1303 | ---------- 1304 | float 1305 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 1306 | """ 1307 | n = Y.size 1308 | if gamma != 0: 1309 | tau = gamma/sigma 1310 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 1311 | else: 1312 | L = n * ( 1 + log(Y.mean()) ) 1313 | return L 1314 | 1315 | 1316 | def _grimshaw(self,epsilon = 1e-8, n_points = 10): 1317 | """ 1318 | Compute the GPD parameters estimation with the Grimshaw's trick 1319 | 1320 | Parameters 1321 | ---------- 1322 | epsilon : float 1323 | numerical parameter to perform (default : 1e-8) 1324 | n_points : int 1325 | maximum number of candidates for maximum likelihood (default : 10) 1326 | 1327 | Returns 1328 | ---------- 1329 | gamma_best,sigma_best,ll_best 1330 | gamma estimates, sigma estimates and corresponding log-likelihood 1331 | """ 1332 | def u(s): 1333 | return 1 + np.log(s).mean() 1334 | 1335 | def v(s): 1336 | return np.mean(1/s) 1337 | 1338 | def w(Y,t): 1339 | s = 1+t*Y 1340 | us = u(s) 1341 | vs = v(s) 1342 | return us*vs-1 1343 | 1344 | def jac_w(Y,t): 1345 | s = 1+t*Y 1346 | us = u(s) 1347 | vs = v(s) 1348 | jac_us = (1/t)*(1-vs) 1349 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 1350 | return us*jac_vs+vs*jac_us 1351 | 1352 | 1353 | Ym = self.peaks.min() 1354 | YM = self.peaks.max() 1355 | Ymean = self.peaks.mean() 1356 | 1357 | 1358 | a = -1/YM 1359 | if abs(a)<2*epsilon: 1360 | epsilon = abs(a)/n_points 1361 | 1362 | a = a + epsilon 1363 | b = 2*(Ymean-Ym)/(Ymean*Ym) 1364 | c = 2*(Ymean-Ym)/(Ym**2) 1365 | 1366 | # We look for possible roots 1367 | left_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t), 1368 | lambda t: jac_w(self.peaks,t), 1369 | (a+epsilon,-epsilon), 1370 | n_points,'regular') 1371 | 1372 | right_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t), 1373 | lambda t: jac_w(self.peaks,t), 1374 | (b,c), 1375 | n_points,'regular') 1376 | 1377 | # all the possible roots 1378 | zeros = np.concatenate((left_zeros,right_zeros)) 1379 | 1380 | # 0 is always a solution so we initialize with it 1381 | gamma_best = 0 1382 | sigma_best = Ymean 1383 | ll_best = SPOT._log_likelihood(self.peaks,gamma_best,sigma_best) 1384 | 1385 | # we look for better candidates 1386 | for z in zeros: 1387 | gamma = u(1+z*self.peaks)-1 1388 | sigma = gamma/z 1389 | ll = dSPOT._log_likelihood(self.peaks,gamma,sigma) 1390 | if ll>ll_best: 1391 | gamma_best = gamma 1392 | sigma_best = sigma 1393 | ll_best = ll 1394 | 1395 | return gamma_best,sigma_best,ll_best 1396 | 1397 | 1398 | 1399 | def _quantile(self,gamma,sigma): 1400 | """ 1401 | Compute the quantile at level 1-q 1402 | 1403 | Parameters 1404 | ---------- 1405 | gamma : float 1406 | GPD parameter 1407 | sigma : float 1408 | GPD parameter 1409 | 1410 | Returns 1411 | ---------- 1412 | float 1413 | quantile at level 1-q for the GPD(γ,σ,μ=0) 1414 | """ 1415 | r = self.n * self.proba / self.Nt 1416 | if gamma != 0: 1417 | return self.init_threshold + (sigma/gamma)*(pow(r,-gamma)-1) 1418 | else: 1419 | return self.init_threshold - sigma*log(r) 1420 | 1421 | 1422 | def run(self, with_alarm = True): 1423 | """ 1424 | Run biSPOT on the stream 1425 | 1426 | Parameters 1427 | ---------- 1428 | with_alarm : bool 1429 | (default = True) If False, SPOT will adapt the threshold assuming \ 1430 | there is no abnormal values 1431 | 1432 | 1433 | Returns 1434 | ---------- 1435 | dict 1436 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 1437 | 1438 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 1439 | the indexes of the values which have triggered alarms 1440 | 1441 | """ 1442 | if (self.n>self.init_data.size): 1443 | print('Warning : the algorithm seems to have already been run, you \ 1444 | should initialize before running again') 1445 | return {} 1446 | 1447 | # actual normal window 1448 | W = self.init_data[-self.depth:] 1449 | 1450 | # list of the thresholds 1451 | th = [] 1452 | alarm = [] 1453 | # Loop over the stream 1454 | for i in tqdm.tqdm(range(self.data.size)): 1455 | Mi = W.mean() 1456 | # If the observed value exceeds the current threshold (alarm case) 1457 | if (self.data[i]-Mi)>self.extreme_quantile: 1458 | # if we want to alarm, we put it in the alarm list 1459 | if with_alarm: 1460 | alarm.append(i) 1461 | # otherwise we add it in the peaks 1462 | else: 1463 | self.peaks = np.append(self.peaks,self.data[i]-Mi-self.init_threshold) 1464 | self.Nt += 1 1465 | self.n += 1 1466 | # and we update the thresholds 1467 | 1468 | g,s,l = self._grimshaw() 1469 | self.extreme_quantile = self._quantile(g,s) #+ Mi 1470 | W = np.append(W[1:],self.data[i]) 1471 | 1472 | # case where the value exceeds the initial threshold but not the alarm ones 1473 | elif (self.data[i]-Mi)>self.init_threshold: 1474 | # we add it in the peaks 1475 | self.peaks = np.append(self.peaks,self.data[i]-Mi-self.init_threshold) 1476 | self.Nt += 1 1477 | self.n += 1 1478 | # and we update the thresholds 1479 | 1480 | g,s,l = self._grimshaw() 1481 | self.extreme_quantile = self._quantile(g,s) #+ Mi 1482 | W = np.append(W[1:],self.data[i]) 1483 | else: 1484 | self.n += 1 1485 | W = np.append(W[1:],self.data[i]) 1486 | 1487 | 1488 | th.append(self.extreme_quantile+Mi) # thresholds record 1489 | 1490 | return {'thresholds' : th, 'alarms': alarm} 1491 | 1492 | 1493 | def plot(self,run_results, with_alarm = True): 1494 | """ 1495 | Plot the results given by the run 1496 | 1497 | Parameters 1498 | ---------- 1499 | run_results : dict 1500 | results given by the 'run' method 1501 | with_alarm : bool 1502 | (default = True) If True, alarms are plotted. 1503 | 1504 | 1505 | Returns 1506 | ---------- 1507 | list 1508 | list of the plots 1509 | 1510 | """ 1511 | x = range(self.data.size) 1512 | K = run_results.keys() 1513 | 1514 | ts_fig, = plt.plot(x,self.data,color=air_force_blue) 1515 | fig = [ts_fig] 1516 | 1517 | # if 'upper_thresholds' in K: 1518 | # thup = run_results['upper_thresholds'] 1519 | # uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed') 1520 | # fig.append(uth_fig) 1521 | # 1522 | # if 'lower_thresholds' in K: 1523 | # thdown = run_results['lower_thresholds'] 1524 | # lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed') 1525 | # fig.append(lth_fig) 1526 | 1527 | if 'thresholds' in K: 1528 | th = run_results['thresholds'] 1529 | th_fig, = plt.plot(x,th,color=deep_saffron,lw=2,ls='dashed') 1530 | fig.append(th_fig) 1531 | 1532 | if with_alarm and ('alarms' in K): 1533 | alarm = run_results['alarms'] 1534 | if len(alarm)>0: 1535 | plt.scatter(alarm,self.data[alarm],color='red') 1536 | 1537 | plt.xlim((0,self.data.size)) 1538 | 1539 | 1540 | return fig 1541 | 1542 | 1543 | 1544 | 1545 | 1546 | 1547 | 1548 | """ 1549 | =========================== DRIFT & DOUBLE BOUNDS ============================= 1550 | """ 1551 | 1552 | 1553 | 1554 | class bidSPOT: 1555 | """ 1556 | This class allows to run DSPOT algorithm on univariate dataset (upper and lower bounds) 1557 | 1558 | Attributes 1559 | ---------- 1560 | proba : float 1561 | Detection level (risk), chosen by the user 1562 | 1563 | depth : int 1564 | Number of observations to compute the moving average 1565 | 1566 | extreme_quantile : float 1567 | current threshold (bound between normal and abnormal events) 1568 | 1569 | data : numpy.array 1570 | stream 1571 | 1572 | init_data : numpy.array 1573 | initial batch of observations (for the calibration/initialization step) 1574 | 1575 | init_threshold : float 1576 | initial threshold computed during the calibration step 1577 | 1578 | peaks : numpy.array 1579 | array of peaks (excesses above the initial threshold) 1580 | 1581 | n : int 1582 | number of observed values 1583 | 1584 | Nt : int 1585 | number of observed peaks 1586 | """ 1587 | def __init__(self, q = 1e-4, depth = 10): 1588 | self.proba = q 1589 | self.data = None 1590 | self.init_data = None 1591 | self.n = 0 1592 | self.depth = depth 1593 | 1594 | nonedict = {'up':None,'down':None} 1595 | 1596 | self.extreme_quantile = dict.copy(nonedict) 1597 | self.init_threshold = dict.copy(nonedict) 1598 | self.peaks = dict.copy(nonedict) 1599 | self.gamma = dict.copy(nonedict) 1600 | self.sigma = dict.copy(nonedict) 1601 | self.Nt = {'up':0,'down':0} 1602 | 1603 | 1604 | def __str__(self): 1605 | s = '' 1606 | s += 'Streaming Peaks-Over-Threshold Object\n' 1607 | s += 'Detection level q = %s\n' % self.proba 1608 | if self.data is not None: 1609 | s += 'Data imported : Yes\n' 1610 | s += '\t initialization : %s values\n' % self.init_data.size 1611 | s += '\t stream : %s values\n' % self.data.size 1612 | else: 1613 | s += 'Data imported : No\n' 1614 | return s 1615 | 1616 | if self.n == 0: 1617 | s += 'Algorithm initialized : No\n' 1618 | else: 1619 | s += 'Algorithm initialized : Yes\n' 1620 | s += '\t initial threshold : %s\n' % self.init_threshold 1621 | 1622 | r = self.n-self.init_data.size 1623 | if r > 0: 1624 | s += 'Algorithm run : Yes\n' 1625 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 1626 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 1627 | else: 1628 | s += '\t number of peaks : %s\n' % self.Nt 1629 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 1630 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 1631 | s += 'Algorithm run : No\n' 1632 | return s 1633 | 1634 | 1635 | def fit(self,init_data,data): 1636 | """ 1637 | Import data to biDSPOT object 1638 | 1639 | Parameters 1640 | ---------- 1641 | init_data : list, numpy.array or pandas.Series 1642 | initial batch to calibrate the algorithm 1643 | 1644 | data : numpy.array 1645 | data for the run (list, np.array or pd.series) 1646 | 1647 | """ 1648 | if isinstance(data,list): 1649 | self.data = np.array(data) 1650 | elif isinstance(data,np.ndarray): 1651 | self.data = data 1652 | elif isinstance(data,pd.Series): 1653 | self.data = data.values 1654 | else: 1655 | print('This data format (%s) is not supported' % type(data)) 1656 | return 1657 | 1658 | if isinstance(init_data,list): 1659 | self.init_data = np.array(init_data) 1660 | elif isinstance(init_data,np.ndarray): 1661 | self.init_data = init_data 1662 | elif isinstance(init_data,pd.Series): 1663 | self.init_data = init_data.values 1664 | elif isinstance(init_data,int): 1665 | self.init_data = self.data[:init_data] 1666 | self.data = self.data[init_data:] 1667 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 1668 | r = int(init_data*data.size) 1669 | self.init_data = self.data[:r] 1670 | self.data = self.data[r:] 1671 | else: 1672 | print('The initial data cannot be set') 1673 | return 1674 | 1675 | def add(self,data): 1676 | """ 1677 | This function allows to append data to the already fitted data 1678 | 1679 | Parameters 1680 | ---------- 1681 | data : list, numpy.array, pandas.Series 1682 | data to append 1683 | """ 1684 | if isinstance(data,list): 1685 | data = np.array(data) 1686 | elif isinstance(data,np.ndarray): 1687 | data = data 1688 | elif isinstance(data,pd.Series): 1689 | data = data.values 1690 | else: 1691 | print('This data format (%s) is not supported' % type(data)) 1692 | return 1693 | 1694 | self.data = np.append(self.data,data) 1695 | return 1696 | 1697 | def initialize(self, verbose = True): 1698 | """ 1699 | Run the calibration (initialization) step 1700 | 1701 | Parameters 1702 | ---------- 1703 | verbose : bool 1704 | (default = True) If True, gives details about the batch initialization 1705 | """ 1706 | n_init = self.init_data.size - self.depth 1707 | 1708 | M = backMean(self.init_data,self.depth) 1709 | T = self.init_data[self.depth:]-M[:-1] # new variable 1710 | 1711 | S = np.sort(T) # we sort T to get the empirical quantile 1712 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 1713 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 1714 | 1715 | # initial peaks 1716 | self.peaks['up'] = T[T>self.init_threshold['up']]-self.init_threshold['up'] 1717 | self.peaks['down'] = -( T[ T0) 1810 | 1811 | Returns 1812 | ---------- 1813 | float 1814 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 1815 | """ 1816 | n = Y.size 1817 | if gamma != 0: 1818 | tau = gamma/sigma 1819 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 1820 | else: 1821 | L = n * ( 1 + log(Y.mean()) ) 1822 | return L 1823 | 1824 | 1825 | def _grimshaw(self,side,epsilon = 1e-8, n_points = 8): 1826 | """ 1827 | Compute the GPD parameters estimation with the Grimshaw's trick 1828 | 1829 | Parameters 1830 | ---------- 1831 | epsilon : float 1832 | numerical parameter to perform (default : 1e-8) 1833 | n_points : int 1834 | maximum number of candidates for maximum likelihood (default : 10) 1835 | 1836 | Returns 1837 | ---------- 1838 | gamma_best,sigma_best,ll_best 1839 | gamma estimates, sigma estimates and corresponding log-likelihood 1840 | """ 1841 | def u(s): 1842 | return 1 + np.log(s).mean() 1843 | 1844 | def v(s): 1845 | return np.mean(1/s) 1846 | 1847 | def w(Y,t): 1848 | s = 1+t*Y 1849 | us = u(s) 1850 | vs = v(s) 1851 | return us*vs-1 1852 | 1853 | def jac_w(Y,t): 1854 | s = 1+t*Y 1855 | us = u(s) 1856 | vs = v(s) 1857 | jac_us = (1/t)*(1-vs) 1858 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 1859 | return us*jac_vs+vs*jac_us 1860 | 1861 | 1862 | Ym = self.peaks[side].min() 1863 | YM = self.peaks[side].max() 1864 | Ymean = self.peaks[side].mean() 1865 | 1866 | 1867 | a = -1/YM 1868 | if abs(a)<2*epsilon: 1869 | epsilon = abs(a)/n_points 1870 | 1871 | a = a + epsilon 1872 | b = 2*(Ymean-Ym)/(Ymean*Ym) 1873 | c = 2*(Ymean-Ym)/(Ym**2) 1874 | 1875 | # We look for possible roots 1876 | left_zeros = bidSPOT._rootsFinder(lambda t: w(self.peaks[side],t), 1877 | lambda t: jac_w(self.peaks[side],t), 1878 | (a+epsilon,-epsilon), 1879 | n_points,'regular') 1880 | 1881 | right_zeros = bidSPOT._rootsFinder(lambda t: w(self.peaks[side],t), 1882 | lambda t: jac_w(self.peaks[side],t), 1883 | (b,c), 1884 | n_points,'regular') 1885 | 1886 | # all the possible roots 1887 | zeros = np.concatenate((left_zeros,right_zeros)) 1888 | 1889 | # 0 is always a solution so we initialize with it 1890 | gamma_best = 0 1891 | sigma_best = Ymean 1892 | ll_best = bidSPOT._log_likelihood(self.peaks[side],gamma_best,sigma_best) 1893 | 1894 | # we look for better candidates 1895 | for z in zeros: 1896 | gamma = u(1+z*self.peaks[side])-1 1897 | sigma = gamma/z 1898 | ll = bidSPOT._log_likelihood(self.peaks[side],gamma,sigma) 1899 | if ll>ll_best: 1900 | gamma_best = gamma 1901 | sigma_best = sigma 1902 | ll_best = ll 1903 | 1904 | return gamma_best,sigma_best,ll_best 1905 | 1906 | 1907 | 1908 | def _quantile(self,side,gamma,sigma): 1909 | """ 1910 | Compute the quantile at level 1-q for a given side 1911 | 1912 | Parameters 1913 | ---------- 1914 | side : str 1915 | 'up' or 'down' 1916 | gamma : float 1917 | GPD parameter 1918 | sigma : float 1919 | GPD parameter 1920 | 1921 | Returns 1922 | ---------- 1923 | float 1924 | quantile at level 1-q for the GPD(γ,σ,μ=0) 1925 | """ 1926 | if side == 'up': 1927 | r = self.n * self.proba / self.Nt[side] 1928 | if gamma != 0: 1929 | return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1) 1930 | else: 1931 | return self.init_threshold['up'] - sigma*log(r) 1932 | elif side == 'down': 1933 | r = self.n * self.proba / self.Nt[side] 1934 | if gamma != 0: 1935 | return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1) 1936 | else: 1937 | return self.init_threshold['down'] + sigma*log(r) 1938 | else: 1939 | print('error : the side is not right') 1940 | 1941 | 1942 | def run(self, with_alarm = True, plot = True): 1943 | """ 1944 | Run biDSPOT on the stream 1945 | 1946 | Parameters 1947 | ---------- 1948 | with_alarm : bool 1949 | (default = True) If False, SPOT will adapt the threshold assuming \ 1950 | there is no abnormal values 1951 | 1952 | 1953 | Returns 1954 | ---------- 1955 | dict 1956 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 1957 | 1958 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 1959 | the indexes of the values which have triggered alarms 1960 | 1961 | """ 1962 | if (self.n>self.init_data.size): 1963 | print('Warning : the algorithm seems to have already been run, you \ 1964 | should initialize before running again') 1965 | return {} 1966 | 1967 | # actual normal window 1968 | W = self.init_data[-self.depth:] 1969 | 1970 | # list of the thresholds 1971 | thup = [] 1972 | thdown = [] 1973 | alarm = [] 1974 | # Loop over the stream 1975 | for i in tqdm.tqdm(range(self.data.size)): 1976 | Mi = W.mean() 1977 | Ni = self.data[i]-Mi 1978 | # If the observed value exceeds the current threshold (alarm case) 1979 | if Ni>self.extreme_quantile['up'] : 1980 | # if we want to alarm, we put it in the alarm list 1981 | if with_alarm: 1982 | alarm.append(i) 1983 | # otherwise we add it in the peaks 1984 | else: 1985 | self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up']) 1986 | self.Nt['up'] += 1 1987 | self.n += 1 1988 | # and we update the thresholds 1989 | 1990 | g,s,l = self._grimshaw('up') 1991 | self.extreme_quantile['up'] = self._quantile('up',g,s) 1992 | W = np.append(W[1:],self.data[i]) 1993 | 1994 | # case where the value exceeds the initial threshold but not the alarm ones 1995 | elif Ni>self.init_threshold['up']: 1996 | # we add it in the peaks 1997 | self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up']) 1998 | self.Nt['up'] += 1 1999 | self.n += 1 2000 | # and we update the thresholds 2001 | g,s,l = self._grimshaw('up') 2002 | self.extreme_quantile['up'] = self._quantile('up',g,s) 2003 | W = np.append(W[1:],self.data[i]) 2004 | 2005 | elif Ni0: 2079 | al_fig = plt.scatter(alarm,self.data[alarm],color='red') 2080 | fig.append(al_fig) 2081 | 2082 | plt.xlim((0,self.data.size)) 2083 | 2084 | return fig --------------------------------------------------------------------------------