├── AfterImage.py ├── AfterImage_extrapolate.pyx ├── FeatureExtractor.py ├── KitNET ├── KitNET.py ├── LICENSE.txt ├── __init__.py ├── corClust.py ├── dA.py └── utils.py ├── Kitsune paper.pdf ├── Kitsune.py ├── Kitsune_fig.png ├── LICENSE ├── README.md ├── example.py ├── mirai.zip ├── netStat.py └── setup.py /AfterImage.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | 5 | class incStat: 6 | def __init__(self, Lambda, ID, init_time=0, isTypeDiff=False): # timestamp is creation time 7 | self.ID = ID 8 | self.CF1 = 0 # linear sum 9 | self.CF2 = 0 # sum of squares 10 | self.w = 1e-20 # weight 11 | self.isTypeDiff = isTypeDiff 12 | self.Lambda = Lambda # Decay Factor 13 | self.lastTimestamp = init_time 14 | self.cur_mean = np.nan 15 | self.cur_var = np.nan 16 | self.cur_std = np.nan 17 | self.covs = [] # a list of incStat_covs (references) with relate to this incStat 18 | 19 | def insert(self, v, t=0): # v is a scalar, t is v's arrival the timestamp 20 | if self.isTypeDiff: 21 | dif = t - self.lastTimestamp 22 | if dif > 0: 23 | v = dif 24 | else: 25 | v = 0 26 | self.processDecay(t) 27 | 28 | # update with v 29 | self.CF1 += v 30 | self.CF2 += math.pow(v, 2) 31 | self.w += 1 32 | self.cur_mean = np.nan # force recalculation if called 33 | self.cur_var = np.nan 34 | self.cur_std = np.nan 35 | 36 | # update covs (if any) 37 | for cov in self.covs: 38 | cov.update_cov(self.ID, v, t) 39 | 40 | def processDecay(self, timestamp): 41 | factor=1 42 | # check for decay 43 | timeDiff = timestamp - self.lastTimestamp 44 | if timeDiff > 0: 45 | factor = math.pow(2, (-self.Lambda * timeDiff)) 46 | self.CF1 = self.CF1 * factor 47 | self.CF2 = self.CF2 * factor 48 | self.w = self.w * factor 49 | self.lastTimestamp = timestamp 50 | return factor 51 | 52 | def weight(self): 53 | return self.w 54 | 55 | def mean(self): 56 | if math.isnan(self.cur_mean): # calculate it only once when necessary 57 | self.cur_mean = self.CF1 / self.w 58 | return self.cur_mean 59 | 60 | def var(self): 61 | if math.isnan(self.cur_var): # calculate it only once when necessary 62 | self.cur_var = abs(self.CF2 / self.w - math.pow(self.mean(), 2)) 63 | return self.cur_var 64 | 65 | def std(self): 66 | if math.isnan(self.cur_std): # calculate it only once when necessary 67 | self.cur_std = math.sqrt(self.var()) 68 | return self.cur_std 69 | 70 | def cov(self,ID2): 71 | for cov in self.covs: 72 | if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2: 73 | return cov.cov() 74 | return [np.nan] 75 | 76 | def pcc(self,ID2): 77 | for cov in self.covs: 78 | if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2: 79 | return cov.pcc() 80 | return [np.nan] 81 | 82 | def cov_pcc(self,ID2): 83 | for cov in self.covs: 84 | if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2: 85 | return cov.get_stats1() 86 | return [np.nan]*2 87 | 88 | def radius(self, other_incStats): # the radius of a set of incStats 89 | A = self.var()**2 90 | for incS in other_incStats: 91 | A += incS.var()**2 92 | return math.sqrt(A) 93 | 94 | def magnitude(self, other_incStats): # the magnitude of a set of incStats 95 | A = math.pow(self.mean(), 2) 96 | for incS in other_incStats: 97 | A += math.pow(incS.mean(), 2) 98 | return math.sqrt(A) 99 | 100 | #calculates and pulls all stats on this stream 101 | def allstats_1D(self): 102 | self.cur_mean = self.CF1 / self.w 103 | self.cur_var = abs(self.CF2 / self.w - math.pow(self.cur_mean, 2)) 104 | return [self.w, self.cur_mean, self.cur_var] 105 | 106 | #calculates and pulls all stats on this stream, and stats shared with the indicated stream 107 | def allstats_2D(self, ID2): 108 | stats1D = self.allstats_1D() 109 | # Find cov component 110 | stats2D = [np.nan] * 4 111 | for cov in self.covs: 112 | if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2: 113 | stats2D = cov.get_stats2() 114 | break 115 | return stats1D + stats2D 116 | 117 | def getHeaders_1D(self, suffix=True): 118 | if self.ID is None: 119 | s0="" 120 | else: 121 | s0 = "_0" 122 | if suffix: 123 | s0 = "_"+self.ID 124 | headers = ["weight"+s0, "mean"+s0, "std"+s0] 125 | return headers 126 | 127 | def getHeaders_2D(self, ID2, suffix=True): 128 | hdrs1D = self.getHeaders_1D(suffix) 129 | if self.ID is None: 130 | s0="" 131 | s1="" 132 | else: 133 | s0 = "_0" 134 | s1 = "_1" 135 | if suffix: 136 | s0 = "_"+self.ID 137 | s1 = "_" + ID2 138 | hdrs2D = ["radius_" + s0 + "_" + s1, "magnitude_" + s0 + "_" + s1, "covariance_" + s0 + "_" + s1, 139 | "pcc_" + s0 + "_" + s1] 140 | return hdrs1D+hdrs2D 141 | 142 | 143 | #like incStat, but maintains stats between two streams 144 | class incStat_cov: 145 | def __init__(self, incS1, incS2, init_time = 0): 146 | # store references tot he streams' incStats 147 | self.incStats = [incS1,incS2] 148 | self.lastRes = [0,0] 149 | # init extrapolators 150 | #self.EXs = [extrapolator(),extrapolator()] 151 | 152 | # init sum product residuals 153 | self.CF3 = 0 # sum of residule products (A-uA)(B-uB) 154 | self.w3 = 1e-20 155 | self.lastTimestamp_cf3 = init_time 156 | 157 | #other_incS_decay is the decay factor of the other incstat 158 | # ID: the stream ID which produced (v,t) 159 | def update_cov(self, ID, v, t): # it is assumes that incStat "ID" has ALREADY been updated with (t,v) [this si performed automatically in method incStat.insert()] 160 | # find incStat 161 | if ID == self.incStats[0].ID: 162 | inc = 0 163 | elif ID == self.incStats[1].ID: 164 | inc = 1 165 | else: 166 | print("update_cov ID error") 167 | return ## error 168 | 169 | # Decay other incStat 170 | self.incStats[not(inc)].processDecay(t) 171 | 172 | # Decay residules 173 | self.processDecay(t,inc) 174 | 175 | # Update extrapolator for current stream 176 | #self.EXs[inc].insert(t,v) 177 | 178 | # Extrapolate other stream 179 | #v_other = self.EXs[not(inc)].predict(t) 180 | 181 | # Compute and update residule 182 | res = (v - self.incStats[inc].mean()) 183 | resid = (v - self.incStats[inc].mean()) * self.lastRes[not(inc)] 184 | self.CF3 += resid 185 | self.w3 += 1 186 | self.lastRes[inc] = res 187 | 188 | def processDecay(self,t,micro_inc_indx): 189 | factor = 1 190 | # check for decay cf3 191 | timeDiffs_cf3 = t - self.lastTimestamp_cf3 192 | if timeDiffs_cf3 > 0: 193 | factor = math.pow(2, (-(self.incStats[micro_inc_indx].Lambda) * timeDiffs_cf3)) 194 | self.CF3 *= factor 195 | self.w3 *= factor 196 | self.lastTimestamp_cf3 = t 197 | self.lastRes[micro_inc_indx] *= factor 198 | return factor 199 | 200 | #todo: add W3 for cf3 201 | 202 | #covariance approximation 203 | def cov(self): 204 | return self.CF3 / self.w3 205 | 206 | # Pearson corl. coef 207 | def pcc(self): 208 | ss = self.incStats[0].std() * self.incStats[1].std() 209 | if ss != 0: 210 | return self.cov() / ss 211 | else: 212 | return 0 213 | 214 | # calculates and pulls all correlative stats 215 | def get_stats1(self): 216 | return [self.cov(), self.pcc()] 217 | 218 | # calculates and pulls all correlative stats AND 2D stats from both streams (incStat) 219 | def get_stats2(self): 220 | return [self.incStats[0].radius([self.incStats[1]]),self.incStats[0].magnitude([self.incStats[1]]),self.cov(), self.pcc()] 221 | 222 | # calculates and pulls all correlative stats AND 2D stats AND the regular stats from both streams (incStat) 223 | def get_stats3(self): 224 | return [self.incStats[0].w,self.incStats[0].mean(),self.incStats[0].std(),self.incStats[1].w,self.incStats[1].mean(),self.incStats[1].std(),self.cov(), self.pcc()] 225 | 226 | # calculates and pulls all correlative stats AND the regular stats from both incStats AND 2D stats 227 | def get_stats4(self): 228 | return [self.incStats[0].w,self.incStats[0].mean(),self.incStats[0].std(),self.incStats[1].w,self.incStats[1].mean(),self.incStats[1].std(), self.incStats[0].radius([self.incStats[1]]),self.incStats[0].magnitude([self.incStats[1]]),self.cov(), self.pcc()] 229 | 230 | def getHeaders(self,ver,suffix=True): #ver = {1,2,3,4} 231 | headers = [] 232 | s0 = "0" 233 | s1 = "1" 234 | if suffix: 235 | s0 = self.incStats[0].ID 236 | s1 = self.incStats[1].ID 237 | 238 | if ver == 1: 239 | headers = ["covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1] 240 | if ver == 2: 241 | headers = ["radius_"+s0+"_"+s1, "magnitude_"+s0+"_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1] 242 | if ver == 3: 243 | headers = ["weight_"+s0, "mean_"+s0, "std_"+s0,"weight_"+s1, "mean_"+s1, "std_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1] 244 | if ver == 4: 245 | headers = ["weight_" + s0, "mean_" + s0, "std_" + s0, "covariance_" + s0 + "_" + s1, "pcc_" + s0 + "_" + s1] 246 | if ver == 5: 247 | headers = ["weight_"+s0, "mean_"+s0, "std_"+s0,"weight_"+s1, "mean_"+s1, "std_"+s1, "radius_"+s0+"_"+s1, "magnitude_"+s0+"_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1] 248 | return headers 249 | 250 | 251 | class incStatDB: 252 | # default_lambda: use this as the lambda for all streams. If not specified, then you must supply a Lambda with every query. 253 | def __init__(self,limit=np.Inf,default_lambda=np.nan): 254 | self.HT = dict() 255 | self.limit = limit 256 | self.df_lambda = default_lambda 257 | 258 | def get_lambda(self,Lambda): 259 | if not np.isnan(self.df_lambda): 260 | Lambda = self.df_lambda 261 | return Lambda 262 | 263 | # Registers a new stream. init_time: init lastTimestamp of the incStat 264 | def register(self,ID,Lambda=1,init_time=0,isTypeDiff=False): 265 | #Default Lambda? 266 | Lambda = self.get_lambda(Lambda) 267 | 268 | #Retrieve incStat 269 | key = ID+"_"+str(Lambda) 270 | incS = self.HT.get(key) 271 | if incS is None: #does not already exist 272 | if len(self.HT) + 1 > self.limit: 273 | raise LookupError( 274 | 'Adding Entry:\n' + key + '\nwould exceed incStatHT 1D limit of ' + str( 275 | self.limit) + '.\nObservation Rejected.') 276 | incS = incStat(Lambda, ID, init_time, isTypeDiff) 277 | self.HT[key] = incS #add new entry 278 | return incS 279 | 280 | # Registers covariance tracking for two streams, registers missing streams 281 | def register_cov(self,ID1,ID2,Lambda=1,init_time=0,isTypeDiff=False): 282 | #Default Lambda? 283 | Lambda = self.get_lambda(Lambda) 284 | 285 | # Lookup both streams 286 | incS1 = self.register(ID1,Lambda,init_time,isTypeDiff) 287 | incS2 = self.register(ID2,Lambda,init_time,isTypeDiff) 288 | 289 | #check for pre-exiting link 290 | for cov in incS1.covs: 291 | if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2: 292 | return cov #there is a pre-exiting link 293 | 294 | # Link incStats 295 | inc_cov = incStat_cov(incS1,incS2,init_time) 296 | incS1.covs.append(inc_cov) 297 | incS2.covs.append(inc_cov) 298 | return inc_cov 299 | 300 | # updates/registers stream 301 | def update(self,ID,t,v,Lambda=1,isTypeDiff=False): 302 | incS = self.register(ID,Lambda,t,isTypeDiff) 303 | incS.insert(v,t) 304 | return incS 305 | 306 | # Pulls current stats from the given ID 307 | def get_1D_Stats(self,ID,Lambda=1): #weight, mean, std 308 | #Default Lambda? 309 | Lambda = self.get_lambda(Lambda) 310 | 311 | #Get incStat 312 | incS = self.HT.get(ID+"_"+str(Lambda)) 313 | if incS is None: # does not already exist 314 | return [np.na]*3 315 | else: 316 | return incS.allstats_1D() 317 | 318 | # Pulls current correlational stats from the given IDs 319 | def get_2D_Stats(self, ID1, ID2, Lambda=1): #cov, pcc 320 | # Default Lambda? 321 | Lambda = self.get_lambda(Lambda) 322 | 323 | # Get incStat 324 | incS1 = self.HT.get(ID1 + "_" + str(Lambda)) 325 | if incS1 is None: # does not exist 326 | return [np.na]*2 327 | 328 | # find relevant cov entry 329 | return incS1.cov_pcc(ID2) 330 | 331 | # Pulls all correlational stats registered with the given ID 332 | # returns tuple [0]: stats-covs&pccs, [2]: IDs 333 | def get_all_2D_Stats(self, ID, Lambda=1): # cov, pcc 334 | # Default Lambda? 335 | Lambda = self.get_lambda(Lambda) 336 | 337 | # Get incStat 338 | incS1 = self.HT.get(ID + "_" + str(Lambda)) 339 | if incS1 is None: # does not exist 340 | return ([],[]) 341 | 342 | # find relevant cov entry 343 | stats = [] 344 | IDs = [] 345 | for cov in incS1.covs: 346 | stats.append(cov.get_stats1()) 347 | IDs.append([cov.incStats[0].ID,cov.incStats[1].ID]) 348 | return stats,IDs 349 | 350 | # Pulls current multidimensional stats from the given IDs 351 | def get_nD_Stats(self,IDs,Lambda=1): #radius, magnitude (IDs is a list) 352 | # Default Lambda? 353 | Lambda = self.get_lambda(Lambda) 354 | 355 | # Get incStats 356 | incStats = [] 357 | for ID in IDs: 358 | incS = self.HT.get(ID + "_" + str(Lambda)) 359 | if incS is not None: #exists 360 | incStats.append(incS) 361 | 362 | # Compute stats 363 | rad = 0 #radius 364 | mag = 0 #magnitude 365 | for incS in incStats: 366 | rad += incS.var() 367 | mag += incS.mean()**2 368 | 369 | return [np.sqrt(rad),np.sqrt(mag)] 370 | 371 | # Updates and then pulls current 1D stats from the given ID. Automatically registers previously unknown stream IDs 372 | def update_get_1D_Stats(self, ID,t,v,Lambda=1,isTypeDiff=False): # weight, mean, std 373 | incS = self.update(ID,t,v,Lambda,isTypeDiff) 374 | return incS.allstats_1D() 375 | 376 | 377 | # Updates and then pulls current correlative stats between the given IDs. Automatically registers previously unknown stream IDs, and cov tracking 378 | #Note: AfterImage does not currently support Diff Type streams for correlational statistics. 379 | def update_get_2D_Stats(self, ID1,ID2,t1,v1,Lambda=1,level=1): #level= 1:cov,pcc 2:radius,magnitude,cov,pcc 380 | #retrieve/add cov tracker 381 | inc_cov = self.register_cov(ID1, ID2, Lambda, t1) 382 | # Update cov tracker 383 | inc_cov.update_cov(ID1,v1,t1) 384 | if level == 1: 385 | return inc_cov.get_stats1() 386 | else: 387 | return inc_cov.get_stats2() 388 | 389 | # Updates and then pulls current 1D and 2D stats from the given IDs. Automatically registers previously unknown stream IDs 390 | def update_get_1D2D_Stats(self, ID1,ID2,t1,v1,Lambda=1): # weight, mean, std 391 | return self.update_get_1D_Stats(ID1,t1,v1,Lambda) + self.update_get_2D_Stats(ID1,ID2,t1,v1,Lambda,level=2) 392 | 393 | def getHeaders_1D(self,Lambda=1,ID=None): 394 | # Default Lambda? 395 | Lambda = self.get_lambda(Lambda) 396 | hdrs = incStat(Lambda,ID).getHeaders_1D(suffix=False) 397 | return [str(Lambda)+"_"+s for s in hdrs] 398 | 399 | def getHeaders_2D(self,Lambda=1,IDs=None, ver=1): #IDs is a 2-element list or tuple 400 | # Default Lambda? 401 | Lambda = self.get_lambda(Lambda) 402 | if IDs is None: 403 | IDs = [0,1] 404 | hdrs = incStat_cov(incStat(Lambda,IDs[0]),incStat(Lambda,IDs[0]),Lambda).getHeaders(ver,suffix=False) 405 | return [str(Lambda)+"_"+s for s in hdrs] 406 | 407 | def getHeaders_1D2D(self,Lambda=1,IDs=None, ver=1): 408 | # Default Lambda? 409 | Lambda = self.get_lambda(Lambda) 410 | if IDs is None: 411 | IDs = [0,1] 412 | hdrs1D = self.getHeaders_1D(Lambda,IDs[0]) 413 | hdrs2D = self.getHeaders_2D(Lambda,IDs, ver) 414 | return hdrs1D + hdrs2D 415 | 416 | def getHeaders_nD(self,Lambda=1,IDs=[]): #IDs is a n-element list or tuple 417 | # Default Lambda? 418 | ID = ":" 419 | for s in IDs: 420 | ID += "_"+s 421 | Lambda = self.get_lambda(Lambda) 422 | hdrs = ["radius"+ID, "magnitude"+ID] 423 | return [str(Lambda)+"_"+s for s in hdrs] 424 | 425 | 426 | #cleans out records that have a weight less than the cutoff. 427 | #returns number or removed records. 428 | def cleanOutOldRecords(self,cutoffWeight,curTime): 429 | n = 0 430 | dump = sorted(self.HT.items(), key=lambda tup: tup[1][0].getMaxW(curTime)) 431 | for entry in dump: 432 | entry[1][0].processDecay(curTime) 433 | W = entry[1][0].w 434 | if W <= cutoffWeight: 435 | key = entry[0] 436 | del entry[1][0] 437 | del self.HT[key] 438 | n=n+1 439 | elif W > cutoffWeight: 440 | break 441 | return n 442 | 443 | -------------------------------------------------------------------------------- /AfterImage_extrapolate.pyx: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | # MIT License 5 | # 6 | # Copyright (c) 2018 Yisroel mirsky 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | 26 | 27 | #compile with: python setup.py build_ext --inplace 28 | 29 | import pyximport; pyximport.install() 30 | 31 | cdef class incStat: 32 | cdef str ID 33 | cdef double CF1 34 | cdef double CF2 35 | cdef double w 36 | cdef int isTypeDiff 37 | cdef double Lambda 38 | cdef double lastTimestamp 39 | cdef double cur_mean 40 | cdef double cur_var 41 | cdef double cur_std 42 | cdef list covs 43 | 44 | def __init__(self, double Lambda, str ID, double init_time=0, int isTypeDiff=False): # timestamp is creation time 45 | self.ID = ID 46 | self.CF1 = 0 # linear sum 47 | self.CF2 = 0 # sum of squares 48 | self.w = 1e-20 # weight 49 | self.isTypeDiff = isTypeDiff 50 | self.Lambda = Lambda # Decay Factor 51 | self.lastTimestamp = init_time 52 | self.cur_mean = np.nan 53 | self.cur_var = np.nan 54 | self.cur_std = np.nan 55 | self.covs = [] # a list of incStat_covs (references) with relate to this incStat 56 | 57 | cdef insert(self, double v, double t=0): # v is a scalar, t is v's arrival the timestamp 58 | if self.isTypeDiff: 59 | if t - self.lastTimestamp > 0: 60 | v = t - self.lastTimestamp 61 | else: 62 | v = 0 63 | self.processDecay(t) 64 | 65 | # update with v 66 | self.CF1 += v 67 | self.CF2 += math.pow(v, 2) 68 | self.w += 1 69 | self.cur_mean = np.nan # force recalculation if called 70 | self.cur_var = np.nan 71 | self.cur_std = np.nan 72 | 73 | # update covs (if any) 74 | cdef incStat_cov cov 75 | for c in self.covs: 76 | cov = c 77 | cov.update_cov(self.ID, v, t) 78 | 79 | cdef processDecay(self, double timestamp): 80 | cdef double factor, timeDiff 81 | factor = 1 82 | # check for decay 83 | timeDiff = timestamp - self.lastTimestamp 84 | if timeDiff > 0: 85 | factor = math.pow(2, (-self.Lambda * timeDiff)) 86 | self.CF1 = self.CF1 * factor 87 | self.CF2 = self.CF2 * factor 88 | self.w = self.w * factor 89 | self.lastTimestamp = timestamp 90 | return factor 91 | 92 | cdef weight(self): 93 | return self.w 94 | 95 | cdef mean(self): 96 | if math.isnan(self.cur_mean): # calculate it only once when necessary 97 | self.cur_mean = self.CF1 / self.w 98 | return self.cur_mean 99 | 100 | cdef var(self): 101 | if math.isnan(self.cur_var): # calculate it only once when necessary 102 | self.cur_var = abs(self.CF2 / self.w - math.pow(self.mean(), 2)) 103 | return self.cur_var 104 | 105 | cdef std(self): 106 | if math.isnan(self.cur_std): # calculate it only once when necessary 107 | self.cur_std = math.sqrt(self.var()) 108 | return self.cur_std 109 | 110 | cdef cov(self,ID2): 111 | for cov in self.covs: 112 | if cov.isRelated(ID2): 113 | return cov.cov() 114 | return [np.nan] 115 | 116 | cdef pcc(self,ID2): 117 | for cov in self.covs: 118 | if cov.isRelated(ID2): 119 | return cov.pcc() 120 | return [np.nan] 121 | 122 | cdef cov_pcc(self,ID2): 123 | cdef incStat_cov cov 124 | for c in self.covs: 125 | cov = c 126 | if cov.isRelated(ID2): 127 | return cov.get_stats1() 128 | return [np.nan]*2 129 | 130 | cdef radius(self, other_incStats): # the radius of a set of incStats 131 | cdef double A 132 | A = self.var() 133 | cdef incStat incSc 134 | for incS in other_incStats: 135 | incSc = incS 136 | A += incSc.var() 137 | return math.sqrt(A) 138 | 139 | cdef magnitude(self, other_incStats): # the magnitude of a set of incStats 140 | cdef double A 141 | A = math.pow(self.mean(), 2) 142 | cdef incStat incSc 143 | for incS in other_incStats: 144 | incSc = incS 145 | A += math.pow(incSc.mean(), 2) 146 | return math.sqrt(A) 147 | 148 | #calculates and pulls all stats on this stream 149 | cdef allstats_1D(self): 150 | self.cur_mean = self.CF1 / self.w 151 | self.cur_var = abs(self.CF2 / self.w - math.pow(self.cur_mean, 2)) 152 | return [self.w, self.cur_mean, self.cur_var] 153 | 154 | #calculates and pulls all stats on this stream, and stats shared with the indicated stream 155 | cdef allstats_2D(self, str ID2): 156 | stats1D = self.allstats_1D() 157 | # Find cov component 158 | stats2D = [np.nan] * 4 159 | cdef incStat_cov cov 160 | for c in self.covs: 161 | cov = c 162 | if cov.isRelated(ID2): 163 | stats2D = cov.get_stats2() 164 | break 165 | return stats1D + stats2D 166 | 167 | cdef getHeaders_1D(self, suffix=True): 168 | if self.ID is None: 169 | s0="" 170 | else: 171 | s0 = "_0" 172 | if suffix: 173 | s0 = "_"+self.ID 174 | headers = ["weight"+s0, "mean"+s0, "std"+s0] 175 | return headers 176 | 177 | cdef getHeaders_2D(self, ID2, suffix=True): 178 | hdrs1D = self.getHeaders_1D(suffix) 179 | if self.ID is None: 180 | s0="" 181 | s1="" 182 | else: 183 | s0 = "_0" 184 | s1 = "_1" 185 | if suffix: 186 | s0 = "_"+self.ID 187 | s1 = "_" + ID2 188 | hdrs2D = ["radius_" + s0 + "_" + s1, "magnitude_" + s0 + "_" + s1, "covariance_" + s0 + "_" + s1, 189 | "pcc_" + s0 + "_" + s1] 190 | return hdrs1D+hdrs2D 191 | 192 | # def toJSON(self): 193 | # j = {} 194 | # j['CF1'] = self.CF1 195 | # j['CF2'] = self.CF2 196 | # j['w'] = self.w 197 | # j['isTypeDiff'] = self.isTypeDiff 198 | # j['Lambda'] = self.Lambda 199 | # j['lastTimestamp'] = self.lastTimestamp 200 | # return json.dumps(j) 201 | # 202 | # def loadFromJSON(self,JSONstring): 203 | # j = json.loads(JSONstring) 204 | # self.CF1 = j['CF1'] 205 | # self.CF2 = j['CF2'] 206 | # self.w = j['w'] 207 | # self.isTypeDiff = j['isTypeDiff'] 208 | # self.Lambda = j['Lambda'] 209 | # self.lastTimestamp = j['lastTimestamp'] 210 | 211 | #like incStat, but maintains stats between two streams 212 | #TODO: make it possble to call incstat magnitude and raduis withour list of incstsats (just single incstat objects) for cov.getstats2 typcast call 213 | cdef class incStat_cov: 214 | cdef double CF3 215 | cdef double w3 216 | cdef double lastTimestamp_cf3 217 | cdef incStat incS1 218 | cdef incStat incS2 219 | cdef extrapolator ex1 220 | cdef extrapolator ex2 221 | 222 | 223 | def __init__(self, incStat incS1,incStat incS2, double init_time = 0): 224 | # store references tot he streams' incStats 225 | self.incS1 = incS1 226 | self.incS2 = incS2 227 | 228 | # init extrapolators 229 | self.ex1 = extrapolator() 230 | self.ex2 = extrapolator() 231 | 232 | # init sum product residuals 233 | self.CF3 = 0 # sum of residule products (A-uA)(B-uB) 234 | self.w3 = 1e-20 235 | self.lastTimestamp_cf3 = init_time 236 | 237 | #other_incS_decay is the decay factor of the other incstat 238 | # ID: the stream ID which produced (v,t) 239 | cdef update_cov(self, str ID, double v, double t): # it is assumes that incStat "ID" has ALREADY been updated with (t,v) [this si performed automatically in method incStat.insert()] 240 | # find incStat 241 | cdef int inc 242 | if ID == self.incS1.ID: 243 | inc = 0 244 | else: 245 | inc = 1 246 | 247 | # Decay residules 248 | self.processDecay(t) 249 | 250 | # Update extrapolator for current stream AND 251 | # Extrapolate other stream AND 252 | # Compute and update residule 253 | cdef double v_other 254 | if inc == 0: 255 | self.ex1.insert(t,v) 256 | v_other = self.ex2.predict(t) 257 | self.CF3 += (v - self.incS1.mean()) * (v_other - self.incS2.mean()) 258 | else: 259 | self.ex2.insert(t,v) 260 | v_other = self.ex1.predict(t) 261 | self.CF3 += (v - self.incS2.mean()) * (v_other - self.incS1.mean()) 262 | self.w3 += 1 263 | 264 | cdef processDecay(self,double t): 265 | cdef double factor 266 | factor = 1 267 | # check for decay cf3 268 | cdef double timeDiffs_cf3 269 | timeDiffs_cf3 = t - self.lastTimestamp_cf3 270 | if timeDiffs_cf3 > 0: 271 | factor = math.pow(2, (-(self.incS1.Lambda) * timeDiffs_cf3)) 272 | self.CF3 *= factor 273 | self.w3 *= factor 274 | self.lastTimestamp_cf3 = t 275 | return factor 276 | 277 | #todo: add W3 for cf3 278 | 279 | #covariance approximation 280 | cdef cov(self): 281 | return self.CF3 / self.w3 282 | 283 | # Pearson corl. coef 284 | cdef pcc(self): 285 | cdef double ss 286 | ss = self.incS1.std() * self.incS2.std() 287 | if ss != 0: 288 | return self.cov() / ss 289 | else: 290 | return 0 291 | 292 | def isRelated(self, str ID): 293 | if self.incS1.ID == ID or self.incS2.ID == ID: 294 | return True 295 | else: 296 | return False 297 | 298 | # calculates and pulls all correlative stats 299 | cdef get_stats1(self): 300 | return [self.cov(), self.pcc()] 301 | 302 | # calculates and pulls all correlative stats AND 2D stats from both streams (incStat) 303 | cdef get_stats2(self): 304 | return [self.incS1.radius([self.incS2]),self.incS1.magnitude([self.incS2]),self.cov(), self.pcc()] 305 | 306 | # calculates and pulls all correlative stats AND 2D stats AND the regular stats from both streams (incStat) 307 | cdef get_stats3(self): 308 | return [self.incS1.w,self.incS1.mean(),self.incS1.std(),self.incS2.w,self.incS2.mean(),self.incS2.std(),self.cov(), self.pcc()] 309 | 310 | # calculates and pulls all correlative stats AND the regular stats from both incStats AND 2D stats 311 | cdef get_stats4(self): 312 | return [self.incS1.w,self.incS1.mean(),self.incS1.std(),self.incS2.w,self.incS2.mean(),self.incS2.std(), self.incS1.radius([self.incS2]),self.incS1.magnitude([self.incS2]),self.cov(), self.pcc()] 313 | 314 | cdef getHeaders(self,int ver,int suffix=True): #ver = {1,2,3,4} 315 | headers = [] 316 | s0 = "0" 317 | s1 = "1" 318 | if suffix: 319 | s0 = self.incS1.ID 320 | s1 = self.incS2.ID 321 | 322 | if ver == 1: 323 | headers = ["covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1] 324 | if ver == 2: 325 | headers = ["radius_"+s0+"_"+s1, "magnitude_"+s0+"_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1] 326 | if ver == 3: 327 | headers = ["weight_"+s0, "mean_"+s0, "std_"+s0,"weight_"+s1, "mean_"+s1, "std_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1] 328 | if ver == 4: 329 | headers = ["weight_" + s0, "mean_" + s0, "std_" + s0, "covariance_" + s0 + "_" + s1, "pcc_" + s0 + "_" + s1] 330 | if ver == 5: 331 | headers = ["weight_"+s0, "mean_"+s0, "std_"+s0,"weight_"+s1, "mean_"+s1, "std_"+s1, "radius_"+s0+"_"+s1, "magnitude_"+s0+"_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1] 332 | return headers 333 | 334 | 335 | cdef class incStatDB: 336 | cdef double limit 337 | cdef double df_lambda 338 | cdef dict HT 339 | 340 | # default_lambda: use this as the lambda for all streams. If not specified, then you must supply a Lambda with every query. 341 | def __init__(self,double limit=np.Inf,double default_lambda=np.nan): 342 | self.HT = dict() 343 | self.limit = limit 344 | self.df_lambda = default_lambda 345 | 346 | cdef get_lambda(self,double Lambda): 347 | if not np.isnan(self.df_lambda): 348 | Lambda = self.df_lambda 349 | return Lambda 350 | 351 | # Registers a new stream. init_time: init lastTimestamp of the incStat 352 | def register(self,str ID,double Lambda=1,double init_time=0,int isTypeDiff=False): 353 | #Default Lambda? 354 | Lambda = self.get_lambda(Lambda) 355 | 356 | #Retrieve incStat 357 | cdef str key 358 | key = ID+"_"+str(Lambda) 359 | 360 | cdef incStat incS 361 | incS = self.HT.get(key) 362 | if incS is None: #does not already exist 363 | if len(self.HT) + 1 > self.limit: 364 | raise LookupError( 365 | 'Adding Entry:\n' + key + '\nwould exceed incStatHT 1D limit of ' + str( 366 | self.limit) + '.\nObservation Rejected.') 367 | incS = incStat(Lambda, ID, init_time, isTypeDiff) 368 | self.HT[key] = incS #add new entry 369 | return incS 370 | 371 | # Registers covariance tracking for two streams, registers missing streams 372 | def register_cov(self,str ID1, str ID2, double Lambda=1, double init_time=0, int isTypeDiff=False): 373 | #Default Lambda? 374 | Lambda = self.get_lambda(Lambda) 375 | 376 | # Lookup both streams 377 | cdef incStat incS1 378 | cdef incStat incS2 379 | incS1 = self.register(ID1,Lambda,init_time,isTypeDiff) 380 | incS2 = self.register(ID2,Lambda,init_time,isTypeDiff) 381 | 382 | #check for pre-exiting link 383 | for cov in incS1.covs: 384 | if cov.isRelated(ID2): 385 | return cov #there is a pre-exiting link 386 | 387 | # Link incStats 388 | inc_cov = incStat_cov(incS1,incS2,init_time) 389 | incS1.covs.append(inc_cov) 390 | incS2.covs.append(inc_cov) 391 | return inc_cov 392 | 393 | # updates/registers stream 394 | def update(self,str ID,double t,double v,double Lambda=1,int isTypeDiff=False): 395 | cdef incStat incS 396 | incS = self.register(ID,Lambda,t,isTypeDiff) 397 | incS.insert(v,t) 398 | return incS 399 | 400 | # Pulls current stats from the given ID 401 | def get_1D_Stats(self,str ID,double Lambda=1): #weight, mean, std 402 | #Default Lambda? 403 | Lambda = self.get_lambda(Lambda) 404 | 405 | #Get incStat 406 | cdef incStat incS 407 | incS = self.HT.get(ID+"_"+str(Lambda)) 408 | if incS is None: # does not already exist 409 | return [np.na]*3 410 | else: 411 | return incS.allstats_1D() 412 | 413 | # Pulls current correlational stats from the given IDs 414 | def get_2D_Stats(self, str ID1, str ID2, double Lambda=1): #cov, pcc 415 | # Default Lambda? 416 | Lambda = self.get_lambda(Lambda) 417 | 418 | # Get incStat 419 | cdef incStat incS 420 | incS = self.HT.get(ID1 + "_" + str(Lambda)) 421 | if incS is None: # does not exist 422 | return [np.na]*2 423 | 424 | # find relevant cov entry 425 | return incS.cov_pcc(ID2) 426 | 427 | # Pulls all correlational stats registered with the given ID 428 | # returns tuple [0]: stats-covs&pccs, [2]: IDs 429 | def get_all_2D_Stats(self, str ID, double Lambda=1): # cov, pcc 430 | # Default Lambda? 431 | Lambda = self.get_lambda(Lambda) 432 | 433 | # Get incStat 434 | cdef incStat incS1 435 | incS1 = self.HT.get(ID + "_" + str(Lambda)) 436 | if incS1 is None: # does not exist 437 | return ([],[]) 438 | 439 | # find relevant cov entry 440 | stats = [] 441 | IDs = [] 442 | for cov in incS1.covs: 443 | stats.append(cov.get_stats1()) 444 | IDs.append([cov.incS1.ID,cov.incS2.ID]) 445 | return stats,IDs 446 | 447 | # Pulls current multidimensional stats from the given IDs 448 | def get_nD_Stats(self,IDs,double Lambda=1): #radius, magnitude (IDs is a list) 449 | # Default Lambda? 450 | Lambda = self.get_lambda(Lambda) 451 | 452 | # Get incStats 453 | incStats = [] 454 | for ID in IDs: 455 | incS = self.HT.get(ID + "_" + str(Lambda)) 456 | if incS is not None: #exists 457 | incStats.append(incS) 458 | 459 | # Compute stats 460 | cdef double rad, mag 461 | rad = 0 #radius 462 | mag = 0 #magnitude 463 | for incS in incStats: 464 | rad += incS.var() 465 | mag += incS.mean()**2 466 | 467 | return [np.sqrt(rad),np.sqrt(mag)] 468 | 469 | # Updates and then pulls current 1D stats from the given ID. Automatically registers previously unknown stream IDs 470 | def update_get_1D_Stats(self, str ID,double t, double v, double Lambda=1, int isTypeDiff=False): # weight, mean, std 471 | cdef incStat incS 472 | incS = self.update(ID,t,v,Lambda,isTypeDiff) 473 | return incS.allstats_1D() 474 | 475 | 476 | # Updates and then pulls current correlative stats between the given IDs. Automatically registers previously unknown stream IDs, and cov tracking 477 | #Note: AfterImage does not currently support Diff Type streams for correlational statistics. 478 | def update_get_2D_Stats(self, str ID1, str ID2,double t1,double v1,double Lambda=1, int level=1): #level= 1:cov,pcc 2:radius,magnitude,cov,pcc 479 | #retrieve/add cov tracker 480 | cdef incStat_cov inc_cov 481 | inc_cov = self.register_cov(ID1, ID2, Lambda, t1) 482 | # Update cov tracker 483 | inc_cov.update_cov(ID1,v1,t1) 484 | if level == 1: 485 | return inc_cov.get_stats1() 486 | else: 487 | return inc_cov.get_stats2() 488 | 489 | # Updates and then pulls current 1D and 2D stats from the given IDs. Automatically registers previously unknown stream IDs 490 | def update_get_1D2D_Stats(self, str ID1, str ID2, double t1,double v1,double Lambda=1): # weight, mean, std 491 | return self.update_get_1D_Stats(ID1,t1,v1,Lambda) + self.update_get_2D_Stats(ID1,ID2,t1,v1,Lambda,level=2) 492 | 493 | def getHeaders_1D(self,Lambda=1,ID=''): 494 | # Default Lambda? 495 | cdef double L 496 | L = Lambda 497 | L = self.get_lambda(L) 498 | hdrs = incStat(L,ID).getHeaders_1D(suffix=False) 499 | return [str(L)+"_"+s for s in hdrs] 500 | 501 | def getHeaders_2D(self,Lambda=1,IDs=None, ver=1): #IDs is a 2-element list or tuple 502 | # Default Lambda? 503 | cdef double L 504 | L = Lambda 505 | L = self.get_lambda(L) 506 | if IDs is None: 507 | IDs = ['0','1'] 508 | hdrs = incStat_cov(incStat(L,IDs[0]),incStat(L,IDs[0]),L).getHeaders(ver,suffix=False) 509 | return [str(Lambda)+"_"+s for s in hdrs] 510 | 511 | def getHeaders_1D2D(self,Lambda=1,IDs=None, ver=1): 512 | # Default Lambda? 513 | cdef double L 514 | L = Lambda 515 | L = self.get_lambda(L) 516 | if IDs is None: 517 | IDs = ['0','1'] 518 | hdrs1D = self.getHeaders_1D(L,IDs[0]) 519 | hdrs2D = self.getHeaders_2D(L,IDs, ver) 520 | return hdrs1D + hdrs2D 521 | 522 | def getHeaders_nD(self,Lambda=1,IDs=[]): #IDs is a n-element list or tuple 523 | # Default Lambda? 524 | cdef double L 525 | L = Lambda 526 | ID = ":" 527 | for s in IDs: 528 | ID += "_"+s 529 | L = self.get_lambda(L) 530 | hdrs = ["radius"+ID, "magnitude"+ID] 531 | return [str(L)+"_"+s for s in hdrs] 532 | 533 | 534 | #cleans out records that have a weight less than the cutoff. 535 | #returns number or removed records. 536 | def cleanOutOldRecords(self,double cutoffWeight,double curTime): 537 | cdef int n 538 | cdef double W 539 | n = 0 540 | dump = sorted(self.HT.items(), key=lambda tup: tup[1][0].getMaxW(curTime)) 541 | for entry in dump: 542 | entry[1][0].processDecay(curTime) 543 | W = entry[1][0].w 544 | if W <= cutoffWeight: 545 | key = entry[0] 546 | del entry[1][0] 547 | del self.HT[key] 548 | n=n+1 549 | elif W > cutoffWeight: 550 | break 551 | return n 552 | 553 | 554 | 555 | 556 | 557 | class incHist: 558 | #ubIsAnom means that the HBOS score for vals that fall past the upped bound are Inf (not 0) 559 | def __init__(self,nbins,Lambda=0,ubIsAnom=True,lbIsAnom=True,lbound=-10,ubound=10,scaleGrace=None): 560 | self.scaleGrace = scaleGrace #the numbe rof instances to observe until a range it determeined 561 | if scaleGrace is not None: 562 | self.lbound = np.Inf 563 | self.ubound = -np.Inf 564 | self.binSize = None 565 | self.isScaling = True 566 | else: 567 | self.lbound = lbound 568 | self.ubound = ubound 569 | self.binSize = (ubound - lbound)/nbins 570 | self.isScaling = False 571 | self.nbins = nbins 572 | self.ubIsAnom = ubIsAnom 573 | self.lbIsAnom = lbIsAnom 574 | self.n = 0 575 | 576 | self.Lambda = Lambda 577 | self.W = np.zeros(nbins) 578 | self.lT = np.zeros(nbins) #last timestamp of each respective bin 579 | self.tallestBin = 0 #indx to the bin that currently has the largest freq weight (assumed...) 580 | 581 | #assumes even bin width starting from lbound until ubound. beyond bounds are assigned to the closest bin 582 | def getBinIndx(self,val,win=0): 583 | indx = int(np.floor((val - self.lbound)/self.binSize)) 584 | if win == 0: 585 | if indx < 0: 586 | return -np.Inf 587 | if indx > (self.nbins - 1): 588 | return np.Inf 589 | return indx 590 | else: #windowed Histogram 591 | if indx - win < 0: #does the left of the window stick out of bounds? 592 | if indx + win >= 0: #if yes, then is there some overlap with inbounds? 593 | return range(0,indx+win+1) #return the inbounds range 594 | else: #then the entire window is our of bounds to the left 595 | return -np.Inf 596 | if indx + win > self.nbins - 1: #does the right of the window stick out of bounds? 597 | if indx - win < self.nbins: #if yes, then is there some overlap with inbounds? 598 | return range(indx - win,self.nbins) #return the inbounds range 599 | else: #then the entire window is our of bounds to the right 600 | return np.Inf 601 | return range(indx-win,indx+win+1) 602 | 603 | 604 | def processDecay(self, bin, timestamp): 605 | # check for decay 606 | timeDiff = timestamp - self.lT[bin] 607 | if np.isscalar(timeDiff): 608 | if timeDiff > 0: 609 | factor = math.pow(2, (-self.Lambda * timeDiff)) 610 | self.W[bin] = self.W[bin] * factor 611 | self.lT[bin] = timestamp 612 | else: #array 613 | timeDiff[timeDiff<0]=0 #don't affect decay of out of order entries 614 | factor = np.power(2, (-self.Lambda * timeDiff)) 615 | #b4 = self.W[bin] 616 | self.W[bin] = self.W[bin] * factor 617 | self.lT[bin] = timestamp 618 | 619 | def insert(self,val,timestamp,penalty=False): 620 | self.n = self.n + 1 621 | if self.isScaling: 622 | if self.n < self.scaleGrace: 623 | if self.lbound > val: 624 | self.lbound = val 625 | if self.ubound < val: 626 | self.ubound = val 627 | if self.n == self.scaleGrace: 628 | if self.ubound == self.lbound: 629 | self.scaleGrace = self.scaleGrace + 1000 630 | else: 631 | width = self.ubound - self.lbound 632 | self.ubound = self.ubound + width 633 | self.lbound = self.lbound - width 634 | self.binSize = (self.ubound - self.lbound) / self.nbins 635 | self.isScaling = False 636 | else: 637 | bin = self.getBinIndx(val) 638 | if not np.isinf(bin): # 639 | self.processDecay(bin, timestamp) 640 | if penalty: 641 | tallestW = self.W[self.tallestBin] 642 | scale = tallestW if tallestW > 0 else 1 643 | fn = self.W[bin]/scale 644 | inc = self.halfsigmoid(fn+0.005,-1.03) 645 | else: 646 | inc = 1 647 | self.W[bin] = self.W[bin] + inc 648 | #track who has the tallest bin (for normilization) 649 | if self.W[bin] > self.W[self.tallestBin]: 650 | self.tallestBin = bin 651 | 652 | def halfsigmoid(self,x,k): 653 | return (k*x)/(k-x+1) 654 | 655 | def score(self,val,timestamp=-1,win=0): #HBOS for one dimension 656 | if self.isScaling: 657 | return 0.0 658 | else: 659 | bin = self.getBinIndx(val,win=win) 660 | if np.isscalar(bin): 661 | if np.isinf(bin): 662 | if self.ubIsAnom and bin > 0: 663 | return np.Inf #it's an anomaly because it passes the upper bound 664 | elif self.lbIsAnom and bin < 0: 665 | return np.Inf # it's an anomaly because it passes the lower bound 666 | else: 667 | return 0.0 #it fell outside a bound which is consedered not anomalous 668 | self.processDecay(bin,timestamp) #if timestamp = -1, no decay will be applied 669 | w = np.mean(self.W[bin]) 670 | if w == 0: 671 | return np.Inf # no stat history, anomaly! 672 | else: 673 | return np.log(self.W[self.tallestBin] / (w)) # log( 1/( p/p_max ) ) 674 | 675 | 676 | def getFreq(self,val,timestamp=-1): #HBOS for one dimension 677 | bin = self.getBinIndx(val) 678 | self.processDecay(bin,timestamp) #if timestamp = -1, no decay will be applied 679 | if np.isinf(bin): 680 | return np.nan 681 | else: 682 | return self.W[bin] 683 | 684 | def getHist(self,timestamp=-1): #HBOS for one dimension 685 | H = np.zeros((len(self.W),1)) 686 | for i in range(0,len(self.W)): 687 | self.processDecay(i,timestamp) #if timestamp = -1, no decay will be applied 688 | H[i] = self.W[i] 689 | H = H/np.sum(self.W) 690 | return H 691 | # 692 | # def loadFromJSON(self,jsonstring): 693 | # return '' # !!!! very important: all timestamps in self.lT should be updated so the decay won't wipe out the histogram: 694 | # # self.lT = self.lT + curtime - max(self.lT) 695 | # # this also applies to when the system.train setting is toggled to 'on' 696 | 697 | from cpython cimport array 698 | 699 | #import cython 700 | 701 | cdef class Queue: 702 | 703 | cdef double[3] q 704 | cdef int indx 705 | cdef unsigned int n 706 | 707 | def __init__(self): 708 | self.q[0] = self.q[1] = self.q[2] = 0 709 | self.indx = 0 710 | self.n = 0 711 | 712 | cdef insert(self,double v): 713 | self.q[self.indx] = v 714 | self.indx = (self.indx + 1) % 3 715 | self.n += 1 716 | 717 | cdef unroll(self): 718 | 719 | cdef double[2] res 720 | if self.n == 2: 721 | res[0] = self.q[0] 722 | res[1] = self.q[1] 723 | return res 724 | if self.indx == 0: 725 | return self.q 726 | 727 | cdef double[3] res3 728 | if self.indx == 1: 729 | res3[0] = self.q[1] 730 | res3[1] = self.q[2] 731 | res3[2] = self.q[0] 732 | return res3 733 | else: 734 | res3[0] = self.q[2] 735 | res3[1] = self.q[0] 736 | res3[2] = self.q[1] 737 | return res3 738 | 739 | cdef get_last(self): 740 | return self.q[(self.indx-1)%3] 741 | 742 | cdef get_mean_diff(self): 743 | cdef double dif 744 | dif = 0 745 | if self.n == 2: 746 | dif=self.q[self.indx%3] - self.q[(self.indx-1)%3] 747 | return dif 748 | else: 749 | # for i in range(2): 750 | # dif+=self.q[(self.indx+i+1)%3] - self.q[(self.indx+i)%3] 751 | dif= (self.q[self.indx%3] - self.q[(self.indx-1)%3]) + (self.q[(self.indx-1)%3] - self.q[(self.indx-2)%3]) 752 | return dif/2 753 | 754 | cdef class extrapolator: 755 | 756 | cdef Queue Qt 757 | cdef Queue Qv 758 | 759 | def __init__(self):#,int winsize=3): 760 | self.Qt = Queue() #deque([],winsize) #window of timestamps 761 | self.Qv = Queue() #deque([],winsize) #window of values 762 | 763 | def insert(self,double t, double v): 764 | self.Qt.insert(t) 765 | self.Qv.insert(v) 766 | 767 | def predict(self, double t): 768 | if self.Qt.n < 2: #not enough points to extrapolate? 769 | if self.Qt.n == 1: 770 | return self.Qv.get_last() 771 | else: 772 | return 0 773 | if (t - self.Qt.get_last())/(self.Qt.get_mean_diff() + 1e-10) > 10: # is the next timestamp 10 time further than the average sample interval? 774 | return self.Qv.get_last() # prediction too far ahead (very likely that we will be way off) 775 | cdef double yp 776 | cdef array.array tm = array.array('d', self.Qt.unroll()) 777 | cdef array.array vm = array.array('d', self.Qv.unroll()) 778 | yp = self.interpolate(t,tm,vm) 779 | return yp 780 | #TODO: try cythonize lagrange 781 | 782 | 783 | cdef interpolate(self, double tp, array.array tm, array.array ym): 784 | cdef int n 785 | n = len(tm) - 1 786 | #cdef double[:] lagrpoly = np.array([self.lagrange(tp, i, tm) for i in range(n + 1)]) 787 | 788 | cdef double y 789 | for i in range(n +1): 790 | """ 791 | Evaluate the i-th Lagrange polynomial at x 792 | based on grid data xm 793 | """ 794 | y = 1 795 | for j in range(n + 1): 796 | if i != j: 797 | y *= (tp - tm[j]) / (tm[i] - tm[j] + 1e-20) 798 | ym[i]*=y 799 | 800 | return sum(ym) 801 | 802 | -------------------------------------------------------------------------------- /FeatureExtractor.py: -------------------------------------------------------------------------------- 1 | #Check if cython code has been compiled 2 | import os 3 | import subprocess 4 | 5 | use_extrapolation=False #experimental correlation code 6 | if use_extrapolation: 7 | print("Importing AfterImage Cython Library") 8 | if not os.path.isfile("AfterImage.c"): #has not yet been compiled, so try to do so... 9 | cmd = "python setup.py build_ext --inplace" 10 | subprocess.call(cmd,shell=True) 11 | #Import dependencies 12 | import netStat as ns 13 | import csv 14 | import numpy as np 15 | print("Importing Scapy Library") 16 | from scapy.all import * 17 | import os.path 18 | import platform 19 | import subprocess 20 | 21 | 22 | #Extracts Kitsune features from given pcap file one packet at a time using "get_next_vector()" 23 | # If wireshark is installed (tshark) it is used to parse (it's faster), otherwise, scapy is used (much slower). 24 | # If wireshark is used then a tsv file (parsed version of the pcap) will be made -which you can use as your input next time 25 | class FE: 26 | def __init__(self,file_path,limit=np.inf): 27 | self.path = file_path 28 | self.limit = limit 29 | self.parse_type = None #unknown 30 | self.curPacketIndx = 0 31 | self.tsvin = None #used for parsing TSV file 32 | self.scapyin = None #used for parsing pcap with scapy 33 | 34 | ### Prep pcap ## 35 | self.__prep__() 36 | 37 | ### Prep Feature extractor (AfterImage) ### 38 | maxHost = 100000000000 39 | maxSess = 100000000000 40 | self.nstat = ns.netStat(np.nan, maxHost, maxSess) 41 | 42 | def _get_tshark_path(self): 43 | if platform.system() == 'Windows': 44 | return 'C:\Program Files\Wireshark\\tshark.exe' 45 | else: 46 | system_path = os.environ['PATH'] 47 | for path in system_path.split(os.pathsep): 48 | filename = os.path.join(path, 'tshark') 49 | if os.path.isfile(filename): 50 | return filename 51 | return '' 52 | 53 | def __prep__(self): 54 | ### Find file: ### 55 | if not os.path.isfile(self.path): # file does not exist 56 | print("File: " + self.path + " does not exist") 57 | raise Exception() 58 | 59 | ### check file type ### 60 | type = self.path.split('.')[-1] 61 | 62 | self._tshark = self._get_tshark_path() 63 | ##If file is TSV (pre-parsed by wireshark script) 64 | if type == "tsv": 65 | self.parse_type = "tsv" 66 | 67 | ##If file is pcap 68 | elif type == "pcap" or type == 'pcapng': 69 | # Try parsing via tshark dll of wireshark (faster) 70 | if os.path.isfile(self._tshark): 71 | self.pcap2tsv_with_tshark() # creates local tsv file 72 | self.path += ".tsv" 73 | self.parse_type = "tsv" 74 | else: # Otherwise, parse with scapy (slower) 75 | print("tshark not found. Trying scapy...") 76 | self.parse_type = "scapy" 77 | else: 78 | print("File: " + self.path + " is not a tsv or pcap file") 79 | raise Exception() 80 | 81 | ### open readers ## 82 | if self.parse_type == "tsv": 83 | maxInt = sys.maxsize 84 | decrement = True 85 | while decrement: 86 | # decrease the maxInt value by factor 10 87 | # as long as the OverflowError occurs. 88 | decrement = False 89 | try: 90 | csv.field_size_limit(maxInt) 91 | except OverflowError: 92 | maxInt = int(maxInt / 10) 93 | decrement = True 94 | 95 | print("counting lines in file...") 96 | num_lines = sum(1 for line in open(self.path)) 97 | print("There are " + str(num_lines) + " Packets.") 98 | self.limit = min(self.limit, num_lines-1) 99 | self.tsvinf = open(self.path, 'rt', encoding="utf8") 100 | self.tsvin = csv.reader(self.tsvinf, delimiter='\t') 101 | row = self.tsvin.__next__() #move iterator past header 102 | 103 | else: # scapy 104 | print("Reading PCAP file via Scapy...") 105 | self.scapyin = rdpcap(self.path) 106 | self.limit = len(self.scapyin) 107 | print("Loaded " + str(len(self.scapyin)) + " Packets.") 108 | 109 | def get_next_vector(self): 110 | if self.curPacketIndx == self.limit: 111 | if self.parse_type == 'tsv': 112 | self.tsvinf.close() 113 | return [] 114 | 115 | ### Parse next packet ### 116 | if self.parse_type == "tsv": 117 | row = self.tsvin.__next__() 118 | IPtype = np.nan 119 | timestamp = row[0] 120 | framelen = row[1] 121 | srcIP = '' 122 | dstIP = '' 123 | if row[4] != '': # IPv4 124 | srcIP = row[4] 125 | dstIP = row[5] 126 | IPtype = 0 127 | elif row[17] != '': # ipv6 128 | srcIP = row[17] 129 | dstIP = row[18] 130 | IPtype = 1 131 | srcproto = row[6] + row[ 132 | 8] # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" 133 | dstproto = row[7] + row[9] # UDP or TCP port 134 | srcMAC = row[2] 135 | dstMAC = row[3] 136 | if srcproto == '': # it's a L2/L1 level protocol 137 | if row[12] != '': # is ARP 138 | srcproto = 'arp' 139 | dstproto = 'arp' 140 | srcIP = row[14] # src IP (ARP) 141 | dstIP = row[16] # dst IP (ARP) 142 | IPtype = 0 143 | elif row[10] != '': # is ICMP 144 | srcproto = 'icmp' 145 | dstproto = 'icmp' 146 | IPtype = 0 147 | elif srcIP + srcproto + dstIP + dstproto == '': # some other protocol 148 | srcIP = row[2] # src MAC 149 | dstIP = row[3] # dst MAC 150 | 151 | elif self.parse_type == "scapy": 152 | packet = self.scapyin[self.curPacketIndx] 153 | IPtype = np.nan 154 | timestamp = packet.time 155 | framelen = len(packet) 156 | if packet.haslayer(IP): # IPv4 157 | srcIP = packet[IP].src 158 | dstIP = packet[IP].dst 159 | IPtype = 0 160 | elif packet.haslayer(IPv6): # ipv6 161 | srcIP = packet[IPv6].src 162 | dstIP = packet[IPv6].dst 163 | IPtype = 1 164 | else: 165 | srcIP = '' 166 | dstIP = '' 167 | 168 | if packet.haslayer(TCP): 169 | srcproto = str(packet[TCP].sport) 170 | dstproto = str(packet[TCP].dport) 171 | elif packet.haslayer(UDP): 172 | srcproto = str(packet[UDP].sport) 173 | dstproto = str(packet[UDP].dport) 174 | else: 175 | srcproto = '' 176 | dstproto = '' 177 | 178 | srcMAC = packet.src 179 | dstMAC = packet.dst 180 | if srcproto == '': # it's a L2/L1 level protocol 181 | if packet.haslayer(ARP): # is ARP 182 | srcproto = 'arp' 183 | dstproto = 'arp' 184 | srcIP = packet[ARP].psrc # src IP (ARP) 185 | dstIP = packet[ARP].pdst # dst IP (ARP) 186 | IPtype = 0 187 | elif packet.haslayer(ICMP): # is ICMP 188 | srcproto = 'icmp' 189 | dstproto = 'icmp' 190 | IPtype = 0 191 | elif srcIP + srcproto + dstIP + dstproto == '': # some other protocol 192 | srcIP = packet.src # src MAC 193 | dstIP = packet.dst # dst MAC 194 | else: 195 | return [] 196 | 197 | self.curPacketIndx = self.curPacketIndx + 1 198 | 199 | 200 | ### Extract Features 201 | try: 202 | return self.nstat.updateGetStats(IPtype, srcMAC, dstMAC, srcIP, srcproto, dstIP, dstproto, 203 | int(framelen), 204 | float(timestamp)) 205 | except Exception as e: 206 | print(e) 207 | return [] 208 | 209 | 210 | def pcap2tsv_with_tshark(self): 211 | print('Parsing with tshark...') 212 | fields = "-e frame.time_epoch -e frame.len -e eth.src -e eth.dst -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e icmp.type -e icmp.code -e arp.opcode -e arp.src.hw_mac -e arp.src.proto_ipv4 -e arp.dst.hw_mac -e arp.dst.proto_ipv4 -e ipv6.src -e ipv6.dst" 213 | cmd = '"' + self._tshark + '" -r '+ self.path +' -T fields '+ fields +' -E header=y -E occurrence=f > '+self.path+".tsv" 214 | subprocess.call(cmd,shell=True) 215 | print("tshark parsing complete. File saved as: "+self.path +".tsv") 216 | 217 | def get_num_features(self): 218 | return len(self.nstat.getNetStatHeaders()) 219 | -------------------------------------------------------------------------------- /KitNET/KitNET.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import KitNET.dA as AE 3 | import KitNET.corClust as CC 4 | 5 | # This class represents a KitNET machine learner. 6 | # KitNET is a lightweight online anomaly detection algorithm based on an ensemble of autoencoders. 7 | # For more information and citation, please see our NDSS'18 paper: Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection 8 | # For licensing information, see the end of this document 9 | 10 | class KitNET: 11 | #n: the number of features in your input dataset (i.e., x \in R^n) 12 | #m: the maximum size of any autoencoder in the ensemble layer 13 | #AD_grace_period: the number of instances the network will learn from before producing anomaly scores 14 | #FM_grace_period: the number of instances which will be taken to learn the feature mapping. If 'None', then FM_grace_period=AM_grace_period 15 | #learning_rate: the default stochastic gradient descent learning rate for all autoencoders in the KitNET instance. 16 | #hidden_ratio: the default ratio of hidden to visible neurons. E.g., 0.75 will cause roughly a 25% compression in the hidden layer. 17 | #feature_map: One may optionally provide a feature map instead of learning one. The map must be a list, 18 | # where the i-th entry contains a list of the feature indices to be assingned to the i-th autoencoder in the ensemble. 19 | # For example, [[2,5,3],[4,0,1],[6,7]] 20 | def __init__(self,n,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75, feature_map = None): 21 | # Parameters: 22 | self.AD_grace_period = AD_grace_period 23 | if FM_grace_period is None: 24 | self.FM_grace_period = AD_grace_period 25 | else: 26 | self.FM_grace_period = FM_grace_period 27 | if max_autoencoder_size <= 0: 28 | self.m = 1 29 | else: 30 | self.m = max_autoencoder_size 31 | self.lr = learning_rate 32 | self.hr = hidden_ratio 33 | self.n = n 34 | 35 | # Variables 36 | self.n_trained = 0 # the number of training instances so far 37 | self.n_executed = 0 # the number of executed instances so far 38 | self.v = feature_map 39 | if self.v is None: 40 | print("Feature-Mapper: train-mode, Anomaly-Detector: off-mode") 41 | else: 42 | self.__createAD__() 43 | print("Feature-Mapper: execute-mode, Anomaly-Detector: train-mode") 44 | self.FM = CC.corClust(self.n) #incremental feature cluatering for the feature mapping process 45 | self.ensembleLayer = [] 46 | self.outputLayer = None 47 | 48 | #If FM_grace_period+AM_grace_period has passed, then this function executes KitNET on x. Otherwise, this function learns from x. 49 | #x: a numpy array of length n 50 | #Note: KitNET automatically performs 0-1 normalization on all attributes. 51 | def process(self,x): 52 | if self.n_trained > self.FM_grace_period + self.AD_grace_period: #If both the FM and AD are in execute-mode 53 | return self.execute(x) 54 | else: 55 | self.train(x) 56 | return 0.0 57 | 58 | #force train KitNET on x 59 | #returns the anomaly score of x during training (do not use for alerting) 60 | def train(self,x): 61 | if self.n_trained <= self.FM_grace_period and self.v is None: #If the FM is in train-mode, and the user has not supplied a feature mapping 62 | #update the incremetnal correlation matrix 63 | self.FM.update(x) 64 | if self.n_trained == self.FM_grace_period: #If the feature mapping should be instantiated 65 | self.v = self.FM.cluster(self.m) 66 | self.__createAD__() 67 | print("The Feature-Mapper found a mapping: "+str(self.n)+" features to "+str(len(self.v))+" autoencoders.") 68 | print("Feature-Mapper: execute-mode, Anomaly-Detector: train-mode") 69 | else: #train 70 | ## Ensemble Layer 71 | S_l1 = np.zeros(len(self.ensembleLayer)) 72 | for a in range(len(self.ensembleLayer)): 73 | # make sub instance for autoencoder 'a' 74 | xi = x[self.v[a]] 75 | S_l1[a] = self.ensembleLayer[a].train(xi) 76 | ## OutputLayer 77 | self.outputLayer.train(S_l1) 78 | if self.n_trained == self.AD_grace_period+self.FM_grace_period: 79 | print("Feature-Mapper: execute-mode, Anomaly-Detector: execute-mode") 80 | self.n_trained += 1 81 | 82 | #force execute KitNET on x 83 | def execute(self,x): 84 | if self.v is None: 85 | raise RuntimeError('KitNET Cannot execute x, because a feature mapping has not yet been learned or provided. Try running process(x) instead.') 86 | else: 87 | self.n_executed += 1 88 | ## Ensemble Layer 89 | S_l1 = np.zeros(len(self.ensembleLayer)) 90 | for a in range(len(self.ensembleLayer)): 91 | # make sub inst 92 | xi = x[self.v[a]] 93 | S_l1[a] = self.ensembleLayer[a].execute(xi) 94 | ## OutputLayer 95 | return self.outputLayer.execute(S_l1) 96 | 97 | def __createAD__(self): 98 | # construct ensemble layer 99 | for map in self.v: 100 | params = AE.dA_params(n_visible=len(map), n_hidden=0, lr=self.lr, corruption_level=0, gracePeriod=0, hiddenRatio=self.hr) 101 | self.ensembleLayer.append(AE.dA(params)) 102 | 103 | # construct output layer 104 | params = AE.dA_params(len(self.v), n_hidden=0, lr=self.lr, corruption_level=0, gracePeriod=0, hiddenRatio=self.hr) 105 | self.outputLayer = AE.dA(params) 106 | 107 | # Copyright (c) 2017 Yisroel Mirsky 108 | # 109 | # MIT License 110 | # 111 | # Permission is hereby granted, free of charge, to any person obtaining 112 | # a copy of this software and associated documentation files (the 113 | # "Software"), to deal in the Software without restriction, including 114 | # without limitation the rights to use, copy, modify, merge, publish, 115 | # distribute, sublicense, and/or sell copies of the Software, and to 116 | # permit persons to whom the Software is furnished to do so, subject to 117 | # the following conditions: 118 | # 119 | # The above copyright notice and this permission notice shall be 120 | # included in all copies or substantial portions of the Software. 121 | # 122 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 123 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 124 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 125 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 126 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 127 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 128 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /KitNET/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Yisroel Mirsky 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /KitNET/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["corClust", "dA", "KitNET","utils"] -------------------------------------------------------------------------------- /KitNET/corClust.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.cluster.hierarchy import linkage, fcluster, to_tree 3 | 4 | # A helper class for KitNET which performs a correlation-based incremental clustering of the dimensions in X 5 | # n: the number of dimensions in the dataset 6 | # For more information and citation, please see our NDSS'18 paper: Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection 7 | class corClust: 8 | def __init__(self,n): 9 | #parameter: 10 | self.n = n 11 | #varaibles 12 | self.c = np.zeros(n) #linear num of features 13 | self.c_r = np.zeros(n) #linear sum of feature residules 14 | self.c_rs = np.zeros(n) #linear sum of feature residules 15 | self.C = np.zeros((n,n)) #partial correlation matrix 16 | self.N = 0 #number of updates performed 17 | 18 | # x: a numpy vector of length n 19 | def update(self,x): 20 | self.N += 1 21 | self.c += x 22 | c_rt = x - self.c/self.N 23 | self.c_r += c_rt 24 | self.c_rs += c_rt**2 25 | self.C += np.outer(c_rt,c_rt) 26 | 27 | # creates the current correlation distance matrix between the features 28 | def corrDist(self): 29 | c_rs_sqrt = np.sqrt(self.c_rs) 30 | C_rs_sqrt = np.outer(c_rs_sqrt,c_rs_sqrt) 31 | C_rs_sqrt[C_rs_sqrt==0] = 1e-100 #this protects against dive by zero erros (occurs when a feature is a constant) 32 | D = 1-self.C/C_rs_sqrt #the correlation distance matrix 33 | D[D<0] = 0 #small negatives may appear due to the incremental fashion in which we update the mean. Therefore, we 'fix' them 34 | return D 35 | 36 | # clusters the features together, having no more than maxClust features per cluster 37 | def cluster(self,maxClust): 38 | D = self.corrDist() 39 | Z = linkage(D[np.triu_indices(self.n, 1)]) # create a linkage matrix based on the distance matrix 40 | if maxClust < 1: 41 | maxClust = 1 42 | if maxClust > self.n: 43 | maxClust = self.n 44 | map = self.__breakClust__(to_tree(Z),maxClust) 45 | return map 46 | 47 | # a recursive helper function which breaks down the dendrogram branches until all clusters have no more than maxClust elements 48 | def __breakClust__(self,dendro,maxClust): 49 | if dendro.count <= maxClust: #base case: we found a minimal cluster, so mark it 50 | return [dendro.pre_order()] #return the origional ids of the features in this cluster 51 | return self.__breakClust__(dendro.get_left(),maxClust) + self.__breakClust__(dendro.get_right(),maxClust) 52 | 53 | # Copyright (c) 2017 Yisroel Mirsky 54 | # 55 | # MIT License 56 | # 57 | # Permission is hereby granted, free of charge, to any person obtaining 58 | # a copy of this software and associated documentation files (the 59 | # "Software"), to deal in the Software without restriction, including 60 | # without limitation the rights to use, copy, modify, merge, publish, 61 | # distribute, sublicense, and/or sell copies of the Software, and to 62 | # permit persons to whom the Software is furnished to do so, subject to 63 | # the following conditions: 64 | # 65 | # The above copyright notice and this permission notice shall be 66 | # included in all copies or substantial portions of the Software. 67 | # 68 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 69 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 70 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 71 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 72 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 73 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 74 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /KitNET/dA.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Yusuke Sugomori 2 | # 3 | # MIT License 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | # Portions of this code have been adapted from Yusuke Sugomori's code on GitHub: https://github.com/yusugomori/DeepLearning 25 | 26 | import sys 27 | import numpy 28 | from KitNET.utils import * 29 | import json 30 | 31 | class dA_params: 32 | def __init__(self,n_visible = 5, n_hidden = 3, lr=0.001, corruption_level=0.0, gracePeriod = 10000, hiddenRatio=None): 33 | self.n_visible = n_visible# num of units in visible (input) layer 34 | self.n_hidden = n_hidden# num of units in hidden layer 35 | self.lr = lr 36 | self.corruption_level = corruption_level 37 | self.gracePeriod = gracePeriod 38 | self.hiddenRatio = hiddenRatio 39 | 40 | class dA: 41 | def __init__(self, params): 42 | self.params = params 43 | 44 | if self.params.hiddenRatio is not None: 45 | self.params.n_hidden = int(numpy.ceil(self.params.n_visible*self.params.hiddenRatio)) 46 | 47 | # for 0-1 normlaization 48 | self.norm_max = numpy.ones((self.params.n_visible,)) * -numpy.Inf 49 | self.norm_min = numpy.ones((self.params.n_visible,)) * numpy.Inf 50 | self.n = 0 51 | 52 | self.rng = numpy.random.RandomState(1234) 53 | 54 | a = 1. / self.params.n_visible 55 | self.W = numpy.array(self.rng.uniform( # initialize W uniformly 56 | low=-a, 57 | high=a, 58 | size=(self.params.n_visible, self.params.n_hidden))) 59 | 60 | self.hbias = numpy.zeros(self.params.n_hidden) # initialize h bias 0 61 | self.vbias = numpy.zeros(self.params.n_visible) # initialize v bias 0 62 | self.W_prime = self.W.T 63 | 64 | 65 | def get_corrupted_input(self, input, corruption_level): 66 | assert corruption_level < 1 67 | 68 | return self.rng.binomial(size=input.shape, 69 | n=1, 70 | p=1 - corruption_level) * input 71 | 72 | # Encode 73 | def get_hidden_values(self, input): 74 | return sigmoid(numpy.dot(input, self.W) + self.hbias) 75 | 76 | # Decode 77 | def get_reconstructed_input(self, hidden): 78 | return sigmoid(numpy.dot(hidden, self.W_prime) + self.vbias) 79 | 80 | def train(self, x): 81 | self.n = self.n + 1 82 | # update norms 83 | self.norm_max[x > self.norm_max] = x[x > self.norm_max] 84 | self.norm_min[x < self.norm_min] = x[x < self.norm_min] 85 | 86 | # 0-1 normalize 87 | x = (x - self.norm_min) / (self.norm_max - self.norm_min + 0.0000000000000001) 88 | 89 | if self.params.corruption_level > 0.0: 90 | tilde_x = self.get_corrupted_input(x, self.params.corruption_level) 91 | else: 92 | tilde_x = x 93 | y = self.get_hidden_values(tilde_x) 94 | z = self.get_reconstructed_input(y) 95 | 96 | L_h2 = x - z 97 | L_h1 = numpy.dot(L_h2, self.W) * y * (1 - y) 98 | 99 | L_vbias = L_h2 100 | L_hbias = L_h1 101 | L_W = numpy.outer(tilde_x.T, L_h1) + numpy.outer(L_h2.T, y) 102 | 103 | self.W += self.params.lr * L_W 104 | self.hbias += self.params.lr * L_hbias 105 | self.vbias += self.params.lr * L_vbias 106 | return numpy.sqrt(numpy.mean(L_h2**2)) #the RMSE reconstruction error during training 107 | 108 | 109 | def reconstruct(self, x): 110 | y = self.get_hidden_values(x) 111 | z = self.get_reconstructed_input(y) 112 | return z 113 | 114 | def execute(self, x): #returns MSE of the reconstruction of x 115 | if self.n < self.params.gracePeriod: 116 | return 0.0 117 | else: 118 | # 0-1 normalize 119 | x = (x - self.norm_min) / (self.norm_max - self.norm_min + 0.0000000000000001) 120 | z = self.reconstruct(x) 121 | rmse = numpy.sqrt(((x - z) ** 2).mean()) #MSE 122 | return rmse 123 | 124 | 125 | def inGrace(self): 126 | return self.n < self.params.gracePeriod 127 | -------------------------------------------------------------------------------- /KitNET/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy 3 | from scipy.stats import norm 4 | numpy.seterr(all='ignore') 5 | 6 | def pdf(x,mu,sigma): #normal distribution pdf 7 | x = (x-mu)/sigma 8 | return numpy.exp(-x**2/2)/(numpy.sqrt(2*numpy.pi)*sigma) 9 | 10 | def invLogCDF(x,mu,sigma): #normal distribution cdf 11 | x = (x - mu) / sigma 12 | return norm.logcdf(-x) #note: we mutiple by -1 after normalization to better get the 1-cdf 13 | 14 | def sigmoid(x): 15 | return 1. / (1 + numpy.exp(-x)) 16 | 17 | 18 | def dsigmoid(x): 19 | return x * (1. - x) 20 | 21 | def tanh(x): 22 | return numpy.tanh(x) 23 | 24 | def dtanh(x): 25 | return 1. - x * x 26 | 27 | def softmax(x): 28 | e = numpy.exp(x - numpy.max(x)) # prevent overflow 29 | if e.ndim == 1: 30 | return e / numpy.sum(e, axis=0) 31 | else: 32 | return e / numpy.array([numpy.sum(e, axis=1)]).T # ndim = 2 33 | 34 | 35 | def ReLU(x): 36 | return x * (x > 0) 37 | 38 | def dReLU(x): 39 | return 1. * (x > 0) 40 | 41 | class rollmean: 42 | def __init__(self,k): 43 | self.winsize = k 44 | self.window = numpy.zeros(self.winsize) 45 | self.pointer = 0 46 | 47 | def apply(self,newval): 48 | self.window[self.pointer]=newval 49 | self.pointer = (self.pointer+1) % self.winsize 50 | return numpy.mean(self.window) 51 | 52 | # probability density for the Gaussian dist 53 | # def gaussian(x, mean=0.0, scale=1.0): 54 | # s = 2 * numpy.power(scale, 2) 55 | # e = numpy.exp( - numpy.power((x - mean), 2) / s ) 56 | 57 | # return e / numpy.square(numpy.pi * s) 58 | 59 | -------------------------------------------------------------------------------- /Kitsune paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ymirsky/Kitsune-py/28a654b5813936380d264c0934136efda672174a/Kitsune paper.pdf -------------------------------------------------------------------------------- /Kitsune.py: -------------------------------------------------------------------------------- 1 | from FeatureExtractor import * 2 | from KitNET.KitNET import KitNET 3 | 4 | # MIT License 5 | # 6 | # Copyright (c) 2018 Yisroel mirsky 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | 26 | class Kitsune: 27 | def __init__(self,file_path,limit,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75,): 28 | #init packet feature extractor (AfterImage) 29 | self.FE = FE(file_path,limit) 30 | 31 | #init Kitnet 32 | self.AnomDetector = KitNET(self.FE.get_num_features(),max_autoencoder_size,FM_grace_period,AD_grace_period,learning_rate,hidden_ratio) 33 | 34 | def proc_next_packet(self): 35 | # create feature vector 36 | x = self.FE.get_next_vector() 37 | if len(x) == 0: 38 | return -1 #Error or no packets left 39 | 40 | # process KitNET 41 | return self.AnomDetector.process(x) # will train during the grace periods, then execute on all the rest. 42 | 43 | -------------------------------------------------------------------------------- /Kitsune_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ymirsky/Kitsune-py/28a654b5813936380d264c0934136efda672174a/Kitsune_fig.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Yisroel mirsky 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | In this repository you will find a Python implementation of Kitsune; an online network intrusion detection system, based on an ensemble of autoencoders. From, 3 | 4 | *Yisroel Mirsky, Tomer Doitshman, Yuval Elovici, and Asaf Shabtai, "Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection", Network and Distributed System Security Symposium 2018 (NDSS'18)* 5 | 6 | # What is Kitsune? 7 | 8 | Neural networks have become an increasingly popular solution for network intrusion detection systems (NIDS). Their capability of learning complex patterns and behaviors make them a suitable solution for differentiating between normal traffic and network attacks. However, a drawback of neural networks is the amount of resources needed to train them. Many network gateways and routers devices, which could potentially host an NIDS, simply do not have the memory or processing power to train and sometimes even execute such models. More importantly, the existing neural network solutions are trained in a supervised manner. Meaning that an expert must label the network traffic and update the model manually from time to time. 9 | 10 | Kitsune is a novel ANN-based NIDS which is online, unsupervised, and efficient. A Kitsune, in Japanese folklore, is a mythical fox-like creature that has a number of tails, can mimic different forms, and whose strength increases with experience. Similarly, Kitsune has an ensemble of small neural networks (autoencoders), which are trained to mimic (reconstruct) network traffic patterns, and whose performance incrementally improves overtime. 11 | 12 | The architecture of Kitsune is illustrated in the figure below: 13 | * First, a feature extraction framework called *AfterImage* efficiently tracks the patterns of every network channel using damped incremental statisitcs, and extracts a feature vector for each packet. The vector captures the temporal context of the packet's channel and sender. 14 | * Next, the features are mapped to the visible neurons of an ensemble of autoenoders (*KitNET* https://github.com/ymirsky/KitNET-py). 15 | * Then, each autoencoder attempts to reconstruct the instance's features, and computes the reconstruction error in terms of root mean squared errors (RMSE). 16 | * Finally, the RMSEs are forwarded to an output autoencoder, which acts as a non-linear voting mechanism for the ensemble. 17 | 18 | We note that while training \textbf{Kitsune}, no more than one instance is stored in memory at a time. Kitsune has one main parameter, which is the maximum number of inputs for any given autoencoder in the ensemble. This parameter is used to increase the algorithm's speed with a modest trade off in detection performance. 19 | 20 | ![An illustration of Kitsune's architecture](https://raw.githubusercontent.com/ymirsky/Kitsune-py/master/Kitsune_fig.png) 21 | 22 | 23 | Some points about KitNET: 24 | * It is completely plug-and-play. 25 | * It is based on an unsupervised machine learning algorithm (it does not need label, just train it on *normal* data!) 26 | * Its efficiency can be scaled with its input parameter m: the maximal size of any autoencoder in the ensemble layer (smaller autoencoders are exponentially cheaper to train and execute) 27 | 28 | # Implimentation Notes: 29 | 30 | * This python implimentation of Kitsune is **is not optimal** in terms of speed. To make Kitsune run as fast as described in the paper, the entire project must be cythonized, or implimented in C++ 31 | * For an experimental AfterImage version, change the import line in netStat.py to use AfterImage_extrapolate.py, and change line 5 of FeatureExtractor.py to True (uses cython). This version uses Lagrange-based Polynomial extrapolation to assit in computing the correlation based features. 32 | * We also require the scapy library for parsing (tshark [Wireshark] is default). 33 | * The source code has been tested with Anaconda 3.6.3 on a Windows 10 64bit machine. 34 | 35 | To install scapy, run in the terminal: 36 | ``` 37 | pip install scapy 38 | ``` 39 | 40 | 41 | 42 | # Using The Code 43 | Here is a simple example of how to make a Kitsune object: 44 | ``` 45 | from Kitsune import * 46 | 47 | 48 | # KitNET params: 49 | maxAE = 10 #maximum size for any autoencoder in the ensemble layer 50 | FMgrace = 5000 #the number of instances taken to learn the feature mapping (the ensemble's architecture) 51 | ADgrace = 50000 #the number of instances used to train the anomaly detector (ensemble itself) 52 | packet_limit = np.Inf #the number of packets from the input file to process 53 | path = "../../captured.pcap" #the pcap, pcapng, or tsv file which you wish to process. 54 | 55 | # Build Kitsune 56 | K = Kitsune(path,packet_limit,maxAE,FMgrace,ADgrace) 57 | ``` 58 | 59 | You can also configure the learning rate and hidden layer's neuron ratio via Kitsune's contructor. 60 | 61 | The input file can be any pcap network capture. When the object is created, the code check whether or not you have tshark (Wireshark) installed. If you do, then it uses tshark to parse the pcap into a tsv file which is saved to disk locally. This file is then later used when running Kitnet. You can also load this tsv file instead of the origional pcap to save time. Note that we currently only look for tshark in the Windows directory "C:\Program Files\Wireshark\tshark.exe" 62 | 63 | If tshark is not found, then the scapy packet parsing library is used. Scapy is significatly slower than using wireshark/tsv... 64 | 65 | To use the Kitsune object, simply tell Kitsune to process the next packet. After processing a packet, Kitsune returns the RMSE value of the packet (zero during the FM featuremapping and AD grace periods). 66 | 67 | Here is an example usage of the Kitsune object: 68 | ``` 69 | while True: 70 | rmse = K.proc_next_packet() #will train during the grace periods, then execute on all the rest. 71 | if rmse == -1: 72 | break 73 | print(rmse) 74 | ``` 75 | 76 | 77 | # Demo Code 78 | As a quick start, a demo script is provided in example.py. In the demo, we run Kitsune on a network capture of the Mirai malware. You can either run it directly or enter the following into your python console 79 | ``` 80 | import example.py 81 | ``` 82 | 83 | 84 | The code was written and with the Python environment Anaconda: https://anaconda.org/anaconda/python 85 | For significant speedups, as shown in our paper, you must implement Kitsune in C++, or entirely using cython. 86 | 87 | # Full Datasets 88 | The full datasets used in our NDSS paper can be found by following this google drive link: 89 | https://goo.gl/iShM7E 90 | 91 | # License 92 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details 93 | 94 | 95 | # Citations 96 | If you use the source code, the datasets, or implement KitNET, please cite the following paper: 97 | 98 | *Yisroel Mirsky, Tomer Doitshman, Yuval Elovici, and Asaf Shabtai, "Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection", Network and Distributed System Security Symposium 2018 (NDSS'18)* 99 | 100 | Yisroel Mirsky 101 | yisroel@post.bgu.ac.il 102 | 103 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from Kitsune import Kitsune 2 | import numpy as np 3 | import time 4 | 5 | ############################################################################## 6 | # Kitsune a lightweight online network intrusion detection system based on an ensemble of autoencoders (kitNET). 7 | # For more information and citation, please see our NDSS'18 paper: Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection 8 | 9 | # This script demonstrates Kitsune's ability to incrementally learn, and detect anomalies in recorded a pcap of the Mirai Malware. 10 | # The demo involves an m-by-n dataset with n=115 dimensions (features), and m=100,000 observations. 11 | # Each observation is a snapshot of the network's state in terms of incremental damped statistics (see the NDSS paper for more details) 12 | 13 | #The runtimes presented in the paper, are based on the C++ implimentation (roughly 100x faster than the python implimentation) 14 | ################### Last Tested with Anaconda 3.6.3 ####################### 15 | 16 | # Load Mirai pcap (a recording of the Mirai botnet malware being activated) 17 | # The first 70,000 observations are clean... 18 | print("Unzipping Sample Capture...") 19 | import zipfile 20 | with zipfile.ZipFile("mirai.zip","r") as zip_ref: 21 | zip_ref.extractall() 22 | 23 | 24 | # File location 25 | path = "mirai.pcap" #the pcap, pcapng, or tsv file to process. 26 | packet_limit = np.Inf #the number of packets to process 27 | 28 | # KitNET params: 29 | maxAE = 10 #maximum size for any autoencoder in the ensemble layer 30 | FMgrace = 5000 #the number of instances taken to learn the feature mapping (the ensemble's architecture) 31 | ADgrace = 50000 #the number of instances used to train the anomaly detector (ensemble itself) 32 | 33 | # Build Kitsune 34 | K = Kitsune(path,packet_limit,maxAE,FMgrace,ADgrace) 35 | 36 | print("Running Kitsune:") 37 | RMSEs = [] 38 | i = 0 39 | start = time.time() 40 | # Here we process (train/execute) each individual packet. 41 | # In this way, each observation is discarded after performing process() method. 42 | while True: 43 | i+=1 44 | if i % 1000 == 0: 45 | print(i) 46 | rmse = K.proc_next_packet() 47 | if rmse == -1: 48 | break 49 | RMSEs.append(rmse) 50 | stop = time.time() 51 | print("Complete. Time elapsed: "+ str(stop - start)) 52 | 53 | 54 | # Here we demonstrate how one can fit the RMSE scores to a log-normal distribution (useful for finding/setting a cutoff threshold \phi) 55 | from scipy.stats import norm 56 | benignSample = np.log(RMSEs[FMgrace+ADgrace+1:100000]) 57 | logProbs = norm.logsf(np.log(RMSEs), np.mean(benignSample), np.std(benignSample)) 58 | 59 | # plot the RMSE anomaly scores 60 | print("Plotting results") 61 | from matplotlib import pyplot as plt 62 | from matplotlib import cm 63 | plt.figure(figsize=(10,5)) 64 | fig = plt.scatter(range(FMgrace+ADgrace+1,len(RMSEs)),RMSEs[FMgrace+ADgrace+1:],s=0.1,c=logProbs[FMgrace+ADgrace+1:],cmap='RdYlGn') 65 | plt.yscale("log") 66 | plt.title("Anomaly Scores from Kitsune's Execution Phase") 67 | plt.ylabel("RMSE (log scaled)") 68 | plt.xlabel("Time elapsed [min]") 69 | figbar=plt.colorbar() 70 | figbar.ax.set_ylabel('Log Probability\n ', rotation=270) 71 | plt.show() 72 | -------------------------------------------------------------------------------- /mirai.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ymirsky/Kitsune-py/28a654b5813936380d264c0934136efda672174a/mirai.zip -------------------------------------------------------------------------------- /netStat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | ## Prep AfterImage cython package 3 | import os 4 | import subprocess 5 | import pyximport 6 | pyximport.install() 7 | import AfterImage as af 8 | #import AfterImage_NDSS as af 9 | 10 | # 11 | # MIT License 12 | # 13 | # Copyright (c) 2018 Yisroel mirsky 14 | # 15 | # Permission is hereby granted, free of charge, to any person obtaining a copy 16 | # of this software and associated documentation files (the "Software"), to deal 17 | # in the Software without restriction, including without limitation the rights 18 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | # copies of the Software, and to permit persons to whom the Software is 20 | # furnished to do so, subject to the following conditions: 21 | # 22 | # The above copyright notice and this permission notice shall be included in all 23 | # copies or substantial portions of the Software. 24 | # 25 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 | # SOFTWARE. 32 | 33 | 34 | class netStat: 35 | #Datastructure for efficent network stat queries 36 | # HostLimit: no more that this many Host identifiers will be tracked 37 | # HostSimplexLimit: no more that this many outgoing channels from each host will be tracked (purged periodically) 38 | # Lambdas: a list of 'window sizes' (decay factors) to track for each stream. nan resolved to default [5,3,1,.1,.01] 39 | def __init__(self, Lambdas = np.nan, HostLimit=255,HostSimplexLimit=1000): 40 | #Lambdas 41 | if np.isnan(Lambdas): 42 | self.Lambdas = [5,3,1,.1,.01] 43 | else: 44 | self.Lambdas = Lambdas 45 | 46 | #HT Limits 47 | self.HostLimit = HostLimit 48 | self.SessionLimit = HostSimplexLimit*self.HostLimit*self.HostLimit #*2 since each dual creates 2 entries in memory 49 | self.MAC_HostLimit = self.HostLimit*10 50 | 51 | #HTs 52 | self.HT_jit = af.incStatDB(limit=self.HostLimit*self.HostLimit)#H-H Jitter Stats 53 | self.HT_MI = af.incStatDB(limit=self.MAC_HostLimit)#MAC-IP relationships 54 | self.HT_H = af.incStatDB(limit=self.HostLimit) #Source Host BW Stats 55 | self.HT_Hp = af.incStatDB(limit=self.SessionLimit)#Source Host BW Stats 56 | 57 | 58 | def findDirection(self,IPtype,srcIP,dstIP,eth_src,eth_dst): #cpp: this is all given to you in the direction string of the instance (NO NEED FOR THIS FUNCTION) 59 | if IPtype==0: #is IPv4 60 | lstP = srcIP.rfind('.') 61 | src_subnet = srcIP[0:lstP:] 62 | lstP = dstIP.rfind('.') 63 | dst_subnet = dstIP[0:lstP:] 64 | elif IPtype==1: #is IPv6 65 | src_subnet = srcIP[0:round(len(srcIP)/2):] 66 | dst_subnet = dstIP[0:round(len(dstIP)/2):] 67 | else: #no Network layer, use MACs 68 | src_subnet = eth_src 69 | dst_subnet = eth_dst 70 | 71 | return src_subnet, dst_subnet 72 | 73 | def updateGetStats(self, IPtype, srcMAC,dstMAC, srcIP, srcProtocol, dstIP, dstProtocol, datagramSize, timestamp): 74 | # Host BW: Stats on the srcIP's general Sender Statistics 75 | # Hstat = np.zeros((3*len(self.Lambdas,))) 76 | # for i in range(len(self.Lambdas)): 77 | # Hstat[(i*3):((i+1)*3)] = self.HT_H.update_get_1D_Stats(srcIP, timestamp, datagramSize, self.Lambdas[i]) 78 | 79 | #MAC.IP: Stats on src MAC-IP relationships 80 | MIstat = np.zeros((3*len(self.Lambdas,))) 81 | for i in range(len(self.Lambdas)): 82 | MIstat[(i*3):((i+1)*3)] = self.HT_MI.update_get_1D_Stats(srcMAC+srcIP, timestamp, datagramSize, self.Lambdas[i]) 83 | 84 | # Host-Host BW: Stats on the dual traffic behavior between srcIP and dstIP 85 | HHstat = np.zeros((7*len(self.Lambdas,))) 86 | for i in range(len(self.Lambdas)): 87 | HHstat[(i*7):((i+1)*7)] = self.HT_H.update_get_1D2D_Stats(srcIP, dstIP,timestamp,datagramSize,self.Lambdas[i]) 88 | 89 | # Host-Host Jitter: 90 | HHstat_jit = np.zeros((3*len(self.Lambdas,))) 91 | for i in range(len(self.Lambdas)): 92 | HHstat_jit[(i*3):((i+1)*3)] = self.HT_jit.update_get_1D_Stats(srcIP+dstIP, timestamp, 0, self.Lambdas[i],isTypeDiff=True) 93 | 94 | # Host-Host BW: Stats on the dual traffic behavior between srcIP and dstIP 95 | HpHpstat = np.zeros((7*len(self.Lambdas,))) 96 | if srcProtocol == 'arp': 97 | for i in range(len(self.Lambdas)): 98 | HpHpstat[(i*7):((i+1)*7)] = self.HT_Hp.update_get_1D2D_Stats(srcMAC, dstMAC, timestamp, datagramSize, self.Lambdas[i]) 99 | else: # some other protocol (e.g. TCP/UDP) 100 | for i in range(len(self.Lambdas)): 101 | HpHpstat[(i*7):((i+1)*7)] = self.HT_Hp.update_get_1D2D_Stats(srcIP + srcProtocol, dstIP + dstProtocol, timestamp, datagramSize, self.Lambdas[i]) 102 | 103 | return np.concatenate((MIstat, HHstat, HHstat_jit, HpHpstat)) # concatenation of stats into one stat vector 104 | 105 | def getNetStatHeaders(self): 106 | MIstat_headers = [] 107 | Hstat_headers = [] 108 | HHstat_headers = [] 109 | HHjitstat_headers = [] 110 | HpHpstat_headers = [] 111 | 112 | for i in range(len(self.Lambdas)): 113 | MIstat_headers += ["MI_dir_"+h for h in self.HT_MI.getHeaders_1D(Lambda=self.Lambdas[i],ID=None)] 114 | HHstat_headers += ["HH_"+h for h in self.HT_H.getHeaders_1D2D(Lambda=self.Lambdas[i],IDs=None,ver=2)] 115 | HHjitstat_headers += ["HH_jit_"+h for h in self.HT_jit.getHeaders_1D(Lambda=self.Lambdas[i],ID=None)] 116 | HpHpstat_headers += ["HpHp_" + h for h in self.HT_Hp.getHeaders_1D2D(Lambda=self.Lambdas[i], IDs=None, ver=2)] 117 | return MIstat_headers + Hstat_headers + HHstat_headers + HHjitstat_headers + HpHpstat_headers 118 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | 4 | setup( 5 | ext_modules = cythonize(["*.pyx"]) 6 | ) --------------------------------------------------------------------------------