├── AfterImage.py
├── AfterImage_extrapolate.pyx
├── FeatureExtractor.py
├── KitNET
    ├── KitNET.py
    ├── LICENSE.txt
    ├── __init__.py
    ├── corClust.py
    ├── dA.py
    └── utils.py
├── Kitsune paper.pdf
├── Kitsune.py
├── Kitsune_fig.png
├── LICENSE
├── README.md
├── example.py
├── mirai.zip
├── netStat.py
└── setup.py


/AfterImage.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | 
  4 | 
  5 | class incStat:
  6 |     def __init__(self, Lambda, ID, init_time=0, isTypeDiff=False):  # timestamp is creation time
  7 |         self.ID = ID
  8 |         self.CF1 = 0  # linear sum
  9 |         self.CF2 = 0  # sum of squares
 10 |         self.w = 1e-20  # weight
 11 |         self.isTypeDiff = isTypeDiff
 12 |         self.Lambda = Lambda  # Decay Factor
 13 |         self.lastTimestamp = init_time
 14 |         self.cur_mean = np.nan
 15 |         self.cur_var = np.nan
 16 |         self.cur_std = np.nan
 17 |         self.covs = [] # a list of incStat_covs (references) with relate to this incStat
 18 | 
 19 |     def insert(self, v, t=0):  # v is a scalar, t is v's arrival the timestamp
 20 |         if self.isTypeDiff:
 21 |             dif = t - self.lastTimestamp
 22 |             if dif > 0:
 23 |                 v = dif
 24 |             else:
 25 |                 v = 0
 26 |         self.processDecay(t)
 27 | 
 28 |         # update with v
 29 |         self.CF1 += v
 30 |         self.CF2 += math.pow(v, 2)
 31 |         self.w += 1
 32 |         self.cur_mean = np.nan  # force recalculation if called
 33 |         self.cur_var = np.nan
 34 |         self.cur_std = np.nan
 35 | 
 36 |         # update covs (if any)
 37 |         for cov in self.covs:
 38 |             cov.update_cov(self.ID, v, t)
 39 | 
 40 |     def processDecay(self, timestamp):
 41 |         factor=1
 42 |         # check for decay
 43 |         timeDiff = timestamp - self.lastTimestamp
 44 |         if timeDiff > 0:
 45 |             factor = math.pow(2, (-self.Lambda * timeDiff))
 46 |             self.CF1 = self.CF1 * factor
 47 |             self.CF2 = self.CF2 * factor
 48 |             self.w = self.w * factor
 49 |             self.lastTimestamp = timestamp
 50 |         return factor
 51 | 
 52 |     def weight(self):
 53 |         return self.w
 54 | 
 55 |     def mean(self):
 56 |         if math.isnan(self.cur_mean):  # calculate it only once when necessary
 57 |             self.cur_mean = self.CF1 / self.w
 58 |         return self.cur_mean
 59 | 
 60 |     def var(self):
 61 |         if math.isnan(self.cur_var):  # calculate it only once when necessary
 62 |             self.cur_var = abs(self.CF2 / self.w - math.pow(self.mean(), 2))
 63 |         return self.cur_var
 64 | 
 65 |     def std(self):
 66 |         if math.isnan(self.cur_std):  # calculate it only once when necessary
 67 |             self.cur_std = math.sqrt(self.var())
 68 |         return self.cur_std
 69 | 
 70 |     def cov(self,ID2):
 71 |         for cov in self.covs:
 72 |             if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2:
 73 |                 return cov.cov()
 74 |         return [np.nan]
 75 | 
 76 |     def pcc(self,ID2):
 77 |         for cov in self.covs:
 78 |             if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2:
 79 |                 return cov.pcc()
 80 |         return [np.nan]
 81 | 
 82 |     def cov_pcc(self,ID2):
 83 |         for cov in self.covs:
 84 |             if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2:
 85 |                 return cov.get_stats1()
 86 |         return [np.nan]*2
 87 | 
 88 |     def radius(self, other_incStats):  # the radius of a set of incStats
 89 |         A = self.var()**2
 90 |         for incS in other_incStats:
 91 |             A += incS.var()**2
 92 |         return math.sqrt(A)
 93 | 
 94 |     def magnitude(self, other_incStats):  # the magnitude of a set of incStats
 95 |         A = math.pow(self.mean(), 2)
 96 |         for incS in other_incStats:
 97 |             A += math.pow(incS.mean(), 2)
 98 |         return math.sqrt(A)
 99 | 
100 |     #calculates and pulls all stats on this stream
101 |     def allstats_1D(self):
102 |         self.cur_mean = self.CF1 / self.w
103 |         self.cur_var = abs(self.CF2 / self.w - math.pow(self.cur_mean, 2))
104 |         return [self.w, self.cur_mean, self.cur_var]
105 | 
106 |     #calculates and pulls all stats on this stream, and stats shared with the indicated stream
107 |     def allstats_2D(self, ID2):
108 |         stats1D = self.allstats_1D()
109 |         # Find cov component
110 |         stats2D = [np.nan] * 4
111 |         for cov in self.covs:
112 |             if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2:
113 |                 stats2D = cov.get_stats2()
114 |                 break
115 |         return stats1D + stats2D
116 | 
117 |     def getHeaders_1D(self, suffix=True):
118 |         if self.ID is None:
119 |             s0=""
120 |         else:
121 |             s0 = "_0"
122 |         if suffix:
123 |             s0 = "_"+self.ID
124 |         headers = ["weight"+s0, "mean"+s0, "std"+s0]
125 |         return headers
126 | 
127 |     def getHeaders_2D(self, ID2, suffix=True):
128 |         hdrs1D = self.getHeaders_1D(suffix)
129 |         if self.ID is None:
130 |             s0=""
131 |             s1=""
132 |         else:
133 |             s0 = "_0"
134 |             s1 = "_1"
135 |         if suffix:
136 |             s0 = "_"+self.ID
137 |             s1 = "_" + ID2
138 |         hdrs2D = ["radius_" + s0 + "_" + s1, "magnitude_" + s0 + "_" + s1, "covariance_" + s0 + "_" + s1,
139 |                    "pcc_" + s0 + "_" + s1]
140 |         return hdrs1D+hdrs2D
141 | 
142 | 
143 | #like incStat, but maintains stats between two streams
144 | class incStat_cov:
145 |     def __init__(self, incS1, incS2, init_time = 0):
146 |         # store references tot he streams' incStats
147 |         self.incStats = [incS1,incS2]
148 |         self.lastRes = [0,0]
149 |         # init extrapolators
150 |         #self.EXs = [extrapolator(),extrapolator()]
151 | 
152 |         # init sum product residuals
153 |         self.CF3 = 0 # sum of residule products (A-uA)(B-uB)
154 |         self.w3 = 1e-20
155 |         self.lastTimestamp_cf3 = init_time
156 | 
157 |     #other_incS_decay is the decay factor of the other incstat
158 |     # ID: the stream ID which produced (v,t)
159 |     def update_cov(self, ID, v, t):  # it is assumes that incStat "ID" has ALREADY been updated with (t,v) [this si performed automatically in method incStat.insert()]
160 |         # find incStat
161 |         if ID == self.incStats[0].ID:
162 |             inc = 0
163 |         elif ID == self.incStats[1].ID:
164 |             inc = 1
165 |         else:
166 |             print("update_cov ID error")
167 |             return ## error
168 | 
169 |         # Decay other incStat
170 |         self.incStats[not(inc)].processDecay(t)
171 | 
172 |         # Decay residules
173 |         self.processDecay(t,inc)
174 | 
175 |         # Update extrapolator for current stream
176 |         #self.EXs[inc].insert(t,v)
177 | 
178 |         # Extrapolate other stream
179 |         #v_other = self.EXs[not(inc)].predict(t)
180 | 
181 |         # Compute and update residule
182 |         res = (v - self.incStats[inc].mean())
183 |         resid = (v - self.incStats[inc].mean()) * self.lastRes[not(inc)]
184 |         self.CF3 += resid
185 |         self.w3 += 1
186 |         self.lastRes[inc] = res
187 | 
188 |     def processDecay(self,t,micro_inc_indx):
189 |         factor = 1
190 |         # check for decay cf3
191 |         timeDiffs_cf3 = t - self.lastTimestamp_cf3
192 |         if timeDiffs_cf3 > 0:
193 |             factor = math.pow(2, (-(self.incStats[micro_inc_indx].Lambda) * timeDiffs_cf3))
194 |             self.CF3 *= factor
195 |             self.w3 *= factor
196 |             self.lastTimestamp_cf3 = t
197 |             self.lastRes[micro_inc_indx] *= factor
198 |         return factor
199 | 
200 |     #todo: add W3 for cf3
201 | 
202 |     #covariance approximation
203 |     def cov(self):
204 |         return self.CF3 / self.w3
205 | 
206 |     # Pearson corl. coef
207 |     def pcc(self):
208 |         ss = self.incStats[0].std() * self.incStats[1].std()
209 |         if ss != 0:
210 |             return self.cov() / ss
211 |         else:
212 |             return 0
213 | 
214 |     # calculates and pulls all correlative stats
215 |     def get_stats1(self):
216 |         return [self.cov(), self.pcc()]
217 | 
218 |     # calculates and pulls all correlative stats AND 2D stats from both streams (incStat)
219 |     def get_stats2(self):
220 |         return [self.incStats[0].radius([self.incStats[1]]),self.incStats[0].magnitude([self.incStats[1]]),self.cov(), self.pcc()]
221 | 
222 |     # calculates and pulls all correlative stats AND 2D stats AND the regular stats from both streams (incStat)
223 |     def get_stats3(self):
224 |         return [self.incStats[0].w,self.incStats[0].mean(),self.incStats[0].std(),self.incStats[1].w,self.incStats[1].mean(),self.incStats[1].std(),self.cov(), self.pcc()]
225 | 
226 |     # calculates and pulls all correlative stats AND the regular stats from both incStats AND 2D stats
227 |     def get_stats4(self):
228 |         return [self.incStats[0].w,self.incStats[0].mean(),self.incStats[0].std(),self.incStats[1].w,self.incStats[1].mean(),self.incStats[1].std(), self.incStats[0].radius([self.incStats[1]]),self.incStats[0].magnitude([self.incStats[1]]),self.cov(), self.pcc()]
229 | 
230 |     def getHeaders(self,ver,suffix=True): #ver = {1,2,3,4}
231 |         headers = []
232 |         s0 = "0"
233 |         s1 = "1"
234 |         if suffix:
235 |             s0 = self.incStats[0].ID
236 |             s1 = self.incStats[1].ID
237 | 
238 |         if ver == 1:
239 |             headers = ["covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1]
240 |         if ver == 2:
241 |             headers = ["radius_"+s0+"_"+s1, "magnitude_"+s0+"_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1]
242 |         if ver == 3:
243 |             headers = ["weight_"+s0, "mean_"+s0, "std_"+s0,"weight_"+s1, "mean_"+s1, "std_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1]
244 |         if ver == 4:
245 |             headers = ["weight_" + s0, "mean_" + s0, "std_" + s0, "covariance_" + s0 + "_" + s1, "pcc_" + s0 + "_" + s1]
246 |         if ver == 5:
247 |             headers = ["weight_"+s0, "mean_"+s0, "std_"+s0,"weight_"+s1, "mean_"+s1, "std_"+s1, "radius_"+s0+"_"+s1, "magnitude_"+s0+"_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1]
248 |         return headers
249 | 
250 | 
251 | class incStatDB:
252 |     # default_lambda: use this as the lambda for all streams. If not specified, then you must supply a Lambda with every query.
253 |     def __init__(self,limit=np.Inf,default_lambda=np.nan):
254 |         self.HT = dict()
255 |         self.limit = limit
256 |         self.df_lambda = default_lambda
257 | 
258 |     def get_lambda(self,Lambda):
259 |         if not np.isnan(self.df_lambda):
260 |             Lambda = self.df_lambda
261 |         return Lambda
262 | 
263 |     # Registers a new stream. init_time: init lastTimestamp of the incStat
264 |     def register(self,ID,Lambda=1,init_time=0,isTypeDiff=False):
265 |         #Default Lambda?
266 |         Lambda = self.get_lambda(Lambda)
267 | 
268 |         #Retrieve incStat
269 |         key = ID+"_"+str(Lambda)
270 |         incS = self.HT.get(key)
271 |         if incS is None: #does not already exist
272 |             if len(self.HT) + 1 > self.limit:
273 |                 raise LookupError(
274 |                     'Adding Entry:\n' + key + '\nwould exceed incStatHT 1D limit of ' + str(
275 |                         self.limit) + '.\nObservation Rejected.')
276 |             incS = incStat(Lambda, ID, init_time, isTypeDiff)
277 |             self.HT[key] = incS #add new entry
278 |         return incS
279 | 
280 |     # Registers covariance tracking for two streams, registers missing streams
281 |     def register_cov(self,ID1,ID2,Lambda=1,init_time=0,isTypeDiff=False):
282 |         #Default Lambda?
283 |         Lambda = self.get_lambda(Lambda)
284 | 
285 |         # Lookup both streams
286 |         incS1 = self.register(ID1,Lambda,init_time,isTypeDiff)
287 |         incS2 = self.register(ID2,Lambda,init_time,isTypeDiff)
288 | 
289 |         #check for pre-exiting link
290 |         for cov in incS1.covs:
291 |             if cov.incStats[0].ID == ID2 or cov.incStats[1].ID == ID2:
292 |                 return cov #there is a pre-exiting link
293 | 
294 |         # Link incStats
295 |         inc_cov = incStat_cov(incS1,incS2,init_time)
296 |         incS1.covs.append(inc_cov)
297 |         incS2.covs.append(inc_cov)
298 |         return inc_cov
299 | 
300 |     # updates/registers stream
301 |     def update(self,ID,t,v,Lambda=1,isTypeDiff=False):
302 |         incS = self.register(ID,Lambda,t,isTypeDiff)
303 |         incS.insert(v,t)
304 |         return incS
305 | 
306 |     # Pulls current stats from the given ID
307 |     def get_1D_Stats(self,ID,Lambda=1): #weight, mean, std
308 |         #Default Lambda?
309 |         Lambda = self.get_lambda(Lambda)
310 | 
311 |         #Get incStat
312 |         incS = self.HT.get(ID+"_"+str(Lambda))
313 |         if incS is None:  # does not already exist
314 |             return [np.na]*3
315 |         else:
316 |             return incS.allstats_1D()
317 | 
318 |     # Pulls current correlational stats from the given IDs
319 |     def get_2D_Stats(self, ID1, ID2, Lambda=1): #cov, pcc
320 |         # Default Lambda?
321 |         Lambda = self.get_lambda(Lambda)
322 | 
323 |         # Get incStat
324 |         incS1 = self.HT.get(ID1 + "_" + str(Lambda))
325 |         if incS1 is None:  # does not exist
326 |             return [np.na]*2
327 | 
328 |         # find relevant cov entry
329 |         return incS1.cov_pcc(ID2)
330 | 
331 |     # Pulls all correlational stats registered with the given ID
332 |     # returns tuple [0]: stats-covs&pccs, [2]: IDs
333 |     def get_all_2D_Stats(self, ID, Lambda=1):  # cov, pcc
334 |         # Default Lambda?
335 |         Lambda = self.get_lambda(Lambda)
336 | 
337 |         # Get incStat
338 |         incS1 = self.HT.get(ID + "_" + str(Lambda))
339 |         if incS1 is None:  # does not exist
340 |             return ([],[])
341 | 
342 |         # find relevant cov entry
343 |         stats = []
344 |         IDs = []
345 |         for cov in incS1.covs:
346 |             stats.append(cov.get_stats1())
347 |             IDs.append([cov.incStats[0].ID,cov.incStats[1].ID])
348 |         return stats,IDs
349 | 
350 |     # Pulls current multidimensional stats from the given IDs
351 |     def get_nD_Stats(self,IDs,Lambda=1): #radius, magnitude (IDs is a list)
352 |         # Default Lambda?
353 |         Lambda = self.get_lambda(Lambda)
354 | 
355 |         # Get incStats
356 |         incStats = []
357 |         for ID in IDs:
358 |             incS = self.HT.get(ID + "_" + str(Lambda))
359 |             if incS is not None:  #exists
360 |                 incStats.append(incS)
361 | 
362 |         # Compute stats
363 |         rad = 0 #radius
364 |         mag = 0 #magnitude
365 |         for incS in incStats:
366 |             rad += incS.var()
367 |             mag += incS.mean()**2
368 | 
369 |         return [np.sqrt(rad),np.sqrt(mag)]
370 | 
371 |     # Updates and then pulls current 1D stats from the given ID. Automatically registers previously unknown stream IDs
372 |     def update_get_1D_Stats(self, ID,t,v,Lambda=1,isTypeDiff=False):  # weight, mean, std
373 |         incS = self.update(ID,t,v,Lambda,isTypeDiff)
374 |         return incS.allstats_1D()
375 | 
376 | 
377 |     # Updates and then pulls current correlative stats between the given IDs. Automatically registers previously unknown stream IDs, and cov tracking
378 |     #Note: AfterImage does not currently support Diff Type streams for correlational statistics.
379 |     def update_get_2D_Stats(self, ID1,ID2,t1,v1,Lambda=1,level=1):  #level=  1:cov,pcc  2:radius,magnitude,cov,pcc
380 |         #retrieve/add cov tracker
381 |         inc_cov = self.register_cov(ID1, ID2, Lambda,  t1)
382 |         # Update cov tracker
383 |         inc_cov.update_cov(ID1,v1,t1)
384 |         if level == 1:
385 |             return inc_cov.get_stats1()
386 |         else:
387 |             return inc_cov.get_stats2()
388 | 
389 |     # Updates and then pulls current 1D and 2D stats from the given IDs. Automatically registers previously unknown stream IDs
390 |     def update_get_1D2D_Stats(self, ID1,ID2,t1,v1,Lambda=1):  # weight, mean, std
391 |         return self.update_get_1D_Stats(ID1,t1,v1,Lambda) + self.update_get_2D_Stats(ID1,ID2,t1,v1,Lambda,level=2)
392 | 
393 |     def getHeaders_1D(self,Lambda=1,ID=None):
394 |         # Default Lambda?
395 |         Lambda = self.get_lambda(Lambda)
396 |         hdrs = incStat(Lambda,ID).getHeaders_1D(suffix=False)
397 |         return [str(Lambda)+"_"+s for s in hdrs]
398 | 
399 |     def getHeaders_2D(self,Lambda=1,IDs=None, ver=1): #IDs is a 2-element list or tuple
400 |         # Default Lambda?
401 |         Lambda = self.get_lambda(Lambda)
402 |         if IDs is None:
403 |             IDs = [0,1]
404 |         hdrs = incStat_cov(incStat(Lambda,IDs[0]),incStat(Lambda,IDs[0]),Lambda).getHeaders(ver,suffix=False)
405 |         return [str(Lambda)+"_"+s for s in hdrs]
406 | 
407 |     def getHeaders_1D2D(self,Lambda=1,IDs=None, ver=1):
408 |         # Default Lambda?
409 |         Lambda = self.get_lambda(Lambda)
410 |         if IDs is None:
411 |             IDs = [0,1]
412 |         hdrs1D = self.getHeaders_1D(Lambda,IDs[0])
413 |         hdrs2D = self.getHeaders_2D(Lambda,IDs, ver)
414 |         return hdrs1D + hdrs2D
415 | 
416 |     def getHeaders_nD(self,Lambda=1,IDs=[]): #IDs is a n-element list or tuple
417 |         # Default Lambda?
418 |         ID = ":"
419 |         for s in IDs:
420 |             ID += "_"+s
421 |         Lambda = self.get_lambda(Lambda)
422 |         hdrs = ["radius"+ID, "magnitude"+ID]
423 |         return [str(Lambda)+"_"+s for s in hdrs]
424 | 
425 | 
426 |     #cleans out records that have a weight less than the cutoff.
427 |     #returns number or removed records.
428 |     def cleanOutOldRecords(self,cutoffWeight,curTime):
429 |         n = 0
430 |         dump = sorted(self.HT.items(), key=lambda tup: tup[1][0].getMaxW(curTime))
431 |         for entry in dump:
432 |             entry[1][0].processDecay(curTime)
433 |             W = entry[1][0].w
434 |             if W <= cutoffWeight:
435 |                 key = entry[0]
436 |                 del entry[1][0]
437 |                 del self.HT[key]
438 |                 n=n+1
439 |             elif W > cutoffWeight:
440 |                 break
441 |         return n
442 | 
443 | 


--------------------------------------------------------------------------------
/AfterImage_extrapolate.pyx:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | 
  4 | # MIT License
  5 | #
  6 | # Copyright (c) 2018 Yisroel mirsky
  7 | #
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | #
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | #
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | # SOFTWARE.
 25 | 
 26 | 
 27 | #compile with:   python setup.py build_ext --inplace
 28 | 
 29 | import pyximport; pyximport.install()
 30 | 
 31 | cdef class incStat:
 32 |     cdef str ID
 33 |     cdef double CF1
 34 |     cdef double CF2
 35 |     cdef double w
 36 |     cdef int isTypeDiff
 37 |     cdef double Lambda
 38 |     cdef double lastTimestamp
 39 |     cdef double cur_mean
 40 |     cdef double cur_var
 41 |     cdef double cur_std
 42 |     cdef list covs
 43 | 
 44 |     def __init__(self, double Lambda, str ID, double init_time=0, int isTypeDiff=False):  # timestamp is creation time
 45 |         self.ID = ID
 46 |         self.CF1 = 0  # linear sum
 47 |         self.CF2 = 0  # sum of squares
 48 |         self.w = 1e-20  # weight
 49 |         self.isTypeDiff = isTypeDiff
 50 |         self.Lambda = Lambda  # Decay Factor
 51 |         self.lastTimestamp = init_time
 52 |         self.cur_mean = np.nan
 53 |         self.cur_var = np.nan
 54 |         self.cur_std = np.nan
 55 |         self.covs = [] # a list of incStat_covs (references) with relate to this incStat
 56 | 
 57 |     cdef insert(self, double v, double t=0):  # v is a scalar, t is v's arrival the timestamp
 58 |         if self.isTypeDiff:
 59 |             if t - self.lastTimestamp > 0:
 60 |                 v = t - self.lastTimestamp
 61 |             else:
 62 |                 v = 0
 63 |         self.processDecay(t)
 64 | 
 65 |         # update with v
 66 |         self.CF1 += v
 67 |         self.CF2 += math.pow(v, 2)
 68 |         self.w += 1
 69 |         self.cur_mean = np.nan  # force recalculation if called
 70 |         self.cur_var = np.nan
 71 |         self.cur_std = np.nan
 72 | 
 73 |         # update covs (if any)
 74 |         cdef incStat_cov cov
 75 |         for c in self.covs:
 76 |             cov = c
 77 |             cov.update_cov(self.ID, v, t)
 78 | 
 79 |     cdef processDecay(self, double timestamp):
 80 |         cdef double factor, timeDiff
 81 |         factor = 1
 82 |         # check for decay
 83 |         timeDiff = timestamp - self.lastTimestamp
 84 |         if timeDiff > 0:
 85 |             factor = math.pow(2, (-self.Lambda * timeDiff))
 86 |             self.CF1 = self.CF1 * factor
 87 |             self.CF2 = self.CF2 * factor
 88 |             self.w = self.w * factor
 89 |             self.lastTimestamp = timestamp
 90 |         return factor
 91 | 
 92 |     cdef weight(self):
 93 |         return self.w
 94 | 
 95 |     cdef mean(self):
 96 |         if math.isnan(self.cur_mean):  # calculate it only once when necessary
 97 |             self.cur_mean = self.CF1 / self.w
 98 |         return self.cur_mean
 99 | 
100 |     cdef var(self):
101 |         if math.isnan(self.cur_var):  # calculate it only once when necessary
102 |             self.cur_var = abs(self.CF2 / self.w - math.pow(self.mean(), 2))
103 |         return self.cur_var
104 | 
105 |     cdef std(self):
106 |         if math.isnan(self.cur_std):  # calculate it only once when necessary
107 |             self.cur_std = math.sqrt(self.var())
108 |         return self.cur_std
109 | 
110 |     cdef cov(self,ID2):
111 |         for cov in self.covs:
112 |             if cov.isRelated(ID2):
113 |                 return cov.cov()
114 |         return [np.nan]
115 | 
116 |     cdef pcc(self,ID2):
117 |         for cov in self.covs:
118 |             if cov.isRelated(ID2):
119 |                 return cov.pcc()
120 |         return [np.nan]
121 | 
122 |     cdef cov_pcc(self,ID2):
123 |         cdef incStat_cov cov
124 |         for c in self.covs:
125 |             cov = c
126 |             if cov.isRelated(ID2):
127 |                 return cov.get_stats1()
128 |         return [np.nan]*2
129 | 
130 |     cdef radius(self, other_incStats):  # the radius of a set of incStats
131 |         cdef double A
132 |         A = self.var()
133 |         cdef incStat incSc
134 |         for incS in other_incStats:
135 |             incSc = incS
136 |             A += incSc.var()
137 |         return math.sqrt(A)
138 | 
139 |     cdef magnitude(self, other_incStats):  # the magnitude of a set of incStats
140 |         cdef double A
141 |         A = math.pow(self.mean(), 2)
142 |         cdef incStat incSc
143 |         for incS in other_incStats:
144 |             incSc = incS
145 |             A += math.pow(incSc.mean(), 2)
146 |         return math.sqrt(A)
147 | 
148 |     #calculates and pulls all stats on this stream
149 |     cdef allstats_1D(self):
150 |         self.cur_mean = self.CF1 / self.w
151 |         self.cur_var = abs(self.CF2 / self.w - math.pow(self.cur_mean, 2))
152 |         return [self.w, self.cur_mean, self.cur_var]
153 | 
154 |     #calculates and pulls all stats on this stream, and stats shared with the indicated stream
155 |     cdef allstats_2D(self, str ID2):
156 |         stats1D = self.allstats_1D()
157 |         # Find cov component
158 |         stats2D = [np.nan] * 4
159 |         cdef incStat_cov cov
160 |         for c in self.covs:
161 |             cov = c
162 |             if cov.isRelated(ID2):
163 |                 stats2D = cov.get_stats2()
164 |                 break
165 |         return stats1D + stats2D
166 | 
167 |     cdef getHeaders_1D(self, suffix=True):
168 |         if self.ID is None:
169 |             s0=""
170 |         else:
171 |             s0 = "_0"
172 |         if suffix:
173 |             s0 = "_"+self.ID
174 |         headers = ["weight"+s0, "mean"+s0, "std"+s0]
175 |         return headers
176 | 
177 |     cdef getHeaders_2D(self, ID2, suffix=True):
178 |         hdrs1D = self.getHeaders_1D(suffix)
179 |         if self.ID is None:
180 |             s0=""
181 |             s1=""
182 |         else:
183 |             s0 = "_0"
184 |             s1 = "_1"
185 |         if suffix:
186 |             s0 = "_"+self.ID
187 |             s1 = "_" + ID2
188 |         hdrs2D = ["radius_" + s0 + "_" + s1, "magnitude_" + s0 + "_" + s1, "covariance_" + s0 + "_" + s1,
189 |                    "pcc_" + s0 + "_" + s1]
190 |         return hdrs1D+hdrs2D
191 | 
192 |     # def toJSON(self):
193 |     #     j = {}
194 |     #     j['CF1'] = self.CF1
195 |     #     j['CF2'] = self.CF2
196 |     #     j['w'] = self.w
197 |     #     j['isTypeDiff'] = self.isTypeDiff
198 |     #     j['Lambda'] = self.Lambda
199 |     #     j['lastTimestamp'] = self.lastTimestamp
200 |     #     return json.dumps(j)
201 |     #
202 |     # def loadFromJSON(self,JSONstring):
203 |     #     j = json.loads(JSONstring)
204 |     #     self.CF1 = j['CF1']
205 |     #     self.CF2 = j['CF2']
206 |     #     self.w = j['w']
207 |     #     self.isTypeDiff = j['isTypeDiff']
208 |     #     self.Lambda = j['Lambda']
209 |     #     self.lastTimestamp = j['lastTimestamp']
210 | 
211 | #like incStat, but maintains stats between two streams
212 | #TODO: make it possble to call incstat magnitude and raduis withour list of incstsats (just single incstat objects) for cov.getstats2 typcast call
213 | cdef class incStat_cov:
214 |     cdef double CF3
215 |     cdef double w3
216 |     cdef double lastTimestamp_cf3
217 |     cdef incStat incS1
218 |     cdef incStat incS2
219 |     cdef extrapolator ex1
220 |     cdef extrapolator ex2
221 | 
222 | 
223 |     def __init__(self, incStat incS1,incStat incS2, double init_time = 0):
224 |         # store references tot he streams' incStats
225 |         self.incS1 = incS1
226 |         self.incS2 = incS2
227 | 
228 |         # init extrapolators
229 |         self.ex1 = extrapolator()
230 |         self.ex2 = extrapolator()
231 | 
232 |         # init sum product residuals
233 |         self.CF3 = 0 # sum of residule products (A-uA)(B-uB)
234 |         self.w3 = 1e-20
235 |         self.lastTimestamp_cf3 = init_time
236 | 
237 |     #other_incS_decay is the decay factor of the other incstat
238 |     # ID: the stream ID which produced (v,t)
239 |     cdef update_cov(self, str ID, double v, double t):  # it is assumes that incStat "ID" has ALREADY been updated with (t,v) [this si performed automatically in method incStat.insert()]
240 |         # find incStat
241 |         cdef int inc
242 |         if ID == self.incS1.ID:
243 |             inc = 0
244 |         else:
245 |             inc = 1
246 | 
247 |         # Decay residules
248 |         self.processDecay(t)
249 | 
250 |         # Update extrapolator for current stream AND
251 |         # Extrapolate other stream AND
252 |         # Compute and update residule
253 |         cdef double v_other
254 |         if inc == 0:
255 |             self.ex1.insert(t,v)
256 |             v_other = self.ex2.predict(t)
257 |             self.CF3 += (v - self.incS1.mean()) * (v_other - self.incS2.mean())
258 |         else:
259 |             self.ex2.insert(t,v)
260 |             v_other = self.ex1.predict(t)
261 |             self.CF3 += (v - self.incS2.mean()) * (v_other - self.incS1.mean())
262 |         self.w3 += 1
263 | 
264 |     cdef processDecay(self,double t):
265 |         cdef double factor
266 |         factor = 1
267 |         # check for decay cf3
268 |         cdef double timeDiffs_cf3
269 |         timeDiffs_cf3 = t - self.lastTimestamp_cf3
270 |         if timeDiffs_cf3 > 0:
271 |             factor = math.pow(2, (-(self.incS1.Lambda) * timeDiffs_cf3))
272 |             self.CF3 *= factor
273 |             self.w3 *= factor
274 |             self.lastTimestamp_cf3 = t
275 |         return factor
276 | 
277 |     #todo: add W3 for cf3
278 | 
279 |     #covariance approximation
280 |     cdef cov(self):
281 |         return self.CF3 / self.w3
282 | 
283 |     # Pearson corl. coef
284 |     cdef pcc(self):
285 |         cdef double ss
286 |         ss = self.incS1.std() * self.incS2.std()
287 |         if ss != 0:
288 |             return self.cov() / ss
289 |         else:
290 |             return 0
291 | 
292 |     def isRelated(self, str ID):
293 |         if self.incS1.ID == ID or self.incS2.ID == ID:
294 |             return True
295 |         else:
296 |             return False
297 | 
298 |     # calculates and pulls all correlative stats
299 |     cdef get_stats1(self):
300 |         return [self.cov(), self.pcc()]
301 | 
302 |     # calculates and pulls all correlative stats AND 2D stats from both streams (incStat)
303 |     cdef get_stats2(self):
304 |         return [self.incS1.radius([self.incS2]),self.incS1.magnitude([self.incS2]),self.cov(), self.pcc()]
305 | 
306 |     # calculates and pulls all correlative stats AND 2D stats AND the regular stats from both streams (incStat)
307 |     cdef get_stats3(self):
308 |         return [self.incS1.w,self.incS1.mean(),self.incS1.std(),self.incS2.w,self.incS2.mean(),self.incS2.std(),self.cov(), self.pcc()]
309 | 
310 |     # calculates and pulls all correlative stats AND the regular stats from both incStats AND 2D stats
311 |     cdef get_stats4(self):
312 |         return [self.incS1.w,self.incS1.mean(),self.incS1.std(),self.incS2.w,self.incS2.mean(),self.incS2.std(), self.incS1.radius([self.incS2]),self.incS1.magnitude([self.incS2]),self.cov(), self.pcc()]
313 | 
314 |     cdef getHeaders(self,int ver,int suffix=True): #ver = {1,2,3,4}
315 |         headers = []
316 |         s0 = "0"
317 |         s1 = "1"
318 |         if suffix:
319 |             s0 = self.incS1.ID
320 |             s1 = self.incS2.ID
321 | 
322 |         if ver == 1:
323 |             headers = ["covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1]
324 |         if ver == 2:
325 |             headers = ["radius_"+s0+"_"+s1, "magnitude_"+s0+"_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1]
326 |         if ver == 3:
327 |             headers = ["weight_"+s0, "mean_"+s0, "std_"+s0,"weight_"+s1, "mean_"+s1, "std_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1]
328 |         if ver == 4:
329 |             headers = ["weight_" + s0, "mean_" + s0, "std_" + s0, "covariance_" + s0 + "_" + s1, "pcc_" + s0 + "_" + s1]
330 |         if ver == 5:
331 |             headers = ["weight_"+s0, "mean_"+s0, "std_"+s0,"weight_"+s1, "mean_"+s1, "std_"+s1, "radius_"+s0+"_"+s1, "magnitude_"+s0+"_"+s1, "covariance_"+s0+"_"+s1, "pcc_"+s0+"_"+s1]
332 |         return headers
333 | 
334 | 
335 | cdef class incStatDB:
336 |     cdef double limit
337 |     cdef double df_lambda
338 |     cdef dict HT
339 | 
340 |     # default_lambda: use this as the lambda for all streams. If not specified, then you must supply a Lambda with every query.
341 |     def __init__(self,double limit=np.Inf,double default_lambda=np.nan):
342 |         self.HT = dict()
343 |         self.limit = limit
344 |         self.df_lambda = default_lambda
345 | 
346 |     cdef get_lambda(self,double Lambda):
347 |         if not np.isnan(self.df_lambda):
348 |             Lambda = self.df_lambda
349 |         return Lambda
350 | 
351 |     # Registers a new stream. init_time: init lastTimestamp of the incStat
352 |     def register(self,str ID,double Lambda=1,double init_time=0,int isTypeDiff=False):
353 |         #Default Lambda?
354 |         Lambda = self.get_lambda(Lambda)
355 | 
356 |         #Retrieve incStat
357 |         cdef str key
358 |         key = ID+"_"+str(Lambda)
359 | 
360 |         cdef incStat incS
361 |         incS = self.HT.get(key)
362 |         if incS is None: #does not already exist
363 |             if len(self.HT) + 1 > self.limit:
364 |                 raise LookupError(
365 |                     'Adding Entry:\n' + key + '\nwould exceed incStatHT 1D limit of ' + str(
366 |                         self.limit) + '.\nObservation Rejected.')
367 |             incS = incStat(Lambda, ID, init_time, isTypeDiff)
368 |             self.HT[key] = incS #add new entry
369 |         return incS
370 | 
371 |     # Registers covariance tracking for two streams, registers missing streams
372 |     def register_cov(self,str ID1, str ID2, double Lambda=1, double init_time=0, int isTypeDiff=False):
373 |         #Default Lambda?
374 |         Lambda = self.get_lambda(Lambda)
375 | 
376 |         # Lookup both streams
377 |         cdef incStat incS1
378 |         cdef incStat incS2
379 |         incS1 = self.register(ID1,Lambda,init_time,isTypeDiff)
380 |         incS2 = self.register(ID2,Lambda,init_time,isTypeDiff)
381 | 
382 |         #check for pre-exiting link
383 |         for cov in incS1.covs:
384 |             if cov.isRelated(ID2):
385 |                 return cov #there is a pre-exiting link
386 | 
387 |         # Link incStats
388 |         inc_cov = incStat_cov(incS1,incS2,init_time)
389 |         incS1.covs.append(inc_cov)
390 |         incS2.covs.append(inc_cov)
391 |         return inc_cov
392 | 
393 |     # updates/registers stream
394 |     def update(self,str ID,double t,double v,double Lambda=1,int isTypeDiff=False):
395 |         cdef incStat incS
396 |         incS = self.register(ID,Lambda,t,isTypeDiff)
397 |         incS.insert(v,t)
398 |         return incS
399 | 
400 |     # Pulls current stats from the given ID
401 |     def get_1D_Stats(self,str ID,double Lambda=1): #weight, mean, std
402 |         #Default Lambda?
403 |         Lambda = self.get_lambda(Lambda)
404 | 
405 |         #Get incStat
406 |         cdef incStat incS
407 |         incS = self.HT.get(ID+"_"+str(Lambda))
408 |         if incS is None:  # does not already exist
409 |             return [np.na]*3
410 |         else:
411 |             return incS.allstats_1D()
412 | 
413 |     # Pulls current correlational stats from the given IDs
414 |     def get_2D_Stats(self, str ID1, str ID2, double Lambda=1): #cov, pcc
415 |         # Default Lambda?
416 |         Lambda = self.get_lambda(Lambda)
417 | 
418 |         # Get incStat
419 |         cdef incStat incS
420 |         incS = self.HT.get(ID1 + "_" + str(Lambda))
421 |         if incS is None:  # does not exist
422 |             return [np.na]*2
423 | 
424 |         # find relevant cov entry
425 |         return incS.cov_pcc(ID2)
426 | 
427 |     # Pulls all correlational stats registered with the given ID
428 |     # returns tuple [0]: stats-covs&pccs, [2]: IDs
429 |     def get_all_2D_Stats(self, str ID, double Lambda=1):  # cov, pcc
430 |         # Default Lambda?
431 |         Lambda = self.get_lambda(Lambda)
432 | 
433 |         # Get incStat
434 |         cdef incStat incS1
435 |         incS1 = self.HT.get(ID + "_" + str(Lambda))
436 |         if incS1 is None:  # does not exist
437 |             return ([],[])
438 | 
439 |         # find relevant cov entry
440 |         stats = []
441 |         IDs = []
442 |         for cov in incS1.covs:
443 |             stats.append(cov.get_stats1())
444 |             IDs.append([cov.incS1.ID,cov.incS2.ID])
445 |         return stats,IDs
446 | 
447 |     # Pulls current multidimensional stats from the given IDs
448 |     def get_nD_Stats(self,IDs,double Lambda=1): #radius, magnitude (IDs is a list)
449 |         # Default Lambda?
450 |         Lambda = self.get_lambda(Lambda)
451 | 
452 |         # Get incStats
453 |         incStats = []
454 |         for ID in IDs:
455 |             incS = self.HT.get(ID + "_" + str(Lambda))
456 |             if incS is not None:  #exists
457 |                 incStats.append(incS)
458 | 
459 |         # Compute stats
460 |         cdef double rad, mag
461 |         rad = 0 #radius
462 |         mag = 0 #magnitude
463 |         for incS in incStats:
464 |             rad += incS.var()
465 |             mag += incS.mean()**2
466 | 
467 |         return [np.sqrt(rad),np.sqrt(mag)]
468 | 
469 |     # Updates and then pulls current 1D stats from the given ID. Automatically registers previously unknown stream IDs
470 |     def update_get_1D_Stats(self, str ID,double t, double v, double Lambda=1, int isTypeDiff=False):  # weight, mean, std
471 |         cdef incStat incS
472 |         incS = self.update(ID,t,v,Lambda,isTypeDiff)
473 |         return incS.allstats_1D()
474 | 
475 | 
476 |     # Updates and then pulls current correlative stats between the given IDs. Automatically registers previously unknown stream IDs, and cov tracking
477 |     #Note: AfterImage does not currently support Diff Type streams for correlational statistics.
478 |     def update_get_2D_Stats(self, str ID1, str ID2,double t1,double v1,double Lambda=1, int level=1):  #level=  1:cov,pcc  2:radius,magnitude,cov,pcc
479 |         #retrieve/add cov tracker
480 |         cdef incStat_cov inc_cov
481 |         inc_cov = self.register_cov(ID1, ID2, Lambda,  t1)
482 |         # Update cov tracker
483 |         inc_cov.update_cov(ID1,v1,t1)
484 |         if level == 1:
485 |             return inc_cov.get_stats1()
486 |         else:
487 |             return inc_cov.get_stats2()
488 | 
489 |     # Updates and then pulls current 1D and 2D stats from the given IDs. Automatically registers previously unknown stream IDs
490 |     def update_get_1D2D_Stats(self, str ID1, str ID2, double t1,double v1,double Lambda=1):  # weight, mean, std
491 |         return self.update_get_1D_Stats(ID1,t1,v1,Lambda) + self.update_get_2D_Stats(ID1,ID2,t1,v1,Lambda,level=2)
492 | 
493 |     def getHeaders_1D(self,Lambda=1,ID=''):
494 |         # Default Lambda?
495 |         cdef double L
496 |         L = Lambda
497 |         L = self.get_lambda(L)
498 |         hdrs = incStat(L,ID).getHeaders_1D(suffix=False)
499 |         return [str(L)+"_"+s for s in hdrs]
500 | 
501 |     def getHeaders_2D(self,Lambda=1,IDs=None, ver=1): #IDs is a 2-element list or tuple
502 |         # Default Lambda?
503 |         cdef double L
504 |         L = Lambda
505 |         L = self.get_lambda(L)
506 |         if IDs is None:
507 |             IDs = ['0','1']
508 |         hdrs = incStat_cov(incStat(L,IDs[0]),incStat(L,IDs[0]),L).getHeaders(ver,suffix=False)
509 |         return [str(Lambda)+"_"+s for s in hdrs]
510 | 
511 |     def getHeaders_1D2D(self,Lambda=1,IDs=None, ver=1):
512 |         # Default Lambda?
513 |         cdef double L
514 |         L = Lambda
515 |         L = self.get_lambda(L)
516 |         if IDs is None:
517 |             IDs = ['0','1']
518 |         hdrs1D = self.getHeaders_1D(L,IDs[0])
519 |         hdrs2D = self.getHeaders_2D(L,IDs, ver)
520 |         return hdrs1D + hdrs2D
521 | 
522 |     def getHeaders_nD(self,Lambda=1,IDs=[]): #IDs is a n-element list or tuple
523 |         # Default Lambda?
524 |         cdef double L
525 |         L = Lambda
526 |         ID = ":"
527 |         for s in IDs:
528 |             ID += "_"+s
529 |         L = self.get_lambda(L)
530 |         hdrs = ["radius"+ID, "magnitude"+ID]
531 |         return [str(L)+"_"+s for s in hdrs]
532 | 
533 | 
534 |     #cleans out records that have a weight less than the cutoff.
535 |     #returns number or removed records.
536 |     def cleanOutOldRecords(self,double cutoffWeight,double curTime):
537 |         cdef int n
538 |         cdef double W
539 |         n = 0
540 |         dump = sorted(self.HT.items(), key=lambda tup: tup[1][0].getMaxW(curTime))
541 |         for entry in dump:
542 |             entry[1][0].processDecay(curTime)
543 |             W = entry[1][0].w
544 |             if W <= cutoffWeight:
545 |                 key = entry[0]
546 |                 del entry[1][0]
547 |                 del self.HT[key]
548 |                 n=n+1
549 |             elif W > cutoffWeight:
550 |                 break
551 |         return n
552 | 
553 | 
554 | 
555 | 
556 | 
557 | class incHist:
558 |     #ubIsAnom means that the HBOS score for vals that fall past the upped bound are Inf (not 0)
559 |     def __init__(self,nbins,Lambda=0,ubIsAnom=True,lbIsAnom=True,lbound=-10,ubound=10,scaleGrace=None):
560 |         self.scaleGrace = scaleGrace #the numbe rof instances to observe until a range it determeined
561 |         if scaleGrace is not None:
562 |             self.lbound = np.Inf
563 |             self.ubound = -np.Inf
564 |             self.binSize = None
565 |             self.isScaling = True
566 |         else:
567 |             self.lbound = lbound
568 |             self.ubound = ubound
569 |             self.binSize = (ubound - lbound)/nbins
570 |             self.isScaling = False
571 |         self.nbins = nbins
572 |         self.ubIsAnom = ubIsAnom
573 |         self.lbIsAnom = lbIsAnom
574 |         self.n = 0
575 | 
576 |         self.Lambda = Lambda
577 |         self.W = np.zeros(nbins)
578 |         self.lT = np.zeros(nbins) #last timestamp of each respective bin
579 |         self.tallestBin = 0 #indx to the bin that currently has the largest freq weight (assumed...)
580 | 
581 |     #assumes even bin width starting from lbound until ubound. beyond bounds are assigned to the closest bin
582 |     def getBinIndx(self,val,win=0):
583 |         indx = int(np.floor((val - self.lbound)/self.binSize))
584 |         if win == 0:
585 |             if indx < 0:
586 |                 return -np.Inf
587 |             if indx > (self.nbins - 1):
588 |                 return np.Inf
589 |             return indx
590 |         else: #windowed Histogram
591 |             if indx - win < 0: #does the left of the window stick out of bounds?
592 |                 if indx + win >= 0: #if yes, then is there some overlap with inbounds?
593 |                     return range(0,indx+win+1) #return the inbounds range
594 |                 else: #then the entire window is our of bounds to the left
595 |                     return -np.Inf
596 |             if indx + win > self.nbins - 1: #does the right of the window stick out of bounds?
597 |                 if indx - win < self.nbins: #if yes, then is there some overlap with inbounds?
598 |                     return range(indx - win,self.nbins) #return the inbounds range
599 |                 else: #then the entire window is our of bounds to the right
600 |                     return np.Inf
601 |             return range(indx-win,indx+win+1)
602 | 
603 | 
604 |     def processDecay(self, bin, timestamp):
605 |         # check for decay
606 |         timeDiff = timestamp - self.lT[bin]
607 |         if np.isscalar(timeDiff):
608 |             if timeDiff > 0:
609 |                 factor = math.pow(2, (-self.Lambda * timeDiff))
610 |                 self.W[bin] = self.W[bin] * factor
611 |                 self.lT[bin] = timestamp
612 |         else: #array
613 |             timeDiff[timeDiff<0]=0 #don't affect decay of out of order entries
614 |             factor = np.power(2, (-self.Lambda * timeDiff))
615 |             #b4 = self.W[bin]
616 |             self.W[bin] = self.W[bin] * factor
617 |             self.lT[bin] = timestamp
618 | 
619 |     def insert(self,val,timestamp,penalty=False):
620 |         self.n = self.n + 1
621 |         if self.isScaling:
622 |             if self.n < self.scaleGrace:
623 |                 if self.lbound > val:
624 |                     self.lbound = val
625 |                 if self.ubound < val:
626 |                     self.ubound = val
627 |             if self.n == self.scaleGrace:
628 |                 if self.ubound == self.lbound:
629 |                     self.scaleGrace = self.scaleGrace + 1000
630 |                 else:
631 |                     width = self.ubound - self.lbound
632 |                     self.ubound = self.ubound + width
633 |                     self.lbound = self.lbound - width
634 |                     self.binSize = (self.ubound - self.lbound) / self.nbins
635 |                     self.isScaling = False
636 |         else:
637 |             bin = self.getBinIndx(val)
638 |             if not np.isinf(bin): #
639 |                 self.processDecay(bin, timestamp)
640 |                 if penalty:
641 |                     tallestW = self.W[self.tallestBin]
642 |                     scale = tallestW if tallestW > 0 else 1
643 |                     fn = self.W[bin]/scale
644 |                     inc = self.halfsigmoid(fn+0.005,-1.03)
645 |                 else:
646 |                     inc = 1
647 |                 self.W[bin] = self.W[bin] + inc
648 |                 #track who has the tallest bin (for normilization)
649 |                 if self.W[bin] > self.W[self.tallestBin]:
650 |                     self.tallestBin = bin
651 | 
652 |     def halfsigmoid(self,x,k):
653 |         return (k*x)/(k-x+1)
654 | 
655 |     def score(self,val,timestamp=-1,win=0): #HBOS for one dimension
656 |         if self.isScaling:
657 |             return 0.0
658 |         else:
659 |             bin = self.getBinIndx(val,win=win)
660 |             if np.isscalar(bin):
661 |                 if np.isinf(bin):
662 |                     if self.ubIsAnom and bin > 0:
663 |                         return np.Inf #it's an anomaly because it passes the upper bound
664 |                     elif self.lbIsAnom and bin < 0:
665 |                         return np.Inf  # it's an anomaly because it passes the lower bound
666 |                     else:
667 |                         return 0.0 #it fell outside a bound which is consedered not anomalous
668 |             self.processDecay(bin,timestamp) #if timestamp = -1, no decay will be applied
669 |             w = np.mean(self.W[bin])
670 |             if w == 0:
671 |                 return np.Inf  # no stat history, anomaly!
672 |             else:
673 |                 return np.log(self.W[self.tallestBin] / (w))  # log(  1/(  p/p_max  )    )
674 | 
675 | 
676 |     def getFreq(self,val,timestamp=-1): #HBOS for one dimension
677 |         bin = self.getBinIndx(val)
678 |         self.processDecay(bin,timestamp) #if timestamp = -1, no decay will be applied
679 |         if np.isinf(bin):
680 |             return np.nan
681 |         else:
682 |             return self.W[bin]
683 | 
684 |     def getHist(self,timestamp=-1): #HBOS for one dimension
685 |         H = np.zeros((len(self.W),1))
686 |         for i in range(0,len(self.W)):
687 |             self.processDecay(i,timestamp) #if timestamp = -1, no decay will be applied
688 |             H[i] = self.W[i]
689 |         H = H/np.sum(self.W)
690 |         return H
691 |     #
692 |     # def loadFromJSON(self,jsonstring):
693 |     #     return '' # !!!! very  important: all timestamps in self.lT should be updated so the decay won't wipe out the histogram:
694 |     #             # self.lT = self.lT + curtime - max(self.lT)
695 |     #             # this also applies to when the system.train setting is toggled to 'on'
696 | 
697 | from cpython cimport array
698 | 
699 | #import cython
700 | 
701 | cdef class Queue:
702 | 
703 |     cdef double[3] q
704 |     cdef int indx
705 |     cdef unsigned int n
706 | 
707 |     def __init__(self):
708 |         self.q[0] = self.q[1] = self.q[2] = 0
709 |         self.indx = 0
710 |         self.n = 0
711 | 
712 |     cdef insert(self,double v):
713 |         self.q[self.indx] = v
714 |         self.indx = (self.indx + 1) % 3
715 |         self.n += 1
716 | 
717 |     cdef unroll(self):
718 | 
719 |         cdef double[2] res
720 |         if self.n == 2:
721 |             res[0] = self.q[0]
722 |             res[1] = self.q[1]
723 |             return res
724 |         if self.indx == 0:
725 |             return self.q
726 | 
727 |         cdef double[3] res3
728 |         if self.indx == 1:
729 |             res3[0] = self.q[1]
730 |             res3[1] = self.q[2]
731 |             res3[2] = self.q[0]
732 |             return res3
733 |         else:
734 |             res3[0] = self.q[2]
735 |             res3[1] = self.q[0]
736 |             res3[2] = self.q[1]
737 |             return res3
738 | 
739 |     cdef get_last(self):
740 |         return self.q[(self.indx-1)%3]
741 | 
742 |     cdef get_mean_diff(self):
743 |         cdef double dif
744 |         dif = 0
745 |         if self.n == 2:
746 |             dif=self.q[self.indx%3] - self.q[(self.indx-1)%3]
747 |             return dif
748 |         else:
749 |             # for i in range(2):
750 |             #     dif+=self.q[(self.indx+i+1)%3] - self.q[(self.indx+i)%3]
751 |             dif= (self.q[self.indx%3] - self.q[(self.indx-1)%3]) + (self.q[(self.indx-1)%3] - self.q[(self.indx-2)%3])
752 |             return dif/2
753 | 
754 | cdef class extrapolator:
755 | 
756 |     cdef Queue Qt
757 |     cdef Queue Qv
758 | 
759 |     def __init__(self):#,int winsize=3):
760 |         self.Qt = Queue() #deque([],winsize) #window of timestamps
761 |         self.Qv = Queue() #deque([],winsize) #window of values
762 | 
763 |     def insert(self,double t, double v):
764 |         self.Qt.insert(t)
765 |         self.Qv.insert(v)
766 | 
767 |     def predict(self, double t):
768 |         if self.Qt.n < 2: #not enough points to extrapolate?
769 |             if self.Qt.n == 1:
770 |                 return self.Qv.get_last()
771 |             else:
772 |                 return 0
773 |         if (t - self.Qt.get_last())/(self.Qt.get_mean_diff() + 1e-10) > 10: # is the next timestamp 10 time further than the average sample interval?
774 |             return self.Qv.get_last() # prediction too far ahead (very likely that we will be way off)
775 |         cdef double yp
776 |         cdef array.array tm = array.array('d', self.Qt.unroll())
777 |         cdef array.array vm = array.array('d', self.Qv.unroll())
778 |         yp = self.interpolate(t,tm,vm)
779 |         return yp
780 |     #TODO: try cythonize lagrange
781 | 
782 | 
783 |     cdef interpolate(self, double tp, array.array tm, array.array ym):
784 |         cdef int n
785 |         n = len(tm) - 1
786 |         #cdef double[:] lagrpoly = np.array([self.lagrange(tp, i, tm) for i in range(n + 1)])
787 | 
788 |         cdef double y
789 |         for i in range(n +1):
790 |             """
791 |             Evaluate the i-th Lagrange polynomial at x
792 |             based on grid data xm
793 |             """
794 |             y = 1
795 |             for j in range(n + 1):
796 |                 if i != j:
797 |                     y *= (tp - tm[j]) / (tm[i] - tm[j] + 1e-20)
798 |             ym[i]*=y
799 | 
800 |         return sum(ym)
801 | 
802 | 


--------------------------------------------------------------------------------
/FeatureExtractor.py:
--------------------------------------------------------------------------------
  1 | #Check if cython code has been compiled
  2 | import os
  3 | import subprocess
  4 | 
  5 | use_extrapolation=False #experimental correlation code
  6 | if use_extrapolation:
  7 |     print("Importing AfterImage Cython Library")
  8 |     if not os.path.isfile("AfterImage.c"): #has not yet been compiled, so try to do so...
  9 |         cmd = "python setup.py build_ext --inplace"
 10 |         subprocess.call(cmd,shell=True)
 11 | #Import dependencies
 12 | import netStat as ns
 13 | import csv
 14 | import numpy as np
 15 | print("Importing Scapy Library")
 16 | from scapy.all import *
 17 | import os.path
 18 | import platform
 19 | import subprocess
 20 | 
 21 | 
 22 | #Extracts Kitsune features from given pcap file one packet at a time using "get_next_vector()"
 23 | # If wireshark is installed (tshark) it is used to parse (it's faster), otherwise, scapy is used (much slower).
 24 | # If wireshark is used then a tsv file (parsed version of the pcap) will be made -which you can use as your input next time
 25 | class FE:
 26 |     def __init__(self,file_path,limit=np.inf):
 27 |         self.path = file_path
 28 |         self.limit = limit
 29 |         self.parse_type = None #unknown
 30 |         self.curPacketIndx = 0
 31 |         self.tsvin = None #used for parsing TSV file
 32 |         self.scapyin = None #used for parsing pcap with scapy
 33 | 
 34 |         ### Prep pcap ##
 35 |         self.__prep__()
 36 | 
 37 |         ### Prep Feature extractor (AfterImage) ###
 38 |         maxHost = 100000000000
 39 |         maxSess = 100000000000
 40 |         self.nstat = ns.netStat(np.nan, maxHost, maxSess)
 41 | 
 42 |     def _get_tshark_path(self):
 43 |         if platform.system() == 'Windows':
 44 |             return 'C:\Program Files\Wireshark\\tshark.exe'
 45 |         else:
 46 |             system_path = os.environ['PATH']
 47 |             for path in system_path.split(os.pathsep):
 48 |                 filename = os.path.join(path, 'tshark')
 49 |                 if os.path.isfile(filename):
 50 |                     return filename
 51 |         return ''
 52 | 
 53 |     def __prep__(self):
 54 |         ### Find file: ###
 55 |         if not os.path.isfile(self.path):  # file does not exist
 56 |             print("File: " + self.path + " does not exist")
 57 |             raise Exception()
 58 | 
 59 |         ### check file type ###
 60 |         type = self.path.split('.')[-1]
 61 | 
 62 |         self._tshark = self._get_tshark_path()
 63 |         ##If file is TSV (pre-parsed by wireshark script)
 64 |         if type == "tsv":
 65 |             self.parse_type = "tsv"
 66 | 
 67 |         ##If file is pcap
 68 |         elif type == "pcap" or type == 'pcapng':
 69 |             # Try parsing via tshark dll of wireshark (faster)
 70 |             if os.path.isfile(self._tshark):
 71 |                 self.pcap2tsv_with_tshark()  # creates local tsv file
 72 |                 self.path += ".tsv"
 73 |                 self.parse_type = "tsv"
 74 |             else: # Otherwise, parse with scapy (slower)
 75 |                 print("tshark not found. Trying scapy...")
 76 |                 self.parse_type = "scapy"
 77 |         else:
 78 |             print("File: " + self.path + " is not a tsv or pcap file")
 79 |             raise Exception()
 80 | 
 81 |         ### open readers ##
 82 |         if self.parse_type == "tsv":
 83 |             maxInt = sys.maxsize
 84 |             decrement = True
 85 |             while decrement:
 86 |                 # decrease the maxInt value by factor 10
 87 |                 # as long as the OverflowError occurs.
 88 |                 decrement = False
 89 |                 try:
 90 |                     csv.field_size_limit(maxInt)
 91 |                 except OverflowError:
 92 |                     maxInt = int(maxInt / 10)
 93 |                     decrement = True
 94 | 
 95 |             print("counting lines in file...")
 96 |             num_lines = sum(1 for line in open(self.path))
 97 |             print("There are " + str(num_lines) + " Packets.")
 98 |             self.limit = min(self.limit, num_lines-1)
 99 |             self.tsvinf = open(self.path, 'rt', encoding="utf8")
100 |             self.tsvin = csv.reader(self.tsvinf, delimiter='\t')
101 |             row = self.tsvin.__next__() #move iterator past header
102 | 
103 |         else: # scapy
104 |             print("Reading PCAP file via Scapy...")
105 |             self.scapyin = rdpcap(self.path)
106 |             self.limit = len(self.scapyin)
107 |             print("Loaded " + str(len(self.scapyin)) + " Packets.")
108 | 
109 |     def get_next_vector(self):
110 |         if self.curPacketIndx == self.limit:
111 |             if self.parse_type == 'tsv':
112 |                 self.tsvinf.close()
113 |             return []
114 | 
115 |         ### Parse next packet ###
116 |         if self.parse_type == "tsv":
117 |             row = self.tsvin.__next__()
118 |             IPtype = np.nan
119 |             timestamp = row[0]
120 |             framelen = row[1]
121 |             srcIP = ''
122 |             dstIP = ''
123 |             if row[4] != '':  # IPv4
124 |                 srcIP = row[4]
125 |                 dstIP = row[5]
126 |                 IPtype = 0
127 |             elif row[17] != '':  # ipv6
128 |                 srcIP = row[17]
129 |                 dstIP = row[18]
130 |                 IPtype = 1
131 |             srcproto = row[6] + row[
132 |                 8]  # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]"
133 |             dstproto = row[7] + row[9]  # UDP or TCP port
134 |             srcMAC = row[2]
135 |             dstMAC = row[3]
136 |             if srcproto == '':  # it's a L2/L1 level protocol
137 |                 if row[12] != '':  # is ARP
138 |                     srcproto = 'arp'
139 |                     dstproto = 'arp'
140 |                     srcIP = row[14]  # src IP (ARP)
141 |                     dstIP = row[16]  # dst IP (ARP)
142 |                     IPtype = 0
143 |                 elif row[10] != '':  # is ICMP
144 |                     srcproto = 'icmp'
145 |                     dstproto = 'icmp'
146 |                     IPtype = 0
147 |                 elif srcIP + srcproto + dstIP + dstproto == '':  # some other protocol
148 |                     srcIP = row[2]  # src MAC
149 |                     dstIP = row[3]  # dst MAC
150 | 
151 |         elif self.parse_type == "scapy":
152 |             packet = self.scapyin[self.curPacketIndx]
153 |             IPtype = np.nan
154 |             timestamp = packet.time
155 |             framelen = len(packet)
156 |             if packet.haslayer(IP):  # IPv4
157 |                 srcIP = packet[IP].src
158 |                 dstIP = packet[IP].dst
159 |                 IPtype = 0
160 |             elif packet.haslayer(IPv6):  # ipv6
161 |                 srcIP = packet[IPv6].src
162 |                 dstIP = packet[IPv6].dst
163 |                 IPtype = 1
164 |             else:
165 |                 srcIP = ''
166 |                 dstIP = ''
167 | 
168 |             if packet.haslayer(TCP):
169 |                 srcproto = str(packet[TCP].sport)
170 |                 dstproto = str(packet[TCP].dport)
171 |             elif packet.haslayer(UDP):
172 |                 srcproto = str(packet[UDP].sport)
173 |                 dstproto = str(packet[UDP].dport)
174 |             else:
175 |                 srcproto = ''
176 |                 dstproto = ''
177 | 
178 |             srcMAC = packet.src
179 |             dstMAC = packet.dst
180 |             if srcproto == '':  # it's a L2/L1 level protocol
181 |                 if packet.haslayer(ARP):  # is ARP
182 |                     srcproto = 'arp'
183 |                     dstproto = 'arp'
184 |                     srcIP = packet[ARP].psrc  # src IP (ARP)
185 |                     dstIP = packet[ARP].pdst  # dst IP (ARP)
186 |                     IPtype = 0
187 |                 elif packet.haslayer(ICMP):  # is ICMP
188 |                     srcproto = 'icmp'
189 |                     dstproto = 'icmp'
190 |                     IPtype = 0
191 |                 elif srcIP + srcproto + dstIP + dstproto == '':  # some other protocol
192 |                     srcIP = packet.src  # src MAC
193 |                     dstIP = packet.dst  # dst MAC
194 |         else:
195 |             return []
196 | 
197 |         self.curPacketIndx = self.curPacketIndx + 1
198 | 
199 | 
200 |         ### Extract Features
201 |         try:
202 |             return self.nstat.updateGetStats(IPtype, srcMAC, dstMAC, srcIP, srcproto, dstIP, dstproto,
203 |                                                  int(framelen),
204 |                                                  float(timestamp))
205 |         except Exception as e:
206 |             print(e)
207 |             return []
208 | 
209 | 
210 |     def pcap2tsv_with_tshark(self):
211 |         print('Parsing with tshark...')
212 |         fields = "-e frame.time_epoch -e frame.len -e eth.src -e eth.dst -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e icmp.type -e icmp.code -e arp.opcode -e arp.src.hw_mac -e arp.src.proto_ipv4 -e arp.dst.hw_mac -e arp.dst.proto_ipv4 -e ipv6.src -e ipv6.dst"
213 |         cmd =  '"' + self._tshark + '" -r '+ self.path +' -T fields '+ fields +' -E header=y -E occurrence=f > '+self.path+".tsv"
214 |         subprocess.call(cmd,shell=True)
215 |         print("tshark parsing complete. File saved as: "+self.path +".tsv")
216 | 
217 |     def get_num_features(self):
218 |         return len(self.nstat.getNetStatHeaders())
219 | 


--------------------------------------------------------------------------------
/KitNET/KitNET.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import KitNET.dA as AE
  3 | import KitNET.corClust as CC
  4 | 
  5 | # This class represents a KitNET machine learner.
  6 | # KitNET is a lightweight online anomaly detection algorithm based on an ensemble of autoencoders.
  7 | # For more information and citation, please see our NDSS'18 paper: Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection
  8 | # For licensing information, see the end of this document
  9 | 
 10 | class KitNET:
 11 |     #n: the number of features in your input dataset (i.e., x \in R^n)
 12 |     #m: the maximum size of any autoencoder in the ensemble layer
 13 |     #AD_grace_period: the number of instances the network will learn from before producing anomaly scores
 14 |     #FM_grace_period: the number of instances which will be taken to learn the feature mapping. If 'None', then FM_grace_period=AM_grace_period
 15 |     #learning_rate: the default stochastic gradient descent learning rate for all autoencoders in the KitNET instance.
 16 |     #hidden_ratio: the default ratio of hidden to visible neurons. E.g., 0.75 will cause roughly a 25% compression in the hidden layer.
 17 |     #feature_map: One may optionally provide a feature map instead of learning one. The map must be a list,
 18 |     #           where the i-th entry contains a list of the feature indices to be assingned to the i-th autoencoder in the ensemble.
 19 |     #           For example, [[2,5,3],[4,0,1],[6,7]]
 20 |     def __init__(self,n,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75, feature_map = None):
 21 |         # Parameters:
 22 |         self.AD_grace_period = AD_grace_period
 23 |         if FM_grace_period is None:
 24 |             self.FM_grace_period = AD_grace_period
 25 |         else:
 26 |             self.FM_grace_period = FM_grace_period
 27 |         if max_autoencoder_size <= 0:
 28 |             self.m = 1
 29 |         else:
 30 |             self.m = max_autoencoder_size
 31 |         self.lr = learning_rate
 32 |         self.hr = hidden_ratio
 33 |         self.n = n
 34 | 
 35 |         # Variables
 36 |         self.n_trained = 0 # the number of training instances so far
 37 |         self.n_executed = 0 # the number of executed instances so far
 38 |         self.v = feature_map
 39 |         if self.v is None:
 40 |             print("Feature-Mapper: train-mode, Anomaly-Detector: off-mode")
 41 |         else:
 42 |             self.__createAD__()
 43 |             print("Feature-Mapper: execute-mode, Anomaly-Detector: train-mode")
 44 |         self.FM = CC.corClust(self.n) #incremental feature cluatering for the feature mapping process
 45 |         self.ensembleLayer = []
 46 |         self.outputLayer = None
 47 | 
 48 |     #If FM_grace_period+AM_grace_period has passed, then this function executes KitNET on x. Otherwise, this function learns from x.
 49 |     #x: a numpy array of length n
 50 |     #Note: KitNET automatically performs 0-1 normalization on all attributes.
 51 |     def process(self,x):
 52 |         if self.n_trained > self.FM_grace_period + self.AD_grace_period: #If both the FM and AD are in execute-mode
 53 |             return self.execute(x)
 54 |         else:
 55 |             self.train(x)
 56 |             return 0.0
 57 | 
 58 |     #force train KitNET on x
 59 |     #returns the anomaly score of x during training (do not use for alerting)
 60 |     def train(self,x):
 61 |         if self.n_trained <= self.FM_grace_period and self.v is None: #If the FM is in train-mode, and the user has not supplied a feature mapping
 62 |             #update the incremetnal correlation matrix
 63 |             self.FM.update(x)
 64 |             if self.n_trained == self.FM_grace_period: #If the feature mapping should be instantiated
 65 |                 self.v = self.FM.cluster(self.m)
 66 |                 self.__createAD__()
 67 |                 print("The Feature-Mapper found a mapping: "+str(self.n)+" features to "+str(len(self.v))+" autoencoders.")
 68 |                 print("Feature-Mapper: execute-mode, Anomaly-Detector: train-mode")
 69 |         else: #train
 70 |             ## Ensemble Layer
 71 |             S_l1 = np.zeros(len(self.ensembleLayer))
 72 |             for a in range(len(self.ensembleLayer)):
 73 |                 # make sub instance for autoencoder 'a'
 74 |                 xi = x[self.v[a]]
 75 |                 S_l1[a] = self.ensembleLayer[a].train(xi)
 76 |             ## OutputLayer
 77 |             self.outputLayer.train(S_l1)
 78 |             if self.n_trained == self.AD_grace_period+self.FM_grace_period:
 79 |                 print("Feature-Mapper: execute-mode, Anomaly-Detector: execute-mode")
 80 |         self.n_trained += 1
 81 | 
 82 |     #force execute KitNET on x
 83 |     def execute(self,x):
 84 |         if self.v is None:
 85 |             raise RuntimeError('KitNET Cannot execute x, because a feature mapping has not yet been learned or provided. Try running process(x) instead.')
 86 |         else:
 87 |             self.n_executed += 1
 88 |             ## Ensemble Layer
 89 |             S_l1 = np.zeros(len(self.ensembleLayer))
 90 |             for a in range(len(self.ensembleLayer)):
 91 |                 # make sub inst
 92 |                 xi = x[self.v[a]]
 93 |                 S_l1[a] = self.ensembleLayer[a].execute(xi)
 94 |             ## OutputLayer
 95 |             return self.outputLayer.execute(S_l1)
 96 | 
 97 |     def __createAD__(self):
 98 |         # construct ensemble layer
 99 |         for map in self.v:
100 |             params = AE.dA_params(n_visible=len(map), n_hidden=0, lr=self.lr, corruption_level=0, gracePeriod=0, hiddenRatio=self.hr)
101 |             self.ensembleLayer.append(AE.dA(params))
102 | 
103 |         # construct output layer
104 |         params = AE.dA_params(len(self.v), n_hidden=0, lr=self.lr, corruption_level=0, gracePeriod=0, hiddenRatio=self.hr)
105 |         self.outputLayer = AE.dA(params)
106 | 
107 | # Copyright (c) 2017 Yisroel Mirsky
108 | #
109 | # MIT License
110 | #
111 | # Permission is hereby granted, free of charge, to any person obtaining
112 | # a copy of this software and associated documentation files (the
113 | # "Software"), to deal in the Software without restriction, including
114 | # without limitation the rights to use, copy, modify, merge, publish,
115 | # distribute, sublicense, and/or sell copies of the Software, and to
116 | # permit persons to whom the Software is furnished to do so, subject to
117 | # the following conditions:
118 | #
119 | # The above copyright notice and this permission notice shall be
120 | # included in all copies or substantial portions of the Software.
121 | #
122 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
123 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
124 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
125 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
126 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
127 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
128 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/KitNET/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Yisroel Mirsky
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/KitNET/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["corClust", "dA", "KitNET","utils"]


--------------------------------------------------------------------------------
/KitNET/corClust.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.cluster.hierarchy import linkage, fcluster, to_tree
 3 | 
 4 | # A helper class for KitNET which performs a correlation-based incremental clustering of the dimensions in X
 5 | # n: the number of dimensions in the dataset
 6 | # For more information and citation, please see our NDSS'18 paper: Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection
 7 | class corClust:
 8 |     def __init__(self,n):
 9 |         #parameter:
10 |         self.n = n
11 |         #varaibles
12 |         self.c = np.zeros(n) #linear num of features
13 |         self.c_r = np.zeros(n) #linear sum of feature residules
14 |         self.c_rs = np.zeros(n) #linear sum of feature residules
15 |         self.C = np.zeros((n,n)) #partial correlation matrix
16 |         self.N = 0 #number of updates performed
17 | 
18 |     # x: a numpy vector of length n
19 |     def update(self,x):
20 |         self.N += 1
21 |         self.c += x
22 |         c_rt = x - self.c/self.N
23 |         self.c_r += c_rt
24 |         self.c_rs += c_rt**2
25 |         self.C += np.outer(c_rt,c_rt)
26 | 
27 |     # creates the current correlation distance matrix between the features
28 |     def corrDist(self):
29 |         c_rs_sqrt = np.sqrt(self.c_rs)
30 |         C_rs_sqrt = np.outer(c_rs_sqrt,c_rs_sqrt)
31 |         C_rs_sqrt[C_rs_sqrt==0] = 1e-100 #this protects against dive by zero erros (occurs when a feature is a constant)
32 |         D = 1-self.C/C_rs_sqrt #the correlation distance matrix
33 |         D[D<0] = 0 #small negatives may appear due to the incremental fashion in which we update the mean. Therefore, we 'fix' them
34 |         return D
35 | 
36 |     # clusters the features together, having no more than maxClust features per cluster
37 |     def cluster(self,maxClust):
38 |         D = self.corrDist()
39 |         Z = linkage(D[np.triu_indices(self.n, 1)])  # create a linkage matrix based on the distance matrix
40 |         if maxClust < 1:
41 |             maxClust = 1
42 |         if maxClust > self.n:
43 |             maxClust = self.n
44 |         map = self.__breakClust__(to_tree(Z),maxClust)
45 |         return map
46 | 
47 |     # a recursive helper function which breaks down the dendrogram branches until all clusters have no more than maxClust elements
48 |     def __breakClust__(self,dendro,maxClust):
49 |         if dendro.count <= maxClust: #base case: we found a minimal cluster, so mark it
50 |             return [dendro.pre_order()] #return the origional ids of the features in this cluster
51 |         return self.__breakClust__(dendro.get_left(),maxClust) + self.__breakClust__(dendro.get_right(),maxClust)
52 | 
53 | # Copyright (c) 2017 Yisroel Mirsky
54 | #
55 | # MIT License
56 | #
57 | # Permission is hereby granted, free of charge, to any person obtaining
58 | # a copy of this software and associated documentation files (the
59 | # "Software"), to deal in the Software without restriction, including
60 | # without limitation the rights to use, copy, modify, merge, publish,
61 | # distribute, sublicense, and/or sell copies of the Software, and to
62 | # permit persons to whom the Software is furnished to do so, subject to
63 | # the following conditions:
64 | #
65 | # The above copyright notice and this permission notice shall be
66 | # included in all copies or substantial portions of the Software.
67 | #
68 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
69 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
70 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
71 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
72 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
73 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
74 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/KitNET/dA.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017 Yusuke Sugomori
  2 | #
  3 | # MIT License
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining
  6 | # a copy of this software and associated documentation files (the
  7 | # "Software"), to deal in the Software without restriction, including
  8 | # without limitation the rights to use, copy, modify, merge, publish,
  9 | # distribute, sublicense, and/or sell copies of the Software, and to
 10 | # permit persons to whom the Software is furnished to do so, subject to
 11 | # the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be
 14 | # included in all copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 23 | 
 24 | # Portions of this code have been adapted from Yusuke Sugomori's code on GitHub: https://github.com/yusugomori/DeepLearning
 25 | 
 26 | import sys
 27 | import numpy
 28 | from KitNET.utils import *
 29 | import json
 30 | 
 31 | class dA_params:
 32 |     def __init__(self,n_visible = 5, n_hidden = 3, lr=0.001, corruption_level=0.0, gracePeriod = 10000, hiddenRatio=None):
 33 |         self.n_visible = n_visible# num of units in visible (input) layer
 34 |         self.n_hidden = n_hidden# num of units in hidden layer
 35 |         self.lr = lr
 36 |         self.corruption_level = corruption_level
 37 |         self.gracePeriod = gracePeriod
 38 |         self.hiddenRatio = hiddenRatio
 39 | 
 40 | class dA:
 41 |     def __init__(self, params):
 42 |         self.params = params
 43 | 
 44 |         if self.params.hiddenRatio is not None:
 45 |             self.params.n_hidden = int(numpy.ceil(self.params.n_visible*self.params.hiddenRatio))
 46 | 
 47 |         # for 0-1 normlaization
 48 |         self.norm_max = numpy.ones((self.params.n_visible,)) * -numpy.Inf
 49 |         self.norm_min = numpy.ones((self.params.n_visible,)) * numpy.Inf
 50 |         self.n = 0
 51 | 
 52 |         self.rng = numpy.random.RandomState(1234)
 53 | 
 54 |         a = 1. / self.params.n_visible
 55 |         self.W = numpy.array(self.rng.uniform(  # initialize W uniformly
 56 |             low=-a,
 57 |             high=a,
 58 |             size=(self.params.n_visible, self.params.n_hidden)))
 59 | 
 60 |         self.hbias = numpy.zeros(self.params.n_hidden)  # initialize h bias 0
 61 |         self.vbias = numpy.zeros(self.params.n_visible)  # initialize v bias 0
 62 |         self.W_prime = self.W.T
 63 | 
 64 | 
 65 |     def get_corrupted_input(self, input, corruption_level):
 66 |         assert corruption_level < 1
 67 | 
 68 |         return self.rng.binomial(size=input.shape,
 69 |                                  n=1,
 70 |                                  p=1 - corruption_level) * input
 71 | 
 72 |     # Encode
 73 |     def get_hidden_values(self, input):
 74 |         return sigmoid(numpy.dot(input, self.W) + self.hbias)
 75 | 
 76 |     # Decode
 77 |     def get_reconstructed_input(self, hidden):
 78 |         return sigmoid(numpy.dot(hidden, self.W_prime) + self.vbias)
 79 | 
 80 |     def train(self, x):
 81 |         self.n = self.n + 1
 82 |         # update norms
 83 |         self.norm_max[x > self.norm_max] = x[x > self.norm_max]
 84 |         self.norm_min[x < self.norm_min] = x[x < self.norm_min]
 85 | 
 86 |         # 0-1 normalize
 87 |         x = (x - self.norm_min) / (self.norm_max - self.norm_min + 0.0000000000000001)
 88 | 
 89 |         if self.params.corruption_level > 0.0:
 90 |             tilde_x = self.get_corrupted_input(x, self.params.corruption_level)
 91 |         else:
 92 |             tilde_x = x
 93 |         y = self.get_hidden_values(tilde_x)
 94 |         z = self.get_reconstructed_input(y)
 95 | 
 96 |         L_h2 = x - z
 97 |         L_h1 = numpy.dot(L_h2, self.W) * y * (1 - y)
 98 | 
 99 |         L_vbias = L_h2
100 |         L_hbias = L_h1
101 |         L_W = numpy.outer(tilde_x.T, L_h1) + numpy.outer(L_h2.T, y)
102 | 
103 |         self.W += self.params.lr * L_W
104 |         self.hbias += self.params.lr * L_hbias
105 |         self.vbias += self.params.lr * L_vbias
106 |         return numpy.sqrt(numpy.mean(L_h2**2)) #the RMSE reconstruction error during training
107 | 
108 | 
109 |     def reconstruct(self, x):
110 |         y = self.get_hidden_values(x)
111 |         z = self.get_reconstructed_input(y)
112 |         return z
113 | 
114 |     def execute(self, x): #returns MSE of the reconstruction of x
115 |         if self.n < self.params.gracePeriod:
116 |             return 0.0
117 |         else:
118 |             # 0-1 normalize
119 |             x = (x - self.norm_min) / (self.norm_max - self.norm_min + 0.0000000000000001)
120 |             z = self.reconstruct(x)
121 |             rmse = numpy.sqrt(((x - z) ** 2).mean()) #MSE
122 |             return rmse
123 | 
124 | 
125 |     def inGrace(self):
126 |         return self.n < self.params.gracePeriod
127 | 


--------------------------------------------------------------------------------
/KitNET/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy
 3 | from scipy.stats import norm
 4 | numpy.seterr(all='ignore')
 5 | 
 6 | def pdf(x,mu,sigma): #normal distribution pdf
 7 |     x = (x-mu)/sigma
 8 |     return numpy.exp(-x**2/2)/(numpy.sqrt(2*numpy.pi)*sigma)
 9 | 
10 | def invLogCDF(x,mu,sigma): #normal distribution cdf
11 |     x = (x - mu) / sigma
12 |     return norm.logcdf(-x) #note: we mutiple by -1 after normalization to better get the 1-cdf
13 | 
14 | def sigmoid(x):
15 |     return 1. / (1 + numpy.exp(-x))
16 | 
17 | 
18 | def dsigmoid(x):
19 |     return x * (1. - x)
20 | 
21 | def tanh(x):
22 |     return numpy.tanh(x)
23 | 
24 | def dtanh(x):
25 |     return 1. - x * x
26 | 
27 | def softmax(x):
28 |     e = numpy.exp(x - numpy.max(x))  # prevent overflow
29 |     if e.ndim == 1:
30 |         return e / numpy.sum(e, axis=0)
31 |     else:  
32 |         return e / numpy.array([numpy.sum(e, axis=1)]).T  # ndim = 2
33 | 
34 | 
35 | def ReLU(x):
36 |     return x * (x > 0)
37 | 
38 | def dReLU(x):
39 |     return 1. * (x > 0)
40 | 
41 | class rollmean:
42 |     def __init__(self,k):
43 |         self.winsize = k
44 |         self.window = numpy.zeros(self.winsize)
45 |         self.pointer = 0
46 | 
47 |     def apply(self,newval):
48 |         self.window[self.pointer]=newval
49 |         self.pointer = (self.pointer+1) % self.winsize
50 |         return numpy.mean(self.window)
51 | 
52 | # probability density for the Gaussian dist
53 | # def gaussian(x, mean=0.0, scale=1.0):
54 | #     s = 2 * numpy.power(scale, 2)
55 | #     e = numpy.exp( - numpy.power((x - mean), 2) / s )
56 | 
57 | #     return e / numpy.square(numpy.pi * s)
58 | 
59 | 


--------------------------------------------------------------------------------
/Kitsune paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ymirsky/Kitsune-py/28a654b5813936380d264c0934136efda672174a/Kitsune paper.pdf


--------------------------------------------------------------------------------
/Kitsune.py:
--------------------------------------------------------------------------------
 1 | from FeatureExtractor import *
 2 | from KitNET.KitNET import KitNET
 3 | 
 4 | # MIT License
 5 | #
 6 | # Copyright (c) 2018 Yisroel mirsky
 7 | #
 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | 
26 | class Kitsune:
27 |     def __init__(self,file_path,limit,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75,):
28 |         #init packet feature extractor (AfterImage)
29 |         self.FE = FE(file_path,limit)
30 | 
31 |         #init Kitnet
32 |         self.AnomDetector = KitNET(self.FE.get_num_features(),max_autoencoder_size,FM_grace_period,AD_grace_period,learning_rate,hidden_ratio)
33 | 
34 |     def proc_next_packet(self):
35 |         # create feature vector
36 |         x = self.FE.get_next_vector()
37 |         if len(x) == 0:
38 |             return -1 #Error or no packets left
39 | 
40 |         # process KitNET
41 |         return self.AnomDetector.process(x)  # will train during the grace periods, then execute on all the rest.
42 | 
43 | 


--------------------------------------------------------------------------------
/Kitsune_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ymirsky/Kitsune-py/28a654b5813936380d264c0934136efda672174a/Kitsune_fig.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Yisroel mirsky
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Overview
  2 | In this repository you will find a Python implementation of Kitsune; an online network intrusion detection system, based on an ensemble of autoencoders. From,
  3 | 
  4 | *Yisroel Mirsky, Tomer Doitshman, Yuval Elovici, and Asaf Shabtai, "Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection", Network and Distributed System Security Symposium 2018 (NDSS'18)*
  5 | 
  6 | # What is Kitsune?
  7 | 
  8 | Neural networks have become an increasingly popular solution for network intrusion detection systems (NIDS). Their capability of learning complex patterns and behaviors make them a suitable solution for differentiating between normal traffic and network attacks. However, a drawback of neural networks is the amount of resources needed to train them. Many network gateways and routers devices, which could potentially host an NIDS, simply do not have the memory or processing power to train and sometimes even execute such models. More importantly, the existing neural network solutions are trained in a supervised manner. Meaning that an expert must label the network traffic and update the model manually from time to time.
  9 | 
 10 | Kitsune is a novel ANN-based NIDS which is online, unsupervised, and efficient. A Kitsune, in Japanese folklore, is a mythical fox-like creature that has a number of tails, can mimic different forms, and whose strength increases with experience. Similarly, Kitsune has an ensemble of small neural networks (autoencoders), which are trained to mimic (reconstruct) network traffic patterns, and whose performance incrementally improves overtime. 
 11 | 	
 12 | The architecture of Kitsune is illustrated in the figure below:
 13 | * First, a feature extraction framework called *AfterImage* efficiently tracks the patterns of every network channel using damped incremental statisitcs, and extracts a feature vector for each packet. The vector captures the temporal context of the packet's channel and sender. 
 14 | * Next, the features are mapped to the visible neurons of an ensemble of autoenoders (*KitNET* https://github.com/ymirsky/KitNET-py). 
 15 | * Then, each autoencoder attempts to reconstruct the instance's features, and computes the reconstruction error in terms of root mean squared errors (RMSE). 
 16 | * Finally, the RMSEs are forwarded to an output autoencoder, which acts as a non-linear voting mechanism for the ensemble. 
 17 | 
 18 | We note that while training \textbf{Kitsune}, no more than one instance is stored in memory at a time. Kitsune has one main parameter, which is the maximum number of inputs for any given autoencoder in the ensemble. This parameter is used to increase the algorithm's speed with a modest trade off in detection performance.
 19 | 	
 20 | ![An illustration of Kitsune's architecture](https://raw.githubusercontent.com/ymirsky/Kitsune-py/master/Kitsune_fig.png)
 21 | 
 22 | 	
 23 | Some points about KitNET:
 24 | * It is completely plug-and-play.
 25 | * It is based on an unsupervised machine learning algorithm (it does not need label, just train it on *normal* data!)
 26 | * Its efficiency can be scaled with its input parameter m: the maximal size of any autoencoder in the ensemble layer (smaller autoencoders are exponentially cheaper to train and execute)
 27 | 
 28 | # Implimentation Notes: 
 29 | 
 30 | * This python implimentation of Kitsune is **is not optimal** in terms of speed. To make Kitsune run as fast as described in the paper, the entire project must be cythonized, or implimented in C++
 31 | * For an experimental AfterImage version, change the import line in netStat.py to use AfterImage_extrapolate.py, and change line 5 of FeatureExtractor.py to True (uses cython). This version uses Lagrange-based Polynomial extrapolation to assit in computing the correlation based features.
 32 | * We also require the scapy library for parsing (tshark [Wireshark] is default).
 33 | * The source code has been tested with Anaconda 3.6.3 on a Windows 10 64bit machine.
 34 | 
 35 | To install scapy, run in the terminal:
 36 | ```
 37 | pip install scapy
 38 | ```
 39 |  
 40 | 
 41 | 
 42 | # Using The Code
 43 | Here is a simple example of how to make a Kitsune object:
 44 | ```
 45 | from Kitsune import *
 46 | 
 47 | 
 48 | # KitNET params:
 49 | maxAE = 10 #maximum size for any autoencoder in the ensemble layer
 50 | FMgrace = 5000 #the number of instances taken to learn the feature mapping (the ensemble's architecture)
 51 | ADgrace = 50000 #the number of instances used to train the anomaly detector (ensemble itself)
 52 | packet_limit = np.Inf #the number of packets from the input file to process
 53 | path = "../../captured.pcap" #the pcap, pcapng, or tsv file which you wish to process.
 54 | 
 55 | # Build Kitsune
 56 | K = Kitsune(path,packet_limit,maxAE,FMgrace,ADgrace)
 57 | ```
 58 | 
 59 | You can also configure the learning rate and hidden layer's neuron ratio via Kitsune's contructor.
 60 | 
 61 | The input file can be any pcap network capture. When the object is created, the code check whether or not you have tshark (Wireshark) installed. If you do, then it uses tshark to parse the pcap into a tsv file which is saved to disk locally. This file is then later used when running Kitnet. You can also load this tsv file instead of the origional pcap to save time. Note that we currently only look for tshark in the Windows directory "C:\Program Files\Wireshark\tshark.exe"
 62 | 
 63 | If tshark is not found, then the scapy packet parsing library is used. Scapy is significatly slower than using wireshark/tsv...
 64 | 
 65 | To use the Kitsune object, simply tell Kitsune to process the next packet. After processing a packet, Kitsune returns the RMSE value of the packet (zero during the FM featuremapping and AD grace periods).
 66 | 
 67 | Here is an example usage of the Kitsune object:
 68 | ```
 69 | while True: 
 70 |     rmse = K.proc_next_packet() #will train during the grace periods, then execute on all the rest.
 71 |     if rmse == -1:
 72 |         break
 73 |     print(rmse)
 74 | ```
 75 | 
 76 | 
 77 | # Demo Code
 78 | As a quick start, a demo script is provided in example.py. In the demo, we run Kitsune on a network capture of the Mirai malware. You can either run it directly or enter the following into your python console
 79 | ```
 80 | import example.py
 81 | ```
 82 | 
 83 | 
 84 | The code was written and with the Python environment Anaconda: https://anaconda.org/anaconda/python
 85 | For significant speedups, as shown in our paper, you must implement Kitsune in C++, or entirely using cython.
 86 | 
 87 | # Full Datasets
 88 | The full datasets used in our NDSS paper can be found by following this google drive link:
 89 | https://goo.gl/iShM7E
 90 | 
 91 | # License
 92 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
 93 | 
 94 | 
 95 | # Citations
 96 | If you use the source code, the datasets, or implement KitNET, please cite the following paper:
 97 | 
 98 | *Yisroel Mirsky, Tomer Doitshman, Yuval Elovici, and Asaf Shabtai, "Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection", Network and Distributed System Security Symposium 2018 (NDSS'18)*
 99 | 
100 | Yisroel Mirsky
101 | yisroel@post.bgu.ac.il
102 | 
103 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from Kitsune import Kitsune
 2 | import numpy as np
 3 | import time
 4 | 
 5 | ##############################################################################
 6 | # Kitsune a lightweight online network intrusion detection system based on an ensemble of autoencoders (kitNET).
 7 | # For more information and citation, please see our NDSS'18 paper: Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection
 8 | 
 9 | # This script demonstrates Kitsune's ability to incrementally learn, and detect anomalies in recorded a pcap of the Mirai Malware.
10 | # The demo involves an m-by-n dataset with n=115 dimensions (features), and m=100,000 observations.
11 | # Each observation is a snapshot of the network's state in terms of incremental damped statistics (see the NDSS paper for more details)
12 | 
13 | #The runtimes presented in the paper, are based on the C++ implimentation (roughly 100x faster than the python implimentation)
14 | ###################  Last Tested with Anaconda 3.6.3   #######################
15 | 
16 | # Load Mirai pcap (a recording of the Mirai botnet malware being activated)
17 | # The first 70,000 observations are clean...
18 | print("Unzipping Sample Capture...")
19 | import zipfile
20 | with zipfile.ZipFile("mirai.zip","r") as zip_ref:
21 |     zip_ref.extractall()
22 | 
23 | 
24 | # File location
25 | path = "mirai.pcap" #the pcap, pcapng, or tsv file to process.
26 | packet_limit = np.Inf #the number of packets to process
27 | 
28 | # KitNET params:
29 | maxAE = 10 #maximum size for any autoencoder in the ensemble layer
30 | FMgrace = 5000 #the number of instances taken to learn the feature mapping (the ensemble's architecture)
31 | ADgrace = 50000 #the number of instances used to train the anomaly detector (ensemble itself)
32 | 
33 | # Build Kitsune
34 | K = Kitsune(path,packet_limit,maxAE,FMgrace,ADgrace)
35 | 
36 | print("Running Kitsune:")
37 | RMSEs = []
38 | i = 0
39 | start = time.time()
40 | # Here we process (train/execute) each individual packet.
41 | # In this way, each observation is discarded after performing process() method.
42 | while True:
43 |     i+=1
44 |     if i % 1000 == 0:
45 |         print(i)
46 |     rmse = K.proc_next_packet()
47 |     if rmse == -1:
48 |         break
49 |     RMSEs.append(rmse)
50 | stop = time.time()
51 | print("Complete. Time elapsed: "+ str(stop - start))
52 | 
53 | 
54 | # Here we demonstrate how one can fit the RMSE scores to a log-normal distribution (useful for finding/setting a cutoff threshold \phi)
55 | from scipy.stats import norm
56 | benignSample = np.log(RMSEs[FMgrace+ADgrace+1:100000])
57 | logProbs = norm.logsf(np.log(RMSEs), np.mean(benignSample), np.std(benignSample))
58 | 
59 | # plot the RMSE anomaly scores
60 | print("Plotting results")
61 | from matplotlib import pyplot as plt
62 | from matplotlib import cm
63 | plt.figure(figsize=(10,5))
64 | fig = plt.scatter(range(FMgrace+ADgrace+1,len(RMSEs)),RMSEs[FMgrace+ADgrace+1:],s=0.1,c=logProbs[FMgrace+ADgrace+1:],cmap='RdYlGn')
65 | plt.yscale("log")
66 | plt.title("Anomaly Scores from Kitsune's Execution Phase")
67 | plt.ylabel("RMSE (log scaled)")
68 | plt.xlabel("Time elapsed [min]")
69 | figbar=plt.colorbar()
70 | figbar.ax.set_ylabel('Log Probability\n ', rotation=270)
71 | plt.show()
72 | 


--------------------------------------------------------------------------------
/mirai.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ymirsky/Kitsune-py/28a654b5813936380d264c0934136efda672174a/mirai.zip


--------------------------------------------------------------------------------
/netStat.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | ## Prep AfterImage cython package
  3 | import os
  4 | import subprocess
  5 | import pyximport
  6 | pyximport.install()
  7 | import AfterImage as af
  8 | #import AfterImage_NDSS as af
  9 | 
 10 | #
 11 | # MIT License
 12 | #
 13 | # Copyright (c) 2018 Yisroel mirsky
 14 | #
 15 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 16 | # of this software and associated documentation files (the "Software"), to deal
 17 | # in the Software without restriction, including without limitation the rights
 18 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 19 | # copies of the Software, and to permit persons to whom the Software is
 20 | # furnished to do so, subject to the following conditions:
 21 | #
 22 | # The above copyright notice and this permission notice shall be included in all
 23 | # copies or substantial portions of the Software.
 24 | #
 25 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 26 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 27 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 28 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 29 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 30 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 31 | # SOFTWARE.
 32 | 
 33 | 
 34 | class netStat:
 35 |     #Datastructure for efficent network stat queries
 36 |     # HostLimit: no more that this many Host identifiers will be tracked
 37 |     # HostSimplexLimit: no more that this many outgoing channels from each host will be tracked (purged periodically)
 38 |     # Lambdas: a list of 'window sizes' (decay factors) to track for each stream. nan resolved to default [5,3,1,.1,.01]
 39 |     def __init__(self, Lambdas = np.nan, HostLimit=255,HostSimplexLimit=1000):
 40 |         #Lambdas
 41 |         if np.isnan(Lambdas):
 42 |             self.Lambdas = [5,3,1,.1,.01]
 43 |         else:
 44 |             self.Lambdas = Lambdas
 45 | 
 46 |         #HT Limits
 47 |         self.HostLimit = HostLimit
 48 |         self.SessionLimit = HostSimplexLimit*self.HostLimit*self.HostLimit #*2 since each dual creates 2 entries in memory
 49 |         self.MAC_HostLimit = self.HostLimit*10
 50 | 
 51 |         #HTs
 52 |         self.HT_jit = af.incStatDB(limit=self.HostLimit*self.HostLimit)#H-H Jitter Stats
 53 |         self.HT_MI = af.incStatDB(limit=self.MAC_HostLimit)#MAC-IP relationships
 54 |         self.HT_H = af.incStatDB(limit=self.HostLimit) #Source Host BW Stats
 55 |         self.HT_Hp = af.incStatDB(limit=self.SessionLimit)#Source Host BW Stats
 56 | 
 57 | 
 58 |     def findDirection(self,IPtype,srcIP,dstIP,eth_src,eth_dst): #cpp: this is all given to you in the direction string of the instance (NO NEED FOR THIS FUNCTION)
 59 |         if IPtype==0: #is IPv4
 60 |             lstP = srcIP.rfind('.')
 61 |             src_subnet = srcIP[0:lstP:]
 62 |             lstP = dstIP.rfind('.')
 63 |             dst_subnet = dstIP[0:lstP:]
 64 |         elif IPtype==1: #is IPv6
 65 |             src_subnet = srcIP[0:round(len(srcIP)/2):]
 66 |             dst_subnet = dstIP[0:round(len(dstIP)/2):]
 67 |         else: #no Network layer, use MACs
 68 |             src_subnet = eth_src
 69 |             dst_subnet = eth_dst
 70 | 
 71 |         return src_subnet, dst_subnet
 72 | 
 73 |     def updateGetStats(self, IPtype, srcMAC,dstMAC, srcIP, srcProtocol, dstIP, dstProtocol, datagramSize, timestamp):
 74 |         # Host BW: Stats on the srcIP's general Sender Statistics
 75 |         # Hstat = np.zeros((3*len(self.Lambdas,)))
 76 |         # for i in range(len(self.Lambdas)):
 77 |         #     Hstat[(i*3):((i+1)*3)] = self.HT_H.update_get_1D_Stats(srcIP, timestamp, datagramSize, self.Lambdas[i])
 78 | 
 79 |         #MAC.IP: Stats on src MAC-IP relationships
 80 |         MIstat =  np.zeros((3*len(self.Lambdas,)))
 81 |         for i in range(len(self.Lambdas)):
 82 |             MIstat[(i*3):((i+1)*3)] = self.HT_MI.update_get_1D_Stats(srcMAC+srcIP, timestamp, datagramSize, self.Lambdas[i])
 83 | 
 84 |         # Host-Host BW: Stats on the dual traffic behavior between srcIP and dstIP
 85 |         HHstat =  np.zeros((7*len(self.Lambdas,)))
 86 |         for i in range(len(self.Lambdas)):
 87 |             HHstat[(i*7):((i+1)*7)] = self.HT_H.update_get_1D2D_Stats(srcIP, dstIP,timestamp,datagramSize,self.Lambdas[i])
 88 | 
 89 |         # Host-Host Jitter:
 90 |         HHstat_jit =  np.zeros((3*len(self.Lambdas,)))
 91 |         for i in range(len(self.Lambdas)):
 92 |             HHstat_jit[(i*3):((i+1)*3)] = self.HT_jit.update_get_1D_Stats(srcIP+dstIP, timestamp, 0, self.Lambdas[i],isTypeDiff=True)
 93 | 
 94 |         # Host-Host BW: Stats on the dual traffic behavior between srcIP and dstIP
 95 |         HpHpstat =  np.zeros((7*len(self.Lambdas,)))
 96 |         if srcProtocol == 'arp':
 97 |             for i in range(len(self.Lambdas)):
 98 |                 HpHpstat[(i*7):((i+1)*7)] = self.HT_Hp.update_get_1D2D_Stats(srcMAC, dstMAC, timestamp, datagramSize, self.Lambdas[i])
 99 |         else:  # some other protocol (e.g. TCP/UDP)
100 |             for i in range(len(self.Lambdas)):
101 |                 HpHpstat[(i*7):((i+1)*7)] = self.HT_Hp.update_get_1D2D_Stats(srcIP + srcProtocol, dstIP + dstProtocol, timestamp, datagramSize, self.Lambdas[i])
102 | 
103 |         return np.concatenate((MIstat, HHstat, HHstat_jit, HpHpstat))  # concatenation of stats into one stat vector
104 | 
105 |     def getNetStatHeaders(self):
106 |         MIstat_headers = []
107 |         Hstat_headers = []
108 |         HHstat_headers = []
109 |         HHjitstat_headers = []
110 |         HpHpstat_headers = []
111 | 
112 |         for i in range(len(self.Lambdas)):
113 |             MIstat_headers += ["MI_dir_"+h for h in self.HT_MI.getHeaders_1D(Lambda=self.Lambdas[i],ID=None)]
114 |             HHstat_headers += ["HH_"+h for h in self.HT_H.getHeaders_1D2D(Lambda=self.Lambdas[i],IDs=None,ver=2)]
115 |             HHjitstat_headers += ["HH_jit_"+h for h in self.HT_jit.getHeaders_1D(Lambda=self.Lambdas[i],ID=None)]
116 |             HpHpstat_headers += ["HpHp_" + h for h in self.HT_Hp.getHeaders_1D2D(Lambda=self.Lambdas[i], IDs=None, ver=2)]
117 |         return MIstat_headers + Hstat_headers + HHstat_headers + HHjitstat_headers + HpHpstat_headers
118 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | from Cython.Build import cythonize
3 | 
4 | setup(
5 |     ext_modules = cythonize(["*.pyx"])
6 | )


--------------------------------------------------------------------------------