Module DiFF_RF
23 | Created on Tue Mar 24 12:19:32 2020
26 |@author: Pierre-François Marteau (https://people.irisa.fr/Pierre-Francois.Marteau/)
27 |29 | Expand source code 30 |
31 |#!/usr/bin/env python3
32 | # -*- coding: utf-8 -*-
33 | """
34 | Created on Tue Mar 24 12:19:32 2020
35 |
36 | @author: Pierre-François Marteau (https://people.irisa.fr/Pierre-Francois.Marteau/)
37 | """
38 |
39 | # Inspired from an implementation of the isolation forest algorithm provided at
40 | # https://github.com/xhan0909/isolation_forest
41 |
42 | import numpy as np
43 | import time
44 | from functools import partial
45 | from multiprocessing import Pool
46 |
47 | import random as rn
48 |
49 | def getSplit(X):
50 | """
51 | Randomly selects a split value from set of scalar data 'X'.
52 | Returns the split value.
53 |
54 | Parameters
55 | ----------
56 | X : array
57 | Array of scalar values
58 | Returns
59 | -------
60 | float
61 | split value
62 | """
63 | xmin = X.min()
64 | xmax = X.max()
65 | return np.random.uniform(xmin, xmax)
66 |
67 | def similarityScore(S, node, alpha):
68 | """
69 | Given a set of instances S falling into node and a value alpha >=0,
70 | returns for all element x in S the weighted similarity score between x
71 | and the centroid M of S (node.M)
72 |
73 | Parameters
74 | ----------
75 | S : array of instances
76 | Array of instances that fall into a node
77 | node: a DiFF tree node
78 | S is the set of instances "falling" into the node
79 | alpha: float
80 | alpha is the distance scaling hyper-parameter
81 | Returns
82 | -------
83 | array
84 | the array of similarity values between the instances in S and the mean of training instances falling in node
85 |
86 | """
87 | d = np.shape(S)[1]
88 | if len(S) > 0:
89 | d = np.shape(S)[1]
90 | U = (S-node.M)/node.Mstd # normalize using the standard deviation vector to the mean
91 | U = (2)**(-alpha*(np.sum(U*U/d, axis=1)))
92 | else:
93 | U = 0
94 |
95 | return U
96 |
97 |
98 | def EE(hist):
99 | """
100 | given a list of positive values as a histogram drawn from any information source,
101 | returns the empirical entropy of its discrete probability function.
102 |
103 | Parameters
104 | ----------
105 | hist: array
106 | histogram
107 | Returns
108 | -------
109 | float
110 | empirical entropy estimated from the histogram
111 |
112 | """
113 | h = np.asarray(hist, dtype=np.float64)
114 | if h.sum() <= 0 or (h < 0).any():
115 | return 0
116 | h = h/h.sum()
117 | return -(h*np.ma.log2(h)).sum()
118 |
119 |
120 | def weightFeature(s, nbins):
121 | '''
122 | Given a list of values corresponding to a feature dimension, returns a weight (in [0,1]) that is
123 | one minus the normalized empirical entropy, a way to characterize the importance of the feature dimension.
124 |
125 | Parameters
126 | ----------
127 | s: array
128 | list of scalar values corresponding to a feature dimension
129 | nbins: int
130 | the number of bins used to discretize the feature dimension using an histogram.
131 | Returns
132 | -------
133 | float
134 | the importance weight for feature s.
135 | '''
136 | if s.min() == s.max():
137 | return 0
138 | hist = np.histogram(s, bins=nbins, density=True)
139 | ent = EE(hist[0])
140 | ent = ent/np.log2(nbins)
141 | return 1-ent
142 |
143 |
144 | def walk_tree(forest, node, treeIdx, obsIdx, X, featureDistrib, depth=0, alpha=1e-2):
145 | '''
146 | Recursive function that walks a tree from an already fitted forest to compute the path length
147 | of the new observations.
148 |
149 | Parameters
150 | ----------
151 | forest : DiFF_RF
152 | A fitted forest of DiFF trees
153 | node: DiFF Tree node
154 | the current node
155 | treeIdx: int
156 | index of the tree that is being walked.
157 | obsIdx: array
158 | 1D array of length n_obs. 1/0 if the obs has reached / has not reached the node.
159 | X: nD array.
160 | array of observations/instances.
161 | depth: int
162 | current depth.
163 | Returns
164 | -------
165 | None
166 | '''
167 |
168 | if isinstance(node, LeafNode):
169 | Xnode = X[obsIdx]
170 | f = ((node.size+1)/forest.sample_size) / ((1+len(Xnode))/forest.XtestSize)
171 | if alpha == 0:
172 | forest.LD[obsIdx, treeIdx] = 0
173 | forest.LF[obsIdx, treeIdx] = -f
174 | forest.LDF[obsIdx, treeIdx] = -f
175 | else:
176 | z = similarityScore(Xnode, node, alpha)
177 | forest.LD[obsIdx, treeIdx] = z
178 | forest.LF[obsIdx, treeIdx] = -f
179 | forest.LDF[obsIdx, treeIdx] = z*f
180 |
181 | else:
182 |
183 | idx = (X[:, node.splitAtt] <= node.splitValue) * obsIdx
184 | walk_tree(forest, node.left, treeIdx, idx, X, featureDistrib, depth + 1, alpha=alpha)
185 |
186 | idx = (X[:, node.splitAtt] > node.splitValue) * obsIdx
187 | walk_tree(forest, node.right, treeIdx, idx, X, featureDistrib, depth + 1, alpha=alpha)
188 |
189 |
190 | def create_tree(X, featureDistrib, sample_size, max_height):
191 | '''
192 | Creates an DiFF tree using a sample of size sample_size of the original data.
193 |
194 | Parameters
195 | ----------
196 | X: nD array.
197 | nD array with the observations. Dimensions should be (n_obs, n_features).
198 | sample_size: int
199 | Size of the sample from which a DiFF tree is built.
200 | max_height: int
201 | Maximum height of the tree.
202 | Returns
203 | -------
204 | a DiFF tree
205 | '''
206 | rows = np.random.choice(len(X), sample_size, replace=False)
207 | featureDistrib = np.array(featureDistrib)
208 | return DiFF_Tree(max_height).fit(X[rows, :], featureDistrib)
209 |
210 |
211 | class DiFF_TreeEnsemble:
212 | '''
213 | DiFF Forest.
214 | Even though all the methods are thought to be public the main functionality of the class is given by:
215 | - __init__
216 | - __fit__
217 | - __predict__
218 | '''
219 | def __init__(self, sample_size: int, n_trees: int = 10):
220 | '''
221 | Creates the DiFF-RF object.
222 |
223 | Parameters
224 | ----------
225 | sample_size: int.
226 | size of the sample randomly drawn from the train instances to build each DiFF tree.
227 | n_trees: int
228 | The number of trees in the forest
229 | Returns
230 | -------
231 | None
232 | '''
233 |
234 | self.sample_size = sample_size
235 | self.n_trees = n_trees
236 | self.alpha=1.0
237 | np.random.seed(int(time.time()))
238 | rn.seed(int(time.time()))
239 |
240 |
241 | def fit(self, X: (np.ndarray), n_jobs: int = 4):
242 | """
243 | Fits the algorithm into a model.
244 | Given a 2D matrix of observations, create an ensemble of IsolationTree
245 | objects and store them in a list: self.trees. Convert DataFrames to
246 | ndarray objects.
247 | Uses parallel computing.
248 |
249 | Parameters
250 | ----------
251 | X: nD array.
252 | nD array with the train instances. Dimensions should be (n_obs, n_features).
253 | n_jobs: int
254 | number of parallel jobs that will be launched
255 | Returns
256 | -------
257 | the object itself.
258 | """
259 | self.X = X
260 | self.path_normFactor = np.sqrt(len(X))
261 |
262 | self.sample_size = min(self.sample_size, len(X))
263 |
264 | limit_height = 1.0*np.ceil(np.log2(self.sample_size))
265 |
266 | featureDistrib = []
267 | nbins = int(len(X)/8)+2
268 | for i in range(np.shape(X)[1]):
269 | featureDistrib.append(weightFeature(X[:, i], nbins))
270 | featureDistrib = np.array(featureDistrib)
271 | featureDistrib = featureDistrib/(np.sum(featureDistrib)+1e-5)
272 | self.featureDistrib = featureDistrib
273 |
274 | create_tree_partial = partial(create_tree,
275 | featureDistrib=self.featureDistrib,
276 | sample_size=self.sample_size,
277 | max_height=limit_height)
278 |
279 | with Pool(n_jobs) as p:
280 | self.trees = p.map(create_tree_partial,
281 | [X for _ in range(self.n_trees)]
282 | )
283 | return self
284 |
285 |
286 | def walk(self, X: np.ndarray) -> np.ndarray:
287 | """
288 | Given a nD matrix of observations, X, compute the average path length,
289 | the distance, frequency and collective anomaly scores
290 | for instances in X. Compute the path length for x_i using every
291 | tree in self.trees then compute the average for each x_i. Return an
292 | ndarray of shape (len(X),1).
293 |
294 | Parameters
295 | ----------
296 | X: nD array.
297 | nD array with the instances to be tested. Dimensions should be (n_obs, n_features).
298 | Returns
299 | -------
300 | None
301 | """
302 |
303 | self.L = np.zeros((len(X), self.n_trees))
304 | self.LD = np.zeros((len(X), self.n_trees))
305 | self.LF = np.zeros((len(X), self.n_trees))
306 | self.LDF = np.zeros((len(X), self.n_trees))
307 |
308 | for treeIdx, itree in enumerate(self.trees):
309 | obsIdx = np.ones(len(X)).astype(bool)
310 | walk_tree(self, itree, treeIdx, obsIdx, X, self.featureDistrib, alpha=self.alpha)
311 |
312 |
313 | def anomaly_score(self, X: np.ndarray, alpha=1) -> np.ndarray:
314 | """
315 | Given a nD matrix of observations, X, compute the anomaly scores
316 | for instances in X, returning 3 1D arrays of anomaly scores
317 |
318 | Parameters
319 | ----------
320 | X: nD array.
321 | nD array with the tested observations to be predicted. Dimensions should be (n_obs, n_features).
322 | alpha: float
323 | scaling distance hyper-parameter.
324 | Returns
325 | -------
326 | scD, scF, scFF: 1d arrays
327 | respectively the distance scores (point-wise anomaly score), the frequency of visit socres and the collective anomaly scores
328 | """
329 | self.XtestSize = len(X)
330 | self.alpha = alpha
331 |
332 | # Get the path length for each of the observations.
333 | self.walk(X)
334 |
335 | # Compute the scores from the path lengths (self.L)
336 | if self.sample_size > 2:
337 | scD = -self.LD.mean(1)
338 | elif self.sample_size == 2:
339 | scD = -self.LD.mean(1)
340 | else:
341 | scD = 0
342 |
343 | scF = self.LF.mean(1)
344 | scDF = -self.LDF.mean(1)
345 | return scD, scF, scDF
346 |
347 |
348 | def predict_from_anomaly_scores(self, scores: np.ndarray, threshold: float) -> np.ndarray:
349 | """
350 | Given an array of scores and a score threshold, return an array of
351 | the predictions: 1 for any score >= the threshold and 0 otherwise.
352 |
353 | Parameters
354 | ----------
355 | scores: 1D array.
356 | 1D array of scores. Dimensions should be (n_obs, n_features).
357 | threshold: float
358 | Threshold for considering a observation an anomaly, the higher the less anomalies.
359 | Returns
360 | -------
361 | 1D array
362 | The prediction array corresponding to 1/0 if anomaly/not anomaly respectively.
363 |
364 | :param scores: 1D array. Scores produced by the random forest.
365 | :param threshold: Threshold for considering a observation an anomaly, the higher the less anomalies.
366 | :return: Return predictions
367 | """
368 | out = scores >= threshold
369 | return out*1
370 |
371 |
372 | def predict(self, X: np.ndarray, threshold: float) -> np.ndarray:
373 | """
374 | A shorthand for calling anomaly_score() and predict_from_anomaly_scores().
375 |
376 | Parameters
377 | ----------
378 | X: nD array.
379 | nD array with the tested observations to be predicted. Dimensions should be (n_obs, n_features).
380 | threshold: float
381 | Threshold for considering a observation an anomaly, the higher the less anomalies.
382 | Returns
383 | -------
384 | 1D array
385 | The prediction array corresponding to 1/0 if anomaly/not anomaly respectively.
386 | """
387 |
388 | scores = self.anomaly_score(X)
389 | return self.predict_from_anomaly_scores(scores, threshold)
390 |
391 |
392 | class DiFF_Tree:
393 | '''
394 | Construct a tree via randomized splits with maximum height height_limit.
395 | '''
396 | def __init__(self, height_limit):
397 | '''
398 | Parameters
399 | ----------
400 | height_limit: int
401 | Maximum height of the tree.
402 | Returns
403 | -------
404 | None
405 | '''
406 | self.height_limit = height_limit
407 |
408 | def fit(self, X: np.ndarray, featureDistrib: np.array):
409 | """
410 | Given a 2D matrix of observations, create an DiFF tree. Set field
411 | self.root to the root of that tree and return it.
412 |
413 | Parameters
414 | ----------
415 | X: nD array.
416 | nD array with the observations. Dimensions should be (n_obs, n_features).
417 | featureDistrib: 1D array
418 | The distribution weight affected to each dimension
419 | Returns
420 | -------
421 | A DIFF tree root.
422 | """
423 | self.root = InNode(X, self.height_limit, featureDistrib, len(X), 0)
424 |
425 | return self.root
426 |
427 |
428 | class InNode:
429 | '''
430 | Node of the tree that is not a leaf node.
431 | The functionality of the class is:
432 | - Do the best split from a sample of randomly chosen
433 | dimensions and split points.
434 | - Partition the space of observations according to the
435 | split and send the along to two different nodes
436 | The method usually has a higher complexity than doing it for every point.
437 | But because it's using NumPy it's more efficient time-wise.
438 | '''
439 | def __init__(self, X, height_limit, featureDistrib, sample_size, current_height):
440 | '''
441 | Parameters
442 | ----------
443 | X: nD array.
444 | nD array with the training instances that have reached the node.
445 | height_limit: int
446 | Maximum height of the tree.
447 | Xf: nD array.
448 | distribution used to randomly select a dimension (feature) used at parent level.
449 | sample_size: int
450 | Size of the sample used to build the tree.
451 | current_height: int
452 | Current height of the tree.
453 | Returns
454 | -------
455 | None
456 | '''
457 |
458 | self.size = len(X)
459 | self.height = current_height+1
460 | n_obs, n_features = X.shape
461 | next_height = current_height + 1
462 | limit_not_reached = height_limit > next_height
463 |
464 | if len(X) > 32:
465 | featureDistrib = []
466 | nbins = int(len(X)/8)+2
467 | for i in range(np.shape(X)[1]):
468 | featureDistrib.append(weightFeature(X[:, i], nbins))
469 | featureDistrib = np.array(featureDistrib)
470 | featureDistrib = featureDistrib/(np.sum(featureDistrib)+1e-5)
471 |
472 | self.featureDistrib = featureDistrib
473 |
474 | cols = np.arange(np.shape(X)[1], dtype='int')
475 |
476 | self.splitAtt = rn.choices(cols, weights=featureDistrib)[0]
477 | splittingCol = X[:, self.splitAtt]
478 | self.splitValue = getSplit(splittingCol)
479 | idx = splittingCol <= self.splitValue
480 |
481 | idx = splittingCol <= self.splitValue
482 |
483 | X_aux = X[idx, :]
484 |
485 | self.left = (InNode(X_aux, height_limit, featureDistrib, sample_size, next_height)
486 | if limit_not_reached and X_aux.shape[0] > 5 and (np.any(X_aux.max(0) != X_aux.min(0))) else LeafNode(
487 | X_aux, next_height, X, sample_size))
488 |
489 | idx = np.invert(idx)
490 | X_aux = X[idx, :]
491 | self.right = (InNode(X_aux, height_limit, featureDistrib, sample_size, next_height)
492 | if limit_not_reached and X_aux.shape[0] > 5 and (np.any(X_aux.max(0) != X_aux.min(0))) else LeafNode(
493 | X_aux, next_height, X, sample_size))
494 |
495 | self.n_nodes = 1 + self.left.n_nodes + self.right.n_nodes
496 |
497 |
498 | class LeafNode:
499 | '''
500 | Leaf node
501 | The base funcitonality is storing the Mean and standard deviation of the observations in that node.
502 | We also evaluate the frequency of visit for training data.
503 | '''
504 | def __init__(self, X, height, Xp, sample_size):
505 | '''
506 | Parameters
507 | ----------
508 | X: nD array.
509 | nD array with the training instances falling into the leaf node.
510 | height: int
511 | Current height of the tree.
512 | Xf: nD array.
513 | nD array with the training instances falling into the parent node.
514 | sample_size: int
515 | Size of the sample used to build the tree.
516 | Returns
517 | -------
518 | None
519 | '''
520 | self.height = height+1
521 | self.size = len(X)
522 | self.n_nodes = 1
523 | self.freq = self.size/sample_size
524 | self.freqs = 0
525 |
526 | if len(X) != 0:
527 | self.M = np.mean(X, axis=0)
528 | if len(X) > 10:
529 | self.Mstd = np.std(X, axis=0)
530 | self.Mstd[self.Mstd == 0] = 1e-2
531 | else:
532 | self.Mstd = np.ones(np.shape(X)[1])
533 | else:
534 | self.M = np.mean(Xp, axis=0)
535 | if len(Xp) > 10:
536 | self.Mstd = np.std(Xp, axis=0)
537 | self.Mstd[self.Mstd == 0] = 1e-2
538 | else:
539 | self.Mstd = np.ones(np.shape(X)[1])
540 | Functions
548 |-
549 |
550 | def EE(hist) 551 |
552 | -
553 | 565 |
given a list of positive values as a histogram drawn from any information source, 554 | returns the empirical entropy of its discrete probability function.
555 |Parameters
556 |-
557 |
hist:array
558 | - histogram 559 |
Returns
561 |-
562 |
float
563 | - empirical entropy estimated from the histogram 564 |
566 |590 |567 | Expand source code 568 |
569 |
589 |def EE(hist): 570 | """ 571 | given a list of positive values as a histogram drawn from any information source, 572 | returns the empirical entropy of its discrete probability function. 573 | 574 | Parameters 575 | ---------- 576 | hist: array 577 | histogram 578 | Returns 579 | ------- 580 | float 581 | empirical entropy estimated from the histogram 582 | 583 | """ 584 | h = np.asarray(hist, dtype=np.float64) 585 | if h.sum() <= 0 or (h < 0).any(): 586 | return 0 587 | h = h/h.sum() 588 | return -(h*np.ma.log2(h)).sum()
591 | 592 | def create_tree(X, featureDistrib, sample_size, max_height) 593 |
594 | -
595 | 610 |
Creates an DiFF tree using a sample of size sample_size of the original data.
596 |Parameters
597 |-
598 |
X:nD array.
599 | - nD array with the observations. Dimensions should be (n_obs, n_features). 600 |
sample_size:int
601 | - Size of the sample from which a DiFF tree is built. 602 |
max_height:int
603 | - Maximum height of the tree. 604 |
Returns
606 |-
607 |
a DiFF tree
608 | - 609 |
611 |634 |612 | Expand source code 613 |
614 |
633 |def create_tree(X, featureDistrib, sample_size, max_height): 615 | ''' 616 | Creates an DiFF tree using a sample of size sample_size of the original data. 617 | 618 | Parameters 619 | ---------- 620 | X: nD array. 621 | nD array with the observations. Dimensions should be (n_obs, n_features). 622 | sample_size: int 623 | Size of the sample from which a DiFF tree is built. 624 | max_height: int 625 | Maximum height of the tree. 626 | Returns 627 | ------- 628 | a DiFF tree 629 | ''' 630 | rows = np.random.choice(len(X), sample_size, replace=False) 631 | featureDistrib = np.array(featureDistrib) 632 | return DiFF_Tree(max_height).fit(X[rows, :], featureDistrib)
635 | 636 | def getSplit(X) 637 |
638 | -
639 | 651 |
Randomly selects a split value from set of scalar data 'X'. 640 | Returns the split value.
641 |Parameters
642 |-
643 |
X:array
644 | - Array of scalar values 645 |
Returns
647 |-
648 |
float
649 | - split value 650 |
652 |673 |653 | Expand source code 654 |
655 |
672 |def getSplit(X): 656 | """ 657 | Randomly selects a split value from set of scalar data 'X'. 658 | Returns the split value. 659 | 660 | Parameters 661 | ---------- 662 | X : array 663 | Array of scalar values 664 | Returns 665 | ------- 666 | float 667 | split value 668 | """ 669 | xmin = X.min() 670 | xmax = X.max() 671 | return np.random.uniform(xmin, xmax)
674 | 675 | def similarityScore(S, node, alpha) 676 |
677 | -
678 | 697 |
Given a set of instances S falling into node and a value alpha >=0, 679 | returns for all element x in S the weighted similarity score between x 680 | and the centroid M of S (node.M)
681 |Parameters
682 |-
683 |
S:array684 | ofinstances
685 | - Array 686 | of instances that fall into a node 687 |
node:a DiFF tree node
688 | - S is the set of instances "falling" into the node 689 |
alpha:float
690 | - alpha is the distance scaling hyper-parameter 691 |
Returns
693 |-
694 |
array
695 | - the array of similarity values between the instances in S and the mean of training instances falling in node 696 |
698 |731 |699 | Expand source code 700 |
701 |
730 |def similarityScore(S, node, alpha): 702 | """ 703 | Given a set of instances S falling into node and a value alpha >=0, 704 | returns for all element x in S the weighted similarity score between x 705 | and the centroid M of S (node.M) 706 | 707 | Parameters 708 | ---------- 709 | S : array of instances 710 | Array of instances that fall into a node 711 | node: a DiFF tree node 712 | S is the set of instances "falling" into the node 713 | alpha: float 714 | alpha is the distance scaling hyper-parameter 715 | Returns 716 | ------- 717 | array 718 | the array of similarity values between the instances in S and the mean of training instances falling in node 719 | 720 | """ 721 | d = np.shape(S)[1] 722 | if len(S) > 0: 723 | d = np.shape(S)[1] 724 | U = (S-node.M)/node.Mstd # normalize using the standard deviation vector to the mean 725 | U = (2)**(-alpha*(np.sum(U*U/d, axis=1))) 726 | else: 727 | U = 0 728 | 729 | return U
732 | 733 | def walk_tree(forest, node, treeIdx, obsIdx, X, featureDistrib, depth=0, alpha=0.01) 734 |
735 | -
736 | 758 |
Recursive function that walks a tree from an already fitted forest to compute the path length 737 | of the new observations.
738 |Parameters
739 |-
740 |
forest:DiFF_RF
741 | - A fitted forest of DiFF trees 742 |
node:DiFF Tree node
743 | - the current node 744 |
treeIdx:int
745 | - index of the tree that is being walked. 746 |
obsIdx:array
747 | - 1D array of length n_obs. 1/0 if the obs has reached / has not reached the node. 748 |
X:nD array.
749 | - array of observations/instances. 750 |
depth:int
751 | - current depth. 752 |
Returns
754 |-
755 |
None
756 | - 757 |
759 |807 |760 | Expand source code 761 |
762 |
806 |def walk_tree(forest, node, treeIdx, obsIdx, X, featureDistrib, depth=0, alpha=1e-2): 763 | ''' 764 | Recursive function that walks a tree from an already fitted forest to compute the path length 765 | of the new observations. 766 | 767 | Parameters 768 | ---------- 769 | forest : DiFF_RF 770 | A fitted forest of DiFF trees 771 | node: DiFF Tree node 772 | the current node 773 | treeIdx: int 774 | index of the tree that is being walked. 775 | obsIdx: array 776 | 1D array of length n_obs. 1/0 if the obs has reached / has not reached the node. 777 | X: nD array. 778 | array of observations/instances. 779 | depth: int 780 | current depth. 781 | Returns 782 | ------- 783 | None 784 | ''' 785 | 786 | if isinstance(node, LeafNode): 787 | Xnode = X[obsIdx] 788 | f = ((node.size+1)/forest.sample_size) / ((1+len(Xnode))/forest.XtestSize) 789 | if alpha == 0: 790 | forest.LD[obsIdx, treeIdx] = 0 791 | forest.LF[obsIdx, treeIdx] = -f 792 | forest.LDF[obsIdx, treeIdx] = -f 793 | else: 794 | z = similarityScore(Xnode, node, alpha) 795 | forest.LD[obsIdx, treeIdx] = z 796 | forest.LF[obsIdx, treeIdx] = -f 797 | forest.LDF[obsIdx, treeIdx] = z*f 798 | 799 | else: 800 | 801 | idx = (X[:, node.splitAtt] <= node.splitValue) * obsIdx 802 | walk_tree(forest, node.left, treeIdx, idx, X, featureDistrib, depth + 1, alpha=alpha) 803 | 804 | idx = (X[:, node.splitAtt] > node.splitValue) * obsIdx 805 | walk_tree(forest, node.right, treeIdx, idx, X, featureDistrib, depth + 1, alpha=alpha)
808 | 809 | def weightFeature(s, nbins) 810 |
811 | -
812 | 826 |
Given a list of values corresponding to a feature dimension, returns a weight (in [0,1]) that is 813 | one minus the normalized empirical entropy, a way to characterize the importance of the feature dimension.
814 |Parameters
815 |-
816 |
s:array
817 | - list of scalar values corresponding to a feature dimension 818 |
nbins:int
819 | - the number of bins used to discretize the feature dimension using an histogram. 820 |
Returns
822 |-
823 |
float
824 | - the importance weight for feature s. 825 |
827 |853 |828 | Expand source code 829 |
830 |
852 |def weightFeature(s, nbins): 831 | ''' 832 | Given a list of values corresponding to a feature dimension, returns a weight (in [0,1]) that is 833 | one minus the normalized empirical entropy, a way to characterize the importance of the feature dimension. 834 | 835 | Parameters 836 | ---------- 837 | s: array 838 | list of scalar values corresponding to a feature dimension 839 | nbins: int 840 | the number of bins used to discretize the feature dimension using an histogram. 841 | Returns 842 | ------- 843 | float 844 | the importance weight for feature s. 845 | ''' 846 | if s.min() == s.max(): 847 | return 0 848 | hist = np.histogram(s, bins=nbins, density=True) 849 | ent = EE(hist[0]) 850 | ent = ent/np.log2(nbins) 851 | return 1-ent
854 |
Classes
858 |-
859 |
860 | class DiFF_Tree 861 | (height_limit) 862 |
863 | -
864 | 875 |
Construct a tree via randomized splits with maximum height height_limit.
865 |Parameters
866 |-
867 |
height_limit:int
868 | - Maximum height of the tree. 869 |
Returns
871 |-
872 |
None
873 | - 874 |
876 |914 |877 | Expand source code 878 |
879 |
913 |class DiFF_Tree: 880 | ''' 881 | Construct a tree via randomized splits with maximum height height_limit. 882 | ''' 883 | def __init__(self, height_limit): 884 | ''' 885 | Parameters 886 | ---------- 887 | height_limit: int 888 | Maximum height of the tree. 889 | Returns 890 | ------- 891 | None 892 | ''' 893 | self.height_limit = height_limit 894 | 895 | def fit(self, X: np.ndarray, featureDistrib: np.array): 896 | """ 897 | Given a 2D matrix of observations, create an DiFF tree. Set field 898 | self.root to the root of that tree and return it. 899 | 900 | Parameters 901 | ---------- 902 | X: nD array. 903 | nD array with the observations. Dimensions should be (n_obs, n_features). 904 | featureDistrib: 1D array 905 | The distribution weight affected to each dimension 906 | Returns 907 | ------- 908 | A DIFF tree root. 909 | """ 910 | self.root = InNode(X, self.height_limit, featureDistrib, len(X), 0) 911 | 912 | return self.rootMethods
915 |-
916 |
917 | def fit(self, X: numpy.ndarray, featureDistrib:) 918 |
919 | -
920 | 931 |
Given a 2D matrix of observations, create an DiFF tree. Set field 921 | self.root to the root of that tree and return it.
922 |Parameters
923 |-
924 |
X:nD array.
925 | - nD array with the observations. Dimensions should be (n_obs, n_features). 926 |
featureDistrib:1D array
927 | - The distribution weight affected to each dimension 928 |
Returns
930 |A DIFF tree root.
932 |954 |933 | Expand source code 934 |
935 |
953 |def fit(self, X: np.ndarray, featureDistrib: np.array): 936 | """ 937 | Given a 2D matrix of observations, create an DiFF tree. Set field 938 | self.root to the root of that tree and return it. 939 | 940 | Parameters 941 | ---------- 942 | X: nD array. 943 | nD array with the observations. Dimensions should be (n_obs, n_features). 944 | featureDistrib: 1D array 945 | The distribution weight affected to each dimension 946 | Returns 947 | ------- 948 | A DIFF tree root. 949 | """ 950 | self.root = InNode(X, self.height_limit, featureDistrib, len(X), 0) 951 | 952 | return self.root
955 |
957 | 958 | class DiFF_TreeEnsemble 959 | (sample_size: int, n_trees: int = 10) 960 |
961 | -
962 | 978 |
DiFF Forest. 963 | Even though all the methods are thought to be public the main functionality of the class is given by: 964 | - init 965 | - fit 966 | - predict
967 |Creates the DiFF-RF object.
968 |Parameters
969 |-
970 |
sample_size:int.
971 | - size of the sample randomly drawn from the train instances to build each DiFF tree. 972 |
n_trees:int
973 | - The number of trees in the forest 974 |
Returns
976 |None 977 |979 |1162 |980 | Expand source code 981 |
982 |
1161 |class DiFF_TreeEnsemble: 983 | ''' 984 | DiFF Forest. 985 | Even though all the methods are thought to be public the main functionality of the class is given by: 986 | - __init__ 987 | - __fit__ 988 | - __predict__ 989 | ''' 990 | def __init__(self, sample_size: int, n_trees: int = 10): 991 | ''' 992 | Creates the DiFF-RF object. 993 | 994 | Parameters 995 | ---------- 996 | sample_size: int. 997 | size of the sample randomly drawn from the train instances to build each DiFF tree. 998 | n_trees: int 999 | The number of trees in the forest 1000 | Returns 1001 | ------- 1002 | None 1003 | ''' 1004 | 1005 | self.sample_size = sample_size 1006 | self.n_trees = n_trees 1007 | self.alpha=1.0 1008 | np.random.seed(int(time.time())) 1009 | rn.seed(int(time.time())) 1010 | 1011 | 1012 | def fit(self, X: (np.ndarray), n_jobs: int = 4): 1013 | """ 1014 | Fits the algorithm into a model. 1015 | Given a 2D matrix of observations, create an ensemble of IsolationTree 1016 | objects and store them in a list: self.trees. Convert DataFrames to 1017 | ndarray objects. 1018 | Uses parallel computing. 1019 | 1020 | Parameters 1021 | ---------- 1022 | X: nD array. 1023 | nD array with the train instances. Dimensions should be (n_obs, n_features). 1024 | n_jobs: int 1025 | number of parallel jobs that will be launched 1026 | Returns 1027 | ------- 1028 | the object itself. 1029 | """ 1030 | self.X = X 1031 | self.path_normFactor = np.sqrt(len(X)) 1032 | 1033 | self.sample_size = min(self.sample_size, len(X)) 1034 | 1035 | limit_height = 1.0*np.ceil(np.log2(self.sample_size)) 1036 | 1037 | featureDistrib = [] 1038 | nbins = int(len(X)/8)+2 1039 | for i in range(np.shape(X)[1]): 1040 | featureDistrib.append(weightFeature(X[:, i], nbins)) 1041 | featureDistrib = np.array(featureDistrib) 1042 | featureDistrib = featureDistrib/(np.sum(featureDistrib)+1e-5) 1043 | self.featureDistrib = featureDistrib 1044 | 1045 | create_tree_partial = partial(create_tree, 1046 | featureDistrib=self.featureDistrib, 1047 | sample_size=self.sample_size, 1048 | max_height=limit_height) 1049 | 1050 | with Pool(n_jobs) as p: 1051 | self.trees = p.map(create_tree_partial, 1052 | [X for _ in range(self.n_trees)] 1053 | ) 1054 | return self 1055 | 1056 | 1057 | def walk(self, X: np.ndarray) -> np.ndarray: 1058 | """ 1059 | Given a nD matrix of observations, X, compute the average path length, 1060 | the distance, frequency and collective anomaly scores 1061 | for instances in X. Compute the path length for x_i using every 1062 | tree in self.trees then compute the average for each x_i. Return an 1063 | ndarray of shape (len(X),1). 1064 | 1065 | Parameters 1066 | ---------- 1067 | X: nD array. 1068 | nD array with the instances to be tested. Dimensions should be (n_obs, n_features). 1069 | Returns 1070 | ------- 1071 | None 1072 | """ 1073 | 1074 | self.L = np.zeros((len(X), self.n_trees)) 1075 | self.LD = np.zeros((len(X), self.n_trees)) 1076 | self.LF = np.zeros((len(X), self.n_trees)) 1077 | self.LDF = np.zeros((len(X), self.n_trees)) 1078 | 1079 | for treeIdx, itree in enumerate(self.trees): 1080 | obsIdx = np.ones(len(X)).astype(bool) 1081 | walk_tree(self, itree, treeIdx, obsIdx, X, self.featureDistrib, alpha=self.alpha) 1082 | 1083 | 1084 | def anomaly_score(self, X: np.ndarray, alpha=1) -> np.ndarray: 1085 | """ 1086 | Given a nD matrix of observations, X, compute the anomaly scores 1087 | for instances in X, returning 3 1D arrays of anomaly scores 1088 | 1089 | Parameters 1090 | ---------- 1091 | X: nD array. 1092 | nD array with the tested observations to be predicted. Dimensions should be (n_obs, n_features). 1093 | alpha: float 1094 | scaling distance hyper-parameter. 1095 | Returns 1096 | ------- 1097 | scD, scF, scFF: 1d arrays 1098 | respectively the distance scores (point-wise anomaly score), the frequency of visit socres and the collective anomaly scores 1099 | """ 1100 | self.XtestSize = len(X) 1101 | self.alpha = alpha 1102 | 1103 | # Get the path length for each of the observations. 1104 | self.walk(X) 1105 | 1106 | # Compute the scores from the path lengths (self.L) 1107 | if self.sample_size > 2: 1108 | scD = -self.LD.mean(1) 1109 | elif self.sample_size == 2: 1110 | scD = -self.LD.mean(1) 1111 | else: 1112 | scD = 0 1113 | 1114 | scF = self.LF.mean(1) 1115 | scDF = -self.LDF.mean(1) 1116 | return scD, scF, scDF 1117 | 1118 | 1119 | def predict_from_anomaly_scores(self, scores: np.ndarray, threshold: float) -> np.ndarray: 1120 | """ 1121 | Given an array of scores and a score threshold, return an array of 1122 | the predictions: 1 for any score >= the threshold and 0 otherwise. 1123 | 1124 | Parameters 1125 | ---------- 1126 | scores: 1D array. 1127 | 1D array of scores. Dimensions should be (n_obs, n_features). 1128 | threshold: float 1129 | Threshold for considering a observation an anomaly, the higher the less anomalies. 1130 | Returns 1131 | ------- 1132 | 1D array 1133 | The prediction array corresponding to 1/0 if anomaly/not anomaly respectively. 1134 | 1135 | :param scores: 1D array. Scores produced by the random forest. 1136 | :param threshold: Threshold for considering a observation an anomaly, the higher the less anomalies. 1137 | :return: Return predictions 1138 | """ 1139 | out = scores >= threshold 1140 | return out*1 1141 | 1142 | 1143 | def predict(self, X: np.ndarray, threshold: float) -> np.ndarray: 1144 | """ 1145 | A shorthand for calling anomaly_score() and predict_from_anomaly_scores(). 1146 | 1147 | Parameters 1148 | ---------- 1149 | X: nD array. 1150 | nD array with the tested observations to be predicted. Dimensions should be (n_obs, n_features). 1151 | threshold: float 1152 | Threshold for considering a observation an anomaly, the higher the less anomalies. 1153 | Returns 1154 | ------- 1155 | 1D array 1156 | The prediction array corresponding to 1/0 if anomaly/not anomaly respectively. 1157 | """ 1158 | 1159 | scores = self.anomaly_score(X) 1160 | return self.predict_from_anomaly_scores(scores, threshold)Methods
1163 |-
1164 |
1165 | def anomaly_score(self, X: numpy.ndarray, alpha=1) ‑> numpy.ndarray 1166 |
1167 | -
1168 | 1182 |
Given a nD matrix of observations, X, compute the anomaly scores 1169 | for instances in X, returning 3 1D arrays of anomaly scores
1170 |Parameters
1171 |-
1172 |
X:nD array.
1173 | - nD array with the tested observations to be predicted. Dimensions should be (n_obs, n_features). 1174 |
alpha:float
1175 | - scaling distance hyper-parameter. 1176 |
Returns
1178 |-
1179 |
scD,scF,scFF:1d arrays
1180 | - respectively the distance scores (point-wise anomaly score), the frequency of visit socres and the collective anomaly scores 1181 |
1183 |1220 |1184 | Expand source code 1185 |
1186 |
1219 |def anomaly_score(self, X: np.ndarray, alpha=1) -> np.ndarray: 1187 | """ 1188 | Given a nD matrix of observations, X, compute the anomaly scores 1189 | for instances in X, returning 3 1D arrays of anomaly scores 1190 | 1191 | Parameters 1192 | ---------- 1193 | X: nD array. 1194 | nD array with the tested observations to be predicted. Dimensions should be (n_obs, n_features). 1195 | alpha: float 1196 | scaling distance hyper-parameter. 1197 | Returns 1198 | ------- 1199 | scD, scF, scFF: 1d arrays 1200 | respectively the distance scores (point-wise anomaly score), the frequency of visit socres and the collective anomaly scores 1201 | """ 1202 | self.XtestSize = len(X) 1203 | self.alpha = alpha 1204 | 1205 | # Get the path length for each of the observations. 1206 | self.walk(X) 1207 | 1208 | # Compute the scores from the path lengths (self.L) 1209 | if self.sample_size > 2: 1210 | scD = -self.LD.mean(1) 1211 | elif self.sample_size == 2: 1212 | scD = -self.LD.mean(1) 1213 | else: 1214 | scD = 0 1215 | 1216 | scF = self.LF.mean(1) 1217 | scDF = -self.LDF.mean(1) 1218 | return scD, scF, scDF
1221 | 1222 | def fit(self, X: numpy.ndarray, n_jobs: int = 4) 1223 |
1224 | -
1225 | 1241 |
Fits the algorithm into a model. 1226 | Given a 2D matrix of observations, create an ensemble of IsolationTree 1227 | objects and store them in a list: self.trees. 1228 | Convert DataFrames to 1229 | ndarray objects. 1230 | Uses parallel computing.
1231 |Parameters
1232 |-
1233 |
X:nD array.
1234 | - nD array with the train instances. Dimensions should be (n_obs, n_features). 1235 |
n_jobs:int
1236 | - number of parallel jobs that will be launched 1237 |
Returns
1239 |the object itself. 1240 |1242 |1289 |1243 | Expand source code 1244 |
1245 |
1288 |def fit(self, X: (np.ndarray), n_jobs: int = 4): 1246 | """ 1247 | Fits the algorithm into a model. 1248 | Given a 2D matrix of observations, create an ensemble of IsolationTree 1249 | objects and store them in a list: self.trees. Convert DataFrames to 1250 | ndarray objects. 1251 | Uses parallel computing. 1252 | 1253 | Parameters 1254 | ---------- 1255 | X: nD array. 1256 | nD array with the train instances. Dimensions should be (n_obs, n_features). 1257 | n_jobs: int 1258 | number of parallel jobs that will be launched 1259 | Returns 1260 | ------- 1261 | the object itself. 1262 | """ 1263 | self.X = X 1264 | self.path_normFactor = np.sqrt(len(X)) 1265 | 1266 | self.sample_size = min(self.sample_size, len(X)) 1267 | 1268 | limit_height = 1.0*np.ceil(np.log2(self.sample_size)) 1269 | 1270 | featureDistrib = [] 1271 | nbins = int(len(X)/8)+2 1272 | for i in range(np.shape(X)[1]): 1273 | featureDistrib.append(weightFeature(X[:, i], nbins)) 1274 | featureDistrib = np.array(featureDistrib) 1275 | featureDistrib = featureDistrib/(np.sum(featureDistrib)+1e-5) 1276 | self.featureDistrib = featureDistrib 1277 | 1278 | create_tree_partial = partial(create_tree, 1279 | featureDistrib=self.featureDistrib, 1280 | sample_size=self.sample_size, 1281 | max_height=limit_height) 1282 | 1283 | with Pool(n_jobs) as p: 1284 | self.trees = p.map(create_tree_partial, 1285 | [X for _ in range(self.n_trees)] 1286 | ) 1287 | return self
1290 | 1291 | def predict(self, X: numpy.ndarray, threshold: float) ‑> numpy.ndarray 1292 |
1293 | -
1294 | 1307 |
A shorthand for calling anomaly_score() and predict_from_anomaly_scores().
1295 |Parameters
1296 |-
1297 |
X:nD array.
1298 | - nD array with the tested observations to be predicted. Dimensions should be (n_obs, n_features). 1299 |
threshold:float
1300 | - Threshold for considering a observation an anomaly, the higher the less anomalies. 1301 |
Returns
1303 |-
1304 |
1D array
1305 | - The prediction array corresponding to 1/0 if anomaly/not anomaly respectively. 1306 |
1308 |1330 |1309 | Expand source code 1310 |
1311 |
1329 |def predict(self, X: np.ndarray, threshold: float) -> np.ndarray: 1312 | """ 1313 | A shorthand for calling anomaly_score() and predict_from_anomaly_scores(). 1314 | 1315 | Parameters 1316 | ---------- 1317 | X: nD array. 1318 | nD array with the tested observations to be predicted. Dimensions should be (n_obs, n_features). 1319 | threshold: float 1320 | Threshold for considering a observation an anomaly, the higher the less anomalies. 1321 | Returns 1322 | ------- 1323 | 1D array 1324 | The prediction array corresponding to 1/0 if anomaly/not anomaly respectively. 1325 | """ 1326 | 1327 | scores = self.anomaly_score(X) 1328 | return self.predict_from_anomaly_scores(scores, threshold)
1331 | 1332 | def predict_from_anomaly_scores(self, scores: numpy.ndarray, threshold: float) ‑> numpy.ndarray 1333 |
1334 | -
1335 | 1352 |
Given an array of scores and a score threshold, return an array of 1336 | the predictions: 1 for any score >= the threshold and 0 otherwise.
1337 |Parameters
1338 |-
1339 |
scores:1D array.
1340 | - 1D array of scores. Dimensions should be (n_obs, n_features). 1341 |
threshold:float
1342 | - Threshold for considering a observation an anomaly, the higher the less anomalies. 1343 |
Returns
1345 |-
1346 |
1D array
1347 | - The prediction array corresponding to 1/0 if anomaly/not anomaly respectively. 1348 |
:param scores: 1D array. Scores produced by the random forest. 1350 | :param threshold: Threshold for considering a observation an anomaly, the higher the less anomalies. 1351 | :return: Return predictions
1353 |1379 |1354 | Expand source code 1355 |
1356 |
1378 |def predict_from_anomaly_scores(self, scores: np.ndarray, threshold: float) -> np.ndarray: 1357 | """ 1358 | Given an array of scores and a score threshold, return an array of 1359 | the predictions: 1 for any score >= the threshold and 0 otherwise. 1360 | 1361 | Parameters 1362 | ---------- 1363 | scores: 1D array. 1364 | 1D array of scores. Dimensions should be (n_obs, n_features). 1365 | threshold: float 1366 | Threshold for considering a observation an anomaly, the higher the less anomalies. 1367 | Returns 1368 | ------- 1369 | 1D array 1370 | The prediction array corresponding to 1/0 if anomaly/not anomaly respectively. 1371 | 1372 | :param scores: 1D array. Scores produced by the random forest. 1373 | :param threshold: Threshold for considering a observation an anomaly, the higher the less anomalies. 1374 | :return: Return predictions 1375 | """ 1376 | out = scores >= threshold 1377 | return out*1
1380 | 1381 | def walk(self, X: numpy.ndarray) ‑> numpy.ndarray 1382 |
1383 | -
1384 | 1399 |
Given a nD matrix of observations, X, compute the average path length, 1385 | the distance, frequency and collective anomaly scores 1386 | for instances in X. 1387 | Compute the path length for x_i using every 1388 | tree in self.trees then compute the average for each x_i. 1389 | Return an 1390 | ndarray of shape (len(X),1).
1391 |Parameters
1392 |-
1393 |
X:nD array.
1394 | - nD array with the instances to be tested. Dimensions should be (n_obs, n_features). 1395 |
Returns
1397 |None 1398 |1400 |1429 |1401 | Expand source code 1402 |
1403 |
1428 |def walk(self, X: np.ndarray) -> np.ndarray: 1404 | """ 1405 | Given a nD matrix of observations, X, compute the average path length, 1406 | the distance, frequency and collective anomaly scores 1407 | for instances in X. Compute the path length for x_i using every 1408 | tree in self.trees then compute the average for each x_i. Return an 1409 | ndarray of shape (len(X),1). 1410 | 1411 | Parameters 1412 | ---------- 1413 | X: nD array. 1414 | nD array with the instances to be tested. Dimensions should be (n_obs, n_features). 1415 | Returns 1416 | ------- 1417 | None 1418 | """ 1419 | 1420 | self.L = np.zeros((len(X), self.n_trees)) 1421 | self.LD = np.zeros((len(X), self.n_trees)) 1422 | self.LF = np.zeros((len(X), self.n_trees)) 1423 | self.LDF = np.zeros((len(X), self.n_trees)) 1424 | 1425 | for treeIdx, itree in enumerate(self.trees): 1426 | obsIdx = np.ones(len(X)).astype(bool) 1427 | walk_tree(self, itree, treeIdx, obsIdx, X, self.featureDistrib, alpha=self.alpha)
1430 |
1432 | 1433 | class InNode 1434 | (X, height_limit, featureDistrib, sample_size, current_height) 1435 |
1436 | -
1437 | 1461 |
Node of the tree that is not a leaf node. 1438 | The functionality of the class is: 1439 | - Do the best split from a sample of randomly chosen 1440 | dimensions and split points. 1441 | - Partition the space of observations according to the 1442 | split and send the along to two different nodes 1443 | The method usually has a higher complexity than doing it for every point. 1444 | But because it's using NumPy it's more efficient time-wise.
1445 |Parameters
1446 |-
1447 |
X:nD array.
1448 | - nD array with the training instances that have reached the node. 1449 |
height_limit:int
1450 | - Maximum height of the tree. 1451 |
Xf:nD array.
1452 | - distribution used to randomly select a dimension (feature) used at parent level. 1453 |
sample_size:int
1454 | - Size of the sample used to build the tree. 1455 |
current_height:int
1456 | - Current height of the tree. 1457 |
Returns
1459 |None 1460 |1462 |1534 |1463 | Expand source code 1464 |
1465 |
1533 |class InNode: 1466 | ''' 1467 | Node of the tree that is not a leaf node. 1468 | The functionality of the class is: 1469 | - Do the best split from a sample of randomly chosen 1470 | dimensions and split points. 1471 | - Partition the space of observations according to the 1472 | split and send the along to two different nodes 1473 | The method usually has a higher complexity than doing it for every point. 1474 | But because it's using NumPy it's more efficient time-wise. 1475 | ''' 1476 | def __init__(self, X, height_limit, featureDistrib, sample_size, current_height): 1477 | ''' 1478 | Parameters 1479 | ---------- 1480 | X: nD array. 1481 | nD array with the training instances that have reached the node. 1482 | height_limit: int 1483 | Maximum height of the tree. 1484 | Xf: nD array. 1485 | distribution used to randomly select a dimension (feature) used at parent level. 1486 | sample_size: int 1487 | Size of the sample used to build the tree. 1488 | current_height: int 1489 | Current height of the tree. 1490 | Returns 1491 | ------- 1492 | None 1493 | ''' 1494 | 1495 | self.size = len(X) 1496 | self.height = current_height+1 1497 | n_obs, n_features = X.shape 1498 | next_height = current_height + 1 1499 | limit_not_reached = height_limit > next_height 1500 | 1501 | if len(X) > 32: 1502 | featureDistrib = [] 1503 | nbins = int(len(X)/8)+2 1504 | for i in range(np.shape(X)[1]): 1505 | featureDistrib.append(weightFeature(X[:, i], nbins)) 1506 | featureDistrib = np.array(featureDistrib) 1507 | featureDistrib = featureDistrib/(np.sum(featureDistrib)+1e-5) 1508 | 1509 | self.featureDistrib = featureDistrib 1510 | 1511 | cols = np.arange(np.shape(X)[1], dtype='int') 1512 | 1513 | self.splitAtt = rn.choices(cols, weights=featureDistrib)[0] 1514 | splittingCol = X[:, self.splitAtt] 1515 | self.splitValue = getSplit(splittingCol) 1516 | idx = splittingCol <= self.splitValue 1517 | 1518 | idx = splittingCol <= self.splitValue 1519 | 1520 | X_aux = X[idx, :] 1521 | 1522 | self.left = (InNode(X_aux, height_limit, featureDistrib, sample_size, next_height) 1523 | if limit_not_reached and X_aux.shape[0] > 5 and (np.any(X_aux.max(0) != X_aux.min(0))) else LeafNode( 1524 | X_aux, next_height, X, sample_size)) 1525 | 1526 | idx = np.invert(idx) 1527 | X_aux = X[idx, :] 1528 | self.right = (InNode(X_aux, height_limit, featureDistrib, sample_size, next_height) 1529 | if limit_not_reached and X_aux.shape[0] > 5 and (np.any(X_aux.max(0) != X_aux.min(0))) else LeafNode( 1530 | X_aux, next_height, X, sample_size)) 1531 | 1532 | self.n_nodes = 1 + self.left.n_nodes + self.right.n_nodes
1535 | 1536 | class LeafNode 1537 | (X, height, Xp, sample_size) 1538 |
1539 | -
1540 | 1557 |
Leaf node 1541 | The base funcitonality is storing the Mean and standard deviation of the observations in that node. 1542 | We also evaluate the frequency of visit for training data.
1543 |Parameters
1544 |-
1545 |
X:nD array.
1546 | - nD array with the training instances falling into the leaf node. 1547 |
height:int
1548 | - Current height of the tree. 1549 |
Xf:nD array.
1550 | - nD array with the training instances falling into the parent node. 1551 |
sample_size:int
1552 | - Size of the sample used to build the tree. 1553 |
Returns
1555 |None 1556 |1558 |1604 |1559 | Expand source code 1560 |
1561 |
1603 |class LeafNode: 1562 | ''' 1563 | Leaf node 1564 | The base funcitonality is storing the Mean and standard deviation of the observations in that node. 1565 | We also evaluate the frequency of visit for training data. 1566 | ''' 1567 | def __init__(self, X, height, Xp, sample_size): 1568 | ''' 1569 | Parameters 1570 | ---------- 1571 | X: nD array. 1572 | nD array with the training instances falling into the leaf node. 1573 | height: int 1574 | Current height of the tree. 1575 | Xf: nD array. 1576 | nD array with the training instances falling into the parent node. 1577 | sample_size: int 1578 | Size of the sample used to build the tree. 1579 | Returns 1580 | ------- 1581 | None 1582 | ''' 1583 | self.height = height+1 1584 | self.size = len(X) 1585 | self.n_nodes = 1 1586 | self.freq = self.size/sample_size 1587 | self.freqs = 0 1588 | 1589 | if len(X) != 0: 1590 | self.M = np.mean(X, axis=0) 1591 | if len(X) > 10: 1592 | self.Mstd = np.std(X, axis=0) 1593 | self.Mstd[self.Mstd == 0] = 1e-2 1594 | else: 1595 | self.Mstd = np.ones(np.shape(X)[1]) 1596 | else: 1597 | self.M = np.mean(Xp, axis=0) 1598 | if len(Xp) > 10: 1599 | self.Mstd = np.std(Xp, axis=0) 1600 | self.Mstd[self.Mstd == 0] = 1e-2 1601 | else: 1602 | self.Mstd = np.ones(np.shape(X)[1])
1605 |