├── License.txt ├── README.md ├── demo_iforest.ipynb ├── demo_vis_pdf.ipynb ├── icdm08b.pdf ├── iso_forest.py ├── setup.py ├── test.py └── version.py /License.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 University of Illinois at Urbana-Champaign 2 | All rights reserved. 3 | 4 | Developed by: Matias Carrasco Kind 5 | NCSA/UIUC 6 | https://github.com/mgckind/iso_forest 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | 10 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. 11 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. 12 | Neither the names of Matias Carrasco Kind, University of Illinois at Urbana-Champaign and NCSA, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2553679.svg)](https://doi.org/10.5281/zenodo.2553679) 2 | 3 | # iso_forest 4 | 5 | This is a simple package implementation for the isolation forest method described (among other places) in this [paper](icdm08b.pdf) for detecting anomalies and outliers from a data point distribution. 6 | 7 | ## Extended isolation forest 8 | 9 | For an extended version of this algorithm that produces more precise scoring maps please visit this repository 10 | 11 | [https://github.com/sahandha/eif](https://github.com/sahandha/eif)/ 12 | 13 | 14 | ## Installation 15 | 16 | 17 | pip install iso_forest 18 | 19 | 20 | or directly from the Github repository 21 | 22 | 23 | pip install git+https://github.com/mgckind/iso_forest.git 24 | 25 | 26 | It supports python2 and python3 27 | 28 | ## Requirements 29 | 30 | - numpy 31 | 32 | No extra requirements are needed for the algorithm. 33 | 34 | In addition, it also contains means to draw the trees created using the [igraph](http://igraph.org/) library. 35 | 36 | ## Use Examples 37 | 38 | See these 2 notebooks examples on how to use it 39 | 40 | - [basics](demo_iforest.ipynb) 41 | - [tree visualization and anomaly PDFs](demo_vis_pdf.ipynb) 42 | 43 | ## Releases 44 | 45 | ### v1.0.3 46 | 47 | - Initial Release 48 | 49 | -------------------------------------------------------------------------------- /icdm08b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mgckind/iso_forest/ce28c720ca395e6030d9e5a793357ea7908c0d04/icdm08b.pdf -------------------------------------------------------------------------------- /iso_forest.py: -------------------------------------------------------------------------------- 1 | "isolated forest functions" 2 | __author__ = 'Matias Carrasco Kind' 3 | import numpy as np 4 | import random as rn 5 | import os 6 | import warnings 7 | from version import __version__ 8 | try: 9 | import igraph as ig 10 | except: 11 | warnings.warn("No igraph interface for plotting trees") 12 | 13 | 14 | def c_factor(n) : 15 | return 2.0*(np.log(n-1)+0.5772156649) - (2.0*(n-1.)/(n*1.0)) 16 | 17 | class iForest(object): 18 | def __init__(self,X, ntrees, sample_size, limit=None): 19 | self.ntrees = ntrees 20 | self.X = X 21 | self.nobjs = len(X) 22 | self.sample = sample_size 23 | self.Trees = [] 24 | self.limit = limit 25 | if limit is None: 26 | self.limit = int(np.ceil(np.log2(self.sample))) 27 | self.c = c_factor(self.sample) 28 | for i in range(self.ntrees): 29 | ix = rn.sample(range(self.nobjs), self.sample) 30 | X_p = X[ix] 31 | self.Trees.append(iTree(X_p, 0, self.limit)) 32 | 33 | def compute_paths(self, X_in = None): 34 | if X_in is None: 35 | X_in = self.X 36 | S = np.zeros(len(X_in)) 37 | for i in range(len(X_in)): 38 | h_temp = 0 39 | for j in range(self.ntrees): 40 | h_temp += PathFactor(X_in[i],self.Trees[j]).path*1.0 41 | Eh = h_temp/self.ntrees 42 | S[i] = 2.0**(-Eh/self.c) 43 | return S 44 | 45 | def compute_paths_single(self, x): 46 | S = np.zeros(self.ntrees) 47 | for j in range(self.ntrees): 48 | path = PathFactor(x,self.Trees[j]).path*1.0 49 | S[j] = 2.0**(-1.0*path/self.c) 50 | return S 51 | 52 | 53 | 54 | 55 | class Node(object): 56 | def __init__(self, X, q, p, e, left, right, node_type = '' ): 57 | self.e = e 58 | self.size = len(X) 59 | self.X = X # to be removed 60 | self.q = q 61 | self.p = p 62 | self.left = left 63 | self.right = right 64 | self.ntype = node_type 65 | 66 | 67 | 68 | 69 | class iTree(object): 70 | 71 | """ 72 | Unique entries for X 73 | """ 74 | 75 | def __init__(self,X,e,l): 76 | self.e = e # depth 77 | self.X = X #save data for now 78 | self.size = len(X) # n objects 79 | self.Q = np.arange(np.shape(X)[1], dtype='int') # n dimensions 80 | self.l = l # depth limit 81 | self.p = None 82 | self.q = None 83 | self.exnodes = 0 84 | self.root = self.make_tree(X,e,l) 85 | 86 | 87 | def make_tree(self,X,e,l): 88 | self.e = e 89 | if e >= l or len(X) <= 1: 90 | left = None 91 | right = None 92 | self.exnodes += 1 93 | return Node(X, self.q, self.p, e, left, right, node_type = 'exNode' ) 94 | else: 95 | self.q = rn.choice(self.Q) 96 | mini = X[:,self.q].min() 97 | maxi = X[:,self.q].max() 98 | if mini==maxi: 99 | left = None 100 | right = None 101 | self.exnodes += 1 102 | return Node(X, self.q, self.p, e, left, right, node_type = 'exNode' ) 103 | self.p = rn.uniform(mini,maxi) 104 | w = np.where(X[:,self.q] < self.p,True,False) 105 | return Node(X, self.q, self.p, e,\ 106 | left=self.make_tree(X[w],e+1,l),\ 107 | right=self.make_tree(X[~w],e+1,l),\ 108 | node_type = 'inNode' ) 109 | 110 | def get_node(self, path): 111 | node = self.root 112 | for p in path: 113 | if p == 'L' : node = node.left 114 | if p == 'R' : node = node.right 115 | return node 116 | 117 | 118 | 119 | 120 | 121 | 122 | class PathFactor(object): 123 | def __init__(self,x,itree): 124 | self.path_list=[] 125 | self.x = x 126 | self.e = 0 127 | self.path = self.find_path(itree.root) 128 | 129 | def find_path(self,T): 130 | if T.ntype == 'exNode': 131 | if T.size == 1: return self.e 132 | else: 133 | self.e = self.e + c_factor(T.size) 134 | return self.e 135 | else: 136 | a = T.q 137 | self.e += 1 138 | if self.x[a] < T.p: 139 | self.path_list.append('L') 140 | return self.find_path(T.left) 141 | else: 142 | self.path_list.append('R') 143 | return self.find_path(T.right) 144 | 145 | def all_branches(node, current=[], branches = None): 146 | current = current[:node.e] 147 | if branches is None: branches = [] 148 | if node.ntype == 'inNode': 149 | current.append('L') 150 | all_branches(node.left, current=current, branches=branches) 151 | current = current[:-1] 152 | current.append('R') 153 | all_branches(node.right, current=current, branches=branches) 154 | else: 155 | branches.append(current) 156 | return branches 157 | 158 | 159 | def branch2num(branch, init_root=0): 160 | num = [init_root] 161 | for b in branch: 162 | if b == 'L': 163 | num.append(num[-1] * 2 + 1) 164 | if b == 'R': 165 | num.append(num[-1] * 2 + 2) 166 | return num 167 | 168 | def gen_graph(branches, g = None, init_root = 0, pre = ''): 169 | num_branches = [branch2num(i, init_root) for i in branches] 170 | all_nodes = [j for branch in num_branches for j in branch] 171 | all_nodes = np.unique(all_nodes) 172 | all_nodes = all_nodes.tolist() 173 | if g is None: 174 | g=ig.Graph() 175 | for k in all_nodes : g.add_vertex(pre+str(k)) 176 | t=[] 177 | for j in range(len(branches)): 178 | branch = branch2num(branches[j], init_root) 179 | for i in range(len(branch)-1): 180 | pair = [branch[i],branch[i+1]] 181 | if pair not in t: 182 | t.append(pair) 183 | g.add_edge(pre+str(branch[i]),pre+str(branch[i+1])) 184 | return g,max(all_nodes) 185 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | try: 4 | from setuptools import setup, find_packages 5 | except ImportError: 6 | from distutils.core import setup 7 | prjdir = os.path.dirname(__file__) 8 | 9 | def read(filename): 10 | return open(os.path.join(prjdir, filename)).read() 11 | 12 | extra_link_args = [] 13 | libraries = [] 14 | library_dirs = [] 15 | include_dirs = [] 16 | exec(open('version.py').read()) 17 | setup( 18 | name='iso_forest', 19 | version=__version__, 20 | author='Matias Carrasco Kind', 21 | author_email='mcarras2@illinois.edu', 22 | scripts=[], 23 | py_modules=['iso_forest','version'], 24 | packages=[], 25 | license='License.txt', 26 | description='Isolation Forest for anomaly detection', 27 | long_description=read('README.md'), 28 | url='https://github.com/mgckind/iso_forest', 29 | install_requires=["numpy"], 30 | ) 31 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from scipy.stats import multivariate_normal 4 | import random as rn 5 | import iso 6 | mean = [0, 0] 7 | cov = [[1, 0], [0, 1]] # diagonal covariance 8 | Nobjs = 2000 9 | x, y = np.random.multivariate_normal(mean, cov, Nobjs).T 10 | x[0]=3 11 | y[0]=3 12 | X=np.array([x,y]).T 13 | 14 | ntrees=500 15 | CT=[] 16 | sample = 256 17 | for i in range(ntrees): 18 | ix = rn.sample(range(Nobjs),sample) 19 | X_p = X[ix] 20 | limit = int(np.ceil(np.log2(sample))) 21 | C=iso.iTree(X_p,0,limit) 22 | CT.append(C) 23 | 24 | 25 | S = np.zeros(Nobjs) 26 | c = iso.c_factor(sample) 27 | for i in range(Nobjs): 28 | h_temp = 0 29 | for j in range(ntrees): 30 | h_temp += iso.PathFactor(X[i],CT[j]).path*1.0 31 | Eh = h_temp/ntrees 32 | S[i] = 2.0**(-Eh/c) 33 | 34 | 35 | ss=np.argsort(S) 36 | plt.plot(x,y,'bo') 37 | plt.plot(x[ss[-10:]],y[ss[-10:]],'ro') 38 | 39 | plt.figure() 40 | 41 | sv1 = [] 42 | sv2 = [] 43 | sv3 = [] 44 | 45 | for j in range(ntrees): 46 | sv1.append(2**(-iso.PathFactor(X[ss[0]],CT[j]).path*1.0/c)) 47 | sv2.append(2**(-iso.PathFactor(X[ss[Nobjs/2]],CT[j]).path*1.0/c)) 48 | sv3.append(2**(-iso.PathFactor(X[ss[-1]],CT[j]).path*1.0/c)) 49 | 50 | plt.plot(sv1,label='normal') 51 | plt.plot(sv2, label='semi') 52 | plt.plot(sv3, label='outlier') 53 | plt.legend(loc=0) 54 | 55 | plt.show() 56 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | """isoforest version""" 2 | 3 | version_tag = (1, 0, 3) 4 | __version__ = '.'.join(map(str, version_tag[:3])) 5 | 6 | if len(version_tag) > 3: 7 | __version__ = '%s-%s' % (__version__, version_tag[3]) 8 | 9 | 10 | --------------------------------------------------------------------------------