├── Computation_time.ipynb ├── Figures ├── anomaly_example-1.png ├── anomaly_example_rank-1.png └── anomaly_example_score-1.png ├── License.txt ├── MFIF_python └── Section_5 │ ├── DDplot_3D.pdf │ ├── MFIF.py │ ├── MNIST.csv │ └── section_5.ipynb ├── README.rst ├── Toy_example.ipynb ├── __fif.pxd ├── __init__.py ├── _fif.pyx ├── fif.cxx ├── fif.hxx ├── old_fif.py ├── setup.cfg ├── setup.py └── version.py /Computation_time.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## This notebook provides a simple comparison of computation time of the python version and the new Cython/C++ version." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Importing libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import time\n", 24 | "import numpy as np\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "import fif as FIF\n", 27 | "import old_fif as old_FIF" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "### Simulated functions" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "np.random.seed(42)\n", 44 | "m =1000 ## m measures\n", 45 | "n =1000 ## n curves\n", 46 | "tps = np.linspace(0,1,m)\n", 47 | "v = np.linspace(1,1.4,n)\n", 48 | "X = np.zeros((n,m))\n", 49 | "for i in range(n):\n", 50 | " X[i] = 30 * ((1-tps) ** v[i]) * tps ** v[i]\n", 51 | "\n", 52 | "\n", 53 | "Z1 = np.zeros((m))\n", 54 | "for j in range(m):\n", 55 | " if (tps[j]<0.2 or tps[j]>0.8):\n", 56 | " Z1[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2\n", 57 | " else:\n", 58 | " Z1[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + np.random.normal(0,0.3,1)\n", 59 | "Z1[0] = 0\n", 60 | "Z1[m-1] = 0\n", 61 | "\n", 62 | "\n", 63 | "Z2 = 30 * ((1-tps) ** 1.6) * tps ** 1.6\n", 64 | "\n", 65 | "\n", 66 | "Z3 = np.zeros((m))\n", 67 | "for j in range(m):\n", 68 | " Z3[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + np.sin(2*np.pi*tps[j])\n", 69 | "\n", 70 | "Z4 = np.zeros((m))\n", 71 | "for j in range(m):\n", 72 | " Z4[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2\n", 73 | "\n", 74 | "for j in range(70,71):\n", 75 | " Z4[j] += 2\n", 76 | "\n", 77 | "Z5 = np.zeros((m))\n", 78 | "for j in range(m):\n", 79 | " Z5[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + 0.5*np.sin(10*np.pi*tps[j])\n", 80 | "\n", 81 | "X = np.concatenate((X,Z1.reshape(1,-1),Z2.reshape(1,-1),\n", 82 | " Z3.reshape(1,-1), Z4.reshape(1,-1), Z5.reshape(1,-1)), axis = 0)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Cython/C++ version of FIF" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 8, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "CPU times: user 7.21 s, sys: 175 ms, total: 7.38 s\n", 102 | "Wall time: 7.33 s\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "%%time\n", 108 | "time = np.linspace(0,1,m)\n", 109 | "F = FIF.FiForest(X, time, ntrees=300, sample_size=64, alpha=0.5, dic_number=1, seed=0)\n", 110 | "S_new_F = F.compute_paths(X_in=X)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Old python version of FIF" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 9, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "CPU times: user 5min 48s, sys: 595 ms, total: 5min 48s\n", 130 | "Wall time: 5min 49s\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "%%time \n", 136 | "np.random.seed(0)\n", 137 | "old_F = old_FIF.FIForest(X,ntrees=300,subsample_size=64,time=time, D='gaussian_wavelets', innerproduct='auto', alpha=0.5)\n", 138 | "S_old_F = old_F.compute_paths(X)" 139 | ] 140 | } 141 | ], 142 | "metadata": { 143 | "kernelspec": { 144 | "display_name": "Python 3", 145 | "language": "python", 146 | "name": "python3" 147 | }, 148 | "language_info": { 149 | "codemirror_mode": { 150 | "name": "ipython", 151 | "version": 3 152 | }, 153 | "file_extension": ".py", 154 | "mimetype": "text/x-python", 155 | "name": "python", 156 | "nbconvert_exporter": "python", 157 | "pygments_lexer": "ipython3", 158 | "version": "3.7.4" 159 | } 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 2 163 | } 164 | -------------------------------------------------------------------------------- /Figures/anomaly_example-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuillaumeStaermanML/FIF/a92273974fd860f9441af99d4a4e418d1e739998/Figures/anomaly_example-1.png -------------------------------------------------------------------------------- /Figures/anomaly_example_rank-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuillaumeStaermanML/FIF/a92273974fd860f9441af99d4a4e418d1e739998/Figures/anomaly_example_rank-1.png -------------------------------------------------------------------------------- /Figures/anomaly_example_score-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuillaumeStaermanML/FIF/a92273974fd860f9441af99d4a4e418d1e739998/Figures/anomaly_example_score-1.png -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Télécom Paris, France. 2 | All rights reserved. 3 | 4 | Developed by: Guillaume Staerman 5 | LTCI 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. 10 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. 11 | Neither the names of Guillaume Staerman, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. 13 | -------------------------------------------------------------------------------- /MFIF_python/Section_5/DDplot_3D.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuillaumeStaermanML/FIF/a92273974fd860f9441af99d4a4e418d1e739998/MFIF_python/Section_5/DDplot_3D.pdf -------------------------------------------------------------------------------- /MFIF_python/Section_5/MFIF.py: -------------------------------------------------------------------------------- 1 | """ Multivariate Functional Isolation Forest 2 | 3 | Author : Guillaume Staerman 4 | """ 5 | 6 | 7 | """Multivariate Functional Isolation Forest Algorithm 8 | 9 | This is the implementation of The Multivariate Functional Isolation Forest which is an 10 | extension of the original Isolation Forest applied to functional data. 11 | 12 | It return the anomaly score of each sample using the FIF algorithm. 13 | The Functional Isolation Forest 'isolates' observations by 14 | randomly selecting a multivariate curve among a dictionary 15 | and then randomly selecting a split value between the maximum 16 | and minimum values of the selected feature. 17 | 18 | Since recursive partitioning can be represented by a tree structure, the 19 | number of splittings required to isolate a sample is equivalent to the path 20 | length from the root node to the terminating node. 21 | 22 | This path length, averaged over a forest of such random trees, is a 23 | measure of normality. 24 | 25 | Random partitioning produces noticeably shorter paths for anomalies. 26 | Hence, when a forest of random trees collectively produce shorter path 27 | lengths for particular samples, they are highly likely to be anomalies. 28 | 29 | """ 30 | import numpy as np 31 | #from version import __version__ 32 | 33 | 34 | 35 | def derivateM(X, step): 36 | """Compute de derivative of a multivariate function X on each dimension. 37 | """ 38 | step = step.astype(dtype = float) 39 | A = np.zeros((X.shape[0],X.shape[1]-1)) 40 | for i in range(X.shape[0]): 41 | A[i,:] = np.diff(X[i,:]) / step 42 | return A 43 | 44 | def c_factor(n_samples_leaf) : 45 | """ 46 | Average path length of unsuccesful search in a binary search tree given n points 47 | 48 | Parameters 49 | ---------- 50 | n_samples_lead : int 51 | Number of curves for the BST. 52 | Returns 53 | ------- 54 | float 55 | Average path length of unsuccesful search in a BST 56 | 57 | """ 58 | return 2.0 * (np.log(n_samples_leaf - 1) + np.euler_gamma) - (2. * ( 59 | n_samples_leaf - 1.) / (n_samples_leaf * 1.0)) 60 | 61 | 62 | class MFIForest(object): 63 | """ 64 | Multivariate Functional Isolation Forest 65 | 66 | Creates an MFIForest object. This object holds the data as well as the trained trees (iTree objects). 67 | 68 | Attributes 69 | ---------- 70 | X : Array-like (n_samples, dimension, discretization) 71 | Data used for training. 72 | 73 | nobjs: int 74 | Size of the dataset. 75 | 76 | sample: int 77 | Size of the sample to be used for tree creation. 78 | 79 | Trees: list 80 | A list of tree objects. 81 | 82 | limit: int 83 | Maximum depth a tree can have. 84 | 85 | c: float 86 | Multiplicative factor used in computing the anomaly scores. 87 | 88 | step : array 89 | Vector of the length of intervals of discretization. 90 | 91 | D : Array-like 92 | Dictionnary of functions used as directions. 93 | 94 | Dsize : int 95 | The size of the dictionary. It is the number of curves that we will use in our 96 | dictionary to build the forest. 97 | 98 | mean : float or None, optional (Default=None) 99 | The mean of the stochastic process used to build a stochastic dictionary. 100 | This is set to zero by default. If a stochastic dictionary is called 101 | and no mean is given, it is set to 0. 102 | 103 | sd : float or None, optional (Default=None) 104 | The standard deviation of the stochastic process used to build a stochastic dictionary. 105 | This is set to one by default. If a stochastic dictionary is called 106 | and no standard deviation is given, it is set to 1. 107 | 108 | J_max : int or None, optional (Default=None) 109 | This parameter fix the size of the dictionary of Haar_wavelet_father. 110 | It will build 2 power J_max functions. 111 | 112 | amplitude_min : float or None, optional (Default=None) 113 | This parameter is used for cosinus dictionary. 114 | It is the minimum amplitude with which one draws the amplitude. 115 | 116 | amplitude_max : float or None, optional (Default=None) 117 | This parameter is used for cosinus dictionary. 118 | It is the minimum amplitude with which one draws the amplitude. 119 | 120 | innerproduct : str or function 121 | An inner product that we use for the construction of the tree. The innerproduct in the paper 122 | is already implemented, call it with 'auto' and fixe and alpha. If a function is given by 123 | the user, it should have three argument : (x, y, step) where x and y are curve (represented 124 | by a vector of length of the discretization). "step" is a vector of length len(time)-1 which 125 | represents the vector of length of step between the discretization. 126 | 127 | alpha : float 128 | a float number between [0,1] used in the innerproduct of the paper. 129 | 130 | deriv_X : Array like 131 | A matrix of derivate of X if needed for the scalar product. 132 | 133 | deriv_dictionary : Array like 134 | A matrix of derivate of D if needed for the scalar product. 135 | 136 | Attributes 137 | ------- 138 | compute_paths(X_in) : 139 | Computes the anomaly score for data X_in 140 | 141 | threshold(score_sample, contamination) : 142 | Given the score returned by the fit function on training sample and a proportion 143 | of anomalies, compute the threshold which separates anomalies and normal data. 144 | 145 | predict_label(score, contamination) : 146 | Given any score (training or testing) and the proportion of anomalies 147 | it return the labels predicted. The function return +1 for outliers and 148 | -1 for inliers. 149 | 150 | 151 | References 152 | ---------- 153 | 154 | .. [1] Staerman, G, Mozharovskyi, P, D'Alché-buc, F and Clémençon,S. "Functional Isolation forest." 155 | 156 | 157 | """ 158 | 159 | def __init__(self, 160 | X, 161 | D, 162 | innerproduct, 163 | time, 164 | ntrees=None, 165 | subsample_size=None, 166 | Dsize=None, 167 | limit=None, 168 | mean=None, 169 | sd=None, 170 | J_max=None, 171 | amplitude_min=None, 172 | amplitude_max=None, 173 | alpha=None): 174 | 175 | 176 | 177 | self.X = X 178 | self.nobjs = len(X) 179 | self.Trees = [] 180 | self.time = time 181 | 182 | 183 | if (ntrees == None): 184 | self.ntrees = 100 185 | else: self.ntrees = ntrees 186 | 187 | 188 | if (subsample_size == None): 189 | if (len(X)>800): 190 | self.sample = 256 191 | else: self.sample = 64 192 | else : self.sample = subsample_size 193 | 194 | 195 | if (Dsize == None): 196 | self.Dsize = 1000 197 | else: self.Dsize = Dsize 198 | 199 | 200 | if (type(D) == str): 201 | """Some dictionary pre-implemented. 202 | """ 203 | 204 | 205 | if (D == 'Brownian'): 206 | """ We build a dictionary from brownian motion (standard or drift). 207 | We use a discretization on [0,1] since we are interested only in the shape. 208 | """ 209 | if (mean == None): 210 | mean = np.zeros(((self.X).shape[1])) 211 | 212 | if (sd == None): 213 | sd = np.eye((self.X).shape[1],(self.X).shape[1]) 214 | 215 | self.D = np.zeros((self.Dsize,(self.X).shape[1], len(self.time))) 216 | t = np.linspace(0, 1, len(self.time)) 217 | self.D[:,:,0] = np.random.multivariate_normal(mean = mean, cov = sd, size = self.Dsize) 218 | for i in range(self.Dsize): 219 | for j in range(1,np.size(self.time)): 220 | self.D[i,:,j] = self.D[i,:, j-1] + np.dot(sd, np.random.multivariate_normal(mean = mean, 221 | cov = np.eye((self.X).shape[1],(self.X).shape[1]) * np.sqrt( t[2] - t[1]) 222 | , size = 1).T).T + mean * (t[2] - t[1]) 223 | 224 | elif (D == 'Brownian_bridge'): 225 | """ We build a dictionary from Brownian bridge. 226 | """ 227 | mean = np.zeros(((self.X).shape[1])) 228 | sd = np.eye((self.X).shape[1],(self.X).shape[1]) 229 | self.D = np.zeros((self.Dsize,(self.X).shape[1],len(self.time))) 230 | t = np.linspace(0, 1, len(self.time)) 231 | for i in range(self.Dsize): 232 | for k in range((self.X).shape[1]): 233 | for j in range(1,(len(self.time)-1)): 234 | self.D[i,k,j] = self.D[i,k, j-1] + np.random.normal(0, np.sqrt(t[2]-t[1]) 235 | , size = 1) - self.D[i,k,j-1] * (t[2]-t[1]) / (1 - t[j]) 236 | 237 | elif (D == 'gaussian_wavelets'): 238 | """ We build a dictionary from gaussian wavelets. We use a discretization on [-5,5] 239 | and add two random parameters to get an interesting dictionary. 240 | The standard deviation sigma and a translationparameter K. The range of these 241 | parameters are fixed. 242 | """ 243 | t = np.linspace(-5,5,len(self.time)) 244 | self.D = np.zeros((self.Dsize, (self.X).shape[1], len(self.time))) 245 | for i in range(self.Dsize): 246 | for j in range((self.X).shape[1]): 247 | sigma = np.random.uniform(0.2,1) 248 | K = np.random.uniform(-4,4) 249 | for l in range(len(self.time)): 250 | self.D[i,j,l] = (-(2 / (np.power(np.pi,0.25) * np.sqrt(3 * sigma)) ) 251 | * ((t[l]-K) ** 2 / (sigma ** 2) -1) * (np.exp(-(t[l] - K) ** 2 / (2 * sigma ** 2)))) 252 | 253 | elif (D == 'Dyadic_indicator'): 254 | """ We build a dictionary from the basis of the Haar wavelets using 255 | only the father wavelets. We use a discretization on [0,1] since 256 | we are interested only in the shape of the curves. 257 | """ 258 | if (J_max == None): 259 | J_max = 7 260 | a =0 261 | t = np.linspace(0,1,len(self.time)) 262 | self.D = np.zeros((np.sum(np.power(2,np.arange(J_max))) ** 2, (self.X).shape[1], len(self.time))) 263 | for J in range(J_max): 264 | for j in range((self.X).shape[1]): 265 | b = np.power(2,J) 266 | for k in range(0,b): 267 | for l in range(0,len(self.time)): 268 | x = b * t[l] - k 269 | self.D[a,j,l] = a*(0 <= x < 1) 270 | a += 1 271 | 272 | elif (D == 'cosinus'): 273 | """ We build a cosinus dictionary with random amplitudes and frequences. 274 | Amplitudes are fixed by the user while freq are fixed by the algorithm 275 | with a large range to avoid overloading parameters. 276 | """ 277 | if (amplitude_min == None): 278 | amplitude_min = -1 279 | 280 | if (amplitude_max == None): 281 | amplitude_max = 1 282 | 283 | t = np.linspace(0,1,len(self.time)) 284 | self.D = np.zeros((self.Dsize,(self.X).shape[1],len(self.time))) 285 | for i in range(self.Dsize): 286 | for j in range((self.X).shape[1]): 287 | freq = np.random.uniform(0, 10, 1) 288 | amp = np.random.uniform(amplitude_min, amplitude_max, 1) 289 | self.D[i,j,:] = amp * np.cos(2 * np.pi * freq * t) 290 | 291 | elif (D == 'SinusCosinus'): 292 | """ We build a cosinus dictionary with random amplitudes and frequences. 293 | Amplitudes are fixed by the user while freq are fixed by the algorithm 294 | with a large range to avoid overloading parameters. 295 | """ 296 | if (amplitude_min == None): 297 | amplitude_min = -1 298 | 299 | if (amplitude_max == None): 300 | amplitude_max = 1 301 | 302 | t = np.linspace(0,1,len(self.time)) 303 | self.D = np.zeros((self.Dsize,(self.X).shape[1],len(self.time))) 304 | for i in range(self.Dsize): 305 | for j in range((self.X).shape[1]): 306 | freq = np.random.uniform(0, 10, 1) 307 | amp = np.random.uniform(amplitude_min, amplitude_max, 1) 308 | choice = np.random.choice(np.array([0,1,])) 309 | if (choice == 0): 310 | self.D[i,j,:] = amp * np.cos(2 * np.pi * freq * t) 311 | else: 312 | self.D[i,j,:] = amp * np.sin(2 * np.pi * freq * t) 313 | elif ( D == 'Self'): 314 | self.D = self.X.copy() 315 | else: raise TypeError('This Dictionary is not pre-defined') 316 | else: self.D = D 317 | 318 | self.alpha = alpha 319 | self.step = np.diff(self.time) 320 | self.deriv_D = None 321 | self.deriv_X = None 322 | 323 | if not callable(innerproduct): 324 | """ Some inner product implemented. 325 | """ 326 | if (innerproduct == 'auto1'): 327 | 328 | 329 | if (self.alpha == None): 330 | self.alpha = 1 331 | if (self.alpha == 1): 332 | def innerproduct(x, y, xderiv = None, yderiv = None ): 333 | """We build the inner product in the paper with alpha = 1 which corresponds 334 | to L2 dot product. 335 | """ 336 | F1 = x * y 337 | A = 0 338 | for i in range(F1.shape[0]): 339 | A += np.sum((self.step * (F1[i][((np.arange(len(F1[i])) + 1) % len(F1[i]))[:len(F1[i])-1]] 340 | + F1[i][((np.arange(len(F1[i])) + -1) % len(F1[i]))[1:len(F1[i])]]) / 2)) 341 | return A 342 | 343 | 344 | 345 | elif (self.alpha == 0): 346 | self.deriv_X = np.zeros((self.X.shape[0], self.X.shape[1],self.X.shape[2]-1)) 347 | self.deriv_D = np.zeros((self.D.shape[0], self.D.shape[1],self.D.shape[2]-1)) 348 | for i in range(self.X.shape[0]): 349 | self.deriv_X[i] = derivateM(self.X[i], self.step) 350 | for i in range(self.D.shape[0]): 351 | self.deriv_D[i] = derivateM(self.D[i], self.step) 352 | def innerproduct(x,y, xderiv, yderiv): 353 | """We build the inner product in the paper with alpha = 0 which corresponds 354 | to L2 of derivate dot product. 355 | """ 356 | A = 0 357 | F1 = x * y 358 | F2 = xderiv * yderiv 359 | F11 = np.zeros((F1.shape[0],F1.shape[1]-1)) 360 | F12 = np.zeros((F1.shape[0],F1.shape[1]-1)) 361 | F21 = np.zeros((F2.shape[0],F2.shape[1]-1)) 362 | F22 = np.zeros((F2.shape[0],F2.shape[1]-1)) 363 | for i in range(F1.shape[0]): 364 | F11[i] = F1[i][((np.arange(len(F1[i])) + 1) % len(F1[i]))[:len(F1[i])-1]] 365 | F12[i] = F1[i][((np.arange(len(F1[i])) + -1) % len(F1[i]))[1:len(F1[i])]] 366 | F21[i] = F2[i][((np.arange(len(F2[i])) + 1) % len(F2[i]))[:len(F2[i])-1]] 367 | F22[i] = F2[i][((np.arange(len(F2[i])) + -1) % len(F2[i]))[1:len(F2[i])]] 368 | 369 | for i in range(F1.shape[0]): 370 | A += (self.alpha *np.sum(( self.step * (F11[i] + F12[i]) / 2)) 371 | +(1-self.alpha) * np.sum((self.step[0:(len(self.step) - 1)] 372 | * (F21[i] + F22[i]) / 2))) 373 | return A 374 | 375 | else: 376 | self.deriv_X = np.zeros((self.X.shape[0], self.X.shape[1],self.X.shape[2]-1)) 377 | self.deriv_D = np.zeros((self.D.shape[0], self.D.shape[1],self.D.shape[2]-1)) 378 | for i in range(X.shape[0]): 379 | self.deriv_X[i] = derivateM(self.X[i], self.step) 380 | for i in range(self.D.shape[0]): 381 | self.deriv_D[i] = derivateM(self.D[i], self.step) 382 | 383 | def innerproduct(x,y, xderiv, yderiv): 384 | """We build the inner product in the paper which is a compromise between 385 | L2 scalar product and the L2 scalar product of derivate. 386 | The function that we use work only with if we have the observations 387 | of curves at constant steps. 388 | """ 389 | A = 0 390 | F1 = x * y 391 | F2 = xderiv * yderiv 392 | F11 = np.zeros((F1.shape[0],F1.shape[1]-1)) 393 | F12 = np.zeros((F1.shape[0],F1.shape[1]-1)) 394 | F21 = np.zeros((F2.shape[0],F2.shape[1]-1)) 395 | F22 = np.zeros((F2.shape[0],F2.shape[1]-1)) 396 | x11 = np.zeros((x.shape[0],x.shape[1]-1)) 397 | x12 = np.zeros((x.shape[0],x.shape[1]-1)) 398 | x21 = np.zeros((xderiv.shape[0],xderiv.shape[1]-1)) 399 | x22 = np.zeros((xderiv.shape[0],xderiv.shape[1]-1)) 400 | y11 = np.zeros((y.shape[0],y.shape[1]-1)) 401 | y12 = np.zeros((y.shape[0],y.shape[1]-1)) 402 | y21 = np.zeros((yderiv.shape[0],yderiv.shape[1]-1)) 403 | y22 = np.zeros((yderiv.shape[0],yderiv.shape[1]-1)) 404 | for i in range(F1.shape[0]): 405 | F11[i] = F1[i][((np.arange(len(F1[i])) + 1) % len(F1[i]))[:len(F1[i])-1]] 406 | F12[i] = F1[i][((np.arange(len(F1[i])) + -1) % len(F1[i]))[1:len(F1[i])]] 407 | F21[i] = F2[i][((np.arange(len(F2[i])) + 1) % len(F2[i]))[:len(F2[i])-1]] 408 | F22[i] = F2[i][((np.arange(len(F2[i])) + -1) % len(F2[i]))[1:len(F2[i])]] 409 | x11[i] = x[i][((np.arange(len(x[i])) + 1) % len(x[i]))[:len(x[i])-1]] 410 | x12[i] = x[i][((np.arange(len(x[i])) + -1) % len(x[i]))[1:len(x[i])]] 411 | x21[i] = xderiv[i][((np.arange(len(xderiv[i])) + 1) % len(xderiv[i]))[:len(xderiv[i])-1]] 412 | x22[i] = xderiv[i][((np.arange(len(xderiv[i])) + -1) % len(xderiv[i]))[1:len(xderiv[i])]] 413 | y11[i] = y[i][((np.arange(len(y[i])) + 1) % len(y[i]))[:len(y[i])-1]] 414 | y12[i] = y[i][((np.arange(len(y[i])) + -1) % len(y[i]))[1:len(y[i])]] 415 | y21[i] = yderiv[i][((np.arange(len(yderiv[i])) + 1) % len(yderiv[i]))[:len(yderiv[i])-1]] 416 | y22[i] = yderiv[i][((np.arange(len(yderiv[i])) + -1) % len(yderiv[i]))[1:len(yderiv[i])]] 417 | 418 | 419 | 420 | 421 | 422 | for i in range(F1.shape[0]): 423 | A += (self.alpha * np.sum(F11[i] + F12[i]) / (np.sqrt(np.sum(x11[i] ** 2 + 424 | x12[i] ** 2)) * np.sqrt(np.sum(y11[i] ** 2 + y12[i] ** 2))) 425 | + (1 - self.alpha) * np.sum(F21[i] + F22[i]) / (np.sqrt(np.sum(x21[i] ** 2 + 426 | x22[i] ** 2)) * np.sqrt(np.sum(y21[i] ** 2 + y22[i] ** 2)))) 427 | return A 428 | elif (innerproduct == 'auto2'): 429 | if (self.alpha == None or self.alpha !=1): 430 | self.alpha = 1 431 | def innerproduct(x, y, xderiv = None, yderiv = None): 432 | """ We build the second type of generalization of the dot product in 433 | multivariate setting. 434 | """ 435 | A = 0 436 | for i in range(x.shape[1]): 437 | A += np.inner(x[:,i],y[:,i]) 438 | return A 439 | 440 | 441 | else: raise TypeError('This inner product is not pre-defined') 442 | else: self.alpha = 1 443 | 444 | self.innerproduct = innerproduct 445 | self.limit = limit 446 | if limit is None: 447 | """Set limit to the default as specified by the paper 448 | (average depth of unsuccesful search through a binary tree). 449 | """ 450 | self.limit = int(np.ceil(np.log2(self.sample))) 451 | 452 | self.c = c_factor(self.sample) 453 | 454 | if (self.alpha == 1): 455 | for i in range(self.ntrees): 456 | """This loop builds an ensemble of iTrees (the forest). 457 | """ 458 | ix = np.random.choice(np.arange(self.nobjs), size = self.sample, replace = False) 459 | 460 | self.Trees.append(iTree(X[ix], self.step, 461 | 0, self.limit, 462 | self.D, self.innerproduct, 463 | self.alpha, self.deriv_X, 464 | self.deriv_D)) 465 | else: 466 | for i in range(self.ntrees): 467 | """This loop builds an ensemble of iTrees (the forest). 468 | """ 469 | ix = np.random.choice(np.arange(self.nobjs), size = self.sample, replace = False) 470 | 471 | self.Trees.append(iTree(X[ix], self.step, 472 | 0, self.limit, 473 | self.D, self.innerproduct, 474 | self.alpha, self.deriv_X[ix], 475 | self.deriv_D)) 476 | 477 | def compute_paths(self, X_in = None): 478 | """ 479 | compute_paths(X_in = None) 480 | 481 | Compute the anomaly score of an input sample is computed as 482 | the mean anomaly score of the trees in the forest. 483 | Parameters 484 | ---------- 485 | X_in : Array-like 486 | Data to be scored. FIForest.Trees are used for computing the depth reached in 487 | each tree by each data curve. 488 | Returns 489 | ------- 490 | float 491 | Anomaly score for a given data curve. 492 | """ 493 | if X_in is None: 494 | X_in = self.X 495 | if(self.alpha != 1): 496 | deriv_X_in = self.deriv_X 497 | else: 498 | if(self.alpha != 1): 499 | deriv_X_in = np.zeros((X_in.shape[0],X_in.shape[1],X_in.shape[2]-1)) 500 | for i in range(X_in.shape[0]): 501 | deriv_X_in[i] = derivateM(X_in[i], self.step) 502 | S = np.zeros(len(X_in)) 503 | 504 | for i in range(len(X_in)): 505 | h_temp = 0 506 | for j in range(self.ntrees): 507 | # Compute path length for each curve 508 | if(self.alpha != 1): 509 | h_temp += PathFactor(X_in[i], self.step, 510 | self.Trees[j], self.alpha, 511 | deriv_X_in[i]).path * 1.0 512 | else: 513 | h_temp += PathFactor(X_in[i], self.step, 514 | self.Trees[j],self.alpha).path * 1.0 515 | 516 | # Average of path length travelled by the point in all trees. 517 | Eh = h_temp / self.ntrees 518 | 519 | # Anomaly Score 520 | S[i] = 2.0 ** (- Eh / self.c) 521 | return S 522 | def threshold(self, score_samples, contamination = 0.1): 523 | """Compute the treshold to declare curves as anomalies or not. 524 | The choice of 'lower' interpolation in the percentile function come from 525 | the fact that it should be a little gap between the score of anomalies and the normal score. 526 | This choice could be different depending on the problem given. 527 | 528 | Parameters 529 | ---------- 530 | 531 | score_samples : Array 532 | The score array for a dataset of curves. 533 | 534 | contamination : float, optional (default=0.1) 535 | The amount of contamination of the data set, i.e. the proportion 536 | of outliers in the data set. Used when fitting to define the threshold 537 | on the decision function. 538 | 539 | """ 540 | return np.percentile(score_samples, 100 * (1-contamination), interpolation = 'lower') 541 | 542 | def predict_label(self, score, contamination = 0.1): 543 | 544 | """Compute the label vector of curves. 545 | 546 | Parameters 547 | ---------- 548 | 549 | score : Array 550 | The score array for a dataset of curves. 551 | 552 | contamination : float, optional (default=0.1) 553 | The amount of contamination of the data set, i.e. the proportion 554 | of outliers in the data set. Used when fitting to define the threshold 555 | on the decision function. 556 | 557 | Returns 558 | ------- 559 | 560 | y_label : array 561 | An array of predict label, -1 if the curve is considered as normal and +1 if not. 562 | """ 563 | y_label = np.zeros((len(score))) 564 | return -1 + 2.0 * (score > self.threshold(score, contamination)) 565 | 566 | 567 | class Node(object): 568 | """ 569 | A single node from each tree (each iTree object). Nodes containe information on hyperplanes used for data division, date to be passed to left and right nodes, whether they are external or internal nodes. 570 | Attributes 571 | ---------- 572 | e: int 573 | Depth of the tree to which the node belongs. 574 | 575 | size: int 576 | Size of the dataset present at the node. 577 | 578 | X: Array-like 579 | Data at the node. 580 | 581 | d: Array-like 582 | Direction function used to build the hyperplane that splits the data in the node. 583 | 584 | dd : int 585 | The index of the direction chosen at this node. 586 | 587 | q: Array 588 | Intercept point through which the hyperplane passes. 589 | 590 | left: Node object 591 | Left child node. 592 | 593 | right: Node object 594 | Right child node. 595 | 596 | ntype: str 597 | The type of the node: 'exNode', 'inNode'. 598 | """ 599 | def __init__(self, 600 | X, 601 | d, 602 | dd, 603 | q, 604 | e, 605 | left, 606 | right, 607 | node_type='' ): 608 | """ 609 | Node(X, u, q, e, left, right, node_type = '' ) 610 | Create a node in a given tree (iTree objectg) 611 | Parameters 612 | ---------- 613 | X : Array-like 614 | Training data available to each node. 615 | 616 | d : Array 617 | Direction (curve) used to build the hyperplane that splits the data in the node. 618 | 619 | q : Array 620 | Intercept point for the hyperplane used for splitting data. 621 | 622 | left : Node object 623 | Left child node. 624 | 625 | right : Node object 626 | Right child node. 627 | 628 | node_type : str 629 | Specifies if the node is external or internal. Takes two values: 'exNode', 'inNode'. 630 | """ 631 | self.e = e 632 | self.size = len(X) 633 | self.X = X 634 | self.d = d 635 | self.dd = dd 636 | self.q = q 637 | self.left = left 638 | self.right = right 639 | self.ntype = node_type 640 | 641 | class iTree(object): 642 | 643 | """ 644 | A single tree in the forest that is build using a unique subsample. 645 | Attributes 646 | ---------- 647 | e: int 648 | Depth of tree 649 | 650 | X: list 651 | Data present at the root node of this tree. 652 | 653 | step : array 654 | Vector of the length of intervals of discretization. 655 | 656 | size: int 657 | Size of the dataset. 658 | 659 | dim: int 660 | Dimension of the dataset. 661 | 662 | l: int 663 | Maximum depth a tree can reach before its creation is terminated. 664 | 665 | d: list 666 | Normal vector at the root of this tree, which is used in creating hyperplanes for splitting criteria 667 | 668 | dd : int 669 | The index of the direction chosen at this node. 670 | 671 | q: list 672 | Intercept point at the root of this tree through which the splitting hyperplane passes. 673 | 674 | root: Node object 675 | At each node create a new tree. 676 | 677 | D: Array like 678 | Dictionary of functions used as directions. 679 | 680 | innerproduct : function or str 681 | An inner product that we use for the construction of the tree. 682 | 683 | alpha : float 684 | A float number between [0,1] used in the innerproduct of the paper. 685 | 686 | deriv_X : Array-like 687 | A matrix of derivate of X if needed for the scalar product. 688 | 689 | deriv_D : Array-like 690 | A matrix of derivate of D if needed for the scalar product. 691 | 692 | Methods 693 | ------- 694 | make_tree(X, e, l, D, innerproduct) 695 | Builds the tree recursively from a given node. Returns a Node object. 696 | """ 697 | 698 | def __init__(self, 699 | X, 700 | step, 701 | e, 702 | l, 703 | D, 704 | innerproduct, 705 | alpha, 706 | deriv_X=None, 707 | deriv_D=None): 708 | 709 | self.e = e 710 | self.X = X 711 | self.step = step 712 | self.size = len(X) 713 | self.l = l 714 | self.q = None 715 | self.d = None 716 | self.dd = None 717 | self.exnodes = 0 718 | self.D = D 719 | self.innerproduct = innerproduct 720 | self.alpha = alpha 721 | self.deriv_X = deriv_X 722 | self.deriv_D = deriv_D 723 | self.root = self.make_tree(self.X, self.e) 724 | 725 | def make_tree(self, X, e): 726 | """ 727 | make_tree(X,e,l,D, innerproduct) 728 | Builds the tree recursively from a given node. Returns a Node object. 729 | Parameters 730 | ---------- 731 | X: Array like 732 | Subsample of training data. 733 | 734 | e : int 735 | Depth of the tree as it is being traversed down. Integer. e <= l. 736 | 737 | 738 | Returns 739 | ------- 740 | Node object 741 | """ 742 | self.e = e 743 | # A curve is isolated in training data, or the depth limit has been reached. 744 | if e >= self.l or len(X) <= 1: 745 | left = None 746 | right = None 747 | self.exnodes += 1 748 | return Node(X, self.d, self.dd, self.q, e, left, right, node_type = 'exNode') 749 | 750 | # Building the tree continues. All these nodes are internal. 751 | else: 752 | sample_size = X.shape[0] 753 | idx = np.random.choice(range(0, (self.D).shape[0]), size=1) 754 | self.d = self.D[idx[0]] 755 | self.dd = idx[0] 756 | Z = np.zeros((sample_size)) 757 | if (self.alpha != 1): 758 | for i in range(sample_size): 759 | Z[i] = self.innerproduct(X[i], self.d, self.deriv_X[i], self.deriv_D[idx[0]]) 760 | else : 761 | for i in range(sample_size): 762 | Z[i] = self.innerproduct(X[i], self.d) 763 | # Picking a random threshold for the hyperplane splitting data. 764 | self.q = np.random.uniform(np.min(Z), np.max(Z)) 765 | # Criteria that determines if a curve should go to the left or right child node. 766 | w = Z - self.q < 0 767 | return Node(X, self.d, self.dd, self.q, e,\ 768 | left=self.make_tree(X[w], e+1),\ 769 | right=self.make_tree(X[~w], e+1),\ 770 | node_type = 'inNode' ) 771 | 772 | class PathFactor(object): 773 | """ 774 | Given a single tree (iTree objext) and a curve x , compute the length of the path traversed 775 | by the point on the tree when it reaches an external node. 776 | 777 | Attributes 778 | ---------- 779 | path_list: list 780 | A list of strings 'L' or 'R' which traces the path a data curve travels down a tree. 781 | 782 | x: Array like (dimension, discretization) 783 | A single function, which is represented as an matrix of floats. 784 | 785 | e: int 786 | The depth of a given node in the tree. 787 | 788 | Methods 789 | ------- 790 | find_path(T) 791 | Given a tree, it finds the path a single data curves takes. 792 | """ 793 | def __init__(self, 794 | x, 795 | step, 796 | itree, 797 | alpha, 798 | deriv_x=None): 799 | """ 800 | PathFactor(x, itree) 801 | Given a single tree (iTree objext) and a curve x, compute the legth of the path traversed 802 | by the point on the tree when it reaches an external node. 803 | 804 | Parameters 805 | ---------- 806 | x : Array 807 | A single function x. 808 | 809 | itree : iTree object 810 | A single tree. 811 | """ 812 | self.path_list=[] 813 | self.x = x 814 | self.deriv_x = deriv_x 815 | self.e = 0 816 | self.alpha = alpha 817 | self.step = step 818 | self.D = itree.D 819 | self.deriv_D = itree.deriv_D 820 | self.innerproduct = itree.innerproduct 821 | self.path = self.find_path(itree.root) 822 | 823 | def find_path(self, T): 824 | """ 825 | find_path(T) 826 | Given a tree, find the path for a single curve based on the splitting criteria stored at each node. 827 | 828 | Parameters 829 | ---------- 830 | T : Itree object 831 | Returns 832 | ------- 833 | int 834 | The depth reached by the data curve. 835 | """ 836 | if T.ntype == 'exNode': 837 | if T.size <= 1: return self.e 838 | else: 839 | self.e = self.e + c_factor(T.size) 840 | return self.e 841 | else: 842 | # Threshold for the hyperplane for splitting data at a given node. 843 | q = T.q 844 | # Direction curve for the hyperplane for splitting data at a given node. 845 | d = T.d 846 | self.e += 1 847 | 848 | if (self.alpha != 1): 849 | if self.innerproduct(self.x, d, self.deriv_x, self.deriv_D[T.dd]) - q < 0: 850 | self.path_list.append('L') 851 | return self.find_path(T.left) 852 | else: 853 | self.path_list.append('R') 854 | return self.find_path(T.right) 855 | else: 856 | if self.innerproduct(self.x, d, self.step) - q < 0: 857 | self.path_list.append('L') 858 | return self.find_path(T.left) 859 | else: 860 | self.path_list.append('R') 861 | return self.find_path(T.right) 862 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | FIF : Functional Isolation Forest 2 | ========================================= 3 | 4 | This repository hosts Python code of the Functional Isolation Forest algorithm: https://proceedings.mlr.press/v101/staerman19a. Here we provide the source code using cython/c++ and the old version in python. The c++ code is highly inspired from the awesome work of https://github.com/sahandha/eif and is extremly faster than the python version. Three dictionaries are implemented, Brownian motion, gaussian wavelets and cosine, see the paper for more details. Feel free to suggests any additional dictionaries. 5 | 6 | 7 | ========================================= 8 | 9 | 10 | Installation 11 | ------------ 12 | Download this repository and then run this python command in the folder: 13 | 14 | .. code:: python 15 | 16 | python setup.py build_ext --inplace 17 | 18 | Further, you can import the algorithm with the following command in your python script: 19 | 20 | .. code:: python 21 | 22 | import fif as FIF 23 | 24 | NB: our algorithm is not the one from https://pypi.org/project/fif/. Uninstall this package if you want to use functional isolation forest 25 | 26 | 27 | Algorithm 28 | --------- 29 | Functional Isolation Forest is an anomaly detection (and anomaly ranking) algorithm for functional data (i.e., time-series). 30 | It shows a great flexibility to distinguish most of anomaly types of functional data. 31 | 32 | The algorithm return the anomaly score of each sample with the function compute_paths(), see notebooks for example or the quick start below. 33 | 34 | Some parameters have to be set by the user : 35 | - X [numpy array of size (n,dim)]: 'n' functional data with 'dim' measurements. 36 | - time [numpy array of size dim]: vector of times measurements of size 'dim'. 37 | - sample_size [int]: the size of samples used for each tree. 38 | - ntrees [int]: the number of trees, default value is 100. 39 | - alpha [float between 0 and 1]: convex combination parameter for the innerproduct (as it is explained in the paper), default value is 1. 40 | - dic_number [int: 0,1,2]: three dictionaries are implemented (0: Brownian motion; 1: Gaussian wavelets; 2: cosine), default value is 1. 41 | 42 | 43 | Quick Start : 44 | ------------ 45 | 46 | Create a toy dataset : 47 | 48 | .. code:: python 49 | 50 | 51 | import numpy as np 52 | np.random.seed(42) 53 | m =100;n =100;tps = np.linspace(0,1,m);v = np.linspace(1,1.4,n) 54 | X = np.zeros((n,m)) 55 | for i in range(n): 56 | X[i] = 30 * ((1-tps) ** v[i]) * tps ** v[i] 57 | Z1 = np.zeros((m)) 58 | for j in range(m): 59 | if (tps[j]<0.2 or tps[j]>0.8): 60 | Z1[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 61 | else: 62 | Z1[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + np.random.normal(0,0.3,1) 63 | Z1[0] = 0 64 | Z1[m-1] = 0 65 | Z2 = 30 * ((1-tps) ** 1.6) * tps ** 1.6 66 | Z3 = np.zeros((m)) 67 | for j in range(m): 68 | Z3[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + np.sin(2*np.pi*tps[j]) 69 | 70 | Z4 = np.zeros((m)) 71 | for j in range(m): 72 | Z4[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 73 | 74 | for j in range(70,71): 75 | Z4[j] += 2 76 | 77 | Z5 = np.zeros((m)) 78 | for j in range(m): 79 | Z5[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + 0.5*np.sin(10*np.pi*tps[j]) 80 | 81 | X = np.concatenate((X,Z1.reshape(1,-1),Z2.reshape(1,-1), 82 | Z3.reshape(1,-1), Z4.reshape(1,-1), Z5.reshape(1,-1)), axis = 0) 83 | 84 | 85 | 86 | And then use FIF to rank functional dataset : 87 | 88 | .. code:: python 89 | 90 | import fif as FIF 91 | F = FIF.FiForest(X, time=tps, ntrees=100, sample_size=64, dic_number=1, alpha=0.5, seed=0) 92 | Anomaly_score = F.compute_paths() 93 | 94 | The simulated dataset with the five introduced anomalies (top). The sorted dataset (middle), the darker the color, the more the curves are considered anomalies. The sorted anomaly score of the dataset (bottom). 95 | 96 | .. image:: Figures/anomaly_example-1.png 97 | .. image:: Figures/anomaly_example_rank-1.png 98 | .. image:: Figures/anomaly_example_score-1.png 99 | 100 | Dependencies 101 | ------------ 102 | 103 | These are the dependencies to use FIF: 104 | 105 | * numpy 106 | * cython 107 | 108 | 109 | Cite 110 | ---- 111 | 112 | If you use this code in your project, please cite:: 113 | 114 | 115 | @InProceedings{pmlr-v101-staerman19a, 116 | title = {Functional Isolation Forest}, 117 | author = {Staerman, Guillaume and Mozharovskyi, Pavlo and Cl\'emen\c{c}on, Stephan and d'Alch\'e-Buc, Florence}, 118 | booktitle = {Proceedings of The Eleventh Asian Conference on Machine Learning}, 119 | pages = {332--347}, 120 | year = {2019}, 121 | volume = {101}, 122 | publisher = {PMLR} 123 | } 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /__fif.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "fif.hxx": 2 | cdef cppclass FiForest: 3 | int limit 4 | FiForest (int, int, int, int, int, double) 5 | void fit (double*, double*, int, int) 6 | void predict (double*, double*, int) 7 | void predictSingleTree (double*, double*, int, int) 8 | void OutputTreeNodes (int) 9 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #gzgz -------------------------------------------------------------------------------- /_fif.pyx: -------------------------------------------------------------------------------- 1 | # Cython wrapper for Functional Isolation Forest 2 | # This code is highly inspired from the code of 'Extended Isolation Forest' https://github.com/sahandha/eif. 3 | 4 | # distutils: language = C++ 5 | # distutils: sources = fif.cxx 6 | # cython: language_level = 3 7 | 8 | import cython 9 | import numpy as np 10 | cimport numpy as np 11 | from version import __version__ 12 | 13 | cimport __fif 14 | 15 | np.import_array() 16 | 17 | cdef class FiForest: 18 | cdef int size_X 19 | cdef int dim 20 | cdef int _ntrees 21 | cdef int _limit 22 | cdef int sample 23 | cdef int tree_index 24 | cdef int dic_number 25 | cdef double alpha 26 | cdef __fif.FiForest* thisptr 27 | 28 | @cython.boundscheck(False) 29 | @cython.wraparound(False) 30 | def __cinit__ (self, np.ndarray[double, ndim=2] X not None, np.ndarray[double, ndim=1] time not None, int sample_size, int ntrees=100, int limit=0, int seed=-1, int dic_number=1, double alpha=1.0): 31 | self.thisptr = new __fif.FiForest (ntrees, sample_size, limit, seed, dic_number, alpha) 32 | if not X.flags['C_CONTIGUOUS']: 33 | X = X.copy(order='C') 34 | if not time.flags['C_CONTIGUOUS']: 35 | time = time.copy(order='C') 36 | self.size_X = X.shape[0] 37 | self.dim = X.shape[1] 38 | self.sample = sample_size 39 | self._ntrees = ntrees 40 | self._limit = self.thisptr.limit 41 | self.alpha = alpha 42 | self.dic_number = dic_number 43 | self.thisptr.fit ( np.PyArray_DATA(X), np.PyArray_DATA(time), self.size_X, self.dim) 44 | 45 | @property 46 | def ntrees(self): 47 | return self._ntrees 48 | 49 | @property 50 | def limit(self): 51 | return self._limit 52 | 53 | def __dealloc__ (self): 54 | del self.thisptr 55 | 56 | @cython.boundscheck(False) 57 | @cython.wraparound(False) 58 | def compute_paths (self, np.ndarray[double, ndim=2] X_in=None): 59 | cdef np.ndarray[double, ndim=1, mode="c"] S 60 | if X_in is None: 61 | S = np.empty(self.size_X, dtype=np.float64, order='C') 62 | self.thisptr.predict ( np.PyArray_DATA(S), NULL, 0) 63 | else: 64 | if not X_in.flags['C_CONTIGUOUS']: 65 | X_in = X_in.copy(order='C') 66 | S = np.empty(X_in.shape[0], dtype=np.float64, order='C') 67 | self.thisptr.predict ( np.PyArray_DATA(S), np.PyArray_DATA(X_in), X_in.shape[0]) 68 | return S 69 | 70 | @cython.boundscheck(False) 71 | @cython.wraparound(False) 72 | def compute_paths_single_tree (self, np.ndarray[double, ndim=2] X_in=None, tree_index=0): 73 | cdef np.ndarray[double, ndim=1, mode="c"] S 74 | if X_in is None: 75 | S = np.empty(self.size_X, dtype=np.float64, order='C') 76 | self.thisptr.predictSingleTree ( np.PyArray_DATA(S), NULL, 0, tree_index) 77 | else: 78 | if not X_in.flags['C_CONTIGUOUS']: 79 | X_in = X_in.copy(order='C') 80 | S = np.empty(X_in.shape[0], dtype=np.float64, order='C') 81 | self.thisptr.predictSingleTree ( np.PyArray_DATA(S), np.PyArray_DATA(X_in), X_in.shape[0], tree_index) 82 | return S 83 | 84 | def output_tree_nodes (self, int tree_index): 85 | self.thisptr.OutputTreeNodes (tree_index) 86 | -------------------------------------------------------------------------------- /fif.cxx: -------------------------------------------------------------------------------- 1 | #include "fif.hxx" 2 | 3 | 4 | /******************************** 5 | Utility functions 6 | ********************************/ 7 | 8 | inline std::vector derivate (double* X1, double* time, int dim) 9 | /* return the derivative of the function X1 whose have been measured at times time.*/ 10 | 11 | { std::vector derivative (dim-1, 0.0); 12 | 13 | for (int i=1; i linspace(double start, double end, int num) 19 | /* return an vector of 'num' equispaced values between 'start' and 'end'. */ 20 | { 21 | std::vector linspaced; 22 | double delta = (end - start) / (num - 1); 23 | 24 | if (num == 0) { return linspaced; } 25 | if (num == 1) 26 | { 27 | linspaced.push_back(start); 28 | return linspaced; 29 | } 30 | 31 | for(int i=0; i < num-1; ++i) 32 | { 33 | linspaced.push_back(start + delta * i); 34 | } 35 | linspaced.push_back(end); 36 | 37 | return linspaced; 38 | } 39 | 40 | inline std::vector dictionary_function (int dim, int dic_number, RANDOM_ENGINE& random_engine_in) 41 | /* return a function sampled from a dictionary. Three choices are possible: 42 | * 43 | * 'dic_number=0' means Brownian motion 44 | * 'dic_num=1' means gaussian wavelets 45 | * 'dic_number=2' means cosine dictionary. 46 | */ 47 | { 48 | std::vector dic_function (dim, 0.0); 49 | std::vector t (dim, 0.0); 50 | t = linspace(-5,5,dim); 51 | 52 | if (dic_number == 0) // Standard Brownian motion 53 | { 54 | dic_function[0] = std::normal_distribution (0.0, 1.0) (random_engine_in); 55 | for (int i=1; i (0.0, std::sqrt(t[i] - t[i-1]))(random_engine_in); 58 | } 59 | } 60 | else if (dic_number == 1) // gaussian wavelets with various mean and std 61 | { 62 | 63 | double sigma; 64 | double K; 65 | 66 | 67 | sigma = std::uniform_real_distribution (0.2, 1)(random_engine_in); 68 | K = std::uniform_real_distribution (-4.0, 4.0)(random_engine_in); 69 | for (int i=0; i (-1, 1)(random_engine_in); 81 | freq = std::uniform_real_distribution (0, 10)(random_engine_in); 82 | 83 | for (int i=0; i prod (dim, 0.0); 112 | 113 | for (int i=0; i prod_derivate (dim-1, 0.0); 128 | std::vector X1_derivate (dim-1, 0.0); 129 | std::vector X2_derivate (dim-1, 0.0); 130 | 131 | X1_derivate = derivate(X1, time, dim); 132 | X2_derivate = derivate(X2, time, dim); 133 | prod_derivate[0] = X1_derivate[0] * X2_derivate[0]; 134 | 135 | for (int i=1; i prod_derivate (dim-1, 0.0); 145 | std::vector step_time (dim-1, 0.0); 146 | std::vector X1_derivate (dim-1, 0.0); 147 | std::vector X2_derivate (dim-1, 0.0); 148 | double inner = 0.0; 149 | double inner_derivate = 0.0; 150 | double norm_X1 = 0.0; 151 | double norm_X2 = 0.0; 152 | double norm_X1_derivate = 0.0; 153 | double norm_X2_derivate = 0.0; 154 | 155 | 156 | prod[0] = X1[0] * X2[0]; 157 | for (int i=1; i sample_without_replacement (int k, int N, RANDOM_ENGINE& gen) 197 | /* Sample k elements from the range [1, N] without replacement */ 198 | { 199 | 200 | // Create an unordered set to store the samples 201 | std::unordered_set samples; 202 | 203 | // Sample and insert values into samples 204 | for (int r=N-k+1; r(1, r)(gen); 207 | if (!samples.insert(v).second) samples.insert(r); 208 | } 209 | 210 | // Copy samples into vector 211 | std::vector result(samples.begin(), samples.end()); 212 | 213 | // Shuffle vector 214 | std::shuffle(result.begin(), result.end(), gen); 215 | 216 | return result; 217 | 218 | } 219 | 220 | void output_tree_node (Node* node_in, std::string string_in) 221 | { 222 | 223 | std::cout << "==== Node ====" << std::endl; 224 | std::cout << "path: " << string_in << std::endl; 225 | std::cout << "e : " << node_in[0].e << std::endl; 226 | std::cout << "size: " << node_in[0].size << std::endl; 227 | std::cout << "n : ["; 228 | int size_n = node_in[0].dic_vector.size(); 229 | for (int i=0; i dic_vector (dim, 0.0); 318 | 319 | if (e_in >= limit || size_in <= 1) { 320 | 321 | Node* left = NULL; 322 | Node* right = NULL; 323 | exnodes += 1; 324 | Node* node = new Node (size_in, dim, &dic_vector[0], treshold, e_in, left, right, "exNode"); 325 | return node; 326 | 327 | } else { 328 | 329 | std::vector innerprod (size_in, 0.0); 330 | std::vector XL, XR; 331 | int sizeXL = 0; 332 | int sizeXR = 0; 333 | 334 | dic_vector = dictionary_function(dim, dic_number, random_engine_in); 335 | for (int i=0; i (innermin, innermax)(random_engine_in); 347 | 348 | // Assign data in left and right leaves. 349 | for (int i=0; i nobjs) 467 | { 468 | std::cout << "No. of data points is " << nobjs << ". Subsample size cannot be larger than " << nobjs << "." << std::endl; 469 | return false; 470 | } 471 | 472 | return true; 473 | 474 | } 475 | 476 | void FiForest::fit (double* X_in, double* time_in, int nobjs_in, int dim_in) 477 | { 478 | std::vector Xsubset; 479 | 480 | X = X_in; 481 | time = time_in; 482 | nobjs = nobjs_in; 483 | dim = dim_in; 484 | if (!CheckSampleSize ()) return; 485 | 486 | 487 | 488 | for (int i=0; i sample_index = sample_without_replacement (sample, nobjs, random_engine); 493 | Xsubset.clear(); 494 | for (int j=0; j 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define EULER_CONSTANT 0.5772156649 10 | #define PI_CONSTANT 3.1415926535 11 | 12 | #define RANDOM_ENGINE std::mt19937_64 13 | #define RANDOM_SEED_GENERATOR std::random_device 14 | 15 | 16 | /**************************** 17 | Class Node 18 | ****************************/ 19 | class Node 20 | { 21 | 22 | private: 23 | 24 | protected: 25 | 26 | public: 27 | int e; 28 | int size; 29 | std::vector dic_vector; 30 | double treshold; 31 | Node* left; 32 | Node* right; 33 | std::string node_type; 34 | 35 | Node (int, int, double*, double, int, Node*, Node*, std::string); 36 | ~Node (); 37 | 38 | }; 39 | 40 | 41 | /**************************** 42 | Class FiTree 43 | ****************************/ 44 | class FiTree 45 | { 46 | 47 | private: 48 | int e; 49 | int size; 50 | double alpha; 51 | int dic_number; 52 | int dim; 53 | int limit; 54 | int exnodes; 55 | 56 | protected: 57 | 58 | public: 59 | Node* root; 60 | 61 | FiTree (); 62 | ~FiTree (); 63 | void build_tree (double*, double*, int, int, int, double,int, int, RANDOM_ENGINE&); 64 | Node* add_node (double*, double*, int, int, RANDOM_ENGINE&); 65 | 66 | }; 67 | 68 | 69 | /************************* 70 | Class Path 71 | *************************/ 72 | class Path 73 | { 74 | 75 | private: 76 | int dim; 77 | double alpha; 78 | double* time; 79 | double* x; 80 | double e; 81 | protected: 82 | 83 | public: 84 | std::vector path_list; 85 | double pathlength; 86 | 87 | Path (double*, int, double, double*, FiTree); 88 | ~Path (); 89 | double find_path (Node*); 90 | 91 | }; 92 | 93 | 94 | /**************************** 95 | Class FiForest 96 | ****************************/ 97 | class FiForest 98 | { 99 | 100 | private: 101 | int nobjs; 102 | int dim; 103 | int sample; 104 | int ntrees; 105 | int dic_number; 106 | double alpha; 107 | double* X; 108 | double * time; 109 | double c; 110 | FiTree* Trees; 111 | unsigned random_seed; 112 | 113 | bool CheckSampleSize (); 114 | protected: 115 | 116 | public: 117 | int limit; 118 | FiForest (int, int, int, int, int, double); 119 | ~FiForest (); 120 | void fit (double*, double*, int, int); 121 | void predict (double*, double*, int); 122 | void predictSingleTree (double*, double*, int, int); 123 | void OutputTreeNodes (int); 124 | 125 | }; 126 | 127 | 128 | /******************************** 129 | Utility functions 130 | ********************************/ 131 | inline std::vector derivate (double* , double*, int); 132 | inline std::vector linspace(double, double, int); 133 | inline std::vector dictionary_function (int , int, RANDOM_ENGINE&); 134 | inline std::vector sample_without_replacement (int, int, RANDOM_ENGINE&); 135 | inline double inner_product (double*, double*, double*, double, int); 136 | inline double c_factor (int); 137 | void output_tree_node (Node*, std::string); 138 | void delete_tree_node (Node*); 139 | -------------------------------------------------------------------------------- /old_fif.py: -------------------------------------------------------------------------------- 1 | """ Functional Isolation Forest 2 | 3 | Author : Guillaume Staerman 4 | """ 5 | 6 | 7 | """Functional Isolation Forest Algorithm 8 | 9 | This is the implementation of The Functional Isolation Forest which is an 10 | extension of the original Isolation Forest applied to functional data. 11 | 12 | It return the anomaly score of each sample using the FIF algorithm. 13 | The Functional Isolation Forest 'isolates' observations by 14 | randomly selecting a curve among a dictionary 15 | and then randomly selecting a split value between the maximum 16 | and minimum values of the selected feature. 17 | 18 | Since recursive partitioning can be represented by a tree structure, the 19 | number of splittings required to isolate a sample is equivalent to the path 20 | length from the root node to the terminating node. 21 | 22 | This path length, averaged over a forest of such random trees, is a 23 | measure of normality. 24 | 25 | Random partitioning produces noticeably shorter paths for anomalies. 26 | Hence, when a forest of random trees collectively produce shorter path 27 | lengths for particular samples, they are highly likely to be anomalies. 28 | 29 | Since the probability distribution nu defined in the paper is (in the interesting case) 30 | continuous on a infinite dimensional space we do not represent it in this implementation. 31 | Instead, lot of dictionaries are already defined as Brownian dictionaries, Brownian bridges.. 32 | where the input of the Wiener measure would be more difficult for the user. If one want to use 33 | a discrete measure nu, one have to 'replace it' with an appropriate dictionary. 34 | Example : if nu is discrete measure with ten values with different weight of probability 35 | and you have a dictionary D of size 10. Then you build a larger dictionaries with the 36 | with the ten functions w.r.t. to their weights. 37 | 38 | 39 | 40 | 41 | """ 42 | import numpy as np 43 | 44 | 45 | def derivate(X, step): 46 | """Compute de derivative of each function in the matrix X w.r.t vector time.""" 47 | step = step.astype(dtype=float) 48 | A = np.zeros((X.shape[0], X.shape[1] - 1)) 49 | for i in range(X.shape[0]): 50 | A[i] = np.diff(X[i]) / step 51 | return A 52 | def derivate_piecewise(X, step): 53 | """Compute de derivative of each piecewise function in the matrix X w.r.t vector time.""" 54 | A = np.zeros((X.shape[0], X.shape[1] - 1)) 55 | for i in range(X.shape[0]): 56 | a = np.where(X[i] != 0)[0] 57 | b = a[0 : (a.shape[0] - 1)] 58 | A[i, b] = np.diff(X[i,a]) / step[b] 59 | return A 60 | 61 | def c_factor(n_samples_leaf) : 62 | """ 63 | Average path length of unsuccesful search in a binary search tree given n points 64 | 65 | Parameters 66 | ---------- 67 | n_samples_lead : int 68 | Number of curves for the BST. 69 | Returns 70 | ------- 71 | float 72 | Average path length of unsuccesful search in a BST 73 | 74 | """ 75 | return 2.0 * (np.log(n_samples_leaf - 1) + np.euler_gamma) - (2. * ( 76 | n_samples_leaf - 1.) / (n_samples_leaf * 1.0)) 77 | 78 | 79 | class FIForest(object): 80 | """ 81 | Functional Isolation Forest 82 | 83 | Creates an FIForest object. This object holds the data as well as the trained trees (iTree objects). 84 | 85 | Attributes 86 | ---------- 87 | X : Array-like 88 | Data used for training. 89 | 90 | nobjs: int 91 | Size of the dataset. 92 | 93 | sample: int 94 | Size of the sample to be used for tree creation. 95 | 96 | Trees: list 97 | A list of tree objects. 98 | 99 | limit: int 100 | Maximum depth a tree can have. 101 | 102 | c: float 103 | Multiplicative factor used in computing the anomaly scores. 104 | 105 | step : array 106 | Vector of the length of intervals of discretization. 107 | 108 | D : Array-like 109 | Dictionnary of functions used as directions. 110 | 111 | Dsize : int 112 | The size of the dictionary. It is the number of curves that we will use in our 113 | dictionary to build the forest. 114 | 115 | innerproduct : str or function 116 | An inner product that we use for the construction of the tree. The innerproduct in the paper 117 | is already implemented, call it with 'auto' and fixe and alpha. If a function is given by 118 | the user, it should have three argument : (x, y, step) where x and y are curve (represented 119 | by a vector of length of the discretization). "step" is a vector of length len(time)-1 which 120 | represents the vector of length of step between the discretization. 121 | 122 | alpha : float 123 | a float number between [0,1] used in the innerproduct of the paper. 124 | 125 | deriv_X : Array like 126 | A matrix of derivate of X if needed for the scalar product. 127 | 128 | deriv_dictionary : Array like 129 | A matrix of derivate of D if needed for the scalar product. 130 | 131 | Attributes 132 | ------- 133 | compute_paths(X_in) : 134 | Computes the anomaly score for data X_in 135 | 136 | threshold(score_sample, contamination) : 137 | Given the score returned by the fit function on training sample and a proportion 138 | of anomalies, compute the threshold which separates anomalies and normal data. 139 | 140 | predict_label(score, contamination) : 141 | Given any score (training or testing) and the proportion of anomalies 142 | it return the labels predicted. The function return +1 for outliers and 143 | -1 for inliers. 144 | 145 | 146 | References 147 | ---------- 148 | 149 | .. [1] Staerman, G, Mozharovskyi, P, D'Alché-buc, F and Clémençon,S. "Functional Isolation forest." 150 | 151 | 152 | """ 153 | 154 | def __init__(self, 155 | X, 156 | D, 157 | time, 158 | innerproduct, 159 | criterion="naive", 160 | ntrees=None, 161 | subsample_size=None, 162 | Dsize=None, 163 | limit=None, 164 | mean=None, 165 | sd=None, 166 | J_max=None, 167 | alpha=None): 168 | 169 | self.X = X 170 | self.nobjs = len(X) 171 | self.Trees = [] 172 | self.time = time 173 | self.criterion = criterion 174 | self.mean = mean 175 | self.sd = sd 176 | self.D = D 177 | 178 | 179 | if (ntrees == None): 180 | self.ntrees = 100 181 | else: self.ntrees = ntrees 182 | 183 | if (subsample_size == None): 184 | if (self.nobjs > 500): 185 | self.sample = 256 186 | else: self.sample = np.minimum(64, self.X.shape[0]) 187 | else : self.sample = subsample_size 188 | 189 | 190 | if (Dsize == None): 191 | self.Dsize = 1000 192 | else: self.Dsize = Dsize 193 | 194 | 195 | if (type(D) == str): 196 | """Finite dictionaries are pre-implemented. 197 | """ 198 | 199 | if (D == 'Dyadic_indicator'): 200 | """ We build a dictionary from the basis of the Haar wavelets using 201 | only the father wavelets. We use a discretization on [0,1] since 202 | we are interested only in the shape. 203 | """ 204 | if (J_max == None): 205 | J_max = 7 206 | a =0 207 | t = np.linspace(0,1,len(self.time)) 208 | self.D = np.zeros((np.sum(np.power(2, np.arange(J_max))), len(self.time))) 209 | for J in range(J_max): 210 | b = np.power(2, J) 211 | for k in range(b): 212 | for l in range(len(self.time)): 213 | x = b * t[l] - k 214 | self.D[a,l] = 1 * (0 <= x < 1) 215 | a += 1 216 | 217 | elif (D == 'Multiresolution_linear'): 218 | """ We build a dictionary from the basis of the Haar wavelets using 219 | only the father wavelets. We use a discretization on [0,1] since 220 | we are interested only in the shape. 221 | """ 222 | if (J_max == None): 223 | J_max = 7 224 | a =0 225 | t = np.linspace(0, 1, len(self.time)) 226 | self.D = np.zeros((np.sum(np.power(2, np.arange(J_max))), len(self.time))) 227 | for J in range(J_max): 228 | b = np.power(2,J) 229 | for k in range(b): 230 | for l in range(len(self.time)): 231 | x = b * t[l] - k 232 | self.D[a,l] = t[l] * (0 <= x < 1) 233 | a += 1 234 | 235 | elif(D == 'Self_local'): 236 | """ 237 | """ 238 | self.D = np.zeros((self.Dsize, len(self.time))) 239 | for i in range(self.Dsize): 240 | a = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0] 241 | b = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0] 242 | for j in range(len(self.time)): 243 | k = np.random.randint(low=0, high=X.shape[0], size=1) 244 | self.D[i,j] = self.X.copy()[k,j] * (np.maximum(a, b) > self.time[j] > np.minimum(a, b)) 245 | 246 | elif (D == 'Self'): 247 | self.D = self.X.copy() 248 | 249 | self.alpha = alpha 250 | self.step = np.diff(self.time) 251 | 252 | if (type(D) == str): 253 | 254 | if (D == 'Self_local' or D == 'Self'): 255 | self.deriv_dictionary = derivate(self.D, self.step) 256 | 257 | elif(D == 'Multiresolution_linear' or D == 'Dyadic_indicator'): 258 | self.deriv_dictionary = derivate_piecewise(self.D, self.step) 259 | 260 | else: self.deriv_dictionary = [] 261 | 262 | self.deriv_X = None 263 | 264 | if not callable(innerproduct): 265 | """ Some inner product implemented. 266 | """ 267 | if (innerproduct == 'auto'): 268 | 269 | if (self.alpha == None): 270 | self.alpha = 1 271 | 272 | if (self.alpha == 0): 273 | self.deriv_X = derivate(self.X, self.step) 274 | 275 | def innerproduct(x, y, xderiv, yderiv): 276 | """We build the inner product in the paper with alpha = 0 which corresponds 277 | to L2 of derivate dot product. 278 | """ 279 | F1 = x * y 280 | F2 = xderiv * yderiv 281 | 282 | F11 = F1[((np.arange(len(F1)) + 1) % len(F1))[:len(F1)-1]] 283 | F12 = F1[((np.arange(len(F1)) + -1) % len(F1))[1:len(F1)]] 284 | F21 = F2[((np.arange(len(F2)) + 1) % len(F2))[:len(F2)-1]] 285 | F22 = F2[((np.arange(len(F2)) + -1) % len(F2))[1:len(F2)]] 286 | 287 | return (self.alpha *np.sum(( self.step * (F11 + F12) / 2)) 288 | +(1-self.alpha) * np.sum((self.step[0:(len(self.step) - 1)] 289 | * (F21 + F22) / 2))) 290 | 291 | elif (self.alpha == 1): 292 | def innerproduct(x, y, xderiv=None, yderiv=None ): 293 | """We build the inner product in the paper with alpha = 1 which corresponds 294 | to L2 dot product. 295 | """ 296 | F1 = x * y 297 | return np.sum((self.step * (F1[((np.arange(len(F1)) + 1) % len(F1))[:len(F1)-1]] 298 | + F1[((np.arange(len(F1)) + -1) % len(F1))[1:len(F1)]]) / 2)) 299 | 300 | 301 | else: 302 | self.deriv_X = derivate(self.X, self.step) 303 | def innerproduct(x, y, xderiv, yderiv): 304 | """We build the inner product in the paper which is a compromise between 305 | L2 scalar product and the L2 scalar product of derivate. 306 | The function that we use work only with if we have the observations 307 | of curves at constant steps. 308 | """ 309 | F1 = x * y 310 | F2 = xderiv * yderiv 311 | 312 | F11 = F1[((np.arange(len(F1)) + 1) % len(F1))[:len(F1) - 1]] 313 | F12 = F1[((np.arange(len(F1)) - 1) % len(F1))[1:len(F1)]] 314 | F21 = F2[((np.arange(len(F2)) + 1) % len(F2))[:len(F2) - 1]] 315 | F22 = F2[((np.arange(len(F2)) - 1) % len(F2))[1:len(F2)]] 316 | 317 | x11 = x[((np.arange(len(x)) + 1) % len(x))[:len(x) - 1]] 318 | x12 = x[((np.arange(len(x)) - 1) % len(x))[1:len(x)]] 319 | x21 = xderiv[((np.arange(len(xderiv)) + 1) % len(xderiv))[:len(xderiv) - 1]] 320 | x22 = xderiv[((np.arange(len(xderiv)) - 1) % len(xderiv))[1:len(xderiv)]] 321 | 322 | y11 = y[((np.arange(len(y)) + 1) % len(y))[:len(y) - 1]] 323 | y12 = y[((np.arange(len(y)) - 1) % len(y))[1:len(y)]] 324 | y21 = yderiv[((np.arange(len(yderiv)) + 1) % len(yderiv))[:len(yderiv) - 1]] 325 | y22 = yderiv[((np.arange(len(yderiv)) - 1) % len(yderiv))[1:len(yderiv)]] 326 | return (self.alpha * np.sum(F11 + F12) / (np.sqrt(np.sum(x11 ** 2 + x12 ** 2)) * np.sqrt(np.sum(y11 ** 2 + y12 ** 2))) 327 | + (1 - self.alpha) * np.sum(F21 + F22) / (np.sqrt(np.sum(x21 ** 2 + 328 | x22 ** 2)) * np.sqrt(np.sum(y21 ** 2 + y22 ** 2)))) 329 | 330 | 331 | else: raise TypeError('This inner product is not pre-defined') 332 | else: self.alpha = 1 333 | 334 | self.innerproduct = innerproduct 335 | self.limit = limit 336 | self.c = c_factor(self.sample) 337 | 338 | 339 | if limit is None: 340 | """Set limit to the default as specified by the original paper 341 | (average depth of unsuccesful search through a binary tree). 342 | """ 343 | self.limit = int(np.ceil(np.log2(self.sample))) 344 | 345 | 346 | if (self.alpha == 1): 347 | for i in range(self.ntrees): 348 | """This loop builds an ensemble of f-itrees (the forest). 349 | """ 350 | ix = np.random.choice(np.arange(self.nobjs), size=self.sample, replace=False) 351 | 352 | self.Trees.append(iTree(X[ix], self.time, self.step, 353 | 0, self.limit, 354 | self.D, self.innerproduct, 355 | self.alpha, self.deriv_X, 356 | None, self.sample, self.criterion, self.mean, self.sd)) 357 | else: 358 | for i in range(self.ntrees): 359 | """This loop builds an ensemble of f-itrees (the forest). 360 | """ 361 | ix = np.random.choice(np.arange(self.nobjs), size=self.sample, replace=False) 362 | 363 | self.Trees.append(iTree(X[ix], self.time, self.step, 364 | 0, self.limit, 365 | self.D, self.innerproduct, 366 | self.alpha, self.deriv_X[ix], 367 | self.deriv_dictionary, self.sample, self.criterion, self.mean, self.sd)) 368 | 369 | 370 | def compute_paths(self, X_in=None): 371 | """ 372 | compute_paths(X_in = None) 373 | 374 | Compute the anomaly score of an input sample is computed as 375 | the mean anomaly score of the trees in the forest. 376 | Parameters 377 | ---------- 378 | X_in : Array-like 379 | Data to be scored. FIForest.Trees are used for computing the depth reached in 380 | each tree by each data curve. 381 | Returns 382 | ------- 383 | float 384 | Anomaly score for a given data curve. 385 | """ 386 | if X_in is None: 387 | X_in = self.X 388 | if(self.alpha != 1): 389 | deriv_X_in = self.deriv_X 390 | else: 391 | if(self.alpha != 1): 392 | deriv_X_in = derivate(X_in, self.step) 393 | 394 | S = np.zeros(len(X_in)) 395 | 396 | for i in range(len(X_in)): 397 | h_temp = 0 398 | for j in range(self.ntrees): 399 | # Compute path length for each curve 400 | if(self.alpha != 1): 401 | h_temp += PathFactor(X_in[i], self.step, 402 | self.Trees[j], self.alpha, 403 | deriv_X_in[i]).path * 1.0 404 | else: 405 | h_temp += PathFactor(X_in[i], self.step, 406 | self.Trees[j], 407 | self.alpha).path * 1.0 408 | 409 | # Average of path length travelled by the point in all trees. 410 | Eh = h_temp / self.ntrees 411 | 412 | # Anomaly Score 413 | S[i] = 2.0 ** (- Eh / self.c) 414 | return S 415 | def threshold(self, score_samples, contamination=0.1): 416 | """Compute the treshold to declare curves as anomalies or not. 417 | The choice of 'lower' interpolation in the percentile function come from 418 | the fact that it should be a little gap between the score of anomalies and the normal score. 419 | This choice could be different depending on the problem given. 420 | 421 | Parameters 422 | ---------- 423 | 424 | score_samples : Array 425 | The score array for a dataset of curves. 426 | 427 | contamination : float, optional (default=0.1) 428 | The amount of contamination of the data set, i.e. the proportion 429 | of outliers in the data set. Used when fitting to define the threshold 430 | on the decision function. 431 | 432 | """ 433 | return np.percentile(score_samples, 100 * (1 - contamination), interpolation='lower') 434 | 435 | def predict_label(self, score, contamination=0.1): 436 | 437 | """Compute the label vector of curves. 438 | 439 | Parameters 440 | ---------- 441 | 442 | score : Array 443 | The score array for a dataset of curves (training or testing). 444 | 445 | contamination : float, optional (default=0.1) 446 | The amount of contamination of the data set, i.e. the proportion 447 | of outliers in the data set. Used when fitting to define the threshold 448 | on the decision function. 449 | 450 | Returns 451 | ------- 452 | 453 | y_label : array 454 | An array of predict label, -1 if the curve is considered as normal and +1 if not. 455 | """ 456 | y_label = np.zeros((len(score))) 457 | return 1- 2.0 * (score > self.threshold(score, contamination)) 458 | 459 | def importance_feature(self): 460 | IF = np.zeros((self.D.shape[0])) 461 | 462 | 463 | for i in range(self.ntrees): 464 | IF += self.Trees[i].IF 465 | 466 | 467 | return IF 468 | 469 | 470 | 471 | class Node(object): 472 | """ 473 | A single node from each tree (each iTree object). Nodes containe information on 474 | hyperplanes used for data division, date to be passed to left and right nodes, 475 | whether they are external or internal nodes. 476 | Attributes 477 | ---------- 478 | e: int 479 | Depth of the tree to which the node belongs. 480 | 481 | size: int 482 | Size of the dataset present at the node. 483 | 484 | X: Array-like 485 | Data at the node. 486 | 487 | d: Array 488 | Direction function used to build the hyperplane that splits the data in the node. 489 | 490 | dd : int 491 | The index of the direction chosen at this node. 492 | 493 | q: Array 494 | Intercept point through which the hyperplane passes. 495 | 496 | left: Node object 497 | Left child node. 498 | 499 | right: Node object 500 | Right child node. 501 | 502 | ntype: str 503 | The type of the node: 'exNode', 'inNode'. 504 | """ 505 | def __init__(self, 506 | X, 507 | d, 508 | dd, 509 | q, 510 | e, 511 | left, 512 | right, 513 | node_type='' ): 514 | 515 | self.e = e 516 | self.size = len(X) 517 | self.X = X 518 | self.d = d 519 | self.dd = dd 520 | self.q = q 521 | self.left = left 522 | self.right = right 523 | self.ntype = node_type 524 | 525 | class iTree(object): 526 | 527 | """ 528 | A single tree in the forest that is build using a unique subsample. 529 | Attributes 530 | ---------- 531 | e: int 532 | Depth of tree 533 | 534 | X: list 535 | Data present at the root node of this tree. 536 | 537 | step : array 538 | Vector of the length of intervals of discretization. 539 | 540 | size: int 541 | Size of the dataset. 542 | 543 | dim: int 544 | Dimension of the dataset. 545 | 546 | l: int 547 | Maximum depth a tree can reach before its creation is terminated. 548 | 549 | d: Array 550 | Normal vector at the root of this tree, which is used in creating hyperplanes for 551 | splitting criteria. 552 | 553 | dd : int 554 | The index of the direction chosen at this node. 555 | 556 | q: float 557 | Intercept point at the root of this tree through which the splitting hyperplane passes. 558 | 559 | root: Node object 560 | At each node create a new tree. 561 | 562 | D: Array-like 563 | Dictionary of functions used as directions. 564 | 565 | innerproduct : str or function 566 | An inner product that we use for the construction of the tree. 567 | 568 | alpha : float 569 | A float number between [0,1] used in the innerproduct of the paper. 570 | 571 | deriv_X : Array-like 572 | A matrix of derivate of X if needed for the scalar product. 573 | 574 | deriv_dictionary : Array-like 575 | A matrix of derivate of D if needed for the scalar product. 576 | 577 | Methods 578 | ------- 579 | make_tree(X, e, l, D, innerproduct) 580 | Builds the tree recursively from a given node. Returns a Node object. 581 | """ 582 | 583 | def __init__(self, 584 | X, 585 | time, 586 | step, 587 | e, 588 | l, 589 | D, 590 | innerproduct, 591 | alpha, 592 | deriv_X=None, 593 | deriv_dictionary=None, 594 | subsample_size=None, 595 | criterion=None, 596 | mean=None, 597 | sd=None): 598 | 599 | self.e = e 600 | self.X = X 601 | self.step = step 602 | self.time = time 603 | self.size = len(X) 604 | self.dim = self.X.shape[1] 605 | self.l = l 606 | self.q = None 607 | self.d = None 608 | self.dd = None 609 | self.exnodes = 0 610 | self.D = D 611 | self.innerproduct = innerproduct 612 | self.alpha = alpha 613 | self.deriv_X = deriv_X 614 | self.mean = mean 615 | self.sd = sd 616 | 617 | self.deriv_dictionary = deriv_dictionary 618 | 619 | if (type(self.D) != str): 620 | self.IF = np.zeros((self.D.shape[0])) 621 | 622 | self.subsample_size = subsample_size 623 | self.criterion = criterion 624 | # At each node create a new tree, starting with root node. 625 | self.root = self.make_tree(self.X, self.e) 626 | 627 | 628 | 629 | def make_tree(self, X, e): 630 | """ 631 | make_tree(X,e,l,D, innerproduct) 632 | Builds the tree recursively from a given node. Returns a Node object. 633 | Parameters 634 | ---------- 635 | X: Array like 636 | Subsample of training data. 637 | 638 | e : int 639 | Depth of the tree as it is being traversed down. Integer. e <= l. 640 | 641 | Returns 642 | ------- 643 | Node object 644 | """ 645 | 646 | self.e = e 647 | # A curve is isolated in training data, or the depth limit has been reached. 648 | if e >= self.l or len(X) <= 1: 649 | left = None 650 | right = None 651 | self.exnodes += 1 652 | return Node(X, self.d, self.dd, self.q, e, left, right, node_type='exNode') 653 | 654 | # Building the tree continues. All these nodes are internal. 655 | else: 656 | sample_size = X.shape[0] 657 | t = np.linspace(0,1,len(self.step)+1) 658 | 659 | if (type(self.D) != str): 660 | # For finite dictionaries, we draw direction from them. 661 | idx = np.random.choice(np.arange((self.D).shape[0]), size=1) 662 | self.d = self.D[idx[0],:] 663 | self.dd = idx[0] 664 | 665 | 666 | 667 | elif (self.D == 'cosinus'): 668 | """ We draw directions from the cosinus dictionary defined in the paper 669 | (with random amplitudes and frequences). 670 | """ 671 | 672 | self.d = np.random.uniform(-1, 1, 1) * np.cos(2 * np.pi * np.random.uniform(0, 10, 1) * t) 673 | if (self.alpha != 1): 674 | self.deriv_dictionary.append(np.diff(self.d) / self.step) 675 | self.dd = len(self.deriv_dictionary) - 1 676 | 677 | elif (self.D == 'Brownian'): 678 | """ We draw directions from the Brownian motion dictionary defined in the paper""" 679 | 680 | 681 | if (self.mean == None): 682 | self.mean = 0 683 | 684 | if (self.sd == None): 685 | self.sd = 1 686 | 687 | self.d = np.zeros((len(t))) 688 | self.d[0] = np.random.normal(self.mean, scale=self.sd , size=1) 689 | for i in range(1,len(t)): 690 | self.d[i] += self.sd * np.random.normal(0, scale=np.sqrt(t[2] - t[1]) 691 | , size=1) + self.mean * (t[2] - t[1]) 692 | if (self.alpha != 1): 693 | self.deriv_dictionary.append(np.diff(self.d) / self.step) 694 | self.dd = len(self.deriv_dictionary) - 1 695 | 696 | elif (self.D == 'gaussian_wavelets'): 697 | """ We draw directions from the gaussian wavelets dictionary. 698 | We use a discretization on [-5,5] and add two random parameters 699 | to get an interesting dictionary. 700 | The standard deviation sigma and a translation parameter K. The range of these 701 | parameters are fixed. 702 | """ 703 | 704 | t = np.linspace(-5,5,len(self.step)+1) 705 | sigma = np.random.uniform(0.2,1) 706 | K = np.random.uniform(-4,4) 707 | self.d = (-(2 / (np.power(np.pi,0.25) * np.sqrt(3 * sigma)) ) 708 | * ((t - K) ** 2 / (sigma ** 2) -1) * ( 709 | np.exp(-(t - K) ** 2 / (2 * sigma ** 2)))) 710 | if (self.alpha != 1): 711 | self.deriv_dictionary.append(np.diff(self.d) / self.step) 712 | self.dd = len(self.deriv_dictionary) - 1 713 | 714 | elif (self.D == 'Brownian_bridge'): 715 | """ We draw directions from the Brownian bridge dictionary defined in the paper""" 716 | 717 | self.d = np.zeros((len(t))) 718 | for i in range(1,(len(t)-1)): 719 | self.d[i] += np.random.normal(0, np.sqrt(t[2] - t[1]) 720 | , size=1) - self.d[i-1] * (t[2] - t[1]) / (1 - t[i]) 721 | 722 | if (self.alpha != 1): 723 | self.deriv_dictionary.append(np.diff(self.d) / self.step) 724 | self.dd = len(self.deriv_dictionary) - 1 725 | 726 | 727 | elif (self.D == 'indicator_uniform'): 728 | """ We draw directions from the indicator uniform dictionary defined in the paper""" 729 | 730 | self.d = np.zeros((len(t))) 731 | a = ((self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0]) 732 | b = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0] 733 | for j in range(len(self.time)): 734 | self.d[j] = 1. * (np.maximum(a, b) > self.time[j] > np.minimum(a, b)) 735 | 736 | 737 | elif (self.D == 'linear_indicator_uniform'): 738 | """ We draw directions from the Linear indicator uniform dictionary defined in the paper""" 739 | 740 | 741 | self.d = np.zeros((len(t))) 742 | a = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0] 743 | b = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0] 744 | for j in range(len(self.time)): 745 | self.d[j] = self.time[j] * (np.maximum(a,b) > self.time[j] > np.minimum(a,b)) 746 | 747 | if (self.alpha != 1): 748 | self.deriv_dictionary.append(np.diff(self.d) / self.step) 749 | self.dd = len(self.deriv_dictionary) - 1 750 | 751 | else: raise TypeError('This Dictionary is not pre-defined') 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | Z = np.zeros((sample_size)) 761 | 762 | if (self.alpha != 1): 763 | for i in range(sample_size): 764 | 765 | 766 | Z[i] = self.innerproduct(X[i,:], self.d, self.deriv_X[i], 767 | self.deriv_dictionary[self.dd]) 768 | 769 | else: 770 | for i in range(sample_size): 771 | Z[i] = self.innerproduct(X[i,:], self.d) 772 | 773 | # Picking a random threshold for the hyperplane splitting data. 774 | 775 | self.q = np.random.uniform(np.min(Z), np.max(Z)) 776 | 777 | # Criteria that determines if a curve should go to the left or right child node. 778 | 779 | w = Z - self.q < 0 780 | 781 | if (type(self.D) != str): 782 | if (sample_size >2): 783 | if (np.sum(w) == 1 or np.sum(w) == sample_size - 1): 784 | if (self.criterion == "naive"): 785 | self.IF[idx[0]] += 1 786 | elif(self.criterion == "sample"): 787 | self.IF[idx[0]] += sample_size / self.subsample_size 788 | 789 | else: 790 | self.IF[idx[0]] += 1 / (e + 1) 791 | 792 | 793 | return Node(self.X, self.d, self.dd, self.q, e,\ 794 | left=self.make_tree(X[w], e+1),\ 795 | right=self.make_tree(X[~w], e+1),\ 796 | node_type = 'inNode' ) 797 | 798 | class PathFactor(object): 799 | """ 800 | Given a single tree (iTree objext) and a curve x , compute the length of the path traversed 801 | by the point on the tree when it reaches an external node. 802 | 803 | Attributes 804 | ---------- 805 | path_list: list 806 | A list of strings 'L' or 'R' which traces the path a data curve travels down a tree. 807 | 808 | x: list 809 | A single function, which is represented as an array floats. 810 | 811 | e: int 812 | The depth of a given node in the tree. 813 | 814 | deriv_x : Array 815 | The derivative of the new function if needed for the scalar product. 816 | 817 | step : array 818 | Vector of the length of intervals of discretization. 819 | 820 | D: Array-like 821 | Dictionary of functions used as directions. 822 | 823 | innerproduct : str or function 824 | An inner product that we use for the construction of the tree. 825 | 826 | alpha : float 827 | A float number between [0,1] used in the innerproduct of the paper. 828 | 829 | deriv_X : Array-like 830 | A matrix of derivate of X if needed for the scalar product. 831 | 832 | deriv_dictionary : Array-like 833 | A matrix of derivate of D if needed for the scalar product. 834 | 835 | Methods 836 | ------- 837 | find_path(T) 838 | Given a tree, it finds the path a single data curves takes. 839 | """ 840 | def __init__(self, 841 | x, 842 | step, 843 | itree, 844 | alpha, 845 | deriv_x=None): 846 | 847 | self.path_list=[] 848 | self.x = x 849 | self.deriv_x = deriv_x 850 | self.e = 0 851 | self.alpha = alpha 852 | self.step = step 853 | self.D = itree.D 854 | self.deriv_dictionary = itree.deriv_dictionary 855 | self.innerproduct = itree.innerproduct 856 | self.path = self.find_path(itree.root) 857 | 858 | def find_path(self, T): 859 | """ 860 | find_path(T) 861 | Given a tree, find the path for a single curve based on the splitting criteria 862 | stored at each node. 863 | 864 | Parameters 865 | ---------- 866 | T : Node object 867 | 868 | innerproduct : str or function 869 | The innerproduct use in the Forest. 870 | 871 | 872 | Returns 873 | ------- 874 | int 875 | The depth reached by the data curve. 876 | """ 877 | if T.ntype == 'exNode': 878 | 879 | if T.size <= 1: return self.e 880 | 881 | else: 882 | self.e = self.e + c_factor(T.size) 883 | return self.e 884 | else: 885 | 886 | q = T.q 887 | d = T.d 888 | self.e += 1 889 | if (self.alpha != 1): 890 | if self.innerproduct(self.x, d, self.deriv_x, self.deriv_dictionary[T.dd]) - q < 0: 891 | self.path_list.append('L') 892 | return self.find_path(T.left) 893 | else: 894 | self.path_list.append('R') 895 | return self.find_path(T.right) 896 | else: 897 | if self.innerproduct(self.x, d, self.step) - q < 0: 898 | self.path_list.append('L') 899 | return self.find_path(T.left) 900 | else: 901 | self.path_list.append('R') 902 | return self.find_path(T.right) 903 | 904 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import numpy 4 | from Cython.Distutils import build_ext 5 | try: 6 | from setuptools import setup, find_packages 7 | from setuptools.extension import Extension 8 | except ImportError: 9 | from distutils.core import setup 10 | from distutils.extension import Extension 11 | prjdir = os.path.dirname(__file__) 12 | 13 | 14 | def read(filename): 15 | return open(os.path.join(prjdir, filename)).read() 16 | 17 | 18 | 19 | 20 | extra_link_args = [] 21 | libraries = [] 22 | library_dirs = [] 23 | include_dirs = [] 24 | exec(open('version.py').read()) 25 | setup( 26 | name='pyt-fif', 27 | version=__version__, 28 | author='Guillaume Staerman', 29 | author_email='guillaume.staerman@telecom-paris.fr', 30 | cmdclass={'build_ext': build_ext}, 31 | ext_modules=[Extension("fif", 32 | sources=["_fif.pyx", "fif.cxx"], 33 | include_dirs=[numpy.get_include()], 34 | extra_compile_args=['-std=c++11', '-Wcpp'], 35 | language="c++")], 36 | scripts=[], 37 | py_modules=['version'], 38 | packages=[], 39 | license='License.txt', 40 | include_package_data=True, 41 | description='Functional Isolation Forest', 42 | long_description_content_type='text/markdown', 43 | url='https://github.com/GuillaumeStaermanML/FIF', 44 | download_url='https://github.com/GuillaumeStaermanML/FIF/archive/refs/tags/1.0.2.tar.gz', 45 | install_requires=["numpy", "cython"], 46 | ) 47 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | """Functional Isolation Forest version""" 2 | 3 | version_tag = (1, 0, 2) 4 | __version__ = '.'.join(map(str, version_tag[:3])) 5 | 6 | if len(version_tag) > 3: 7 | __version__ = '%s-%s' % (__version__, version_tag[3]) 8 | --------------------------------------------------------------------------------