├── Computation_time.ipynb
├── Figures
    ├── anomaly_example-1.png
    ├── anomaly_example_rank-1.png
    └── anomaly_example_score-1.png
├── License.txt
├── MFIF_python
    └── Section_5
    │   ├── DDplot_3D.pdf
    │   ├── MFIF.py
    │   ├── MNIST.csv
    │   └── section_5.ipynb
├── README.rst
├── Toy_example.ipynb
├── __fif.pxd
├── __init__.py
├── _fif.pyx
├── fif.cxx
├── fif.hxx
├── old_fif.py
├── setup.cfg
├── setup.py
└── version.py


/Computation_time.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## This notebook provides a simple comparison of computation time of the python version and the new Cython/C++ version."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Importing libraries"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import time\n",
 24 |     "import numpy as np\n",
 25 |     "import matplotlib.pyplot as plt\n",
 26 |     "import fif as FIF\n",
 27 |     "import old_fif as old_FIF"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### Simulated functions"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "np.random.seed(42)\n",
 44 |     "m =1000 ## m measures\n",
 45 |     "n =1000 ## n curves\n",
 46 |     "tps = np.linspace(0,1,m)\n",
 47 |     "v = np.linspace(1,1.4,n)\n",
 48 |     "X = np.zeros((n,m))\n",
 49 |     "for i in range(n):\n",
 50 |     "    X[i] = 30 * ((1-tps) ** v[i]) * tps ** v[i]\n",
 51 |     "\n",
 52 |     "\n",
 53 |     "Z1 = np.zeros((m))\n",
 54 |     "for j in range(m):\n",
 55 |     "    if (tps[j]<0.2 or tps[j]>0.8):\n",
 56 |     "        Z1[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2\n",
 57 |     "    else:\n",
 58 |     "        Z1[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + np.random.normal(0,0.3,1)\n",
 59 |     "Z1[0] = 0\n",
 60 |     "Z1[m-1] = 0\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "Z2 = 30 * ((1-tps) ** 1.6) * tps ** 1.6\n",
 64 |     "\n",
 65 |     "\n",
 66 |     "Z3 = np.zeros((m))\n",
 67 |     "for j in range(m):\n",
 68 |     "    Z3[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + np.sin(2*np.pi*tps[j])\n",
 69 |     "\n",
 70 |     "Z4 = np.zeros((m))\n",
 71 |     "for j in range(m):\n",
 72 |     "    Z4[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2\n",
 73 |     "\n",
 74 |     "for j in range(70,71):\n",
 75 |     "    Z4[j] += 2\n",
 76 |     "\n",
 77 |     "Z5 = np.zeros((m))\n",
 78 |     "for j in range(m):\n",
 79 |     "    Z5[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + 0.5*np.sin(10*np.pi*tps[j])\n",
 80 |     "\n",
 81 |     "X = np.concatenate((X,Z1.reshape(1,-1),Z2.reshape(1,-1),\n",
 82 |     "                     Z3.reshape(1,-1), Z4.reshape(1,-1), Z5.reshape(1,-1)), axis = 0)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Cython/C++ version of FIF"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 8,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "CPU times: user 7.21 s, sys: 175 ms, total: 7.38 s\n",
102 |       "Wall time: 7.33 s\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "%%time\n",
108 |     "time = np.linspace(0,1,m)\n",
109 |     "F = FIF.FiForest(X, time, ntrees=300, sample_size=64, alpha=0.5, dic_number=1, seed=0)\n",
110 |     "S_new_F = F.compute_paths(X_in=X)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "## Old python version of FIF"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 9,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "CPU times: user 5min 48s, sys: 595 ms, total: 5min 48s\n",
130 |       "Wall time: 5min 49s\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "%%time \n",
136 |     "np.random.seed(0)\n",
137 |     "old_F = old_FIF.FIForest(X,ntrees=300,subsample_size=64,time=time, D='gaussian_wavelets', innerproduct='auto', alpha=0.5)\n",
138 |     "S_old_F = old_F.compute_paths(X)"
139 |    ]
140 |   }
141 |  ],
142 |  "metadata": {
143 |   "kernelspec": {
144 |    "display_name": "Python 3",
145 |    "language": "python",
146 |    "name": "python3"
147 |   },
148 |   "language_info": {
149 |    "codemirror_mode": {
150 |     "name": "ipython",
151 |     "version": 3
152 |    },
153 |    "file_extension": ".py",
154 |    "mimetype": "text/x-python",
155 |    "name": "python",
156 |    "nbconvert_exporter": "python",
157 |    "pygments_lexer": "ipython3",
158 |    "version": "3.7.4"
159 |   }
160 |  },
161 |  "nbformat": 4,
162 |  "nbformat_minor": 2
163 | }
164 | 


--------------------------------------------------------------------------------
/Figures/anomaly_example-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuillaumeStaermanML/FIF/a92273974fd860f9441af99d4a4e418d1e739998/Figures/anomaly_example-1.png


--------------------------------------------------------------------------------
/Figures/anomaly_example_rank-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuillaumeStaermanML/FIF/a92273974fd860f9441af99d4a4e418d1e739998/Figures/anomaly_example_rank-1.png


--------------------------------------------------------------------------------
/Figures/anomaly_example_score-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuillaumeStaermanML/FIF/a92273974fd860f9441af99d4a4e418d1e739998/Figures/anomaly_example_score-1.png


--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 Télécom Paris, France.
 2 | All rights reserved.
 3 | 
 4 | Developed by: 	  Guillaume Staerman
 5 |                   LTCI
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | 
 9 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers.
10 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution.
11 | Neither the names of Guillaume Staerman, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission.
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
13 | 


--------------------------------------------------------------------------------
/MFIF_python/Section_5/DDplot_3D.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuillaumeStaermanML/FIF/a92273974fd860f9441af99d4a4e418d1e739998/MFIF_python/Section_5/DDplot_3D.pdf


--------------------------------------------------------------------------------
/MFIF_python/Section_5/MFIF.py:
--------------------------------------------------------------------------------
  1 | """ Multivariate Functional Isolation Forest
  2 | 
  3 |     Author : Guillaume Staerman
  4 | """
  5 | 
  6 | 
  7 | """Multivariate Functional Isolation Forest Algorithm
  8 | 
  9 | This is the implementation of The Multivariate Functional Isolation Forest which is an
 10 | extension of the original Isolation Forest applied to functional data.
 11 | 
 12 | It return the anomaly score of each sample using the FIF algorithm.
 13 | The Functional Isolation Forest 'isolates' observations by 
 14 | randomly selecting a multivariate curve among a dictionary
 15 | and then randomly selecting a split value between the maximum 
 16 | and minimum values of the selected feature.
 17 | 
 18 | Since recursive partitioning can be represented by a tree structure, the
 19 | number of splittings required to isolate a sample is equivalent to the path
 20 | length from the root node to the terminating node.
 21 | 
 22 | This path length, averaged over a forest of such random trees, is a
 23 | measure of normality.
 24 | 
 25 | Random partitioning produces noticeably shorter paths for anomalies.
 26 | Hence, when a forest of random trees collectively produce shorter path
 27 | lengths for particular samples, they are highly likely to be anomalies.
 28 | 
 29 | """
 30 | import numpy as np
 31 | #from version import __version__
 32 | 
 33 | 
 34 | 
 35 | def derivateM(X, step):
 36 |     """Compute de derivative of a multivariate function X on each dimension.
 37 |     """
 38 |     step = step.astype(dtype = float)
 39 |     A = np.zeros((X.shape[0],X.shape[1]-1))
 40 |     for i in range(X.shape[0]):
 41 |             A[i,:] = np.diff(X[i,:]) / step
 42 |     return A
 43 | 
 44 | def c_factor(n_samples_leaf) :
 45 |     """
 46 |     Average path length of unsuccesful search in a binary search tree given n points
 47 |     
 48 |     Parameters
 49 |     ----------
 50 |     n_samples_lead : int
 51 |         Number of curves for the BST.
 52 |     Returns
 53 |     -------
 54 |     float
 55 |         Average path length of unsuccesful search in a BST
 56 |         
 57 |     """
 58 |     return 2.0 * (np.log(n_samples_leaf - 1) + np.euler_gamma) - (2. * (
 59 |         n_samples_leaf - 1.) / (n_samples_leaf * 1.0))
 60 | 
 61 | 
 62 | class MFIForest(object):
 63 |     """
 64 |     Multivariate Functional Isolation Forest
 65 |     
 66 |     Creates an MFIForest object. This object holds the data as well as the trained trees (iTree objects).
 67 |     
 68 |     Attributes
 69 |     ----------
 70 |     X : Array-like (n_samples, dimension, discretization)
 71 |         Data used for training.
 72 |         
 73 |     nobjs: int
 74 |         Size of the dataset.
 75 |         
 76 |     sample: int
 77 |         Size of the sample to be used for tree creation.
 78 |         
 79 |     Trees: list
 80 |         A list of tree objects.
 81 |         
 82 |     limit: int
 83 |         Maximum depth a tree can have.
 84 |         
 85 |     c: float
 86 |         Multiplicative factor used in computing the anomaly scores.
 87 |         
 88 |     step : array
 89 |         Vector of the length of intervals of discretization.
 90 |         
 91 |     D : Array-like
 92 |         Dictionnary of functions used as directions.
 93 |         
 94 |     Dsize : int
 95 |         The size of the dictionary. It is the number of curves that we will use in our 
 96 |         dictionary to build the forest.
 97 |         
 98 |     mean : float or None, optional (Default=None)
 99 |         The mean of the stochastic process used to build a stochastic dictionary. 
100 |         This is set to zero by default. If a stochastic dictionary is called 
101 |         and no mean is given, it is set to 0.
102 | 
103 |     sd : float or None, optional (Default=None)
104 |         The standard deviation of the stochastic process used to build a stochastic dictionary.
105 |         This is set to one by default. If a stochastic dictionary is called 
106 |         and no standard deviation is given, it is set to 1.
107 | 
108 |     J_max : int or None, optional (Default=None)
109 |         This parameter fix the size of the dictionary of Haar_wavelet_father. 
110 |         It will build 2 power J_max functions.
111 | 
112 |     amplitude_min : float or None, optional (Default=None)
113 |         This parameter is used for cosinus dictionary. 
114 |         It is the minimum amplitude with which one draws the amplitude.
115 | 
116 |     amplitude_max : float or None, optional (Default=None)
117 |         This parameter is used for cosinus dictionary. 
118 |         It is the minimum amplitude with which one draws the amplitude.
119 |         
120 |     innerproduct : str or function  
121 |         An inner product that we use for the construction of the tree. The innerproduct in the paper
122 |         is already implemented, call it with 'auto' and fixe and alpha. If a function is given by 
123 |         the user, it should have three argument : (x, y, step) where x and y are curve (represented
124 |         by a vector of length of the discretization). "step" is a vector of length len(time)-1 which
125 |         represents the vector of length of step between the discretization.
126 |                 
127 |     alpha : float
128 |         a float number between [0,1] used in the innerproduct of the paper.
129 |             
130 |     deriv_X : Array like
131 |         A matrix of derivate of X if needed for the scalar product.
132 |         
133 |     deriv_dictionary : Array like
134 |         A matrix of derivate of D if needed for the scalar product.
135 |         
136 |     Attributes
137 |     -------
138 |     compute_paths(X_in) :
139 |         Computes the anomaly score for data X_in
140 |         
141 |     threshold(score_sample, contamination) :
142 |         Given the score returned by the fit function on training sample and a proportion 
143 |         of anomalies, compute the threshold which separates anomalies and normal data.
144 |         
145 |     predict_label(score, contamination) :
146 |         Given any score (training or testing) and the proportion of anomalies 
147 |         it return the labels predicted. The function return +1 for outliers and
148 |         -1 for inliers.
149 |     
150 |     
151 |     References
152 |     ----------
153 |     
154 |     .. [1] Staerman, G, Mozharovskyi, P, D'Alché-buc, F and Clémençon,S. "Functional Isolation forest."
155 | 
156 |     
157 |     """
158 |     
159 |     def __init__(self,
160 |                  X,                
161 |                  D,
162 |                  innerproduct,
163 |                  time,
164 |                  ntrees=None,
165 |                  subsample_size=None,
166 |                  Dsize=None,
167 |                  limit=None,
168 |                  mean=None,
169 |                  sd=None,
170 |                  J_max=None,
171 |                  amplitude_min=None,
172 |                  amplitude_max=None,
173 |                  alpha=None):
174 |         
175 |         
176 | 
177 |         self.X = X
178 |         self.nobjs = len(X)
179 |         self.Trees = []
180 |         self.time = time
181 | 
182 | 
183 |         if (ntrees == None):
184 |             self.ntrees = 100
185 |         else: self.ntrees = ntrees
186 | 
187 | 
188 |         if (subsample_size == None):
189 |             if (len(X)>800):
190 |                 self.sample = 256
191 |             else: self.sample = 64
192 |         else : self.sample = subsample_size
193 | 
194 | 
195 |         if (Dsize == None):
196 |             self.Dsize = 1000
197 |         else: self.Dsize = Dsize 
198 |         
199 | 
200 |         if (type(D) == str):
201 |             """Some dictionary pre-implemented.
202 |             """ 
203 |             
204 |                     
205 |             if (D == 'Brownian'):
206 |                 """ We build a dictionary from brownian motion (standard or drift).
207 |                 We use a discretization on [0,1] since we are interested only in the shape.
208 |                 """
209 |                 if (mean == None):
210 |                     mean = np.zeros(((self.X).shape[1]))
211 |                 
212 |                 if (sd == None):
213 |                     sd = np.eye((self.X).shape[1],(self.X).shape[1])
214 |                     
215 |                 self.D = np.zeros((self.Dsize,(self.X).shape[1], len(self.time)))
216 |                 t = np.linspace(0, 1, len(self.time))
217 |                 self.D[:,:,0] = np.random.multivariate_normal(mean = mean, cov = sd, size = self.Dsize) 
218 |                 for i in range(self.Dsize):
219 |                     for j in range(1,np.size(self.time)):
220 |                         self.D[i,:,j] = self.D[i,:, j-1] + np.dot(sd, np.random.multivariate_normal(mean = mean, 
221 |                                     cov = np.eye((self.X).shape[1],(self.X).shape[1]) * np.sqrt( t[2] - t[1])
222 |                                                                 , size = 1).T).T + mean * (t[2] - t[1]) 
223 | 
224 |             elif (D == 'Brownian_bridge'):
225 |                 """ We build a dictionary from Brownian bridge.
226 |                 """
227 |                 mean = np.zeros(((self.X).shape[1]))
228 |                 sd = np.eye((self.X).shape[1],(self.X).shape[1])
229 |                 self.D = np.zeros((self.Dsize,(self.X).shape[1],len(self.time)))
230 |                 t = np.linspace(0, 1, len(self.time))
231 |                 for i in range(self.Dsize):
232 |                     for k in range((self.X).shape[1]):
233 |                         for j in range(1,(len(self.time)-1)):
234 |                              self.D[i,k,j] = self.D[i,k, j-1] +  np.random.normal(0, np.sqrt(t[2]-t[1])
235 |                                                         , size = 1) - self.D[i,k,j-1] * (t[2]-t[1]) / (1 - t[j])
236 |                     
237 |             elif (D == 'gaussian_wavelets'):  
238 |                 """ We build a dictionary from gaussian wavelets. We use a discretization on [-5,5]
239 |                 and add two random parameters to get an interesting dictionary. 
240 |                 The standard deviation sigma and a translationparameter K. The range of these 
241 |                 parameters are fixed.
242 |                 """
243 |                 t = np.linspace(-5,5,len(self.time))
244 |                 self.D = np.zeros((self.Dsize, (self.X).shape[1], len(self.time)))
245 |                 for i in range(self.Dsize):
246 |                     for j in range((self.X).shape[1]): 
247 |                         sigma = np.random.uniform(0.2,1)
248 |                         K = np.random.uniform(-4,4)
249 |                         for l in range(len(self.time)):
250 |                             self.D[i,j,l] = (-(2 / (np.power(np.pi,0.25) * np.sqrt(3 * sigma)) ) 
251 |                             * ((t[l]-K) ** 2 / (sigma ** 2) -1) * (np.exp(-(t[l] - K) ** 2 / (2 * sigma ** 2))))
252 |                         
253 |             elif (D == 'Dyadic_indicator'):
254 |                 """ We build a dictionary from the basis of the Haar wavelets using 
255 |                 only the father wavelets. We use a discretization on [0,1] since 
256 |                 we are interested only in the shape of the curves.
257 |                 """
258 |                 if (J_max == None):
259 |                     J_max = 7
260 |                 a =0
261 |                 t = np.linspace(0,1,len(self.time))
262 |                 self.D = np.zeros((np.sum(np.power(2,np.arange(J_max))) ** 2, (self.X).shape[1], len(self.time)))
263 |                 for J in range(J_max):
264 |                     for j in range((self.X).shape[1]):
265 |                         b = np.power(2,J)
266 |                         for k in range(0,b):
267 |                             for l in range(0,len(self.time)):
268 |                                 x = b * t[l] - k
269 |                                 self.D[a,j,l] = a*(0 <= x < 1)
270 |                             a += 1
271 |                         
272 |             elif (D == 'cosinus'):
273 |                 """ We build a cosinus dictionary with random amplitudes and frequences.
274 |                 Amplitudes are fixed by the user while freq are fixed by the algorithm 
275 |                 with a large range to avoid overloading parameters.
276 |                 """
277 |                 if (amplitude_min == None):
278 |                     amplitude_min = -1
279 |                     
280 |                 if (amplitude_max == None):
281 |                     amplitude_max = 1
282 |                     
283 |                 t = np.linspace(0,1,len(self.time))
284 |                 self.D = np.zeros((self.Dsize,(self.X).shape[1],len(self.time)))
285 |                 for i in range(self.Dsize):
286 |                     for j in range((self.X).shape[1]):
287 |                         freq = np.random.uniform(0, 10, 1)
288 |                         amp = np.random.uniform(amplitude_min, amplitude_max, 1)
289 |                         self.D[i,j,:] = amp * np.cos(2 * np.pi * freq * t)
290 |                         
291 |             elif (D == 'SinusCosinus'):
292 |                 """ We build a cosinus dictionary with random amplitudes and frequences.
293 |                 Amplitudes are fixed by the user while freq are fixed by the algorithm 
294 |                 with a large range to avoid overloading parameters.
295 |                 """
296 |                 if (amplitude_min == None):
297 |                     amplitude_min = -1
298 |                     
299 |                 if (amplitude_max == None):
300 |                     amplitude_max = 1
301 |                     
302 |                 t = np.linspace(0,1,len(self.time))
303 |                 self.D = np.zeros((self.Dsize,(self.X).shape[1],len(self.time)))
304 |                 for i in range(self.Dsize):
305 |                     for j in range((self.X).shape[1]):
306 |                         freq = np.random.uniform(0, 10, 1)
307 |                         amp = np.random.uniform(amplitude_min, amplitude_max, 1)
308 |                         choice = np.random.choice(np.array([0,1,]))
309 |                         if (choice == 0):
310 |                             self.D[i,j,:] = amp * np.cos(2 * np.pi * freq * t)
311 |                         else:
312 |                             self.D[i,j,:] = amp * np.sin(2 * np.pi * freq * t)
313 |             elif ( D == 'Self'):
314 |                 self.D = self.X.copy()
315 |             else: raise TypeError('This Dictionary is not pre-defined')
316 |         else: self.D = D
317 |             
318 |         self.alpha = alpha
319 |         self.step = np.diff(self.time)
320 |         self.deriv_D = None
321 |         self.deriv_X = None
322 |         
323 |         if not callable(innerproduct):
324 |             """ Some inner product implemented.
325 |             """                
326 |             if (innerproduct == 'auto1'):
327 |                
328 |                 
329 |                 if (self.alpha == None):
330 |                     self.alpha = 1
331 |                 if (self.alpha == 1):
332 |                     def innerproduct(x, y, xderiv = None, yderiv = None ):
333 |                         """We build the inner product in the paper with alpha = 1 which corresponds 
334 |                         to L2 dot product.
335 |                         """ 
336 |                         F1 = x * y 
337 |                         A = 0
338 |                         for i in range(F1.shape[0]):
339 |                             A += np.sum((self.step * (F1[i][((np.arange(len(F1[i])) + 1) % len(F1[i]))[:len(F1[i])-1]]
340 |                                 + F1[i][((np.arange(len(F1[i])) + -1) % len(F1[i]))[1:len(F1[i])]]) / 2))
341 |                         return A   
342 |                         
343 |                         
344 |                         
345 |                 elif (self.alpha == 0):
346 |                     self.deriv_X = np.zeros((self.X.shape[0], self.X.shape[1],self.X.shape[2]-1))
347 |                     self.deriv_D = np.zeros((self.D.shape[0], self.D.shape[1],self.D.shape[2]-1))
348 |                     for i in range(self.X.shape[0]):
349 |                         self.deriv_X[i] = derivateM(self.X[i], self.step)
350 |                     for i in range(self.D.shape[0]):
351 |                         self.deriv_D[i] = derivateM(self.D[i], self.step)
352 |                     def innerproduct(x,y, xderiv, yderiv):
353 |                         """We build the inner product in the paper with alpha = 0 which corresponds 
354 |                         to L2 of derivate dot product.
355 |                         """ 
356 |                         A = 0
357 |                         F1 = x * y
358 |                         F2 = xderiv * yderiv
359 |                         F11 = np.zeros((F1.shape[0],F1.shape[1]-1))
360 |                         F12 = np.zeros((F1.shape[0],F1.shape[1]-1))
361 |                         F21 = np.zeros((F2.shape[0],F2.shape[1]-1))
362 |                         F22 = np.zeros((F2.shape[0],F2.shape[1]-1))
363 |                         for i in range(F1.shape[0]):
364 |                             F11[i] = F1[i][((np.arange(len(F1[i])) + 1) % len(F1[i]))[:len(F1[i])-1]]
365 |                             F12[i] = F1[i][((np.arange(len(F1[i])) + -1) % len(F1[i]))[1:len(F1[i])]]
366 |                             F21[i] = F2[i][((np.arange(len(F2[i])) + 1) % len(F2[i]))[:len(F2[i])-1]]
367 |                             F22[i] = F2[i][((np.arange(len(F2[i])) + -1) % len(F2[i]))[1:len(F2[i])]]
368 |                             
369 |                         for i in range(F1.shape[0]):
370 |                             A += (self.alpha *np.sum(( self.step * (F11[i] + F12[i]) / 2))
371 |                             +(1-self.alpha) * np.sum((self.step[0:(len(self.step) - 1)]
372 |                                                       * (F21[i] + F22[i]) / 2)))
373 |                         return A
374 |                         
375 |                 else:
376 |                     self.deriv_X = np.zeros((self.X.shape[0], self.X.shape[1],self.X.shape[2]-1))
377 |                     self.deriv_D = np.zeros((self.D.shape[0], self.D.shape[1],self.D.shape[2]-1))
378 |                     for i in range(X.shape[0]):
379 |                         self.deriv_X[i] = derivateM(self.X[i], self.step)
380 |                     for i in range(self.D.shape[0]):
381 |                         self.deriv_D[i] = derivateM(self.D[i], self.step)
382 | 
383 |                     def innerproduct(x,y, xderiv, yderiv):
384 |                         """We build the inner product in the paper which is a compromise between 
385 |                         L2 scalar product and the L2 scalar product of derivate.
386 |                         The function that we use work only with if we have the observations 
387 |                         of curves at constant steps.
388 |                         """ 
389 |                         A = 0
390 |                         F1 = x * y
391 |                         F2 = xderiv * yderiv
392 |                         F11 = np.zeros((F1.shape[0],F1.shape[1]-1))
393 |                         F12 = np.zeros((F1.shape[0],F1.shape[1]-1))
394 |                         F21 = np.zeros((F2.shape[0],F2.shape[1]-1))
395 |                         F22 = np.zeros((F2.shape[0],F2.shape[1]-1))
396 |                         x11 = np.zeros((x.shape[0],x.shape[1]-1))
397 |                         x12 = np.zeros((x.shape[0],x.shape[1]-1))
398 |                         x21 = np.zeros((xderiv.shape[0],xderiv.shape[1]-1))
399 |                         x22 = np.zeros((xderiv.shape[0],xderiv.shape[1]-1))
400 |                         y11 = np.zeros((y.shape[0],y.shape[1]-1))
401 |                         y12 = np.zeros((y.shape[0],y.shape[1]-1))
402 |                         y21 = np.zeros((yderiv.shape[0],yderiv.shape[1]-1))
403 |                         y22 = np.zeros((yderiv.shape[0],yderiv.shape[1]-1))
404 |                         for i in range(F1.shape[0]):
405 |                             F11[i] = F1[i][((np.arange(len(F1[i])) + 1) % len(F1[i]))[:len(F1[i])-1]]
406 |                             F12[i] = F1[i][((np.arange(len(F1[i])) + -1) % len(F1[i]))[1:len(F1[i])]]
407 |                             F21[i] = F2[i][((np.arange(len(F2[i])) + 1) % len(F2[i]))[:len(F2[i])-1]]
408 |                             F22[i] = F2[i][((np.arange(len(F2[i])) + -1) % len(F2[i]))[1:len(F2[i])]]
409 |                             x11[i] = x[i][((np.arange(len(x[i])) + 1) % len(x[i]))[:len(x[i])-1]]
410 |                             x12[i] = x[i][((np.arange(len(x[i])) + -1) % len(x[i]))[1:len(x[i])]]
411 |                             x21[i] = xderiv[i][((np.arange(len(xderiv[i])) + 1) % len(xderiv[i]))[:len(xderiv[i])-1]]
412 |                             x22[i] = xderiv[i][((np.arange(len(xderiv[i])) + -1) % len(xderiv[i]))[1:len(xderiv[i])]]                             
413 |                             y11[i] = y[i][((np.arange(len(y[i])) + 1) % len(y[i]))[:len(y[i])-1]]
414 |                             y12[i] = y[i][((np.arange(len(y[i])) + -1) % len(y[i]))[1:len(y[i])]]
415 |                             y21[i] = yderiv[i][((np.arange(len(yderiv[i])) + 1) % len(yderiv[i]))[:len(yderiv[i])-1]]
416 |                             y22[i] = yderiv[i][((np.arange(len(yderiv[i])) + -1) % len(yderiv[i]))[1:len(yderiv[i])]]
417 | 
418 | 
419 | 
420 | 
421 | 
422 |                         for i in range(F1.shape[0]):
423 |                             A += (self.alpha * np.sum(F11[i] + F12[i]) / (np.sqrt(np.sum(x11[i] ** 2 +
424 |                                 x12[i] ** 2)) * np.sqrt(np.sum(y11[i] ** 2 + y12[i] ** 2)))
425 |                                + (1 - self.alpha) * np.sum(F21[i] + F22[i]) / (np.sqrt(np.sum(x21[i] ** 2 + 
426 |                                 x22[i] ** 2)) * np.sqrt(np.sum(y21[i] ** 2 + y22[i] ** 2))))
427 |                         return A
428 |             elif (innerproduct == 'auto2'):
429 |                 if (self.alpha == None or self.alpha !=1):
430 |                     self.alpha = 1
431 |                 def innerproduct(x, y, xderiv = None, yderiv = None):
432 |                     """ We build the second type of generalization of the dot product in 
433 |                          multivariate setting.
434 |                     """
435 |                     A = 0
436 |                     for i in range(x.shape[1]):
437 |                         A += np.inner(x[:,i],y[:,i])
438 |                     return A 
439 |                     
440 |                             
441 |             else: raise TypeError('This inner product is not pre-defined')
442 |         else: self.alpha = 1 
443 | 
444 |         self.innerproduct = innerproduct
445 |         self.limit = limit
446 |         if limit is None:
447 |             """Set limit to the default as specified by the paper
448 |             (average depth of unsuccesful search through a binary tree).
449 |             """ 
450 |             self.limit = int(np.ceil(np.log2(self.sample)))  
451 |             
452 |         self.c = c_factor(self.sample)
453 |         
454 |         if (self.alpha == 1):
455 |             for i in range(self.ntrees): 
456 |                 """This loop builds an ensemble of iTrees (the forest).
457 |                 """
458 |                 ix = np.random.choice(np.arange(self.nobjs), size = self.sample, replace = False)
459 |                 
460 |                 self.Trees.append(iTree(X[ix], self.step,  
461 |                                         0, self.limit, 
462 |                                         self.D, self.innerproduct, 
463 |                                         self.alpha, self.deriv_X, 
464 |                                         self.deriv_D))
465 |         else:
466 |             for i in range(self.ntrees): 
467 |                 """This loop builds an ensemble of iTrees (the forest).
468 |                 """
469 |                 ix = np.random.choice(np.arange(self.nobjs), size = self.sample, replace = False)
470 |                 
471 |                 self.Trees.append(iTree(X[ix], self.step, 
472 |                                         0, self.limit, 
473 |                                         self.D, self.innerproduct, 
474 |                                         self.alpha, self.deriv_X[ix], 
475 |                                         self.deriv_D))
476 |     
477 |     def compute_paths(self, X_in = None):
478 |         """
479 |         compute_paths(X_in = None) 
480 | 
481 |         Compute the anomaly score of an input sample is computed as
482 |         the mean anomaly score of the trees in the forest.
483 |         Parameters
484 |         ----------
485 |         X_in : Array-like
486 |                 Data to be scored. FIForest.Trees are used for computing the depth reached in 
487 |                 each tree by each data curve.
488 |         Returns
489 |         -------
490 |         float
491 |             Anomaly score for a given data curve.
492 |         """
493 |         if X_in is None:
494 |             X_in = self.X           
495 |             if(self.alpha != 1):
496 |                 deriv_X_in = self.deriv_X
497 |         else: 
498 |             if(self.alpha != 1):
499 |                 deriv_X_in = np.zeros((X_in.shape[0],X_in.shape[1],X_in.shape[2]-1))
500 |                 for i in range(X_in.shape[0]):
501 |                     deriv_X_in[i] = derivateM(X_in[i], self.step)
502 |         S = np.zeros(len(X_in))
503 |         
504 |         for i in  range(len(X_in)):
505 |             h_temp = 0
506 |             for j in range(self.ntrees):
507 |                 # Compute path length for each curve
508 |                 if(self.alpha != 1):
509 |                     h_temp += PathFactor(X_in[i], self.step, 
510 |                                          self.Trees[j], self.alpha,
511 |                                          deriv_X_in[i]).path * 1.0  
512 |                 else:
513 |                     h_temp += PathFactor(X_in[i], self.step, 
514 |                                          self.Trees[j],self.alpha).path * 1.0  
515 |                 
516 |             # Average of path length travelled by the point in all trees.
517 |             Eh = h_temp / self.ntrees
518 |             
519 |              # Anomaly Score
520 |             S[i] = 2.0 ** (- Eh / self.c)                                             
521 |         return S
522 |     def threshold(self, score_samples, contamination = 0.1):
523 |         """Compute the treshold to declare curves as anomalies or not.
524 |            The choice of 'lower' interpolation in the percentile function come from
525 |            the fact that it should be a little gap between the score of anomalies and the normal score. 
526 |            This choice could be different depending on the problem given.
527 |            
528 |         Parameters
529 |         ----------
530 |         
531 |         score_samples : Array
532 |             The score array for a dataset of curves.
533 |             
534 |         contamination : float, optional (default=0.1)
535 |             The amount of contamination of the data set, i.e. the proportion
536 |             of outliers in the data set. Used when fitting to define the threshold
537 |             on the decision function.
538 |             
539 |         """
540 |         return np.percentile(score_samples, 100 * (1-contamination), interpolation = 'lower')
541 |     
542 |     def predict_label(self, score, contamination = 0.1):
543 |          
544 |         """Compute the label vector of curves.  
545 |         
546 |         Parameters
547 |         ----------
548 |         
549 |         score : Array
550 |             The score array for a dataset of curves.
551 |             
552 |         contamination : float, optional (default=0.1)
553 |             The amount of contamination of the data set, i.e. the proportion
554 |             of outliers in the data set. Used when fitting to define the threshold
555 |             on the decision function.
556 |             
557 |         Returns
558 |         -------
559 |         
560 |         y_label : array
561 |             An array of predict label, -1 if the curve is considered as normal and +1 if not.
562 |         """
563 |         y_label = np.zeros((len(score)))
564 |         return -1 + 2.0 * (score > self.threshold(score, contamination))
565 | 
566 | 
567 | class Node(object): 
568 |     """
569 |     A single node from each tree (each iTree object). Nodes containe information on hyperplanes used for data division, date to be passed to left and right nodes, whether they are external or internal nodes.
570 |     Attributes
571 |     ----------
572 |     e: int
573 |         Depth of the tree to which the node belongs.
574 |         
575 |     size: int
576 |         Size of the dataset present at the node.
577 |         
578 |     X: Array-like
579 |         Data at the node.
580 |         
581 |     d: Array-like
582 |         Direction function used to build the hyperplane that splits the data in the node.
583 |         
584 |     dd : int
585 |         The index of the direction chosen at this node.
586 |         
587 |     q: Array
588 |         Intercept point through which the hyperplane passes.
589 |         
590 |     left: Node object
591 |         Left child node.
592 |         
593 |     right: Node object
594 |         Right child node.
595 |         
596 |     ntype: str
597 |         The type of the node: 'exNode', 'inNode'.
598 |     """
599 |     def __init__(self, 
600 |                  X, 
601 |                  d,
602 |                  dd,
603 |                  q, 
604 |                  e, 
605 |                  left, 
606 |                  right, 
607 |                  node_type='' ):
608 |         """
609 |         Node(X, u, q, e, left, right, node_type = '' )
610 |         Create a node in a given tree (iTree objectg)
611 |         Parameters
612 |         ----------
613 |         X : Array-like
614 |             Training data available to each node.
615 |             
616 |         d : Array
617 |             Direction (curve) used to build the hyperplane that splits the data in the node.
618 |             
619 |         q : Array
620 |             Intercept point for the hyperplane used for splitting data.
621 |             
622 |         left : Node object
623 |             Left child node.
624 |             
625 |         right : Node object
626 |             Right child node.
627 |             
628 |         node_type : str
629 |             Specifies if the node is external or internal. Takes two values: 'exNode', 'inNode'.
630 |         """
631 |         self.e = e
632 |         self.size = len(X)
633 |         self.X = X 
634 |         self.d = d
635 |         self.dd = dd
636 |         self.q = q
637 |         self.left = left
638 |         self.right = right
639 |         self.ntype = node_type
640 | 
641 | class iTree(object):
642 | 
643 |     """
644 |     A single tree in the forest that is build using a unique subsample.
645 |     Attributes
646 |     ----------
647 |     e: int
648 |         Depth of tree
649 |         
650 |     X: list
651 |         Data present at the root node of this tree.
652 |         
653 |     step : array
654 |         Vector of the length of intervals of discretization.
655 |         
656 |     size: int
657 |         Size of the dataset.
658 |         
659 |     dim: int
660 |         Dimension of the dataset.
661 |         
662 |     l: int
663 |         Maximum depth a tree can reach before its creation is terminated.
664 |         
665 |     d: list
666 |         Normal vector at the root of this tree, which is used in creating hyperplanes for splitting criteria
667 |         
668 |     dd : int
669 |         The index of the direction chosen at this node.
670 |         
671 |     q: list
672 |         Intercept point at the root of this tree through which the splitting hyperplane passes.
673 |         
674 |     root: Node object
675 |         At each node create a new tree.
676 |         
677 |     D: Array like
678 |         Dictionary of functions used as directions.
679 |         
680 |     innerproduct : function or str  
681 |         An inner product that we use for the construction of the tree.
682 |         
683 |     alpha : float
684 |         A float number between [0,1] used in the innerproduct of the paper.
685 | 
686 |     deriv_X : Array-like
687 |         A matrix of derivate of X if needed for the scalar product.
688 |         
689 |     deriv_D : Array-like
690 |         A matrix of derivate of D if needed for the scalar product.
691 |         
692 |     Methods
693 |     -------
694 |     make_tree(X, e, l, D, innerproduct)
695 |         Builds the tree recursively from a given node. Returns a Node object.
696 |     """
697 | 
698 |     def __init__(self,
699 |                  X,
700 |                  step,
701 |                  e,
702 |                  l,
703 |                  D,
704 |                  innerproduct,
705 |                  alpha,
706 |                  deriv_X=None,
707 |                  deriv_D=None):
708 |         
709 |         self.e = e
710 |         self.X = X
711 |         self.step = step
712 |         self.size = len(X)
713 |         self.l = l
714 |         self.q = None                                       
715 |         self.d = None 
716 |         self.dd = None
717 |         self.exnodes = 0
718 |         self.D = D  
719 |         self.innerproduct = innerproduct
720 |         self.alpha = alpha
721 |         self.deriv_X = deriv_X
722 |         self.deriv_D = deriv_D
723 |         self.root = self.make_tree(self.X, self.e) 
724 |         
725 |     def make_tree(self, X, e):
726 |         """
727 |         make_tree(X,e,l,D, innerproduct)
728 |         Builds the tree recursively from a given node. Returns a Node object.
729 |         Parameters
730 |         ----------
731 |         X: Array like
732 |             Subsample of training data. 
733 |             
734 |         e : int
735 |             Depth of the tree as it is being traversed down. Integer. e <= l.
736 | 
737 |            
738 |         Returns
739 |         -------
740 |         Node object
741 |         """
742 |         self.e = e
743 |         # A curve is isolated in training data, or the depth limit has been reached.
744 |         if e >= self.l or len(X) <= 1:                                               
745 |             left = None
746 |             right = None
747 |             self.exnodes += 1
748 |             return Node(X, self.d, self.dd, self.q, e, left, right, node_type = 'exNode')
749 |         
750 |         # Building the tree continues. All these nodes are internal.
751 |         else:                                                                   
752 |             sample_size = X.shape[0]  
753 |             idx = np.random.choice(range(0, (self.D).shape[0]), size=1)
754 |             self.d = self.D[idx[0]]
755 |             self.dd = idx[0]
756 |             Z = np.zeros((sample_size))
757 |             if (self.alpha != 1):
758 |                 for i in range(sample_size): 
759 |                         Z[i] = self.innerproduct(X[i], self.d, self.deriv_X[i], self.deriv_D[idx[0]])
760 |             else : 
761 |                 for i in range(sample_size): 
762 |                         Z[i] = self.innerproduct(X[i], self.d)
763 |             # Picking a random  threshold for the hyperplane splitting data.
764 |             self.q = np.random.uniform(np.min(Z), np.max(Z)) 
765 |             # Criteria that determines if a curve should go to the left or right child node.
766 |             w = Z - self.q < 0                                                    
767 |             return Node(X, self.d, self.dd, self.q, e,\
768 |             left=self.make_tree(X[w], e+1),\
769 |             right=self.make_tree(X[~w], e+1),\
770 |             node_type = 'inNode' )
771 | 
772 | class PathFactor(object):
773 |     """
774 |     Given a single tree (iTree objext) and a curve x , compute the length of the path traversed
775 |     by the point on the tree when it reaches an external node.
776 |     
777 |     Attributes
778 |     ----------
779 |     path_list: list
780 |         A list of strings 'L' or 'R' which traces the path a data curve travels down a tree.
781 |         
782 |     x: Array like (dimension, discretization)
783 |         A single function, which is represented as an matrix of floats.
784 |         
785 |     e: int
786 |         The depth of a given node in the tree.
787 |         
788 |     Methods
789 |     -------
790 |     find_path(T)
791 |         Given a tree, it finds the path a single data curves takes.
792 |     """
793 |     def __init__(self, 
794 |                  x, 
795 |                  step,
796 |                  itree,
797 |                  alpha,
798 |                  deriv_x=None):
799 |         """
800 |         PathFactor(x, itree)
801 |         Given a single tree (iTree objext) and a curve x, compute the legth of the path traversed 
802 |         by the point on the tree when it reaches an external node.
803 |         
804 |         Parameters
805 |         ----------
806 |         x : Array 
807 |             A single function x.
808 |             
809 |         itree : iTree object
810 |             A single tree.
811 |         """
812 |         self.path_list=[]
813 |         self.x = x
814 |         self.deriv_x = deriv_x
815 |         self.e = 0
816 |         self.alpha = alpha
817 |         self.step = step
818 |         self.D = itree.D
819 |         self.deriv_D = itree.deriv_D
820 |         self.innerproduct = itree.innerproduct
821 |         self.path  = self.find_path(itree.root)
822 | 
823 |     def find_path(self, T):
824 |         """
825 |         find_path(T)
826 |         Given a tree, find the path for a single curve based on the splitting criteria stored at each node.
827 |         
828 |         Parameters
829 |         ----------
830 |         T : Itree object
831 |         Returns
832 |         -------
833 |         int
834 |             The depth reached by the data curve.
835 |         """
836 |         if T.ntype == 'exNode':
837 |             if T.size <= 1: return self.e
838 |             else:
839 |                 self.e = self.e + c_factor(T.size)
840 |                 return self.e
841 |         else:
842 |             # Threshold for the hyperplane for splitting data at a given node.
843 |             q = T.q 
844 |             # Direction curve for the hyperplane for splitting data at a given node.
845 |             d = T.d                                                             
846 |             self.e += 1
847 |             
848 |             if (self.alpha != 1):
849 |                 if self.innerproduct(self.x, d, self.deriv_x, self.deriv_D[T.dd]) - q < 0:
850 |                     self.path_list.append('L')
851 |                     return self.find_path(T.left)
852 |                 else:
853 |                     self.path_list.append('R')
854 |                     return self.find_path(T.right)
855 |             else:
856 |                 if self.innerproduct(self.x, d, self.step) - q < 0:
857 |                     self.path_list.append('L')
858 |                     return self.find_path(T.left)
859 |                 else:
860 |                     self.path_list.append('R')
861 |                     return self.find_path(T.right)
862 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | FIF : Functional Isolation Forest
  2 | =========================================
  3 | 
  4 | This repository hosts Python code of the Functional Isolation Forest algorithm: https://proceedings.mlr.press/v101/staerman19a. Here we provide the source code using cython/c++ and the old version in python. The c++ code is highly inspired from the awesome work of https://github.com/sahandha/eif and is extremly faster than the python version. Three dictionaries are implemented, Brownian motion, gaussian wavelets and cosine, see the paper for more details. Feel free to suggests any additional dictionaries. 
  5 | 
  6 | 
  7 | =========================================
  8 | 
  9 | 
 10 | Installation
 11 | ------------
 12 | Download this repository and then run this python command in the folder:
 13 | 
 14 | .. code:: python
 15 | 
 16 |    python setup.py build_ext --inplace
 17 |    
 18 | Further, you can import the algorithm with the following command in your python script:
 19 | 
 20 | .. code:: python
 21 | 
 22 |    import fif as FIF
 23 |   
 24 | NB: our algorithm is not the one from https://pypi.org/project/fif/. Uninstall this package if you want to use functional isolation forest
 25 | 
 26 | 
 27 | Algorithm
 28 | ---------
 29 | Functional Isolation Forest is an anomaly detection (and anomaly ranking) algorithm for functional data (i.e., time-series).
 30 | It shows a great flexibility to distinguish most of anomaly types of functional data.
 31 | 
 32 | The algorithm return the anomaly score of each sample with the function compute_paths(), see notebooks for example or the quick start below.
 33 | 
 34 | Some parameters have to be set by the user : 
 35 |                                     - X [numpy array of size (n,dim)]: 'n' functional data with 'dim' measurements. 
 36 |                                     - time [numpy array of size dim]: vector of times measurements of size 'dim'.
 37 |                                     - sample_size [int]: the size of samples used for each tree.
 38 |                                     - ntrees [int]: the number of trees, default value is 100.
 39 |                                     - alpha [float between 0 and 1]: convex combination parameter for the innerproduct (as it is explained in the paper), default value is 1. 
 40 |                                     - dic_number [int: 0,1,2]: three dictionaries are implemented (0: Brownian motion; 1: Gaussian wavelets; 2: cosine), default value is 1.
 41 |                                                                    
 42 | 
 43 | Quick Start :
 44 | ------------
 45 | 
 46 | Create a toy dataset :
 47 | 
 48 | .. code:: python
 49 | 
 50 | 
 51 |   import numpy as np 
 52 |   np.random.seed(42)
 53 |   m =100;n =100;tps = np.linspace(0,1,m);v = np.linspace(1,1.4,n)
 54 |   X = np.zeros((n,m))
 55 |   for i in range(n):
 56 |       X[i] = 30 * ((1-tps) ** v[i]) * tps ** v[i]
 57 |   Z1 = np.zeros((m))
 58 |   for j in range(m):
 59 |       if (tps[j]<0.2 or tps[j]>0.8):
 60 |           Z1[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 
 61 |       else:
 62 |           Z1[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + np.random.normal(0,0.3,1)
 63 |   Z1[0] = 0
 64 |   Z1[m-1] = 0
 65 |   Z2 = 30 * ((1-tps) ** 1.6) * tps ** 1.6
 66 |   Z3 = np.zeros((m))
 67 |   for j in range(m):
 68 |       Z3[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + np.sin(2*np.pi*tps[j])
 69 | 
 70 |   Z4 = np.zeros((m))
 71 |   for j in range(m):
 72 |       Z4[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2
 73 | 
 74 |   for j in range(70,71):
 75 |       Z4[j] += 2
 76 | 
 77 |   Z5 = np.zeros((m))
 78 |   for j in range(m):
 79 |       Z5[j] = 30 * ((1-tps[j]) ** 1.2) * tps[j] ** 1.2 + 0.5*np.sin(10*np.pi*tps[j])
 80 | 
 81 |   X = np.concatenate((X,Z1.reshape(1,-1),Z2.reshape(1,-1),  
 82 |                        Z3.reshape(1,-1), Z4.reshape(1,-1), Z5.reshape(1,-1)), axis = 0)
 83 | 
 84 | 
 85 |    
 86 | And then use FIF to rank functional dataset :
 87 | 
 88 | .. code:: python
 89 | 
 90 |   import fif as FIF
 91 |   F  = FIF.FiForest(X, time=tps, ntrees=100, sample_size=64, dic_number=1, alpha=0.5, seed=0)
 92 |   Anomaly_score  = F.compute_paths()
 93 |     
 94 | The simulated dataset with the five introduced anomalies (top). The sorted dataset (middle), the darker the color, the more the curves are considered anomalies. The sorted anomaly score of the dataset (bottom). 
 95 | 
 96 | .. image:: Figures/anomaly_example-1.png
 97 | .. image:: Figures/anomaly_example_rank-1.png
 98 | .. image:: Figures/anomaly_example_score-1.png
 99 | 
100 | Dependencies
101 | ------------
102 | 
103 | These are the dependencies to use FIF:
104 | 
105 | * numpy 
106 | * cython
107 | 
108 | 
109 | Cite
110 | ----
111 | 
112 | If you use this code in your project, please cite::
113 | 
114 | 
115 |    @InProceedings{pmlr-v101-staerman19a,
116 |   title = 	 {Functional Isolation Forest},
117 |   author =       {Staerman, Guillaume and Mozharovskyi, Pavlo and Cl\'emen\c{c}on, Stephan and d'Alch\'e-Buc, Florence},
118 |   booktitle = 	 {Proceedings of The Eleventh Asian Conference on Machine Learning},
119 |   pages = 	 {332--347},
120 |   year = 	 {2019},
121 |   volume = 	 {101},
122 |   publisher =    {PMLR}
123 |    }
124 | 
125 | 
126 |   
127 | 


--------------------------------------------------------------------------------
/__fif.pxd:
--------------------------------------------------------------------------------
1 | cdef extern from "fif.hxx":
2 |     cdef cppclass FiForest:
3 |         int limit
4 |         FiForest (int, int, int, int,  int, double)
5 |         void fit (double*, double*, int, int)
6 |         void predict (double*, double*,  int)
7 |         void predictSingleTree (double*, double*, int, int)
8 |         void OutputTreeNodes (int)
9 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #gzgz


--------------------------------------------------------------------------------
/_fif.pyx:
--------------------------------------------------------------------------------
 1 | # Cython wrapper for Functional Isolation Forest
 2 | # This code is highly inspired from the code of 'Extended Isolation Forest' https://github.com/sahandha/eif.
 3 | 
 4 | # distutils: language = C++
 5 | # distutils: sources  = fif.cxx
 6 | # cython: language_level = 3
 7 | 
 8 | import cython
 9 | import numpy as np
10 | cimport numpy as np
11 | from version import __version__
12 | 
13 | cimport __fif
14 | 
15 | np.import_array()
16 | 
17 | cdef class FiForest:
18 |     cdef int size_X
19 |     cdef int dim
20 |     cdef int _ntrees
21 |     cdef int _limit
22 |     cdef int sample
23 |     cdef int tree_index
24 |     cdef int dic_number
25 |     cdef double alpha
26 |     cdef __fif.FiForest* thisptr
27 | 
28 |     @cython.boundscheck(False)
29 |     @cython.wraparound(False)
30 |     def __cinit__ (self, np.ndarray[double, ndim=2] X not None, np.ndarray[double, ndim=1] time not None,  int sample_size, int ntrees=100, int limit=0,  int seed=-1, int dic_number=1, double alpha=1.0):
31 |         self.thisptr = new __fif.FiForest (ntrees, sample_size, limit, seed, dic_number, alpha)
32 |         if not X.flags['C_CONTIGUOUS']:
33 |             X = X.copy(order='C')
34 |         if not time.flags['C_CONTIGUOUS']:
35 |             time = time.copy(order='C')
36 |         self.size_X = X.shape[0]
37 |         self.dim = X.shape[1]
38 |         self.sample = sample_size
39 |         self._ntrees = ntrees
40 |         self._limit = self.thisptr.limit
41 |         self.alpha = alpha
42 |         self.dic_number = dic_number
43 |         self.thisptr.fit (<double*> np.PyArray_DATA(X), <double*> np.PyArray_DATA(time), self.size_X, self.dim)
44 | 
45 |     @property
46 |     def ntrees(self):
47 |         return self._ntrees
48 | 
49 |     @property
50 |     def limit(self):
51 |         return self._limit
52 | 
53 |     def __dealloc__ (self):
54 |         del self.thisptr
55 | 
56 |     @cython.boundscheck(False)
57 |     @cython.wraparound(False)
58 |     def compute_paths (self, np.ndarray[double, ndim=2] X_in=None):
59 |         cdef np.ndarray[double, ndim=1, mode="c"] S
60 |         if X_in is None:
61 |             S = np.empty(self.size_X, dtype=np.float64, order='C')
62 |             self.thisptr.predict (<double*> np.PyArray_DATA(S), NULL, 0)
63 |         else:
64 |             if not X_in.flags['C_CONTIGUOUS']:
65 |                 X_in = X_in.copy(order='C')
66 |             S = np.empty(X_in.shape[0], dtype=np.float64, order='C')
67 |             self.thisptr.predict (<double*> np.PyArray_DATA(S), <double*> np.PyArray_DATA(X_in), X_in.shape[0])
68 |         return S
69 | 
70 |     @cython.boundscheck(False)
71 |     @cython.wraparound(False)
72 |     def compute_paths_single_tree (self, np.ndarray[double, ndim=2] X_in=None, tree_index=0):
73 |         cdef np.ndarray[double, ndim=1, mode="c"] S
74 |         if X_in is None:
75 |             S = np.empty(self.size_X, dtype=np.float64, order='C')
76 |             self.thisptr.predictSingleTree (<double*> np.PyArray_DATA(S), NULL, 0, tree_index)
77 |         else:
78 |             if not X_in.flags['C_CONTIGUOUS']:
79 |                 X_in = X_in.copy(order='C')
80 |             S = np.empty(X_in.shape[0], dtype=np.float64, order='C')
81 |             self.thisptr.predictSingleTree (<double*> np.PyArray_DATA(S), <double*> np.PyArray_DATA(X_in), X_in.shape[0], tree_index)
82 |         return S
83 | 
84 |     def output_tree_nodes (self, int tree_index):
85 |         self.thisptr.OutputTreeNodes (tree_index)
86 | 


--------------------------------------------------------------------------------
/fif.cxx:
--------------------------------------------------------------------------------
  1 | #include "fif.hxx"
  2 | 
  3 | 
  4 | /********************************
  5 | 	Utility functions
  6 |  ********************************/
  7 | 
  8 | inline std::vector<double> derivate (double* X1, double* time, int dim)
  9 | 	/* return the derivative of the function X1 whose have been measured at times time.*/
 10 | 	
 11 | {	std::vector<double> derivative (dim-1, 0.0);
 12 | 
 13 | 	for (int i=1; i<dim; i++) derivative[i-1] = (X1[i] - X1[i-1]) / (time[i] - time[i-1]);
 14 | 
 15 | 	return derivative;
 16 | 
 17 | }
 18 | inline std::vector<double> linspace(double  start, double end, int num)
 19 | 	/* return an vector of 'num' equispaced values between 'start' and 'end'. */ 
 20 | {
 21 |   std::vector<double> linspaced;
 22 |   double delta = (end - start) / (num - 1);
 23 | 
 24 |   if (num == 0) { return linspaced; }
 25 |   if (num == 1) 
 26 |     {
 27 |       linspaced.push_back(start);
 28 |       return linspaced;
 29 |     }
 30 | 
 31 |   for(int i=0; i < num-1; ++i)
 32 |     {
 33 |       linspaced.push_back(start + delta * i);
 34 |     }
 35 |   linspaced.push_back(end); 
 36 | 
 37 |   return linspaced;
 38 | }
 39 | 
 40 | inline std::vector<double> dictionary_function (int dim, int dic_number, RANDOM_ENGINE& random_engine_in)
 41 | 	/* return a function sampled from a dictionary. Three choices are possible: 
 42 | 	*
 43 | 	* 'dic_number=0' means Brownian motion
 44 | 	* 'dic_num=1' means gaussian wavelets 
 45 | 	* 'dic_number=2' means cosine dictionary.
 46 | 	*/
 47 | {	
 48 | 	std::vector<double> dic_function (dim, 0.0);
 49 | 	std::vector<double> t (dim, 0.0);
 50 | 	t = linspace(-5,5,dim);
 51 | 
 52 | 	if (dic_number == 0) // Standard Brownian motion
 53 | 		{
 54 | 		dic_function[0] = std::normal_distribution<double> (0.0, 1.0) (random_engine_in);
 55 | 		for (int i=1; i<dim; i++)
 56 | 			{
 57 | 				dic_function[i] = std::normal_distribution<double> (0.0, std::sqrt(t[i] - t[i-1]))(random_engine_in);
 58 | 			}
 59 | 		}
 60 | 	else if (dic_number == 1) // gaussian wavelets with various mean and std
 61 | 		{	
 62 | 
 63 | 		double sigma;
 64 | 		double K;
 65 | 
 66 | 
 67 | 		sigma = std::uniform_real_distribution<double> (0.2, 1)(random_engine_in);
 68 | 		K = std::uniform_real_distribution<double> (-4.0, 4.0)(random_engine_in);
 69 | 		for (int i=0; i<dim; i++)
 70 | 			{
 71 | 			dic_function[i] = -(2 / (std::pow(PI_CONSTANT, 0.25) * std::sqrt(3 * sigma))) * (std::pow(t[i] - K, 2.0) / std::pow(sigma,2.0) - 1) * (std::exp(-(std::pow(t[i]-K, 2.0) / (2 * std::pow(sigma, 2.0)))));
 72 | 			}
 73 | 		}
 74 | 	else if (dic_number == 2) // cosine with various frequencies and amplitude
 75 | 
 76 | 		{	
 77 | 
 78 | 		double ampl = 0.0;
 79 | 		double freq = 0.0;
 80 | 		ampl = std::uniform_real_distribution<double> (-1, 1)(random_engine_in);
 81 | 		freq = std::uniform_real_distribution<double> (0, 10)(random_engine_in);
 82 | 		
 83 | 		for (int i=0; i<dim; i++)
 84 | 			{
 85 | 
 86 | 
 87 | 			dic_function[i] = ampl * std::cos(2 * PI_CONSTANT * freq * t[i]);
 88 | 			}
 89 | 		}
 90 | 	else
 91 | 		{
 92 | 		std::cout << "this dictionary is not defined";
 93 | 		}
 94 | 
 95 | 	return dic_function;
 96 | 
 97 | 
 98 | }
 99 | 
100 | 
101 | inline double inner_product (double* X1, double* X2, double* time, double alpha, int dim)
102 | 	/* Return the innerproduct between X1 and X2 as a convex combination
103 | 	 *between L2 innerproduct and the L2 innerproduct of derivatives. 
104 | 	 *
105 | 	 * 'alpha=1' corresponds to L2 innerproduct
106 | 	 * 'alpha=0.5' corresponds to the Sobolev innerproduct
107 | 	 * 'alpha=0' corresponds to the derivative innerproduct. 
108 | 	*/
109 | {
110 | 	double result = 0.0;
111 | 	std::vector<double> prod (dim, 0.0);
112 | 
113 | 	for (int i=0; i<dim; i++);
114 | 
115 | 	if (alpha == 1)
116 | 		{
117 | 		prod[0] = X1[0] * X2[0];	
118 | 		for (int i=1; i<dim; i++)
119 | 			{ 
120 | 			prod[i] = X1[i] * X2[i];
121 | 			result += (time[i] - time[i-1]) * (prod[i] + prod[i-1]) / 2.0;
122 | 			}
123 | 
124 | 		}
125 | 	else if (alpha == 0)
126 | 		{
127 | 		std::vector<double> prod_derivate (dim-1, 0.0);
128 | 		std::vector<double> X1_derivate (dim-1, 0.0);
129 | 		std::vector<double> X2_derivate (dim-1, 0.0);
130 | 
131 | 		X1_derivate = derivate(X1, time, dim);
132 | 		X2_derivate = derivate(X2, time, dim);		
133 | 		prod_derivate[0] = X1_derivate[0] * X2_derivate[0];
134 | 
135 | 		for (int i=1; i<dim-1; i++) 
136 | 			{
137 | 			prod_derivate[i] = X1_derivate[i] * X2_derivate[i];
138 | 			result += (time[i] - time[i-1]) * (prod_derivate[i] + prod_derivate[i-1]) / 2.0;
139 | 			}
140 | 		}
141 | 
142 | 	else
143 | 		{
144 | 		std::vector<double> prod_derivate (dim-1, 0.0);
145 | 		std::vector<double> step_time (dim-1, 0.0);
146 | 		std::vector<double> X1_derivate (dim-1, 0.0);
147 | 		std::vector<double> X2_derivate (dim-1, 0.0);
148 | 		double inner = 0.0;
149 | 		double inner_derivate = 0.0;
150 | 		double norm_X1 = 0.0;
151 | 		double norm_X2 = 0.0;
152 | 		double norm_X1_derivate = 0.0;
153 | 		double norm_X2_derivate = 0.0;
154 | 
155 | 
156 | 		prod[0] = X1[0] * X2[0];
157 | 		for (int i=1; i<dim; i++)
158 | 			{ 
159 | 			prod[i] = X1[i] * X2[i];
160 | 			step_time[i-1] = time[i] - time[i-1];
161 | 			inner += step_time[i-1] * (prod[i] + prod[i-1]) / 2.0;
162 | 			norm_X1 += step_time[i-1] * (std::pow (X1[i], 2.0) + std::pow (X1[i-1], 2.0)) / 2.0;
163 | 			norm_X2 += step_time[i-1] * (std::pow (X2[i], 2.0) + std::pow (X2[i-1], 2.0)) / 2.0;
164 | 			}
165 | 
166 | 		X1_derivate = derivate(X1, time, dim);
167 | 		X2_derivate = derivate(X2, time, dim);
168 | 		prod_derivate[0] = X1_derivate[0] * X2_derivate[0];
169 | 
170 | 		for (int i=1; i<dim-1; i++) 
171 | 			{
172 | 			prod_derivate[i] = X1_derivate[i] * X2_derivate[i];
173 | 			inner_derivate += step_time[i-1]  * (prod_derivate[i] + prod_derivate[i-1]) / 2.0;
174 | 			norm_X1_derivate += step_time[i-1]  * (std::pow (X1_derivate[i], 2.0) + std::pow (X1_derivate[i-1], 2.0)) / 2.0;
175 | 			norm_X2_derivate += step_time[i-1]  * (std::pow (X2_derivate[i], 2.0) + std::pow (X2_derivate[i-1], 2.0)) / 2.0;
176 | 			}
177 | 		result = alpha * inner / (std::sqrt (norm_X1) * std::sqrt (norm_X2)) + (1 - alpha) * inner_derivate / (std::sqrt (norm_X1_derivate) * std::sqrt (norm_X2_derivate));
178 | 		}			
179 | 	
180 | 	return result;
181 | 
182 | }
183 | 
184 | 
185 | inline double c_factor (int N)
186 | 	/* Constant factor of the average depth of trees. */ 
187 | {
188 | 
189 | 	double Nd = (double) N;
190 | 	double result;
191 | 	result = 2.0*(log(Nd-1.0)+EULER_CONSTANT) - 2.0*(Nd-1.0)/Nd;
192 | 	return result;
193 | 
194 | }
195 | 
196 | inline std::vector<int> sample_without_replacement (int k, int N, RANDOM_ENGINE& gen)
197 | 	/* Sample k elements from the range [1, N] without replacement  */
198 | {
199 | 
200 |     // Create an unordered set to store the samples
201 |     std::unordered_set<int> samples;
202 | 
203 |     // Sample and insert values into samples
204 |     for (int r=N-k+1; r<N+1; ++r)
205 |     {
206 |         int v = std::uniform_int_distribution<>(1, r)(gen);
207 |         if (!samples.insert(v).second) samples.insert(r);
208 |     }
209 | 
210 |     // Copy samples into vector
211 |     std::vector<int> result(samples.begin(), samples.end());
212 | 
213 |     // Shuffle vector
214 |     std::shuffle(result.begin(), result.end(), gen);
215 | 
216 |     return result;
217 | 
218 | }
219 | 
220 | void output_tree_node (Node* node_in, std::string string_in)
221 | {
222 | 
223 | 	std::cout << "==== Node ====" << std::endl;
224 | 	std::cout << "path: " 	<< string_in << std::endl;
225 | 	std::cout << "e   : " 	<< node_in[0].e << std::endl;
226 | 	std::cout << "size: " 	<< node_in[0].size << std::endl;
227 | 	std::cout << "n   : [";
228 | 	int size_n = node_in[0].dic_vector.size(); 
229 | 	for (int i=0; i<size_n; i++)
230 | 	{
231 | 		std::cout << node_in[0].dic_vector[i];
232 | 		if (i<size_n-1) std::cout << ", ";
233 | 	}
234 | 	std::cout << "]" << std::endl;
235 | 	std::cout << node_in[0].treshold;
236 | 	std::cout << "]" << std::endl;
237 | 	std::cout << "type: " << node_in[0].node_type << std::endl;
238 | 
239 | 	if (node_in[0].node_type == "exNode") return;
240 | 	else
241 | 	{
242 | 		output_tree_node (node_in[0].left, string_in.append(" L"));
243 | 		string_in.pop_back();
244 | 		output_tree_node (node_in[0].right, string_in.append("R"));
245 | 	}
246 | 
247 | }
248 | 
249 | void delete_tree_node (Node* node_in)
250 | {
251 | 
252 | 	if (node_in[0].node_type == "exNode") delete node_in;
253 | 	else
254 | 	{
255 | 		delete_tree_node (node_in[0].left);
256 | 		delete_tree_node (node_in[0].right);
257 | 		delete node_in;
258 | 	}
259 | 
260 | }
261 | 
262 | 
263 | /****************************
264 |         Class Node
265 |  ****************************/
266 | Node::Node (int size_in, int dim_in, double* dic_vector_in, double treshold_in, int e_in, Node* left_in, Node* right_in, std::string node_type_in)
267 | {
268 | 
269 | 	e = e_in;
270 | 	size = size_in;	
271 | 	treshold = treshold_in;
272 | 	left = left_in;
273 | 	right = right_in;
274 | 	node_type = node_type_in;
275 | 	for (int i=0; i<dim_in; i++) dic_vector.push_back(dic_vector_in[i]);
276 | 
277 | }
278 | 
279 | Node::~Node ()
280 | {
281 | 
282 | }
283 | 
284 | 
285 | /****************************
286 |         Class FiTree
287 |  ****************************/
288 | FiTree::FiTree ()
289 | {
290 | 	root = NULL;
291 | }
292 | 
293 | FiTree::~FiTree ()
294 | {
295 | 
296 | }
297 | 
298 | void FiTree::build_tree (double* X_in, double* time_in, int size_in, int e_in, int limit_in, double alpha_in, int dic_number_in, int dim_in, RANDOM_ENGINE& random_engine_in)
299 | {
300 | 
301 | 	e = e_in;
302 | 	size = size_in;
303 | 	dim = dim_in;
304 | 	dic_number = dic_number_in;
305 | 	alpha = alpha_in;
306 | 	limit = limit_in;
307 | 	exnodes = 0;
308 | 	root = add_node (X_in, time_in, size_in, e_in, random_engine_in);
309 | 
310 | }
311 | 
312 | Node* FiTree::add_node (double* X_in, double* time_in, int size_in, int e_in, RANDOM_ENGINE& random_engine_in)
313 | {
314 | 
315 | 	e = e_in;
316 | 	double treshold=0.0;
317 | 	std::vector<double> dic_vector (dim, 0.0);
318 | 
319 | 	if (e_in >= limit || size_in <= 1) {
320 | 
321 | 		Node* left = NULL;
322 | 		Node* right = NULL;
323 | 		exnodes += 1;
324 | 		Node* node = new Node (size_in, dim, &dic_vector[0], treshold, e_in, left, right, "exNode");
325 | 		return node;
326 | 
327 | 	} else {
328 | 
329 | 		std::vector<double> innerprod (size_in, 0.0);
330 | 		std::vector<double> XL, XR;
331 | 		int sizeXL = 0;
332 | 		int sizeXR = 0;
333 | 
334 | 		dic_vector = dictionary_function(dim, dic_number, random_engine_in);
335 | 		for (int i=0; i<size_in; i++)
336 | 		{
337 | 			int index = i*dim;
338 | 			innerprod[i] = inner_product (&X_in[index], &dic_vector[0],time_in, alpha, dim);
339 | 		}
340 | 
341 | 		// Pick a random point between min and max of the projections
342 | 		double innermin; double innermax; 
343 | 
344 | 		innermin = *std::min_element(std::begin(innerprod), std::end(innerprod));
345 | 		innermax = *std::max_element(std::begin(innerprod), std::end(innerprod));
346 | 		treshold = std::uniform_real_distribution<double> (innermin, innermax)(random_engine_in);
347 | 
348 | 		// Assign data in left and right leaves.
349 | 		for (int i=0; i<size_in; i++)
350 | 		{ 	int index = i*dim;
351 | 			if (innerprod[i] < treshold) {
352 | 				for (int j=0; j<dim; j++) XL.push_back(X_in[j+index]);
353 | 				sizeXL += 1;
354 | 			} else {
355 | 				for (int j=0; j<dim; j++) XR.push_back(X_in[j+index]);
356 | 				sizeXR += 1;
357 | 			}
358 | 		
359 | 		}	
360 | 
361 | 		Node* left = add_node (&XL[0], time_in, sizeXL, e_in+1, random_engine_in);
362 | 		Node* right = add_node (&XR[0], time_in, sizeXR, e_in+1, random_engine_in);
363 | 
364 | 		Node* node = new Node (size_in, dim, &dic_vector[0], treshold, e_in, left, right, "inNode");
365 | 		return node;
366 | 
367 | 	}
368 | 
369 | }
370 | 
371 | 
372 | /*************************
373 |         Class Path
374 |  *************************/
375 | Path::Path (double* time_in, int dim_in, double alpha_in, double* x_in, FiTree fitree_in)
376 | {
377 | 
378 | 	dim = dim_in;
379 | 	alpha = alpha_in;
380 | 	x = x_in;
381 | 	time = time_in;
382 | 	e = 0.0;
383 | 	pathlength = find_path (fitree_in.root);
384 | 
385 | }
386 | 
387 | Path::~Path ()
388 | {
389 | 
390 | }
391 | 
392 | double Path::find_path (Node* node_in)
393 | {
394 | 
395 | 	if (node_in[0].node_type == "exNode") {
396 | 
397 | 		if (node_in[0].size <= 1) {
398 | 			return e;
399 | 		} else {
400 | 			e = e + c_factor (node_in[0].size);
401 | 			return e;
402 | 		}
403 | 
404 | 	} else {
405 | 
406 | 		e += 1.0;
407 | 
408 | 		double xdotn, treshold, plength;
409 | 		treshold = node_in[0].treshold;
410 | 		xdotn = inner_product (x, &node_in[0].dic_vector[0], time, alpha, dim);
411 | 		if (xdotn < treshold) {
412 | 			path_list.push_back('L');
413 | 			plength = find_path (node_in[0].left);
414 | 		} else {
415 | 			path_list.push_back('R');
416 | 			plength = find_path (node_in[0].right);
417 | 		}
418 | 		return plength;
419 | 
420 | 	}
421 | 
422 | }
423 | 
424 | 
425 | /****************************
426 |         Class FiForest
427 |  ****************************/
428 | FiForest::FiForest (int sample_in, int ntrees_in=100, int limit_in=0,  int random_seed_in=-1, int dic_number_in=1, double alpha_in=1.0)
429 | {
430 | 
431 | 	ntrees = ntrees_in;
432 | 	dic_number = dic_number_in;
433 | 	sample = sample_in;
434 | 	limit = limit_in;
435 | 	alpha = alpha_in;
436 | 	if (limit_in <= 0) limit = (int) ceil(log2(sample)); 
437 | 	c = c_factor (sample);
438 | 	Trees = new FiTree [ntrees];
439 | 	if (random_seed_in < 0) {
440 | 		RANDOM_SEED_GENERATOR random_seed_generator;
441 | 		random_seed = random_seed_generator();
442 | 	} else {
443 | 		random_seed = (unsigned) random_seed_in;
444 | 	}
445 | 
446 | }
447 | 
448 | FiForest::~FiForest ()
449 | {
450 | 
451 | 	for (int i=0; i<ntrees; i++)
452 | 		if (Trees[i].root != NULL) delete_tree_node (Trees[i].root);
453 | 	delete [] Trees;
454 | 
455 | }
456 | 
457 | 
458 | bool FiForest::CheckSampleSize ()
459 | {
460 | 
461 | 	if (sample < 1)
462 | 	{
463 | 		std::cout << "Subsample size must be an integer between 1 and " << nobjs << "." << std::endl;
464 | 		return false;
465 | 	}
466 | 	if (sample > nobjs)
467 | 	{
468 | 		std::cout << "No. of data points is " << nobjs << ". Subsample size cannot be larger than " << nobjs << "." << std::endl;
469 | 		return false;
470 | 	}
471 | 
472 | 	return true;
473 | 
474 | }
475 | 
476 | void FiForest::fit (double* X_in, double* time_in,  int nobjs_in, int dim_in)
477 | {
478 | 	std::vector<double> Xsubset;
479 | 
480 | 	X = X_in;
481 | 	time = time_in;
482 | 	nobjs = nobjs_in;
483 | 	dim = dim_in;
484 | 	if (!CheckSampleSize ()) return;
485 | 
486 | 
487 | 
488 | 	for (int i=0; i<ntrees; i++)
489 | 	{
490 | 		/* Select a random subset of X_in of size sample_in */
491 | 		RANDOM_ENGINE random_engine (random_seed+i);
492 | 		std::vector<int> sample_index = sample_without_replacement (sample, nobjs, random_engine);
493 | 		Xsubset.clear();
494 | 		for (int j=0; j<sample; j++)
495 | 		{
496 | 			for (int k=0; k<dim; k++)
497 | 			{
498 | 				int index = k+(sample_index[j]-1)*dim;
499 | 				Xsubset.push_back(X[index]);
500 | 			}
501 | 		}
502 | 
503 | 		Trees[i].build_tree (&Xsubset[0], time, sample, 0, limit, alpha, dic_number, dim, random_engine );
504 | 	}
505 | 
506 | }
507 | 
508 | void FiForest::predict (double* S,  double* X_in=NULL, int size_in=0)
509 | {
510 | 
511 | 	if (X_in == NULL)
512 | 	{
513 | 		X_in = X;
514 | 		size_in = nobjs;
515 | 	}
516 | 
517 | 	double htemp, havg;
518 | 	for (int i=0; i<size_in; i++)
519 | 	{
520 | 		htemp = 0.0;
521 | 		for (int j=0; j<ntrees; j++)
522 | 		{
523 | 			Path path (time, dim, alpha, &X_in[i*dim], Trees[j]);
524 | 			htemp += path.pathlength;
525 | 		}
526 | 		havg = htemp/ntrees;
527 | 		S[i] = std::pow(2.0, -havg/c);
528 | 	}
529 | 
530 | }
531 | void FiForest::predictSingleTree (double* S, double* X_in=NULL, int size_in=0, int FiTree_index=0)
532 | {
533 | 
534 | 	if (X_in == NULL)
535 | 	{
536 | 		X_in = X;
537 | 		size_in = nobjs;
538 | 	}
539 | 
540 | 	double htemp;
541 | 	for (int i=0; i<size_in; i++)
542 | 	{
543 | 		htemp = 0.0;
544 | 		Path path (time, dim, alpha,  &X_in[i*dim], Trees[FiTree_index]);
545 | 		htemp = path.pathlength;
546 | 		S[i] = htemp;
547 | 	}
548 | 
549 | }
550 | 
551 | void FiForest::OutputTreeNodes (int FiTree_index)
552 | {
553 | 
554 | 	output_tree_node (Trees[FiTree_index].root, "root");
555 | 
556 | }
557 | 


--------------------------------------------------------------------------------
/fif.hxx:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <cmath>
  4 | #include <vector>
  5 | #include <random>
  6 | #include <algorithm>
  7 | #include <unordered_set>
  8 | 
  9 | #define EULER_CONSTANT 0.5772156649
 10 | #define PI_CONSTANT 3.1415926535
 11 | 
 12 | #define RANDOM_ENGINE std::mt19937_64
 13 | #define RANDOM_SEED_GENERATOR std::random_device
 14 | 
 15 | 
 16 | /****************************
 17 |         Class Node
 18 |  ****************************/
 19 | class Node
 20 | {
 21 | 
 22 |     private:
 23 | 
 24 |     protected:
 25 | 
 26 |     public:
 27 | 	    int e;
 28 |         int size;
 29 |         std::vector<double> dic_vector;
 30 |         double treshold;
 31 |         Node* left;
 32 |         Node* right;
 33 |         std::string node_type;
 34 | 
 35 |         Node (int, int, double*, double, int, Node*, Node*, std::string);
 36 |         ~Node ();
 37 | 
 38 | };
 39 | 
 40 | 
 41 | /****************************
 42 |         Class FiTree
 43 |  ****************************/
 44 | class FiTree
 45 | {
 46 | 
 47 |     private:
 48 |         int e;
 49 |         int size;
 50 |         double alpha;
 51 |         int dic_number;
 52 |         int dim;
 53 |         int limit;
 54 |         int exnodes;
 55 | 
 56 |     protected:
 57 | 
 58 |     public:
 59 |         Node* root;
 60 | 
 61 |         FiTree ();
 62 |         ~FiTree ();
 63 |         void build_tree (double*, double*,  int, int, int, double,int, int, RANDOM_ENGINE&);
 64 |         Node* add_node (double*, double*,  int, int, RANDOM_ENGINE&);
 65 | 
 66 | };
 67 | 
 68 | 
 69 | /*************************
 70 |         Class Path
 71 |  *************************/
 72 | class Path
 73 | {
 74 | 
 75 |     private:
 76 |         int dim;
 77 |         double alpha;
 78 |         double* time;
 79 |         double* x;
 80 |         double e;
 81 |     protected:
 82 | 
 83 |     public:
 84 |         std::vector<char> path_list;
 85 |         double pathlength;
 86 | 
 87 |         Path (double*, int, double, double*, FiTree);
 88 |         ~Path ();
 89 |         double find_path (Node*);
 90 | 
 91 | };
 92 | 
 93 | 
 94 | /****************************
 95 |         Class FiForest
 96 |  ****************************/
 97 | class FiForest
 98 | {
 99 | 
100 |     private:
101 |         int nobjs;
102 |         int dim;
103 |         int sample;
104 |         int ntrees;
105 |         int dic_number;
106 |         double alpha;
107 |         double* X;
108 |         double * time;
109 |         double c;
110 |         FiTree* Trees;
111 |         unsigned random_seed;
112 | 
113 | 	bool CheckSampleSize ();
114 |     protected:
115 | 
116 |     public:
117 |         int limit;
118 |         FiForest (int, int, int, int, int, double);
119 |         ~FiForest ();
120 |         void fit (double*, double*, int, int);
121 |         void predict (double*, double*, int);
122 |         void predictSingleTree (double*, double*, int, int);
123 | 	    void OutputTreeNodes (int);
124 | 
125 | };
126 | 
127 | 
128 | /********************************
129 |         Utility functions
130 |  ********************************/
131 | inline std::vector<double> derivate (double* , double*, int);
132 | inline std::vector<double> linspace(double, double, int);
133 | inline std::vector<double> dictionary_function (int , int, RANDOM_ENGINE&);
134 | inline std::vector<int> sample_without_replacement (int, int, RANDOM_ENGINE&);
135 | inline double inner_product (double*, double*, double*, double, int);
136 | inline double c_factor (int);
137 | void output_tree_node (Node*, std::string);
138 | void delete_tree_node (Node*);
139 | 


--------------------------------------------------------------------------------
/old_fif.py:
--------------------------------------------------------------------------------
  1 | """ Functional Isolation Forest
  2 | 
  3 |     Author : Guillaume Staerman
  4 | """
  5 | 
  6 | 
  7 | """Functional Isolation Forest Algorithm
  8 | 
  9 | This is the implementation of The Functional Isolation Forest which is an
 10 | extension of the original Isolation Forest applied to functional data.
 11 | 
 12 | It return the anomaly score of each sample using the FIF algorithm.
 13 | The Functional Isolation Forest 'isolates' observations by 
 14 | randomly selecting a curve among a dictionary
 15 | and then randomly selecting a split value between the maximum 
 16 | and minimum values of the selected feature.
 17 | 
 18 | Since recursive partitioning can be represented by a tree structure, the
 19 | number of splittings required to isolate a sample is equivalent to the path
 20 | length from the root node to the terminating node.
 21 | 
 22 | This path length, averaged over a forest of such random trees, is a
 23 | measure of normality.
 24 | 
 25 | Random partitioning produces noticeably shorter paths for anomalies.
 26 | Hence, when a forest of random trees collectively produce shorter path
 27 | lengths for particular samples, they are highly likely to be anomalies.
 28 | 
 29 | Since the probability distribution nu defined in the paper is (in the interesting case)
 30 | continuous on a infinite dimensional space we do not represent it in this implementation. 
 31 | Instead, lot of dictionaries are already defined as Brownian dictionaries, Brownian bridges..
 32 |  where the input of the Wiener measure would be more difficult for the user. If one want to use
 33 |  a discrete measure nu, one have to 'replace it' with an appropriate dictionary.
 34 |  Example : if nu is discrete measure with ten values with different weight of probability
 35 |  and you have a dictionary D of size 10. Then you build a larger dictionaries with the 
 36 |  with the ten functions w.r.t. to their weights.
 37 | 
 38 | 
 39 | 
 40 | 
 41 | """
 42 | import numpy as np
 43 | 
 44 | 
 45 | def derivate(X, step):
 46 |     """Compute de derivative of each function in the matrix X w.r.t vector time."""
 47 |     step = step.astype(dtype=float)
 48 |     A = np.zeros((X.shape[0], X.shape[1] - 1))
 49 |     for i in range(X.shape[0]):
 50 |         A[i] = np.diff(X[i]) / step
 51 |     return A
 52 | def derivate_piecewise(X, step):
 53 |     """Compute de derivative of each piecewise function in the matrix X w.r.t vector time."""
 54 |     A = np.zeros((X.shape[0], X.shape[1] - 1))
 55 |     for i in range(X.shape[0]):
 56 |         a = np.where(X[i] != 0)[0]
 57 |         b = a[0 : (a.shape[0] - 1)]
 58 |         A[i, b] = np.diff(X[i,a]) / step[b]
 59 |     return A
 60 |     
 61 | def c_factor(n_samples_leaf) :
 62 |     """
 63 |     Average path length of unsuccesful search in a binary search tree given n points
 64 |     
 65 |     Parameters
 66 |     ----------
 67 |     n_samples_lead : int
 68 |         Number of curves for the BST.
 69 |     Returns
 70 |     -------
 71 |     float
 72 |         Average path length of unsuccesful search in a BST
 73 |         
 74 |     """
 75 |     return 2.0 * (np.log(n_samples_leaf - 1) + np.euler_gamma) - (2. * (
 76 |         n_samples_leaf - 1.) / (n_samples_leaf * 1.0))
 77 | 
 78 | 
 79 | class FIForest(object):
 80 |     """
 81 |     Functional Isolation Forest
 82 |     
 83 |     Creates an FIForest object. This object holds the data as well as the trained trees (iTree objects).
 84 |     
 85 |     Attributes
 86 |     ----------
 87 |     X : Array-like
 88 |         Data used for training.
 89 |         
 90 |     nobjs: int
 91 |         Size of the dataset.
 92 |         
 93 |     sample: int
 94 |         Size of the sample to be used for tree creation.
 95 |         
 96 |     Trees: list
 97 |         A list of tree objects.
 98 |         
 99 |     limit: int
100 |         Maximum depth a tree can have.
101 |         
102 |     c: float
103 |         Multiplicative factor used in computing the anomaly scores.
104 | 
105 |     step : array
106 |         Vector of the length of intervals of discretization.
107 | 
108 |     D : Array-like
109 |         Dictionnary of functions used as directions.
110 | 
111 |     Dsize : int
112 |         The size of the dictionary. It is the number of curves that we will use in our 
113 |         dictionary to build the forest.
114 |     
115 |     innerproduct : str or function  
116 |         An inner product that we use for the construction of the tree. The innerproduct in the paper
117 |         is already implemented, call it with 'auto' and fixe and alpha. If a function is given by 
118 |         the user, it should have three argument : (x, y, step) where x and y are curve (represented
119 |         by a vector of length of the discretization). "step" is a vector of length len(time)-1 which
120 |         represents the vector of length of step between the discretization.
121 |                 
122 |     alpha : float
123 |         a float number between [0,1] used in the innerproduct of the paper.
124 |             
125 |     deriv_X : Array like
126 |         A matrix of derivate of X if needed for the scalar product.
127 |         
128 |     deriv_dictionary : Array like
129 |         A matrix of derivate of D if needed for the scalar product.
130 |         
131 |     Attributes
132 |     -------
133 |     compute_paths(X_in) :
134 |         Computes the anomaly score for data X_in
135 |         
136 |     threshold(score_sample, contamination) :
137 |         Given the score returned by the fit function on training sample and a proportion 
138 |         of anomalies, compute the threshold which separates anomalies and normal data.
139 |         
140 |     predict_label(score, contamination) :
141 |         Given any score (training or testing) and the proportion of anomalies 
142 |         it return the labels predicted. The function return +1 for outliers and
143 |         -1 for inliers.
144 |     
145 |     
146 |     References
147 |     ----------
148 |     
149 |     .. [1] Staerman, G, Mozharovskyi, P, D'Alché-buc, F and Clémençon,S. "Functional Isolation forest."
150 | 
151 |     
152 |     """
153 | 
154 |     def __init__(self, 
155 |                  X,  
156 |                  D,
157 |                  time,
158 |                  innerproduct,
159 |                  criterion="naive",
160 |                  ntrees=None,
161 |                  subsample_size=None, 
162 |                  Dsize=None, 
163 |                  limit=None, 
164 |                  mean=None, 
165 |                  sd=None, 
166 |                  J_max=None,  
167 |                  alpha=None):
168 |       
169 |         self.X = X
170 |         self.nobjs = len(X)
171 |         self.Trees = []
172 |         self.time = time
173 |         self.criterion = criterion
174 |         self.mean = mean
175 |         self.sd = sd
176 |         self.D = D
177 | 
178 | 
179 |         if (ntrees == None):
180 |             self.ntrees = 100
181 |         else: self.ntrees = ntrees
182 | 
183 |         if (subsample_size == None):
184 |             if (self.nobjs > 500):
185 |                 self.sample = 256
186 |             else: self.sample = np.minimum(64, self.X.shape[0])
187 |         else : self.sample = subsample_size
188 | 
189 | 
190 |         if (Dsize == None):
191 |             self.Dsize = 1000
192 |         else: self.Dsize = Dsize 
193 |         
194 | 
195 |         if (type(D) == str):
196 |             """Finite dictionaries are pre-implemented.
197 |             """ 
198 |            
199 |             if (D == 'Dyadic_indicator'):
200 |                 """ We build a dictionary from the basis of the Haar wavelets using 
201 |                 only the father wavelets. We use a discretization on [0,1] since 
202 |                 we are interested only in the shape.
203 |                 """
204 |                 if (J_max == None):
205 |                     J_max = 7
206 |                 a =0
207 |                 t = np.linspace(0,1,len(self.time))
208 |                 self.D = np.zeros((np.sum(np.power(2, np.arange(J_max))), len(self.time)))
209 |                 for J in range(J_max):
210 |                     b = np.power(2, J)
211 |                     for k in range(b):
212 |                         for l in range(len(self.time)):
213 |                             x = b * t[l] - k
214 |                             self.D[a,l] = 1 * (0 <= x < 1)
215 |                         a += 1
216 |                         
217 |             elif (D == 'Multiresolution_linear'):
218 |                 """ We build a dictionary from the basis of the Haar wavelets using 
219 |                 only the father wavelets. We use a discretization on [0,1] since 
220 |                 we are interested only in the shape.
221 |                 """
222 |                 if (J_max == None):
223 |                     J_max = 7
224 |                 a =0
225 |                 t = np.linspace(0, 1, len(self.time))
226 |                 self.D = np.zeros((np.sum(np.power(2, np.arange(J_max))), len(self.time)))
227 |                 for J in range(J_max):
228 |                     b = np.power(2,J)
229 |                     for k in range(b):
230 |                         for l in range(len(self.time)):
231 |                             x = b * t[l] - k
232 |                             self.D[a,l] = t[l] * (0 <= x < 1)
233 |                         a += 1
234 | 
235 |             elif(D == 'Self_local'):
236 |                 """
237 |                 """
238 |                 self.D = np.zeros((self.Dsize, len(self.time)))
239 |                 for i in range(self.Dsize):
240 |                     a = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0]
241 |                     b = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0]
242 |                     for j in range(len(self.time)):
243 |                         k = np.random.randint(low=0, high=X.shape[0], size=1)
244 |                         self.D[i,j] = self.X.copy()[k,j] * (np.maximum(a, b) > self.time[j] > np.minimum(a, b))
245 | 
246 |             elif (D == 'Self'):
247 |                 self.D = self.X.copy()
248 | 
249 |         self.alpha = alpha
250 |         self.step = np.diff(self.time)
251 | 
252 |         if (type(D) == str):
253 | 
254 |             if (D == 'Self_local' or D == 'Self'):
255 |                 self.deriv_dictionary = derivate(self.D, self.step)
256 | 
257 |             elif(D == 'Multiresolution_linear' or D == 'Dyadic_indicator'):
258 |                 self.deriv_dictionary = derivate_piecewise(self.D, self.step)
259 | 
260 |             else: self.deriv_dictionary = []
261 | 
262 |         self.deriv_X = None
263 | 
264 |         if not callable(innerproduct):
265 |             """ Some inner product implemented.
266 |             """
267 |             if (innerproduct == 'auto'):
268 |                 
269 |                 if (self.alpha == None):
270 |                     self.alpha = 1
271 |                 
272 |                 if (self.alpha == 0):
273 |                     self.deriv_X = derivate(self.X, self.step)
274 | 
275 |                     def innerproduct(x, y, xderiv, yderiv):
276 |                         """We build the inner product in the paper with alpha = 0 which corresponds 
277 |                         to L2 of derivate dot product.
278 |                         """ 
279 |                         F1 = x * y
280 |                         F2 = xderiv * yderiv
281 | 
282 |                         F11 = F1[((np.arange(len(F1)) + 1) % len(F1))[:len(F1)-1]]
283 |                         F12 = F1[((np.arange(len(F1)) + -1) % len(F1))[1:len(F1)]]
284 |                         F21 = F2[((np.arange(len(F2)) + 1) % len(F2))[:len(F2)-1]]
285 |                         F22 = F2[((np.arange(len(F2)) + -1) % len(F2))[1:len(F2)]]
286 | 
287 |                         return  (self.alpha *np.sum(( self.step * (F11 + F12) / 2))
288 |                                 +(1-self.alpha) * np.sum((self.step[0:(len(self.step) - 1)]
289 |                                                           * (F21 + F22) / 2)))
290 | 
291 |                 elif (self.alpha == 1):
292 |                     def innerproduct(x, y, xderiv=None, yderiv=None ):
293 |                         """We build the inner product in the paper with alpha = 1 which corresponds 
294 |                         to L2 dot product.
295 |                         """ 
296 |                         F1 = x * y                        
297 |                         return  np.sum((self.step * (F1[((np.arange(len(F1)) + 1) % len(F1))[:len(F1)-1]]
298 |                                 + F1[((np.arange(len(F1)) + -1) % len(F1))[1:len(F1)]]) / 2)) 
299 |                                  
300 |                     
301 |                 else:
302 |                     self.deriv_X = derivate(self.X, self.step)
303 |                     def innerproduct(x, y, xderiv, yderiv):
304 |                         """We build the inner product in the paper which is a compromise between 
305 |                         L2 scalar product and the L2 scalar product of derivate.
306 |                         The function that we use work only with if we have the observations 
307 |                         of curves at constant steps.
308 |                         """ 
309 |                         F1 = x * y
310 |                         F2 = xderiv * yderiv
311 |                         
312 |                         F11 = F1[((np.arange(len(F1)) + 1) % len(F1))[:len(F1) - 1]]
313 |                         F12 = F1[((np.arange(len(F1)) - 1) % len(F1))[1:len(F1)]]
314 |                         F21 = F2[((np.arange(len(F2)) + 1) % len(F2))[:len(F2) - 1]]
315 |                         F22 = F2[((np.arange(len(F2)) - 1) % len(F2))[1:len(F2)]]
316 |                         
317 |                         x11 = x[((np.arange(len(x)) + 1) % len(x))[:len(x) - 1]]
318 |                         x12 = x[((np.arange(len(x)) - 1) % len(x))[1:len(x)]]
319 |                         x21 = xderiv[((np.arange(len(xderiv)) + 1) % len(xderiv))[:len(xderiv) - 1]]
320 |                         x22 = xderiv[((np.arange(len(xderiv)) - 1) % len(xderiv))[1:len(xderiv)]]
321 |                         
322 |                         y11 = y[((np.arange(len(y)) + 1) % len(y))[:len(y) - 1]]
323 |                         y12 = y[((np.arange(len(y)) - 1) % len(y))[1:len(y)]]
324 |                         y21 = yderiv[((np.arange(len(yderiv)) + 1) % len(yderiv))[:len(yderiv) - 1]]
325 |                         y22 = yderiv[((np.arange(len(yderiv)) - 1) % len(yderiv))[1:len(yderiv)]]
326 |                         return (self.alpha * np.sum(F11 + F12) / (np.sqrt(np.sum(x11 ** 2 + x12 ** 2)) * np.sqrt(np.sum(y11 ** 2 + y12 ** 2)))
327 |                                + (1 - self.alpha) * np.sum(F21 + F22) / (np.sqrt(np.sum(x21 ** 2 + 
328 |                                 x22 ** 2)) * np.sqrt(np.sum(y21 ** 2 + y22 ** 2))))
329 |                         
330 |                     
331 |             else: raise TypeError('This inner product is not pre-defined') 
332 |         else: self.alpha = 1 
333 | 
334 |         self.innerproduct = innerproduct
335 |         self.limit = limit
336 |         self.c = c_factor(self.sample)
337 | 
338 | 
339 |         if limit is None:
340 |             """Set limit to the default as specified by the original paper
341 |             (average depth of unsuccesful search through a binary tree).
342 |             """ 
343 |             self.limit = int(np.ceil(np.log2(self.sample))) 
344 |             
345 | 
346 |         if (self.alpha == 1):
347 |             for i in range(self.ntrees): 
348 |                 """This loop builds an ensemble of f-itrees (the forest).
349 |                 """
350 |                 ix = np.random.choice(np.arange(self.nobjs), size=self.sample, replace=False)
351 |                 
352 |                 self.Trees.append(iTree(X[ix], self.time, self.step,  
353 |                                         0, self.limit, 
354 |                                         self.D, self.innerproduct, 
355 |                                         self.alpha, self.deriv_X, 
356 |                                         None, self.sample, self.criterion, self.mean, self.sd))
357 |         else:
358 |             for i in range(self.ntrees): 
359 |                 """This loop builds an ensemble of f-itrees (the forest).
360 |                 """
361 |                 ix = np.random.choice(np.arange(self.nobjs), size=self.sample, replace=False)
362 |                 
363 |                 self.Trees.append(iTree(X[ix], self.time, self.step, 
364 |                                         0, self.limit, 
365 |                                         self.D, self.innerproduct, 
366 |                                         self.alpha, self.deriv_X[ix], 
367 |                                         self.deriv_dictionary, self.sample, self.criterion, self.mean, self.sd))
368 | 
369 | 
370 |     def compute_paths(self, X_in=None):
371 |         """
372 |         compute_paths(X_in = None) 
373 | 
374 |         Compute the anomaly score of an input sample is computed as
375 |         the mean anomaly score of the trees in the forest.
376 |         Parameters
377 |         ----------
378 |         X_in : Array-like
379 |                 Data to be scored. FIForest.Trees are used for computing the depth reached in 
380 |                 each tree by each data curve.
381 |         Returns
382 |         -------
383 |         float
384 |             Anomaly score for a given data curve.
385 |         """
386 |         if X_in is None:
387 |             X_in = self.X           
388 |             if(self.alpha != 1):
389 |                 deriv_X_in = self.deriv_X
390 |         else: 
391 |             if(self.alpha != 1):
392 |                 deriv_X_in = derivate(X_in, self.step)
393 | 
394 |         S = np.zeros(len(X_in))
395 |         
396 |         for i in  range(len(X_in)):
397 |             h_temp = 0
398 |             for j in range(self.ntrees):
399 |                 # Compute path length for each curve
400 |                 if(self.alpha != 1):
401 |                     h_temp += PathFactor(X_in[i], self.step, 
402 |                                          self.Trees[j], self.alpha,
403 |                                          deriv_X_in[i]).path * 1.0  
404 |                 else:
405 |                     h_temp += PathFactor(X_in[i], self.step, 
406 |                                          self.Trees[j], 
407 |                                          self.alpha).path * 1.0  
408 |                 
409 |             # Average of path length travelled by the point in all trees.
410 |             Eh = h_temp / self.ntrees
411 |             
412 |              # Anomaly Score
413 |             S[i] = 2.0 ** (- Eh / self.c)                                           
414 |         return S
415 |     def threshold(self, score_samples, contamination=0.1):
416 |         """Compute the treshold to declare curves as anomalies or not.
417 |            The choice of 'lower' interpolation in the percentile function come from
418 |            the fact that it should be a little gap between the score of anomalies and the normal score. 
419 |            This choice could be different depending on the problem given.
420 |            
421 |         Parameters
422 |         ----------
423 |         
424 |         score_samples : Array
425 |             The score array for a dataset of curves.
426 |             
427 |         contamination : float, optional (default=0.1)
428 |             The amount of contamination of the data set, i.e. the proportion
429 |             of outliers in the data set. Used when fitting to define the threshold
430 |             on the decision function.
431 |             
432 |         """
433 |         return np.percentile(score_samples, 100 * (1 - contamination), interpolation='lower')
434 |     
435 |     def predict_label(self, score, contamination=0.1):
436 |          
437 |         """Compute the label vector of curves.  
438 |         
439 |         Parameters
440 |         ----------
441 |         
442 |         score : Array
443 |             The score array for a dataset of curves (training or testing).
444 |             
445 |         contamination : float, optional (default=0.1)
446 |             The amount of contamination of the data set, i.e. the proportion
447 |             of outliers in the data set. Used when fitting to define the threshold
448 |             on the decision function.
449 |             
450 |         Returns
451 |         -------
452 |         
453 |         y_label : array
454 |             An array of predict label, -1 if the curve is considered as normal and +1 if not.
455 |         """
456 |         y_label = np.zeros((len(score)))
457 |         return 1- 2.0 * (score > self.threshold(score, contamination))
458 | 
459 |     def importance_feature(self):
460 |         IF = np.zeros((self.D.shape[0]))
461 | 
462 |         
463 |         for i in range(self.ntrees):
464 |             IF += self.Trees[i].IF
465 |     
466 | 
467 |         return IF
468 | 
469 | 
470 | 
471 | class Node(object): 
472 |     """
473 |     A single node from each tree (each iTree object). Nodes containe information on 
474 |     hyperplanes used for data division, date to be passed to left and right nodes, 
475 |     whether they are external or internal nodes.
476 |     Attributes
477 |     ----------
478 |     e: int
479 |         Depth of the tree to which the node belongs.
480 |         
481 |     size: int
482 |         Size of the dataset present at the node.
483 |         
484 |     X: Array-like
485 |         Data at the node.
486 |         
487 |     d: Array
488 |         Direction function used to build the hyperplane that splits the data in the node.
489 | 
490 |     dd : int
491 |         The index of the direction chosen at this node.
492 |         
493 |     q: Array
494 |         Intercept point through which the hyperplane passes.
495 |         
496 |     left: Node object
497 |         Left child node.
498 |         
499 |     right: Node object
500 |         Right child node.
501 |         
502 |     ntype: str
503 |         The type of the node: 'exNode', 'inNode'.
504 |     """
505 |     def __init__(self, 
506 |                  X, 
507 |                  d, 
508 |                  dd, 
509 |                  q, 
510 |                  e, 
511 |                  left, 
512 |                  right, 
513 |                  node_type='' ):
514 | 
515 |         self.e = e
516 |         self.size = len(X)
517 |         self.X = X 
518 |         self.d = d
519 |         self.dd = dd
520 |         self.q = q
521 |         self.left = left
522 |         self.right = right
523 |         self.ntype = node_type
524 | 
525 | class iTree(object):
526 | 
527 |     """
528 |     A single tree in the forest that is build using a unique subsample.
529 |     Attributes
530 |     ----------
531 |     e: int
532 |         Depth of tree
533 |         
534 |     X: list
535 |         Data present at the root node of this tree.
536 | 
537 |     step : array
538 |         Vector of the length of intervals of discretization.
539 |         
540 |     size: int
541 |         Size of the dataset.
542 |         
543 |     dim: int
544 |         Dimension of the dataset.
545 |         
546 |     l: int
547 |         Maximum depth a tree can reach before its creation is terminated.
548 |         
549 |     d: Array
550 |         Normal vector at the root of this tree, which is used in creating hyperplanes for 
551 |         splitting criteria.
552 | 
553 |     dd : int
554 |         The index of the direction chosen at this node.
555 |         
556 |     q: float
557 |         Intercept point at the root of this tree through which the splitting hyperplane passes.
558 |         
559 |     root: Node object
560 |         At each node create a new tree.
561 |         
562 |     D: Array-like
563 |         Dictionary of functions used as directions.
564 |         
565 |     innerproduct :  str or function  
566 |         An inner product that we use for the construction of the tree.
567 |         
568 |     alpha : float
569 |         A float number between [0,1] used in the innerproduct of the paper.
570 | 
571 |     deriv_X : Array-like
572 |         A matrix of derivate of X if needed for the scalar product.
573 |         
574 |     deriv_dictionary : Array-like
575 |         A matrix of derivate of D if needed for the scalar product.
576 |         
577 |     Methods
578 |     -------
579 |     make_tree(X, e, l, D, innerproduct)
580 |         Builds the tree recursively from a given node. Returns a Node object.
581 |     """
582 | 
583 |     def __init__(self, 
584 |                  X,
585 |                  time, 
586 |                  step, 
587 |                  e, 
588 |                  l, 
589 |                  D, 
590 |                  innerproduct, 
591 |                  alpha, 
592 |                  deriv_X=None, 
593 |                  deriv_dictionary=None,
594 |                  subsample_size=None,
595 |                  criterion=None,
596 |                  mean=None,
597 |                  sd=None):
598 |         
599 |         self.e = e
600 |         self.X = X
601 |         self.step = step
602 |         self.time = time
603 |         self.size = len(X)
604 |         self.dim = self.X.shape[1]
605 |         self.l = l
606 |         self.q = None                                       
607 |         self.d = None
608 |         self.dd = None
609 |         self.exnodes = 0
610 |         self.D = D    
611 |         self.innerproduct = innerproduct
612 |         self.alpha = alpha
613 |         self.deriv_X = deriv_X
614 |         self.mean = mean
615 |         self.sd = sd
616 | 
617 |         self.deriv_dictionary = deriv_dictionary
618 | 
619 |         if (type(self.D) != str):
620 |             self.IF = np.zeros((self.D.shape[0])) 
621 | 
622 |         self.subsample_size = subsample_size
623 |         self.criterion = criterion
624 |         # At each node create a new tree, starting with root node.
625 |         self.root = self.make_tree(self.X, self.e)
626 |         
627 | 
628 | 
629 |     def make_tree(self, X, e):
630 |         """
631 |         make_tree(X,e,l,D, innerproduct)
632 |         Builds the tree recursively from a given node. Returns a Node object.
633 |         Parameters
634 |         ----------
635 |         X: Array like
636 |             Subsample of training data. 
637 |             
638 |         e : int
639 |             Depth of the tree as it is being traversed down. Integer. e <= l.
640 |                
641 |         Returns
642 |         -------
643 |         Node object
644 |         """
645 |         
646 |         self.e = e
647 |         # A curve is isolated in training data, or the depth limit has been reached.
648 |         if e >= self.l or len(X) <= 1:                                               
649 |             left = None
650 |             right = None
651 |             self.exnodes += 1
652 |             return Node(X, self.d, self.dd, self.q, e, left, right, node_type='exNode')
653 |         
654 |         # Building the tree continues. All these nodes are internal.
655 |         else:
656 |             sample_size = X.shape[0] 
657 |             t = np.linspace(0,1,len(self.step)+1)
658 | 
659 |             if (type(self.D) != str):
660 |                 # For finite dictionaries, we draw direction from them.
661 |                 idx = np.random.choice(np.arange((self.D).shape[0]), size=1)
662 |                 self.d = self.D[idx[0],:]
663 |                 self.dd = idx[0]
664 | 
665 | 
666 | 
667 |             elif (self.D == 'cosinus'):
668 |                 """ We draw directions from the cosinus dictionary defined in the paper
669 |                  (with random amplitudes and frequences).
670 |                 """
671 | 
672 |                 self.d =  np.random.uniform(-1, 1, 1) * np.cos(2 * np.pi * np.random.uniform(0, 10, 1) * t)
673 |                 if (self.alpha != 1):
674 |                     self.deriv_dictionary.append(np.diff(self.d) / self.step)
675 |                     self.dd = len(self.deriv_dictionary) - 1
676 | 
677 |             elif (self.D == 'Brownian'):
678 |                 """ We draw directions from the Brownian motion dictionary defined in the paper"""
679 | 
680 |                                                                          
681 |                 if (self.mean == None):
682 |                     self.mean = 0
683 |                 
684 |                 if (self.sd == None):
685 |                     self.sd = 1
686 |                     
687 |                 self.d = np.zeros((len(t)))
688 |                 self.d[0] = np.random.normal(self.mean, scale=self.sd , size=1) 
689 |                 for i in range(1,len(t)):
690 |                     self.d[i] += self.sd * np.random.normal(0, scale=np.sqrt(t[2] - t[1])
691 |                                                                 , size=1) + self.mean * (t[2] - t[1])
692 |                 if (self.alpha != 1):
693 |                     self.deriv_dictionary.append(np.diff(self.d) / self.step)
694 |                     self.dd = len(self.deriv_dictionary) - 1 
695 | 
696 |             elif (self.D == 'gaussian_wavelets'):
697 |                 """ We draw directions from the gaussian wavelets dictionary.
698 |                  We use a discretization on [-5,5] and add two random parameters 
699 |                  to get an interesting dictionary. 
700 |                 The standard deviation sigma and a translation parameter K. The range of these 
701 |                 parameters are fixed.
702 |                 """
703 | 
704 |                 t = np.linspace(-5,5,len(self.step)+1)
705 |                 sigma = np.random.uniform(0.2,1)
706 |                 K = np.random.uniform(-4,4)
707 |                 self.d = (-(2 / (np.power(np.pi,0.25) * np.sqrt(3 * sigma)) ) 
708 |                              * ((t - K) ** 2 / (sigma ** 2) -1) * (
709 |                              np.exp(-(t - K) ** 2 / (2 * sigma ** 2))))
710 |                 if (self.alpha != 1):
711 |                     self.deriv_dictionary.append(np.diff(self.d) / self.step)
712 |                     self.dd = len(self.deriv_dictionary) - 1 
713 | 
714 |             elif (self.D == 'Brownian_bridge'):
715 |                 """ We draw directions from the Brownian bridge dictionary defined in the paper"""
716 | 
717 |                 self.d = np.zeros((len(t)))
718 |                 for i in range(1,(len(t)-1)):
719 |                     self.d[i] +=  np.random.normal(0, np.sqrt(t[2] - t[1])
720 |                                   , size=1) - self.d[i-1] * (t[2] - t[1]) / (1 - t[i])
721 | 
722 |                 if (self.alpha != 1):
723 |                     self.deriv_dictionary.append(np.diff(self.d) / self.step)
724 |                     self.dd = len(self.deriv_dictionary) - 1 
725 | 
726 | 
727 |             elif (self.D == 'indicator_uniform'):
728 |                 """ We draw directions from the indicator uniform dictionary defined in the paper"""
729 | 
730 |                 self.d = np.zeros((len(t)))
731 |                 a = ((self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0])
732 |                 b = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0]
733 |                 for j in range(len(self.time)):
734 |                     self.d[j] = 1. * (np.maximum(a, b) > self.time[j] > np.minimum(a, b))
735 | 
736 | 
737 |             elif (self.D == 'linear_indicator_uniform'):
738 |                 """ We draw directions from the Linear indicator uniform dictionary defined in the paper"""
739 | 
740 | 
741 |                 self.d = np.zeros((len(t)))
742 |                 a = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0]
743 |                 b = (self.time[len(self.time) - 1] - self.time[0]) * np.random.random() + self.time[0]
744 |                 for j in range(len(self.time)):
745 |                     self.d[j] = self.time[j] * (np.maximum(a,b) > self.time[j] > np.minimum(a,b))
746 | 
747 |                 if (self.alpha != 1):
748 |                     self.deriv_dictionary.append(np.diff(self.d) / self.step)
749 |                     self.dd = len(self.deriv_dictionary) - 1 
750 | 
751 |             else: raise TypeError('This Dictionary is not pre-defined')
752 |     
753 | 
754 | 
755 |            
756 | 
757 | 
758 | 
759 | 
760 |             Z = np.zeros((sample_size))
761 | 
762 |             if (self.alpha != 1):
763 |                 for i in range(sample_size):
764 | 
765 | 
766 |                     Z[i] = self.innerproduct(X[i,:], self.d, self.deriv_X[i], 
767 |                                                   self.deriv_dictionary[self.dd])
768 |             
769 |             else: 
770 |                 for i in range(sample_size): 
771 |                     Z[i] = self.innerproduct(X[i,:], self.d)
772 |                     
773 |             # Picking a random threshold for the hyperplane splitting data.
774 | 
775 |             self.q = np.random.uniform(np.min(Z), np.max(Z)) 
776 | 
777 |             # Criteria that determines if a curve should go to the left or right child node.
778 | 
779 |             w = Z - self.q < 0
780 | 
781 |             if (type(self.D) != str):
782 |                 if (sample_size >2):
783 |                     if (np.sum(w) == 1 or np.sum(w) == sample_size - 1): 
784 |                         if (self.criterion == "naive"):
785 |                             self.IF[idx[0]] += 1
786 |                         elif(self.criterion == "sample"):
787 |                             self.IF[idx[0]] += sample_size / self.subsample_size 
788 | 
789 |                         else:
790 |                             self.IF[idx[0]] += 1 / (e + 1) 
791 | 
792 | 
793 |             return Node(self.X, self.d, self.dd, self.q, e,\
794 |             left=self.make_tree(X[w], e+1),\
795 |             right=self.make_tree(X[~w], e+1),\
796 |             node_type = 'inNode' )
797 | 
798 | class PathFactor(object):
799 |     """
800 |     Given a single tree (iTree objext) and a curve x , compute the length of the path traversed
801 |     by the point on the tree when it reaches an external node.
802 |     
803 |     Attributes
804 |     ----------
805 |     path_list: list
806 |         A list of strings 'L' or 'R' which traces the path a data curve travels down a tree.
807 |         
808 |     x: list
809 |         A single function, which is represented as an array floats.
810 |         
811 |     e: int
812 |         The depth of a given node in the tree.
813 |         
814 |     deriv_x : Array
815 |         The derivative of the new function if needed for the scalar product.
816 | 
817 |     step : array
818 |         Vector of the length of intervals of discretization.
819 | 
820 |     D: Array-like
821 |         Dictionary of functions used as directions.
822 |         
823 |     innerproduct :  str or function  
824 |         An inner product that we use for the construction of the tree.
825 |         
826 |     alpha : float
827 |         A float number between [0,1] used in the innerproduct of the paper.
828 | 
829 |     deriv_X : Array-like
830 |         A matrix of derivate of X if needed for the scalar product.
831 |         
832 |     deriv_dictionary : Array-like
833 |         A matrix of derivate of D if needed for the scalar product.
834 |         
835 |     Methods
836 |     -------
837 |     find_path(T)
838 |         Given a tree, it finds the path a single data curves takes.
839 |     """
840 |     def __init__(self, 
841 |                  x, 
842 |                  step, 
843 |                  itree, 
844 |                  alpha, 
845 |                  deriv_x=None):
846 | 
847 |         self.path_list=[]
848 |         self.x = x
849 |         self.deriv_x = deriv_x
850 |         self.e = 0
851 |         self.alpha = alpha
852 |         self.step = step
853 |         self.D = itree.D
854 |         self.deriv_dictionary = itree.deriv_dictionary
855 |         self.innerproduct = itree.innerproduct
856 |         self.path  = self.find_path(itree.root)
857 | 
858 |     def find_path(self, T):
859 |         """
860 |         find_path(T)
861 |         Given a tree, find the path for a single curve based on the splitting criteria 
862 |         stored at each node.
863 |         
864 |         Parameters
865 |         ----------
866 |         T : Node object
867 |         
868 |         innerproduct : str or function
869 |             The innerproduct use in the Forest.
870 |         
871 |         
872 |         Returns
873 |         -------
874 |         int
875 |             The depth reached by the data curve.
876 |         """
877 |         if T.ntype == 'exNode':
878 |             
879 |             if T.size <= 1: return self.e
880 |             
881 |             else:
882 |                 self.e = self.e + c_factor(T.size)
883 |                 return self.e
884 |         else:
885 | 
886 |             q = T.q 
887 |             d = T.d                                                             
888 |             self.e += 1            
889 |             if (self.alpha != 1):
890 |                 if self.innerproduct(self.x, d, self.deriv_x, self.deriv_dictionary[T.dd]) - q < 0:
891 |                     self.path_list.append('L')
892 |                     return self.find_path(T.left)
893 |                 else:
894 |                     self.path_list.append('R')
895 |                     return self.find_path(T.right)
896 |             else:
897 |                 if self.innerproduct(self.x, d, self.step) - q < 0:
898 |                     self.path_list.append('L')
899 |                     return self.find_path(T.left)
900 |                 else:
901 |                     self.path_list.append('R')
902 |                     return self.find_path(T.right)
903 | 
904 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import numpy
 4 | from Cython.Distutils import build_ext
 5 | try:
 6 |     from setuptools import setup, find_packages
 7 |     from setuptools.extension import Extension
 8 | except ImportError:
 9 |     from distutils.core import setup
10 |     from distutils.extension import Extension
11 | prjdir = os.path.dirname(__file__)
12 | 
13 | 
14 | def read(filename):
15 |     return open(os.path.join(prjdir, filename)).read()
16 | 
17 | 
18 | 
19 | 
20 | extra_link_args = []
21 | libraries = []
22 | library_dirs = []
23 | include_dirs = []
24 | exec(open('version.py').read())
25 | setup(
26 |     name='pyt-fif',
27 |     version=__version__,
28 |     author='Guillaume Staerman',
29 |     author_email='guillaume.staerman@telecom-paris.fr',
30 |     cmdclass={'build_ext': build_ext},
31 |     ext_modules=[Extension("fif",
32 |                  sources=["_fif.pyx", "fif.cxx"],
33 |                  include_dirs=[numpy.get_include()],
34 |                  extra_compile_args=['-std=c++11', '-Wcpp'],
35 |                  language="c++")],
36 |     scripts=[],
37 |     py_modules=['version'],
38 |     packages=[],
39 |     license='License.txt',
40 |     include_package_data=True,
41 |     description='Functional Isolation Forest',
42 |     long_description_content_type='text/markdown',
43 |     url='https://github.com/GuillaumeStaermanML/FIF',
44 |     download_url='https://github.com/GuillaumeStaermanML/FIF/archive/refs/tags/1.0.2.tar.gz',
45 |     install_requires=["numpy", "cython"],
46 | )
47 | 


--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
1 | """Functional Isolation Forest version"""
2 | 
3 | version_tag = (1, 0, 2)
4 | __version__ = '.'.join(map(str, version_tag[:3]))
5 | 
6 | if len(version_tag) > 3:
7 |     __version__ = '%s-%s' % (__version__, version_tag[3])
8 | 


--------------------------------------------------------------------------------