├── .github
    └── workflows
    │   └── build.yml
├── .gitignore
├── .gitmodules
├── CHANGELOG.md
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── ext
    └── isotree
    │   ├── ext.cpp
    │   └── extconf.rb
├── isotree.gemspec
├── lib
    ├── isotree.rb
    └── isotree
    │   ├── dataset.rb
    │   ├── isolation_forest.rb
    │   └── version.rb
└── test
    ├── isolation_forest_test.rb
    ├── support
        ├── data.csv
        ├── import.py
        ├── model.bin
        ├── model.bin.metadata
        └── predict.py
    └── test_helper.rb


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   build:
 5 |     strategy:
 6 |       fail-fast: false
 7 |       matrix:
 8 |         os: [ubuntu-latest, macos-latest]
 9 |     runs-on: ${{ matrix.os }}
10 |     steps:
11 |       - uses: actions/checkout@v4
12 |         with:
13 |           submodules: recursive
14 |       - uses: ruby/setup-ruby@v1
15 |         with:
16 |           ruby-version: 3.4
17 |           bundler-cache: true
18 |       - run: bundle exec rake compile
19 |       - run: bundle exec rake test
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.bundle/
 2 | /.yardoc
 3 | /_yardoc/
 4 | /coverage/
 5 | /doc/
 6 | /pkg/
 7 | /spec/reports/
 8 | /tmp/
 9 | *.lock
10 | *.bundle
11 | *.so
12 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "vendor/isotree"]
2 | 	path = vendor/isotree
3 | 	url = https://github.com/david-cortes/isotree
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## 0.4.0 (2024-06-11)
 2 | 
 3 | - Updated IsoTree to 0.6.1
 4 | - Dropped support for Ruby < 3.1
 5 | 
 6 | ## 0.3.1 (2023-12-19)
 7 | 
 8 | - Updated IsoTree to 0.5.25
 9 | 
10 | ## 0.3.0 (2022-06-13)
11 | 
12 | - Updated IsoTree to 0.5.16
13 | - Updated serialization format (exported models must be recreated)
14 | - Dropped support for Ruby < 2.7
15 | - Dropped support for Windows
16 | 
17 | ## 0.2.2 (2022-06-12)
18 | 
19 | - Fixed segfault when data is smaller than sample size
20 | 
21 | ## 0.2.1 (2021-05-23)
22 | 
23 | - Improved performance
24 | 
25 | ## 0.2.0 (2021-05-17)
26 | 
27 | - Updated to Rice 4
28 | - Dropped support for Ruby < 2.6
29 | 
30 | ## 0.1.5 (2021-03-14)
31 | 
32 | - Updated IsoTree to 0.1.25
33 | - Added support for exporting and importing models
34 | 
35 | ## 0.1.4 (2020-08-22)
36 | 
37 | - Added `missing_action`, `new_categ_action`, `categ_split_type`, `coefs`, `depth_imp`, and `weigh_imp_rows` options
38 | - Fixed signal handling
39 | 
40 | ## 0.1.3 (2020-08-13)
41 | 
42 | - Added support for categorical data
43 | - Added support for Rover data frames
44 | - Added `output` option to `predict` method
45 | 
46 | ## 0.1.2 (2020-08-11)
47 | 
48 | - Fixed outlier scores
49 | 
50 | ## 0.1.1 (2020-08-10)
51 | 
52 | - Fixed installation error when cereal not installed
53 | 
54 | ## 0.1.0 (2020-08-10)
55 | 
56 | - First release
57 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source "https://rubygems.org"
 2 | 
 3 | gemspec
 4 | 
 5 | gem "rake"
 6 | gem "rake-compiler"
 7 | gem "minitest", ">= 5"
 8 | gem "numo-narray"
 9 | gem "rover-df"
10 | gem "csv"
11 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2019-2023, David Cortes
 4 | Copyright (c) 2020-2023, Andrew Kane
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 | 1. Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 | 
13 | 2. Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # IsoTree Ruby
  2 | 
  3 | :evergreen_tree: [IsoTree](https://github.com/david-cortes/isotree) - outlier/anomaly detection using Isolation Forest - for Ruby
  4 | 
  5 | Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
  6 | 
  7 | :deciduous_tree: Check out [OutlierTree](https://github.com/ankane/outliertree-ruby) for human-readable explanations of outliers
  8 | 
  9 | [![Build Status](https://github.com/ankane/isotree-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/isotree-ruby/actions)
 10 | 
 11 | ## Installation
 12 | 
 13 | Add this line to your application’s Gemfile:
 14 | 
 15 | ```ruby
 16 | gem "isotree"
 17 | ```
 18 | 
 19 | Windows is not supported at the moment
 20 | 
 21 | ## Getting Started
 22 | 
 23 | Prep your data
 24 | 
 25 | ```ruby
 26 | data = [
 27 |   {department: "Books",  sale: false, price: 2.50},
 28 |   {department: "Books",  sale: true,  price: 3.00},
 29 |   {department: "Movies", sale: false, price: 5.00},
 30 |   # ...
 31 | ]
 32 | ```
 33 | 
 34 | Train a model
 35 | 
 36 | ```ruby
 37 | model = IsoTree::IsolationForest.new
 38 | model.fit(data)
 39 | ```
 40 | 
 41 | Get outlier scores
 42 | 
 43 | ```ruby
 44 | model.predict(data)
 45 | ```
 46 | 
 47 | Scores are between 0 and 1, with higher scores indicating outliers
 48 | 
 49 | Export the model
 50 | 
 51 | ```ruby
 52 | model.export_model("model.bin")
 53 | ```
 54 | 
 55 | Import a model
 56 | 
 57 | ```ruby
 58 | model = IsoTree::IsolationForest.import_model("model.bin")
 59 | ```
 60 | 
 61 | ## Parameters
 62 | 
 63 | Pass parameters - default values below
 64 | 
 65 | ```ruby
 66 | IsoTree::IsolationForest.new(
 67 |   sample_size: "auto",
 68 |   ntrees: 500,
 69 |   ndim: 3,
 70 |   ntry: 1,
 71 |   max_depth: "auto",
 72 |   ncols_per_tree: nil,
 73 |   prob_pick_pooled_gain: 0.0,
 74 |   prob_pick_avg_gain: 0.0,
 75 |   prob_pick_full_gain: 0.0,
 76 |   prob_pick_dens: 0.0,
 77 |   prob_pick_col_by_range: 0.0,
 78 |   prob_pick_col_by_var: 0.0,
 79 |   prob_pick_col_by_kurt: 0.0,
 80 |   min_gain: 0.0,
 81 |   missing_action: "auto",
 82 |   new_categ_action: "auto",
 83 |   categ_split_type: "auto",
 84 |   all_perm: false,
 85 |   coef_by_prop: false,
 86 |   sample_with_replacement: false,
 87 |   penalize_range: false,
 88 |   standardize_data: true,
 89 |   scoring_metric: "depth",
 90 |   fast_bratio: true,
 91 |   weigh_by_kurtosis: false,
 92 |   coefs: "uniform",
 93 |   assume_full_distr: true,
 94 |   min_imp_obs: 3,
 95 |   depth_imp: "higher",
 96 |   weigh_imp_rows: "inverse",
 97 |   random_seed: 1,
 98 |   use_long_double: false,
 99 |   nthreads: -1
100 | )
101 | ```
102 | 
103 | See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.IsolationForest)
104 | 
105 | ## Data
106 | 
107 | Data can be an array of hashes
108 | 
109 | ```ruby
110 | [
111 |   {department: "Books",  sale: false, price: 2.50},
112 |   {department: "Books",  sale: true,  price: 3.00},
113 |   {department: "Movies", sale: false, price: 5.00}
114 | ]
115 | ```
116 | 
117 | Or a Rover data frame
118 | 
119 | ```ruby
120 | Rover.read_csv("data.csv")
121 | ```
122 | 
123 | Or a Numo array
124 | 
125 | ```ruby
126 | Numo::NArray.cast([[1, 2, 3], [4, 5, 6]])
127 | ```
128 | 
129 | ## Performance
130 | 
131 | IsoTree uses OpenMP when possible for best performance. To enable OpenMP on Mac, run:
132 | 
133 | ```sh
134 | brew install libomp
135 | ```
136 | 
137 | Then reinstall the gem.
138 | 
139 | ```sh
140 | gem uninstall isotree --force
141 | bundle install
142 | ```
143 | 
144 | ## Deployment
145 | 
146 | Check out [Trove](https://github.com/ankane/trove) for deploying models.
147 | 
148 | ```sh
149 | trove push model.bin
150 | ```
151 | 
152 | ## Reference
153 | 
154 | Get the average isolation depth
155 | 
156 | ```ruby
157 | model.predict(data, output: "avg_depth")
158 | ```
159 | 
160 | ## History
161 | 
162 | View the [changelog](https://github.com/ankane/isotree-ruby/blob/master/CHANGELOG.md)
163 | 
164 | ## Contributing
165 | 
166 | Everyone is encouraged to help improve this project. Here are a few ways you can help:
167 | 
168 | - [Report bugs](https://github.com/ankane/isotree-ruby/issues)
169 | - Fix bugs and [submit pull requests](https://github.com/ankane/isotree-ruby/pulls)
170 | - Write, clarify, or fix documentation
171 | - Suggest or add new features
172 | 
173 | To get started with development:
174 | 
175 | ```sh
176 | git clone --recursive https://github.com/ankane/isotree-ruby.git
177 | cd isotree-ruby
178 | bundle install
179 | bundle exec rake compile
180 | bundle exec rake test
181 | ```
182 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require "bundler/gem_tasks"
 2 | require "rake/testtask"
 3 | require "rake/extensiontask"
 4 | 
 5 | task default: :test
 6 | Rake::TestTask.new do |t|
 7 |   t.libs << "test"
 8 |   t.pattern = "test/**/*_test.rb"
 9 | end
10 | 
11 | Rake::ExtensionTask.new("isotree") do |ext|
12 |   ext.name = "ext"
13 |   ext.lib_dir = "lib/isotree"
14 | end
15 | 
16 | task :check_license do
17 |   raise "Missing vendor license" unless File.exist?("vendor/isotree/LICENSE")
18 | end
19 | 
20 | task :remove_ext do
21 |   path = "lib/isotree/ext.bundle"
22 |   File.unlink(path) if File.exist?(path)
23 | end
24 | 
25 | Rake::Task["build"].enhance [:check_license, :remove_ext]
26 | 


--------------------------------------------------------------------------------
/ext/isotree/ext.cpp:
--------------------------------------------------------------------------------
  1 | // stdlib
  2 | #include <cmath>
  3 | #include <fstream>
  4 | #include <iostream>
  5 | 
  6 | // isotree
  7 | #include <isotree.hpp>
  8 | 
  9 | // rice
 10 | #include <rice/rice.hpp>
 11 | 
 12 | using Rice::Array;
 13 | using Rice::Hash;
 14 | using Rice::Module;
 15 | using Rice::Object;
 16 | using Rice::String;
 17 | using Rice::Symbol;
 18 | using Rice::define_class_under;
 19 | using Rice::define_module;
 20 | 
 21 | namespace Rice::detail
 22 | {
 23 |   template<>
 24 |   class From_Ruby<NewCategAction>
 25 |   {
 26 |   public:
 27 |     NewCategAction convert(VALUE x)
 28 |     {
 29 |       auto value = Object(x).to_s().str();
 30 |       if (value == "weighted" || value == "impute") return Weighted;
 31 |       if (value == "smallest") return Smallest;
 32 |       if (value == "random") return Random;
 33 |       throw std::runtime_error("Unknown new categ action: " + value);
 34 |     }
 35 |   };
 36 | 
 37 |   template<>
 38 |   class From_Ruby<MissingAction>
 39 |   {
 40 |   public:
 41 |     MissingAction convert(VALUE x)
 42 |     {
 43 |       auto value = Object(x).to_s().str();
 44 |       if (value == "divide") return Divide;
 45 |       if (value == "impute") return Impute;
 46 |       if (value == "fail") return Fail;
 47 |       throw std::runtime_error("Unknown missing action: " + value);
 48 |     }
 49 |   };
 50 | 
 51 |   template<>
 52 |   class From_Ruby<CategSplit>
 53 |   {
 54 |   public:
 55 |     CategSplit convert(VALUE x)
 56 |     {
 57 |       auto value = Object(x).to_s().str();
 58 |       if (value == "subset") return SubSet;
 59 |       if (value == "single_categ") return SingleCateg;
 60 |       throw std::runtime_error("Unknown categ split: " + value);
 61 |     }
 62 |   };
 63 | 
 64 |   template<>
 65 |   class From_Ruby<CoefType>
 66 |   {
 67 |   public:
 68 |     CoefType convert(VALUE x)
 69 |     {
 70 |       auto value = Object(x).to_s().str();
 71 |       if (value == "uniform") return Uniform;
 72 |       if (value == "normal") return Normal;
 73 |       throw std::runtime_error("Unknown coef type: " + value);
 74 |     }
 75 |   };
 76 | 
 77 |   template<>
 78 |   class From_Ruby<UseDepthImp>
 79 |   {
 80 |   public:
 81 |     UseDepthImp convert(VALUE x)
 82 |     {
 83 |       auto value = Object(x).to_s().str();
 84 |       if (value == "lower") return Lower;
 85 |       if (value == "higher") return Higher;
 86 |       if (value == "same") return Same;
 87 |       throw std::runtime_error("Unknown depth imp: " + value);
 88 |     }
 89 |   };
 90 | 
 91 |   template<>
 92 |   class From_Ruby<WeighImpRows>
 93 |   {
 94 |   public:
 95 |     WeighImpRows convert(VALUE x)
 96 |     {
 97 |       auto value = Object(x).to_s().str();
 98 |       if (value == "inverse") return Inverse;
 99 |       if (value == "prop") return Prop;
100 |       if (value == "flat") return Flat;
101 |       throw std::runtime_error("Unknown weight imp rows: " + value);
102 |     }
103 |   };
104 | 
105 |   template<>
106 |   class From_Ruby<ScoringMetric>
107 |   {
108 |   public:
109 |     ScoringMetric convert(VALUE x)
110 |     {
111 |       auto value = Object(x).to_s().str();
112 |       if (value == "depth") return Depth;
113 |       if (value == "adj_depth") return AdjDepth;
114 |       if (value == "density") return Density;
115 |       if (value == "adj_density") return AdjDensity;
116 |       if (value == "boxed_density") return BoxedDensity;
117 |       if (value == "boxed_density2") return BoxedDensity2;
118 |       if (value == "boxed_ratio") return BoxedRatio;
119 |       throw std::runtime_error("Unknown scoring metric: " + value);
120 |     }
121 |   };
122 | }
123 | 
124 | extern "C"
125 | void Init_ext()
126 | {
127 |   Module rb_mIsoTree = define_module("IsoTree");
128 | 
129 |   Module rb_mExt = define_module_under(rb_mIsoTree, "Ext");
130 |   define_class_under<ExtIsoForest>(rb_mExt, "ExtIsoForest");
131 | 
132 |   rb_mExt
133 |     .define_singleton_function(
134 |       "fit_iforest",
135 |       [](Hash options) {
136 |         // model
137 |         ExtIsoForest iso;
138 | 
139 |         // data
140 |         size_t nrows = options.get<size_t, Symbol>("nrows");
141 |         size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
142 |         size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
143 | 
144 |         real_t* numeric_data = NULL;
145 |         if (ncols_numeric > 0) {
146 |           numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
147 |         }
148 | 
149 |         int* categorical_data = NULL;
150 |         int* ncat = NULL;
151 |         if (ncols_categ > 0) {
152 |           categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
153 |           ncat = (int*) options.get<String, Symbol>("ncat").c_str();
154 |         }
155 | 
156 |         // not used (sparse matrices)
157 |         real_t* Xc = NULL;
158 |         sparse_ix* Xc_ind = NULL;
159 |         sparse_ix* Xc_indptr = NULL;
160 | 
161 |         // options
162 |         // Rice has limit of 14 arguments, so use hash
163 |         size_t sample_size = options.get<size_t, Symbol>("sample_size");
164 |         size_t ndim = options.get<size_t, Symbol>("ndim");
165 |         size_t ntrees = options.get<size_t, Symbol>("ntrees");
166 |         size_t ntry = options.get<size_t, Symbol>("ntry");
167 |         double prob_pick_by_gain_avg = options.get<double, Symbol>("prob_pick_avg_gain");
168 |         double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
169 |         double min_gain = options.get<double, Symbol>("min_gain");
170 |         MissingAction missing_action = options.get<MissingAction, Symbol>("missing_action");
171 |         CategSplit cat_split_type = options.get<CategSplit, Symbol>("categ_split_type");
172 |         NewCategAction new_cat_action = options.get<NewCategAction, Symbol>("new_categ_action");
173 |         bool all_perm = options.get<bool, Symbol>("all_perm");
174 |         bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
175 |         bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
176 |         bool penalize_range = options.get<bool, Symbol>("penalize_range");
177 |         bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
178 |         CoefType coef_type = options.get<CoefType, Symbol>("coefs");
179 |         size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
180 |         UseDepthImp depth_imp = options.get<UseDepthImp, Symbol>("depth_imp");
181 |         WeighImpRows weigh_imp_rows = options.get<WeighImpRows, Symbol>("weigh_imp_rows");
182 |         uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
183 |         bool use_long_double = options.get<bool, Symbol>("use_long_double");
184 |         int nthreads = options.get<int, Symbol>("nthreads");
185 | 
186 |         // TODO options
187 |         double* sample_weights = NULL;
188 |         bool weight_as_sample = options.get<bool, Symbol>("weights_as_sample_prob");
189 |         size_t max_depth = options.get<size_t, Symbol>("max_depth");
190 |         bool limit_depth = options.get<bool, Symbol>("limit_depth");
191 |         bool standardize_dist = false;
192 |         double* tmat = NULL;
193 |         double* output_depths = NULL;
194 |         bool standardize_depth = false;
195 |         real_t* col_weights = NULL;
196 |         Imputer* imputer = NULL;
197 |         bool impute_at_fit = false;
198 | 
199 |         int ncols_per_tree = options.get<int, Symbol>("ncols_per_tree");
200 |         bool standardize_data = options.get<bool, Symbol>("standardize_data");
201 |         ScoringMetric scoring_metric = options.get<ScoringMetric, Symbol>("scoring_metric");
202 |         bool fast_bratio = options.get<bool, Symbol>("fast_bratio");
203 |         double prob_pick_by_full_gain = options.get<double, Symbol>("prob_pick_full_gain");
204 |         double prob_pick_by_dens = options.get<double, Symbol>("prob_pick_dens");
205 |         double prob_pick_col_by_range = options.get<double, Symbol>("prob_pick_col_by_range");
206 |         double prob_pick_col_by_var = options.get<double, Symbol>("prob_pick_col_by_var");
207 |         double prob_pick_col_by_kurt = options.get<double, Symbol>("prob_pick_col_by_kurt");
208 | 
209 |         fit_iforest(
210 |           NULL,
211 |           &iso,
212 |           numeric_data,
213 |           ncols_numeric,
214 |           categorical_data,
215 |           ncols_categ,
216 |           ncat,
217 |           Xc,
218 |           Xc_ind,
219 |           Xc_indptr,
220 |           ndim,
221 |           ntry,
222 |           coef_type,
223 |           coef_by_prop,
224 |           sample_weights,
225 |           with_replacement,
226 |           weight_as_sample,
227 |           nrows,
228 |           sample_size,
229 |           ntrees,
230 |           max_depth,
231 |           ncols_per_tree,
232 |           limit_depth,
233 |           penalize_range,
234 |           standardize_data,
235 |           scoring_metric,
236 |           fast_bratio,
237 |           standardize_dist,
238 |           tmat,
239 |           output_depths,
240 |           standardize_depth,
241 |           col_weights,
242 |           weigh_by_kurt,
243 |           prob_pick_by_gain_pl,
244 |           prob_pick_by_gain_avg,
245 |           prob_pick_by_full_gain,
246 |           prob_pick_by_dens,
247 |           prob_pick_col_by_range,
248 |           prob_pick_col_by_var,
249 |           prob_pick_col_by_kurt,
250 |           min_gain,
251 |           missing_action,
252 |           cat_split_type,
253 |           new_cat_action,
254 |           all_perm,
255 |           imputer,
256 |           min_imp_obs,
257 |           depth_imp,
258 |           weigh_imp_rows,
259 |           impute_at_fit,
260 |           random_seed,
261 |           use_long_double,
262 |           nthreads
263 |         );
264 | 
265 |         return iso;
266 |       })
267 |     .define_singleton_function(
268 |       "predict_iforest",
269 |       [](ExtIsoForest& iso, Hash options) {
270 |         // data
271 |         size_t nrows = options.get<size_t, Symbol>("nrows");
272 |         size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
273 |         size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
274 | 
275 |         real_t* numeric_data = NULL;
276 |         if (ncols_numeric > 0) {
277 |           numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
278 |         }
279 | 
280 |         int* categorical_data = NULL;
281 |         if (ncols_categ > 0) {
282 |           categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
283 |         }
284 | 
285 |         // not used (sparse matrices)
286 |         real_t* Xc = NULL;
287 |         sparse_ix* Xc_ind = NULL;
288 |         sparse_ix* Xc_indptr = NULL;
289 |         real_t* Xr = NULL;
290 |         sparse_ix* Xr_ind = NULL;
291 |         sparse_ix* Xr_indptr = NULL;
292 | 
293 |         // options
294 |         int nthreads = options.get<int, Symbol>("nthreads");
295 |         bool standardize = options.get<bool, Symbol>("standardize");
296 |         std::vector<double> outlier_scores(nrows);
297 |         sparse_ix* tree_num = NULL;
298 |         bool is_col_major = true;
299 |         size_t ld_numeric = 0;
300 |         size_t ld_categ = 0;
301 |         double* per_tree_depths = NULL;
302 | 
303 |         predict_iforest(
304 |           numeric_data,
305 |           categorical_data,
306 |           is_col_major,
307 |           ld_numeric,
308 |           ld_categ,
309 |           Xc,
310 |           Xc_ind,
311 |           Xc_indptr,
312 |           Xr,
313 |           Xr_ind,
314 |           Xr_indptr,
315 |           nrows,
316 |           nthreads,
317 |           standardize,
318 |           NULL,
319 |           &iso,
320 |           outlier_scores.data(),
321 |           tree_num,
322 |           per_tree_depths,
323 |           NULL
324 |         );
325 | 
326 |         Array ret;
327 |         for (size_t i = 0; i < outlier_scores.size(); i++) {
328 |           ret.push(outlier_scores[i]);
329 |         }
330 |         return ret;
331 |       })
332 |     .define_singleton_function(
333 |       "serialize_combined",
334 |       [](ExtIsoForest& iso, String path, String metadata) {
335 |         #ifdef _MSC_VER
336 |         // TODO convert to wchar_t
337 |         throw std::runtime_error("Not supported on Windows yet");
338 |         #else
339 |         std::ofstream file;
340 |         file.open(path.c_str());
341 |         serialize_combined(
342 |           NULL,
343 |           &iso,
344 |           NULL,
345 |           NULL,
346 |           metadata.c_str(),
347 |           // returns bytesize (RSTRING_LEN)
348 |           metadata.length(),
349 |           file
350 |         );
351 |         file.close();
352 |         #endif
353 |       })
354 |     .define_singleton_function(
355 |       "deserialize_combined",
356 |       [](String path) {
357 |         #ifdef _MSC_VER
358 |         // TODO convert to wchar_t
359 |         throw std::runtime_error("Not supported on Windows yet");
360 |         #else
361 |         Array ret;
362 | 
363 |         std::ifstream file;
364 |         file.open(path.c_str(), std::ios_base::in | std::ios_base::binary);
365 |         if (!file) {
366 |           throw std::runtime_error("Cannot open file");
367 |         }
368 | 
369 |         bool is_isotree_model = false;
370 |         bool is_compatible = false;
371 |         bool has_combined_objects = false;
372 |         bool has_IsoForest = false;
373 |         bool has_ExtIsoForest = false;
374 |         bool has_Imputer = false;
375 |         bool has_Indexer = false;
376 |         bool has_metadata = false;
377 |         size_t size_metadata = 0;
378 | 
379 |         inspect_serialized_object(
380 |           file,
381 |           is_isotree_model,
382 |           is_compatible,
383 |           has_combined_objects,
384 |           has_IsoForest,
385 |           has_ExtIsoForest,
386 |           has_Imputer,
387 |           has_Indexer,
388 |           has_metadata,
389 |           size_metadata
390 |         );
391 | 
392 |         if (!is_isotree_model || !has_combined_objects) {
393 |           throw std::runtime_error("Input file is not a serialized isotree model");
394 |         }
395 |         if (!is_compatible) {
396 |           throw std::runtime_error("Model file format is incompatible");
397 |         }
398 |         if (size_metadata == 0) {
399 |           throw std::runtime_error("Input file does not contain metadata");
400 |         }
401 | 
402 |         IsoForest model = IsoForest();
403 |         ExtIsoForest model_ext = ExtIsoForest();
404 |         Imputer imputer = Imputer();
405 |         TreesIndexer indexer = TreesIndexer();
406 |         char *optional_metadata = (char*) calloc(size_metadata, sizeof(char));
407 |         if (optional_metadata == NULL) {
408 |           throw std::runtime_error("Cannot allocate memory");
409 |         }
410 | 
411 |         deserialize_combined(file, &model, &model_ext, &imputer, &indexer, optional_metadata);
412 |         file.close();
413 | 
414 |         ret.push(Object(Rice::detail::To_Ruby<ExtIsoForest>().convert(model_ext)));
415 |         ret.push(String(std::string(optional_metadata, size_metadata)));
416 | 
417 |         free(optional_metadata);
418 | 
419 |         return ret;
420 |         #endif
421 |       });
422 | }
423 | 


--------------------------------------------------------------------------------
/ext/isotree/extconf.rb:
--------------------------------------------------------------------------------
 1 | require "mkmf-rice"
 2 | 
 3 | $CXXFLAGS += " -std=c++17 $(optflags) -D_USE_XOSHIRO -DSUPPORTS_RESTRICT=1 -D_USE_ROBIN_MAP -DDONT_THROW_ON_INTERRUPT"
 4 | 
 5 | apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
 6 | 
 7 | # check omp first
 8 | if have_library("omp") || have_library("gomp")
 9 |   $CXXFLAGS += " -Xclang" if apple_clang
10 |   $CXXFLAGS += " -fopenmp"
11 | end
12 | 
13 | ext = File.expand_path(".", __dir__)
14 | isotree_src = File.expand_path("../../vendor/isotree/src", __dir__)
15 | isotree_inc = File.expand_path("../../vendor/isotree/include", __dir__)
16 | 
17 | exclude = %w(c_interface.cpp Rwrapper.cpp RcppExports.cpp)
18 | $srcs = Dir["{#{ext},#{isotree_src}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) }
19 | $INCFLAGS << " -I#{isotree_inc}"
20 | $VPATH << isotree_src
21 | 
22 | create_makefile("isotree/ext")
23 | 


--------------------------------------------------------------------------------
/isotree.gemspec:
--------------------------------------------------------------------------------
 1 | require_relative "lib/isotree/version"
 2 | 
 3 | Gem::Specification.new do |spec|
 4 |   spec.name          = "isotree"
 5 |   spec.version       = IsoTree::VERSION
 6 |   spec.summary       = "Outlier/anomaly detection for Ruby using Isolation Forest"
 7 |   spec.homepage      = "https://github.com/ankane/isotree-ruby"
 8 |   spec.license       = "BSD-2-Clause"
 9 | 
10 |   spec.author        = "Andrew Kane"
11 |   spec.email         = "andrew@ankane.org"
12 | 
13 |   spec.files         = Dir["*.{md,txt}", "{ext,lib}/**/*", "vendor/isotree/{LICENSE,README.md}", "vendor/isotree/inst/COPYRIGHTS", "vendor/isotree/{include,src}/*.{cpp,hpp}", "vendor/isotree/src/robinmap/{LICENSE,README.md}", "vendor/isotree/src/robinmap/include/**/*"]
14 |   spec.require_path  = "lib"
15 |   spec.extensions    = ["ext/isotree/extconf.rb"]
16 | 
17 |   spec.required_ruby_version = ">= 3.1"
18 | 
19 |   spec.add_dependency "rice", ">= 4.3.3"
20 | end
21 | 


--------------------------------------------------------------------------------
/lib/isotree.rb:
--------------------------------------------------------------------------------
 1 | # ext
 2 | require "isotree/ext"
 3 | 
 4 | # stdlib
 5 | require "etc"
 6 | require "json"
 7 | 
 8 | # modules
 9 | require_relative "isotree/dataset"
10 | require_relative "isotree/isolation_forest"
11 | require_relative "isotree/version"
12 | 


--------------------------------------------------------------------------------
/lib/isotree/dataset.rb:
--------------------------------------------------------------------------------
 1 | module IsoTree
 2 |   class Dataset
 3 |     attr_reader :numeric_columns, :categorical_columns, :array_type
 4 | 
 5 |     def initialize(data)
 6 |       @data = data
 7 | 
 8 |       if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
 9 |         @vectors = data.vectors
10 |         @numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
11 |         @array_type = false
12 |       elsif defined?(Numo::NArray) && data.is_a?(Numo::NArray)
13 |         raise ArgumentError, "Input must have 2 dimensions" if data.ndim != 2
14 | 
15 |         data = data.cast_to(Numo::DFloat)
16 |         ncols = data.shape[1]
17 | 
18 |         @numeric_columns = ncols.times.to_a
19 |         @categorical_columns = []
20 | 
21 |         @vectors = {}
22 |         @numeric_columns.each do |k|
23 |           @vectors[k] = data[true, k]
24 |         end
25 |         @array_type = true
26 |       else
27 |         data = data.to_a
28 | 
29 |         hashes = data.all? { |d| d.is_a?(Hash) }
30 |         arrays = !hashes && data.all? { |d| d.is_a?(Array) }
31 |         unless hashes || arrays
32 |           raise ArgumentError, "Array elements must be all hashes or arrays"
33 |         end
34 | 
35 |         ncols = data.first ? data.first.size : 0
36 |         if data.any? { |r| r.size != ncols }
37 |           raise ArgumentError, "All rows must have the same number of columns"
38 |         end
39 | 
40 |         keys =
41 |           if hashes
42 |             data.flat_map(&:keys).uniq
43 |           else
44 |             ncols.times.to_a
45 |           end
46 | 
47 |         @vectors = {}
48 |         keys.each do |k|
49 |           @vectors[k] = []
50 |         end
51 |         data.each do |d|
52 |           keys.each do |k|
53 |             @vectors[k] << d[k]
54 |           end
55 |         end
56 | 
57 |         @numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
58 |         @array_type = arrays
59 |       end
60 | 
61 |       raise ArgumentError, "No data" if size == 0
62 |     end
63 | 
64 |     def [](k)
65 |       @vectors[k]
66 |     end
67 | 
68 |     def size
69 |       @vectors.any? ? @vectors.values.first.size : 0
70 |     end
71 |   end
72 | end
73 | 


--------------------------------------------------------------------------------
/lib/isotree/isolation_forest.rb:
--------------------------------------------------------------------------------
  1 | module IsoTree
  2 |   class IsolationForest
  3 |     def initialize(
  4 |       sample_size: "auto",
  5 |       ntrees: 500,
  6 |       ndim: 3,
  7 |       ntry: 1,
  8 |       # categ_cols: nil,
  9 |       max_depth: "auto",
 10 |       ncols_per_tree: nil,
 11 |       prob_pick_pooled_gain: 0.0,
 12 |       prob_pick_avg_gain: 0.0,
 13 |       prob_pick_full_gain: 0.0,
 14 |       prob_pick_dens: 0.0,
 15 |       prob_pick_col_by_range: 0.0,
 16 |       prob_pick_col_by_var: 0.0,
 17 |       prob_pick_col_by_kurt: 0.0,
 18 |       min_gain: 0.0,
 19 |       missing_action: "auto",
 20 |       new_categ_action: "auto",
 21 |       categ_split_type: "auto",
 22 |       all_perm: false,
 23 |       coef_by_prop: false,
 24 |       # recode_categ: false,
 25 |       weights_as_sample_prob: true,
 26 |       sample_with_replacement: false,
 27 |       penalize_range: false,
 28 |       standardize_data: true,
 29 |       scoring_metric: "depth",
 30 |       fast_bratio: true,
 31 |       weigh_by_kurtosis: false,
 32 |       coefs: "uniform",
 33 |       assume_full_distr: true,
 34 |       # build_imputer: false,
 35 |       min_imp_obs: 3,
 36 |       depth_imp: "higher",
 37 |       weigh_imp_rows: "inverse",
 38 |       random_seed: 1,
 39 |       use_long_double: false,
 40 |       nthreads: -1
 41 |     )
 42 |       @sample_size = sample_size
 43 |       @ntrees = ntrees
 44 |       @ndim = ndim
 45 |       @ntry = ntry
 46 |       # @categ_cols = categ_cols
 47 |       @max_depth = max_depth
 48 |       @ncols_per_tree = ncols_per_tree
 49 |       @prob_pick_pooled_gain = prob_pick_pooled_gain
 50 |       @prob_pick_avg_gain = prob_pick_avg_gain
 51 |       @prob_pick_full_gain = prob_pick_full_gain
 52 |       @prob_pick_dens = prob_pick_dens
 53 |       @prob_pick_col_by_range = prob_pick_col_by_range
 54 |       @prob_pick_col_by_var = prob_pick_col_by_var
 55 |       @prob_pick_col_by_kurt = prob_pick_col_by_kurt
 56 |       @min_gain = min_gain
 57 |       @missing_action = missing_action
 58 |       @new_categ_action = new_categ_action
 59 |       @categ_split_type = categ_split_type
 60 |       @all_perm = all_perm
 61 |       @coef_by_prop = coef_by_prop
 62 |       # @recode_categ = recode_categ
 63 |       @weights_as_sample_prob = weights_as_sample_prob
 64 |       @sample_with_replacement = sample_with_replacement
 65 |       @penalize_range = penalize_range
 66 |       @standardize_data = standardize_data
 67 |       @scoring_metric = scoring_metric
 68 |       @fast_bratio = fast_bratio
 69 |       @weigh_by_kurtosis = weigh_by_kurtosis
 70 |       @coefs = coefs
 71 |       @assume_full_distr = assume_full_distr
 72 |       @min_imp_obs = min_imp_obs
 73 |       @depth_imp = depth_imp
 74 |       @weigh_imp_rows = weigh_imp_rows
 75 |       @random_seed = random_seed
 76 |       @use_long_double = use_long_double
 77 | 
 78 |       # etc module returns virtual cores
 79 |       nthreads = Etc.nprocessors if nthreads < 0
 80 |       @nthreads = nthreads
 81 |     end
 82 | 
 83 |     def fit(x)
 84 |       # make export consistent with Python library
 85 |       update_params
 86 | 
 87 |       x = Dataset.new(x)
 88 |       prep_fit(x)
 89 |       options = data_options(x).merge(fit_options)
 90 | 
 91 |       if options[:sample_size] == "auto"
 92 |         options[:sample_size] = [options[:nrows], 10000].min
 93 |       end
 94 | 
 95 |       # prevent segfault
 96 |       options[:sample_size] = options[:nrows] if options[:sample_size] > options[:nrows]
 97 | 
 98 |       @ext_iso_forest = Ext.fit_iforest(options)
 99 |     end
100 | 
101 |     def predict(x, output: "score")
102 |       check_fit
103 | 
104 |       x = Dataset.new(x)
105 |       prep_predict(x)
106 | 
107 |       options = data_options(x).merge(nthreads: @nthreads)
108 |       case output
109 |       when "score"
110 |         options[:standardize] = true
111 |       when "avg_depth"
112 |         options[:standardize] = false
113 |       else
114 |         raise ArgumentError, "Unknown output"
115 |       end
116 | 
117 |       Ext.predict_iforest(@ext_iso_forest, options)
118 |     end
119 | 
120 |     # same format as Python so models are compatible
121 |     def export_model(path, add_metada_file: false)
122 |       check_fit
123 | 
124 |       metadata = export_metadata
125 |       if add_metada_file
126 |         # indent 4 spaces like Python
127 |         File.write("#{path}.metadata", JSON.pretty_generate(metadata, indent: "    "))
128 |       end
129 |       Ext.serialize_combined(@ext_iso_forest, path, JSON.generate(metadata))
130 |     end
131 | 
132 |     def self.import_model(path)
133 |       model = new
134 |       ext_iso_forest, metadata = Ext.deserialize_combined(path)
135 |       model.instance_variable_set(:@ext_iso_forest, ext_iso_forest)
136 |       model.send(:import_metadata, JSON.parse(metadata))
137 |       model
138 |     end
139 | 
140 |     private
141 | 
142 |     def export_metadata
143 |       data_info = {
144 |         ncols_numeric: @numeric_columns.size,
145 |         ncols_categ: @categorical_columns.size,
146 |         cols_numeric: @numeric_columns,
147 |         cols_categ: @categorical_columns,
148 |         cat_levels: @categorical_columns.map { |v| @categories[v].keys },
149 |         categ_cols: [],
150 |         categ_max: []
151 |       }
152 | 
153 |       # Ruby-specific
154 |       data_info[:sym_numeric] = @numeric_columns.map { |v| v.is_a?(Symbol) }
155 |       data_info[:sym_categ] = @categorical_columns.map { |v| v.is_a?(Symbol) }
156 | 
157 |       model_info = {
158 |         ndim: @ndim,
159 |         nthreads: @nthreads,
160 |         use_long_double: @use_long_double,
161 |         build_imputer: false
162 |       }
163 | 
164 |       params = {}
165 |       PARAM_KEYS.each do |k|
166 |         params[k] = instance_variable_get("@#{k}")
167 |       end
168 | 
169 |       if params[:max_depth] == "auto"
170 |         params[:max_depth] = 0
171 |       end
172 | 
173 |       {
174 |         data_info: data_info,
175 |         model_info: model_info,
176 |         params: params
177 |       }
178 |     end
179 | 
180 |     def import_metadata(metadata)
181 |       data_info = metadata["data_info"]
182 |       model_info = metadata["model_info"]
183 |       params = metadata["params"]
184 | 
185 |       # Ruby-specific
186 |       sym_numeric = data_info["sym_numeric"].to_a
187 |       sym_categ = data_info["sym_categ"].to_a
188 | 
189 |       @numeric_columns = data_info["cols_numeric"].map.with_index { |v, i| sym_numeric[i] ? v.to_sym : v }
190 |       @categorical_columns = data_info["cols_categ"].map.with_index { |v, i| sym_categ[i] ? v.to_sym : v }
191 |       @categories = {}
192 |       @categorical_columns.zip(data_info["cat_levels"]) do |col, levels|
193 |         @categories[col] = levels.map.with_index.to_h
194 |       end
195 | 
196 |       @ndim = model_info["ndim"]
197 |       @nthreads = model_info["nthreads"]
198 |       @use_long_double = model_info["use_long_double"]
199 |       @build_imputer = model_info["build_imputer"]
200 | 
201 |       PARAM_KEYS.each do |k|
202 |         instance_variable_set("@#{k}", params[k.to_s])
203 |       end
204 |     end
205 | 
206 |     def check_fit
207 |       raise "Not fit" unless @ext_iso_forest
208 |     end
209 | 
210 |     def prep_fit(df)
211 |       @numeric_columns = df.numeric_columns
212 |       @categorical_columns = df.categorical_columns
213 |       @categories = {}
214 |       @categorical_columns.each do |k|
215 |         @categories[k] = df[k].uniq.to_a.compact.map.with_index.to_h
216 |       end
217 |     end
218 | 
219 |     # TODO handle column type mismatches
220 |     def prep_predict(df)
221 |       expected_columns = @numeric_columns + @categorical_columns
222 |       if df.array_type
223 |         if df.numeric_columns.size + df.categorical_columns.size != expected_columns.size
224 |           raise ArgumentError, "Input must have #{expected_columns.size} columns for this model"
225 |         end
226 |       end
227 |       expected_columns.each do |k|
228 |         raise ArgumentError, "Missing column: #{k}" unless df[k]
229 |       end
230 |     end
231 | 
232 |     def data_options(df)
233 |       options = {}
234 | 
235 |       # numeric
236 |       numeric_data = String.new
237 |       @numeric_columns.each do |k|
238 |         v = df[k]
239 |         v = v.to_numo if v.respond_to?(:to_numo) # Rover
240 |         binary_str =
241 |           if v.respond_to?(:to_binary) # Rover and Numo
242 |             v.cast_to(Numo::DFloat).to_binary
243 |           else
244 |             v.pack("d*")
245 |           end
246 |         numeric_data << binary_str
247 |       end
248 |       options[:numeric_data] = numeric_data
249 |       options[:ncols_numeric] = @numeric_columns.size
250 | 
251 |       # categorical
252 |       categorical_data = String.new
253 |       ncat = String.new
254 |       @categorical_columns.each do |k|
255 |         categories = @categories[k]
256 |         # for unseen values, set to categories.size
257 |         categories_size = categories.size
258 |         values = df[k].map { |v| v.nil? ? -1 : (categories[v] || categories_size) }
259 |         # TODO make more efficient
260 |         if values.any? { |v| v == categories_size }
261 |           warn "[isotree] Unseen values in column: #{k}"
262 |         end
263 | 
264 |         v = values
265 |         v = v.to_numo if v.respond_to?(:to_numo) # Rover
266 |         binary_str =
267 |           if v.respond_to?(:to_binary) # Rover and Numo
268 |             v.cast_to(Numo::Int32).to_binary
269 |           else
270 |             v.pack("i*")
271 |           end
272 |         categorical_data << binary_str
273 |         ncat << [categories.size].pack("i")
274 |       end
275 |       options[:categorical_data] = categorical_data
276 |       options[:ncols_categ] = @categorical_columns.size
277 |       options[:ncat] = ncat
278 | 
279 |       options[:nrows] = df.size
280 |       options
281 |     end
282 | 
283 |     PARAM_KEYS = %i(
284 |       sample_size ntrees ntry max_depth ncols_per_tree
285 |       prob_pick_avg_gain prob_pick_pooled_gain prob_pick_full_gain prob_pick_dens
286 |       prob_pick_col_by_range prob_pick_col_by_var prob_pick_col_by_kurt
287 |       min_gain missing_action new_categ_action categ_split_type coefs
288 |       depth_imp weigh_imp_rows min_imp_obs random_seed all_perm
289 |       coef_by_prop weights_as_sample_prob sample_with_replacement penalize_range standardize_data
290 |       scoring_metric fast_bratio weigh_by_kurtosis assume_full_distr
291 |     )
292 | 
293 |     def fit_options
294 |       keys = %i(
295 |         sample_size ntrees ndim ntry
296 |         categ_cols max_depth ncols_per_tree
297 |         prob_pick_pooled_gain prob_pick_avg_gain
298 |         prob_pick_full_gain prob_pick_dens
299 |         prob_pick_col_by_range prob_pick_col_by_var prob_pick_col_by_kurt
300 |         min_gain missing_action new_categ_action
301 |         categ_split_type all_perm coef_by_prop
302 |         weights_as_sample_prob
303 |         sample_with_replacement penalize_range standardize_data
304 |         scoring_metric fast_bratio
305 |         weigh_by_kurtosis coefs min_imp_obs depth_imp
306 |         weigh_imp_rows random_seed use_long_double nthreads
307 |       )
308 |       options = {}
309 |       keys.each do |k|
310 |         options[k] = instance_variable_get("@#{k}")
311 |       end
312 | 
313 |       if options[:max_depth] == "auto"
314 |         options[:max_depth] = 0
315 |         options[:limit_depth] = true
316 |       end
317 | 
318 |       if options[:ncols_per_tree].nil?
319 |         options[:ncols_per_tree] = 0
320 |       end
321 | 
322 |       options
323 |     end
324 | 
325 |     def update_params
326 |       if @missing_action == "auto"
327 |         if @ndim == 1
328 |           @missing_action = "divide"
329 |         else
330 |           @missing_action = "impute"
331 |         end
332 |       end
333 | 
334 |       if @new_categ_action == "auto"
335 |         if @ndim == 1
336 |           @new_categ_action = "weighted"
337 |         else
338 |           @new_categ_action = "impute"
339 |         end
340 |       end
341 | 
342 |       if @categ_split_type == "auto"
343 |         if @ndim == 1
344 |           @categ_split_type = "single_categ"
345 |         else
346 |           @categ_split_type = "subset"
347 |         end
348 |       end
349 |     end
350 |   end
351 | end
352 | 


--------------------------------------------------------------------------------
/lib/isotree/version.rb:
--------------------------------------------------------------------------------
1 | module IsoTree
2 |   VERSION = "0.4.0"
3 | end
4 | 


--------------------------------------------------------------------------------
/test/isolation_forest_test.rb:
--------------------------------------------------------------------------------
  1 | require_relative "test_helper"
  2 | 
  3 | class IsolationForestTest < Minitest::Test
  4 |   def test_hashes
  5 |     data = test_data
  6 |     model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1)
  7 |     model.fit(data)
  8 |     predictions = model.predict(data)
  9 |     expected = [0.4816470280716818, 0.46655713161582574, 0.5363011880474468]
 10 |     assert_elements_in_delta expected, predictions.first(3)
 11 |     assert_equal 100, predictions.each_with_index.max[1]
 12 |   end
 13 | 
 14 |   def test_array
 15 |     data = numeric_data
 16 |     model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1)
 17 |     model.fit(data)
 18 |     predictions = model.predict(data)
 19 |     expected = [0.454691875234909, 0.42805783155356797, 0.5460616479701705]
 20 |     assert_elements_in_delta expected, predictions.first(3)
 21 |     assert_equal 100, predictions.each_with_index.max[1]
 22 |   end
 23 | 
 24 |   def test_export
 25 |     skip "Not supported yet" if windows?
 26 | 
 27 |     data = test_data
 28 |     model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1)
 29 |     model.fit(data)
 30 |     original_predictions = model.predict(data)
 31 | 
 32 |     tempfile = Tempfile.new
 33 |     model.export_model(tempfile.path)
 34 |     model = IsoTree::IsolationForest.import_model(tempfile.path)
 35 |     predictions = model.predict(data)
 36 |     assert_elements_in_delta original_predictions, predictions
 37 |   end
 38 | 
 39 |   def test_import_to_python
 40 |     skip if !ENV["TEST_PYTHON"] || windows?
 41 | 
 42 |     model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1)
 43 |     model.fit(test_data)
 44 |     model.export_model("/tmp/model.bin", add_metada_file: true)
 45 |     assert_match "Name: 100", %x[python3 test/support/import.py]
 46 | 
 47 |     expected = JSON.parse(File.read("test/support/model.bin.metadata"))
 48 |     metadata = JSON.parse(File.read("/tmp/model.bin.metadata"))
 49 |     metadata["data_info"].reject! { |k, _| ["sym_numeric", "sym_categ"].include?(k) }
 50 |     assert_equal expected, metadata
 51 |   end
 52 | 
 53 |   def test_import_from_python
 54 |     skip "Not supported yet" if windows?
 55 | 
 56 |     model = IsoTree::IsolationForest.import_model("test/support/model.bin")
 57 |     predictions = model.predict(test_data.map { |v| v.transform_keys(&:to_s) })
 58 |     assert_equal 100, predictions.each_with_index.max[1]
 59 |   end
 60 | 
 61 |   def test_import_missing_file
 62 |     error = assert_raises do
 63 |       IsoTree::IsolationForest.import_model("missing.bin")
 64 |     end
 65 |     assert_equal "Cannot open file", error.message
 66 |   end
 67 | 
 68 |   def test_numo
 69 |     data = Numo::DFloat.cast(numeric_data)
 70 |     model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1)
 71 |     model.fit(data)
 72 |     predictions = model.predict(data)
 73 |     expected = [0.454691875234909, 0.42805783155356797, 0.5460616479701705]
 74 |     assert_elements_in_delta expected, predictions.first(3)
 75 |     assert_equal 100, predictions.each_with_index.max[1]
 76 |   end
 77 | 
 78 |   def test_rover
 79 |     require "rover"
 80 | 
 81 |     data = Rover::DataFrame.new(test_data)
 82 |     model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1)
 83 |     model.fit(data)
 84 |     predictions = model.predict(data)
 85 |     expected = [0.4816470280716818, 0.46655713161582574, 0.5363011880474468]
 86 |     assert_elements_in_delta expected, predictions.first(3)
 87 |     assert_equal 100, predictions.each_with_index.max[1]
 88 |   end
 89 | 
 90 |   def test_predict_output_avg_depth
 91 |     data = test_data
 92 |     model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1)
 93 |     model.fit(data)
 94 |     predictions = model.predict(data, output: "avg_depth")
 95 |     # different results on different platforms with same seed
 96 |     expected = [8.847458736905825, 9.23295785483866, 7.545738407619213]
 97 |     assert_elements_in_delta expected, predictions.first(3)
 98 |     assert_equal 100, predictions.each_with_index.min[1]
 99 |   end
100 | 
101 |   def test_not_fit
102 |     model = IsoTree::IsolationForest.new
103 |     error = assert_raises do
104 |       model.predict([])
105 |     end
106 |     assert_equal "Not fit", error.message
107 |   end
108 | 
109 |   def test_different_columns
110 |     x = Numo::DFloat.new(101, 2).rand_norm
111 |     model = IsoTree::IsolationForest.new
112 |     model.fit(x)
113 |     error = assert_raises(ArgumentError) do
114 |       model.predict(x.reshape(2, 101))
115 |     end
116 |     assert_equal "Input must have 2 columns for this model", error.message
117 |   end
118 | 
119 |   def test_no_data
120 |     model = IsoTree::IsolationForest.new
121 |     error = assert_raises(ArgumentError) do
122 |       model.fit([])
123 |     end
124 |     assert_equal "No data", error.message
125 |   end
126 | 
127 |   def test_bad_size
128 |     model = IsoTree::IsolationForest.new
129 |     error = assert_raises(ArgumentError) do
130 |       model.fit([[1, 2], [3]])
131 |     end
132 |     assert_equal "All rows must have the same number of columns", error.message
133 |   end
134 | 
135 |   def test_bad_dimensions
136 |     model = IsoTree::IsolationForest.new
137 |     error = assert_raises(ArgumentError) do
138 |       model.fit(Numo::DFloat.cast([[[1]]]))
139 |     end
140 |     assert_equal "Input must have 2 dimensions", error.message
141 |   end
142 | 
143 |   def test_bad_sample_size
144 |     data = test_data
145 |     model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1, sample_size: data.size * 2)
146 |     model.fit(data)
147 |   end
148 | 
149 |   def test_data
150 |     CSV.table("test/support/data.csv").map(&:to_h)
151 |   end
152 | 
153 |   def numeric_data
154 |     test_data.map { |v| [v[:num1], v[:num2]] }
155 |   end
156 | 
157 |   def windows?
158 |     Gem.win_platform?
159 |   end
160 | end
161 | 


--------------------------------------------------------------------------------
/test/support/data.csv:
--------------------------------------------------------------------------------
  1 | num1,num2,cat1
  2 | 1.624345,-0.611756,C
  3 | -0.528172,-1.072969,B
  4 | 0.865408,-2.301539,C
  5 | 1.744812,-0.761207,C
  6 | 0.319039,-0.249370,B
  7 | 1.462108,-2.060141,C
  8 | -0.322417,-0.384054,B
  9 | 1.133769,-1.099891,A
 10 | -0.172428,-0.877858,C
 11 | 0.042214,0.582815,B
 12 | -1.100619,1.144724,A
 13 | 0.901591,0.502494,B
 14 | 0.900856,-0.683728,A
 15 | -0.122890,-0.935769,B
 16 | -0.267888,0.530355,C
 17 | -0.691661,-0.396754,C
 18 | -0.687173,-0.845206,B
 19 | -0.671246,-0.012665,B
 20 | -1.117310,0.234416,B
 21 | 1.659802,0.742044,C
 22 | -0.191836,-0.887629,B
 23 | -0.747158,1.692455,C
 24 | 0.050808,-0.636996,B
 25 | 0.190915,2.100255,A
 26 | 0.120159,0.617203,A
 27 | 0.300170,-0.352250,C
 28 | -1.142518,-0.349343,A
 29 | -0.208894,0.586623,A
 30 | 0.838983,0.931102,C
 31 | 0.285587,0.885141,A
 32 | -0.754398,1.252868,B
 33 | 0.512930,-0.298093,A
 34 | 0.488518,-0.075572,A
 35 | 1.131629,1.519817,C
 36 | 2.185575,-1.396496,C
 37 | -1.444114,-0.504466,C
 38 | 0.160037,0.876169,C
 39 | 0.315635,-2.022201,C
 40 | -0.306204,0.827975,A
 41 | 0.230095,0.762011,C
 42 | -0.222328,-0.200758,B
 43 | 0.186561,0.410052,A
 44 | 0.198300,0.119009,B
 45 | -0.670662,0.377564,A
 46 | 0.121821,1.129484,A
 47 | 1.198918,0.185156,B
 48 | -0.375285,-0.638730,C
 49 | 0.423494,0.077340,C
 50 | -0.343854,0.043597,C
 51 | -0.620001,0.698032,B
 52 | -0.447129,1.224508,A
 53 | 0.403492,0.593579,A
 54 | -1.094912,0.169382,A
 55 | 0.740556,-0.953701,C
 56 | -0.266219,0.032615,A
 57 | -1.373117,0.315159,A
 58 | 0.846161,-0.859516,A
 59 | 0.350546,-1.312283,C
 60 | -0.038696,-1.615772,C
 61 | 1.121418,0.408901,A
 62 | -0.024617,-0.775162,C
 63 | 1.273756,1.967102,A
 64 | -1.857982,1.236164,A
 65 | 1.627651,0.338012,A
 66 | -1.199268,0.863345,B
 67 | -0.180920,-0.603921,B
 68 | -1.230058,0.550537,A
 69 | 0.792807,-0.623531,B
 70 | 0.520576,-1.144341,C
 71 | 0.801861,0.046567,B
 72 | -0.186570,-0.101746,B
 73 | 0.868886,0.750412,A
 74 | 0.529465,0.137701,A
 75 | 0.077821,0.618380,B
 76 | 0.232495,0.682551,C
 77 | -0.310117,-2.434838,B
 78 | 1.038825,2.186980,B
 79 | 0.441364,-0.100155,B
 80 | -0.136445,-0.119054,B
 81 | 0.017409,-1.122019,C
 82 | -0.517094,-0.997027,A
 83 | 0.248799,-0.296641,A
 84 | 0.495211,-0.174703,A
 85 | 0.986335,0.213534,B
 86 | 2.190700,-1.896361,C
 87 | -0.646917,0.901487,C
 88 | 2.528326,-0.248635,B
 89 | 0.043669,-0.226314,B
 90 | 1.331457,-0.287308,A
 91 | 0.680070,-0.319802,B
 92 | -1.272559,0.313548,A
 93 | 0.503185,1.293226,A
 94 | -0.110447,-0.617362,B
 95 | 0.562761,0.240737,A
 96 | 0.280665,-0.073113,C
 97 | 1.160339,0.369493,B
 98 | 1.904659,1.111057,A
 99 | 0.659050,-1.627438,C
100 | 0.602319,0.420282,A
101 | 0.810952,1.044442,B
102 | 3.000000,3.000000,A
103 | 


--------------------------------------------------------------------------------
/test/support/import.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from isotree import IsolationForest
 4 | 
 5 | df = pd.read_csv('test/support/data.csv')
 6 | model = IsolationForest.import_model("/tmp/model.bin")
 7 | 
 8 | predictions = model.predict(df)
 9 | print(predictions[0:3].tolist())
10 | print('Point with highest outlier score: ', df.iloc[np.argsort(-predictions)[0]])
11 | 


--------------------------------------------------------------------------------
/test/support/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ankane/isotree-ruby/9e06d1923f134725fedb383c84e61fa53cd90669/test/support/model.bin


--------------------------------------------------------------------------------
/test/support/model.bin.metadata:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data_info": {
 3 |         "ncols_numeric": 2,
 4 |         "ncols_categ": 1,
 5 |         "cols_numeric": [
 6 |             "num1",
 7 |             "num2"
 8 |         ],
 9 |         "cols_categ": [
10 |             "cat1"
11 |         ],
12 |         "cat_levels": [
13 |             [
14 |                 "C",
15 |                 "B",
16 |                 "A"
17 |             ]
18 |         ],
19 |         "categ_cols": [],
20 |         "categ_max": []
21 |     },
22 |     "model_info": {
23 |         "ndim": 3,
24 |         "nthreads": 1,
25 |         "use_long_double": false,
26 |         "build_imputer": false
27 |     },
28 |     "params": {
29 |         "sample_size": "auto",
30 |         "ntrees": 10,
31 |         "ntry": 1,
32 |         "max_depth": 0,
33 |         "ncols_per_tree": null,
34 |         "prob_pick_avg_gain": 0.0,
35 |         "prob_pick_pooled_gain": 0.0,
36 |         "prob_pick_full_gain": 0.0,
37 |         "prob_pick_dens": 0.0,
38 |         "prob_pick_col_by_range": 0.0,
39 |         "prob_pick_col_by_var": 0.0,
40 |         "prob_pick_col_by_kurt": 0.0,
41 |         "min_gain": 0.0,
42 |         "missing_action": "impute",
43 |         "new_categ_action": "impute",
44 |         "categ_split_type": "subset",
45 |         "coefs": "uniform",
46 |         "depth_imp": "higher",
47 |         "weigh_imp_rows": "inverse",
48 |         "min_imp_obs": 3,
49 |         "random_seed": 1,
50 |         "all_perm": false,
51 |         "coef_by_prop": false,
52 |         "weights_as_sample_prob": true,
53 |         "sample_with_replacement": false,
54 |         "penalize_range": false,
55 |         "standardize_data": true,
56 |         "scoring_metric": "depth",
57 |         "fast_bratio": true,
58 |         "weigh_by_kurtosis": false,
59 |         "assume_full_distr": true
60 |     }
61 | }


--------------------------------------------------------------------------------
/test/support/predict.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from isotree import IsolationForest
 4 | 
 5 | df = pd.read_csv('test/support/data.csv')
 6 | model = IsolationForest(ntrees=10, ndim=3, nthreads=1)
 7 | model.fit(df)
 8 | 
 9 | predictions = model.predict(df)
10 | print(predictions[0:3].tolist())
11 | print('Point with highest outlier score: ', df.iloc[np.argsort(-predictions)[0]])
12 | 
13 | print('avg_depth')
14 | print(model.predict(df, output='avg_depth')[0:3].tolist())
15 | 
16 | model.export_model('test/support/model.bin', add_metada_file=True)
17 | 


--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
 1 | require "bundler/setup"
 2 | Bundler.require(:default)
 3 | require "minitest/autorun"
 4 | require "minitest/pride"
 5 | require "csv"
 6 | 
 7 | class Minitest::Test
 8 |   def assert_elements_in_delta(expected, actual)
 9 |     assert_equal expected.size, actual.size
10 |     expected.zip(actual) do |exp, act|
11 |       assert_in_delta exp, act
12 |     end
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------