├── .github └── workflows │ └── build.yml ├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── ext └── isotree │ ├── ext.cpp │ └── extconf.rb ├── isotree.gemspec ├── lib ├── isotree.rb └── isotree │ ├── dataset.rb │ ├── isolation_forest.rb │ └── version.rb └── test ├── isolation_forest_test.rb ├── support ├── data.csv ├── import.py ├── model.bin ├── model.bin.metadata └── predict.py └── test_helper.rb /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | os: [ubuntu-latest, macos-latest] 9 | runs-on: ${{ matrix.os }} 10 | steps: 11 | - uses: actions/checkout@v4 12 | with: 13 | submodules: recursive 14 | - uses: ruby/setup-ruby@v1 15 | with: 16 | ruby-version: 3.4 17 | bundler-cache: true 18 | - run: bundle exec rake compile 19 | - run: bundle exec rake test 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | *.lock 10 | *.bundle 11 | *.so 12 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vendor/isotree"] 2 | path = vendor/isotree 3 | url = https://github.com/david-cortes/isotree 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.4.0 (2024-06-11) 2 | 3 | - Updated IsoTree to 0.6.1 4 | - Dropped support for Ruby < 3.1 5 | 6 | ## 0.3.1 (2023-12-19) 7 | 8 | - Updated IsoTree to 0.5.25 9 | 10 | ## 0.3.0 (2022-06-13) 11 | 12 | - Updated IsoTree to 0.5.16 13 | - Updated serialization format (exported models must be recreated) 14 | - Dropped support for Ruby < 2.7 15 | - Dropped support for Windows 16 | 17 | ## 0.2.2 (2022-06-12) 18 | 19 | - Fixed segfault when data is smaller than sample size 20 | 21 | ## 0.2.1 (2021-05-23) 22 | 23 | - Improved performance 24 | 25 | ## 0.2.0 (2021-05-17) 26 | 27 | - Updated to Rice 4 28 | - Dropped support for Ruby < 2.6 29 | 30 | ## 0.1.5 (2021-03-14) 31 | 32 | - Updated IsoTree to 0.1.25 33 | - Added support for exporting and importing models 34 | 35 | ## 0.1.4 (2020-08-22) 36 | 37 | - Added `missing_action`, `new_categ_action`, `categ_split_type`, `coefs`, `depth_imp`, and `weigh_imp_rows` options 38 | - Fixed signal handling 39 | 40 | ## 0.1.3 (2020-08-13) 41 | 42 | - Added support for categorical data 43 | - Added support for Rover data frames 44 | - Added `output` option to `predict` method 45 | 46 | ## 0.1.2 (2020-08-11) 47 | 48 | - Fixed outlier scores 49 | 50 | ## 0.1.1 (2020-08-10) 51 | 52 | - Fixed installation error when cereal not installed 53 | 54 | ## 0.1.0 (2020-08-10) 55 | 56 | - First release 57 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | 5 | gem "rake" 6 | gem "rake-compiler" 7 | gem "minitest", ">= 5" 8 | gem "numo-narray" 9 | gem "rover-df" 10 | gem "csv" 11 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2019-2023, David Cortes 4 | Copyright (c) 2020-2023, Andrew Kane 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IsoTree Ruby 2 | 3 | :evergreen_tree: [IsoTree](https://github.com/david-cortes/isotree) - outlier/anomaly detection using Isolation Forest - for Ruby 4 | 5 | Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works 6 | 7 | :deciduous_tree: Check out [OutlierTree](https://github.com/ankane/outliertree-ruby) for human-readable explanations of outliers 8 | 9 | [![Build Status](https://github.com/ankane/isotree-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/isotree-ruby/actions) 10 | 11 | ## Installation 12 | 13 | Add this line to your application’s Gemfile: 14 | 15 | ```ruby 16 | gem "isotree" 17 | ``` 18 | 19 | Windows is not supported at the moment 20 | 21 | ## Getting Started 22 | 23 | Prep your data 24 | 25 | ```ruby 26 | data = [ 27 | {department: "Books", sale: false, price: 2.50}, 28 | {department: "Books", sale: true, price: 3.00}, 29 | {department: "Movies", sale: false, price: 5.00}, 30 | # ... 31 | ] 32 | ``` 33 | 34 | Train a model 35 | 36 | ```ruby 37 | model = IsoTree::IsolationForest.new 38 | model.fit(data) 39 | ``` 40 | 41 | Get outlier scores 42 | 43 | ```ruby 44 | model.predict(data) 45 | ``` 46 | 47 | Scores are between 0 and 1, with higher scores indicating outliers 48 | 49 | Export the model 50 | 51 | ```ruby 52 | model.export_model("model.bin") 53 | ``` 54 | 55 | Import a model 56 | 57 | ```ruby 58 | model = IsoTree::IsolationForest.import_model("model.bin") 59 | ``` 60 | 61 | ## Parameters 62 | 63 | Pass parameters - default values below 64 | 65 | ```ruby 66 | IsoTree::IsolationForest.new( 67 | sample_size: "auto", 68 | ntrees: 500, 69 | ndim: 3, 70 | ntry: 1, 71 | max_depth: "auto", 72 | ncols_per_tree: nil, 73 | prob_pick_pooled_gain: 0.0, 74 | prob_pick_avg_gain: 0.0, 75 | prob_pick_full_gain: 0.0, 76 | prob_pick_dens: 0.0, 77 | prob_pick_col_by_range: 0.0, 78 | prob_pick_col_by_var: 0.0, 79 | prob_pick_col_by_kurt: 0.0, 80 | min_gain: 0.0, 81 | missing_action: "auto", 82 | new_categ_action: "auto", 83 | categ_split_type: "auto", 84 | all_perm: false, 85 | coef_by_prop: false, 86 | sample_with_replacement: false, 87 | penalize_range: false, 88 | standardize_data: true, 89 | scoring_metric: "depth", 90 | fast_bratio: true, 91 | weigh_by_kurtosis: false, 92 | coefs: "uniform", 93 | assume_full_distr: true, 94 | min_imp_obs: 3, 95 | depth_imp: "higher", 96 | weigh_imp_rows: "inverse", 97 | random_seed: 1, 98 | use_long_double: false, 99 | nthreads: -1 100 | ) 101 | ``` 102 | 103 | See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.IsolationForest) 104 | 105 | ## Data 106 | 107 | Data can be an array of hashes 108 | 109 | ```ruby 110 | [ 111 | {department: "Books", sale: false, price: 2.50}, 112 | {department: "Books", sale: true, price: 3.00}, 113 | {department: "Movies", sale: false, price: 5.00} 114 | ] 115 | ``` 116 | 117 | Or a Rover data frame 118 | 119 | ```ruby 120 | Rover.read_csv("data.csv") 121 | ``` 122 | 123 | Or a Numo array 124 | 125 | ```ruby 126 | Numo::NArray.cast([[1, 2, 3], [4, 5, 6]]) 127 | ``` 128 | 129 | ## Performance 130 | 131 | IsoTree uses OpenMP when possible for best performance. To enable OpenMP on Mac, run: 132 | 133 | ```sh 134 | brew install libomp 135 | ``` 136 | 137 | Then reinstall the gem. 138 | 139 | ```sh 140 | gem uninstall isotree --force 141 | bundle install 142 | ``` 143 | 144 | ## Deployment 145 | 146 | Check out [Trove](https://github.com/ankane/trove) for deploying models. 147 | 148 | ```sh 149 | trove push model.bin 150 | ``` 151 | 152 | ## Reference 153 | 154 | Get the average isolation depth 155 | 156 | ```ruby 157 | model.predict(data, output: "avg_depth") 158 | ``` 159 | 160 | ## History 161 | 162 | View the [changelog](https://github.com/ankane/isotree-ruby/blob/master/CHANGELOG.md) 163 | 164 | ## Contributing 165 | 166 | Everyone is encouraged to help improve this project. Here are a few ways you can help: 167 | 168 | - [Report bugs](https://github.com/ankane/isotree-ruby/issues) 169 | - Fix bugs and [submit pull requests](https://github.com/ankane/isotree-ruby/pulls) 170 | - Write, clarify, or fix documentation 171 | - Suggest or add new features 172 | 173 | To get started with development: 174 | 175 | ```sh 176 | git clone --recursive https://github.com/ankane/isotree-ruby.git 177 | cd isotree-ruby 178 | bundle install 179 | bundle exec rake compile 180 | bundle exec rake test 181 | ``` 182 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rake/testtask" 3 | require "rake/extensiontask" 4 | 5 | task default: :test 6 | Rake::TestTask.new do |t| 7 | t.libs << "test" 8 | t.pattern = "test/**/*_test.rb" 9 | end 10 | 11 | Rake::ExtensionTask.new("isotree") do |ext| 12 | ext.name = "ext" 13 | ext.lib_dir = "lib/isotree" 14 | end 15 | 16 | task :check_license do 17 | raise "Missing vendor license" unless File.exist?("vendor/isotree/LICENSE") 18 | end 19 | 20 | task :remove_ext do 21 | path = "lib/isotree/ext.bundle" 22 | File.unlink(path) if File.exist?(path) 23 | end 24 | 25 | Rake::Task["build"].enhance [:check_license, :remove_ext] 26 | -------------------------------------------------------------------------------- /ext/isotree/ext.cpp: -------------------------------------------------------------------------------- 1 | // stdlib 2 | #include 3 | #include 4 | #include 5 | 6 | // isotree 7 | #include 8 | 9 | // rice 10 | #include 11 | 12 | using Rice::Array; 13 | using Rice::Hash; 14 | using Rice::Module; 15 | using Rice::Object; 16 | using Rice::String; 17 | using Rice::Symbol; 18 | using Rice::define_class_under; 19 | using Rice::define_module; 20 | 21 | namespace Rice::detail 22 | { 23 | template<> 24 | class From_Ruby 25 | { 26 | public: 27 | NewCategAction convert(VALUE x) 28 | { 29 | auto value = Object(x).to_s().str(); 30 | if (value == "weighted" || value == "impute") return Weighted; 31 | if (value == "smallest") return Smallest; 32 | if (value == "random") return Random; 33 | throw std::runtime_error("Unknown new categ action: " + value); 34 | } 35 | }; 36 | 37 | template<> 38 | class From_Ruby 39 | { 40 | public: 41 | MissingAction convert(VALUE x) 42 | { 43 | auto value = Object(x).to_s().str(); 44 | if (value == "divide") return Divide; 45 | if (value == "impute") return Impute; 46 | if (value == "fail") return Fail; 47 | throw std::runtime_error("Unknown missing action: " + value); 48 | } 49 | }; 50 | 51 | template<> 52 | class From_Ruby 53 | { 54 | public: 55 | CategSplit convert(VALUE x) 56 | { 57 | auto value = Object(x).to_s().str(); 58 | if (value == "subset") return SubSet; 59 | if (value == "single_categ") return SingleCateg; 60 | throw std::runtime_error("Unknown categ split: " + value); 61 | } 62 | }; 63 | 64 | template<> 65 | class From_Ruby 66 | { 67 | public: 68 | CoefType convert(VALUE x) 69 | { 70 | auto value = Object(x).to_s().str(); 71 | if (value == "uniform") return Uniform; 72 | if (value == "normal") return Normal; 73 | throw std::runtime_error("Unknown coef type: " + value); 74 | } 75 | }; 76 | 77 | template<> 78 | class From_Ruby 79 | { 80 | public: 81 | UseDepthImp convert(VALUE x) 82 | { 83 | auto value = Object(x).to_s().str(); 84 | if (value == "lower") return Lower; 85 | if (value == "higher") return Higher; 86 | if (value == "same") return Same; 87 | throw std::runtime_error("Unknown depth imp: " + value); 88 | } 89 | }; 90 | 91 | template<> 92 | class From_Ruby 93 | { 94 | public: 95 | WeighImpRows convert(VALUE x) 96 | { 97 | auto value = Object(x).to_s().str(); 98 | if (value == "inverse") return Inverse; 99 | if (value == "prop") return Prop; 100 | if (value == "flat") return Flat; 101 | throw std::runtime_error("Unknown weight imp rows: " + value); 102 | } 103 | }; 104 | 105 | template<> 106 | class From_Ruby 107 | { 108 | public: 109 | ScoringMetric convert(VALUE x) 110 | { 111 | auto value = Object(x).to_s().str(); 112 | if (value == "depth") return Depth; 113 | if (value == "adj_depth") return AdjDepth; 114 | if (value == "density") return Density; 115 | if (value == "adj_density") return AdjDensity; 116 | if (value == "boxed_density") return BoxedDensity; 117 | if (value == "boxed_density2") return BoxedDensity2; 118 | if (value == "boxed_ratio") return BoxedRatio; 119 | throw std::runtime_error("Unknown scoring metric: " + value); 120 | } 121 | }; 122 | } 123 | 124 | extern "C" 125 | void Init_ext() 126 | { 127 | Module rb_mIsoTree = define_module("IsoTree"); 128 | 129 | Module rb_mExt = define_module_under(rb_mIsoTree, "Ext"); 130 | define_class_under(rb_mExt, "ExtIsoForest"); 131 | 132 | rb_mExt 133 | .define_singleton_function( 134 | "fit_iforest", 135 | [](Hash options) { 136 | // model 137 | ExtIsoForest iso; 138 | 139 | // data 140 | size_t nrows = options.get("nrows"); 141 | size_t ncols_numeric = options.get("ncols_numeric"); 142 | size_t ncols_categ = options.get("ncols_categ"); 143 | 144 | real_t* numeric_data = NULL; 145 | if (ncols_numeric > 0) { 146 | numeric_data = (double*) options.get("numeric_data").c_str(); 147 | } 148 | 149 | int* categorical_data = NULL; 150 | int* ncat = NULL; 151 | if (ncols_categ > 0) { 152 | categorical_data = (int*) options.get("categorical_data").c_str(); 153 | ncat = (int*) options.get("ncat").c_str(); 154 | } 155 | 156 | // not used (sparse matrices) 157 | real_t* Xc = NULL; 158 | sparse_ix* Xc_ind = NULL; 159 | sparse_ix* Xc_indptr = NULL; 160 | 161 | // options 162 | // Rice has limit of 14 arguments, so use hash 163 | size_t sample_size = options.get("sample_size"); 164 | size_t ndim = options.get("ndim"); 165 | size_t ntrees = options.get("ntrees"); 166 | size_t ntry = options.get("ntry"); 167 | double prob_pick_by_gain_avg = options.get("prob_pick_avg_gain"); 168 | double prob_pick_by_gain_pl = options.get("prob_pick_pooled_gain"); 169 | double min_gain = options.get("min_gain"); 170 | MissingAction missing_action = options.get("missing_action"); 171 | CategSplit cat_split_type = options.get("categ_split_type"); 172 | NewCategAction new_cat_action = options.get("new_categ_action"); 173 | bool all_perm = options.get("all_perm"); 174 | bool coef_by_prop = options.get("coef_by_prop"); 175 | bool with_replacement = options.get("sample_with_replacement"); 176 | bool penalize_range = options.get("penalize_range"); 177 | bool weigh_by_kurt = options.get("weigh_by_kurtosis"); 178 | CoefType coef_type = options.get("coefs"); 179 | size_t min_imp_obs = options.get("min_imp_obs"); 180 | UseDepthImp depth_imp = options.get("depth_imp"); 181 | WeighImpRows weigh_imp_rows = options.get("weigh_imp_rows"); 182 | uint64_t random_seed = options.get("random_seed"); 183 | bool use_long_double = options.get("use_long_double"); 184 | int nthreads = options.get("nthreads"); 185 | 186 | // TODO options 187 | double* sample_weights = NULL; 188 | bool weight_as_sample = options.get("weights_as_sample_prob"); 189 | size_t max_depth = options.get("max_depth"); 190 | bool limit_depth = options.get("limit_depth"); 191 | bool standardize_dist = false; 192 | double* tmat = NULL; 193 | double* output_depths = NULL; 194 | bool standardize_depth = false; 195 | real_t* col_weights = NULL; 196 | Imputer* imputer = NULL; 197 | bool impute_at_fit = false; 198 | 199 | int ncols_per_tree = options.get("ncols_per_tree"); 200 | bool standardize_data = options.get("standardize_data"); 201 | ScoringMetric scoring_metric = options.get("scoring_metric"); 202 | bool fast_bratio = options.get("fast_bratio"); 203 | double prob_pick_by_full_gain = options.get("prob_pick_full_gain"); 204 | double prob_pick_by_dens = options.get("prob_pick_dens"); 205 | double prob_pick_col_by_range = options.get("prob_pick_col_by_range"); 206 | double prob_pick_col_by_var = options.get("prob_pick_col_by_var"); 207 | double prob_pick_col_by_kurt = options.get("prob_pick_col_by_kurt"); 208 | 209 | fit_iforest( 210 | NULL, 211 | &iso, 212 | numeric_data, 213 | ncols_numeric, 214 | categorical_data, 215 | ncols_categ, 216 | ncat, 217 | Xc, 218 | Xc_ind, 219 | Xc_indptr, 220 | ndim, 221 | ntry, 222 | coef_type, 223 | coef_by_prop, 224 | sample_weights, 225 | with_replacement, 226 | weight_as_sample, 227 | nrows, 228 | sample_size, 229 | ntrees, 230 | max_depth, 231 | ncols_per_tree, 232 | limit_depth, 233 | penalize_range, 234 | standardize_data, 235 | scoring_metric, 236 | fast_bratio, 237 | standardize_dist, 238 | tmat, 239 | output_depths, 240 | standardize_depth, 241 | col_weights, 242 | weigh_by_kurt, 243 | prob_pick_by_gain_pl, 244 | prob_pick_by_gain_avg, 245 | prob_pick_by_full_gain, 246 | prob_pick_by_dens, 247 | prob_pick_col_by_range, 248 | prob_pick_col_by_var, 249 | prob_pick_col_by_kurt, 250 | min_gain, 251 | missing_action, 252 | cat_split_type, 253 | new_cat_action, 254 | all_perm, 255 | imputer, 256 | min_imp_obs, 257 | depth_imp, 258 | weigh_imp_rows, 259 | impute_at_fit, 260 | random_seed, 261 | use_long_double, 262 | nthreads 263 | ); 264 | 265 | return iso; 266 | }) 267 | .define_singleton_function( 268 | "predict_iforest", 269 | [](ExtIsoForest& iso, Hash options) { 270 | // data 271 | size_t nrows = options.get("nrows"); 272 | size_t ncols_numeric = options.get("ncols_numeric"); 273 | size_t ncols_categ = options.get("ncols_categ"); 274 | 275 | real_t* numeric_data = NULL; 276 | if (ncols_numeric > 0) { 277 | numeric_data = (double*) options.get("numeric_data").c_str(); 278 | } 279 | 280 | int* categorical_data = NULL; 281 | if (ncols_categ > 0) { 282 | categorical_data = (int*) options.get("categorical_data").c_str(); 283 | } 284 | 285 | // not used (sparse matrices) 286 | real_t* Xc = NULL; 287 | sparse_ix* Xc_ind = NULL; 288 | sparse_ix* Xc_indptr = NULL; 289 | real_t* Xr = NULL; 290 | sparse_ix* Xr_ind = NULL; 291 | sparse_ix* Xr_indptr = NULL; 292 | 293 | // options 294 | int nthreads = options.get("nthreads"); 295 | bool standardize = options.get("standardize"); 296 | std::vector outlier_scores(nrows); 297 | sparse_ix* tree_num = NULL; 298 | bool is_col_major = true; 299 | size_t ld_numeric = 0; 300 | size_t ld_categ = 0; 301 | double* per_tree_depths = NULL; 302 | 303 | predict_iforest( 304 | numeric_data, 305 | categorical_data, 306 | is_col_major, 307 | ld_numeric, 308 | ld_categ, 309 | Xc, 310 | Xc_ind, 311 | Xc_indptr, 312 | Xr, 313 | Xr_ind, 314 | Xr_indptr, 315 | nrows, 316 | nthreads, 317 | standardize, 318 | NULL, 319 | &iso, 320 | outlier_scores.data(), 321 | tree_num, 322 | per_tree_depths, 323 | NULL 324 | ); 325 | 326 | Array ret; 327 | for (size_t i = 0; i < outlier_scores.size(); i++) { 328 | ret.push(outlier_scores[i]); 329 | } 330 | return ret; 331 | }) 332 | .define_singleton_function( 333 | "serialize_combined", 334 | [](ExtIsoForest& iso, String path, String metadata) { 335 | #ifdef _MSC_VER 336 | // TODO convert to wchar_t 337 | throw std::runtime_error("Not supported on Windows yet"); 338 | #else 339 | std::ofstream file; 340 | file.open(path.c_str()); 341 | serialize_combined( 342 | NULL, 343 | &iso, 344 | NULL, 345 | NULL, 346 | metadata.c_str(), 347 | // returns bytesize (RSTRING_LEN) 348 | metadata.length(), 349 | file 350 | ); 351 | file.close(); 352 | #endif 353 | }) 354 | .define_singleton_function( 355 | "deserialize_combined", 356 | [](String path) { 357 | #ifdef _MSC_VER 358 | // TODO convert to wchar_t 359 | throw std::runtime_error("Not supported on Windows yet"); 360 | #else 361 | Array ret; 362 | 363 | std::ifstream file; 364 | file.open(path.c_str(), std::ios_base::in | std::ios_base::binary); 365 | if (!file) { 366 | throw std::runtime_error("Cannot open file"); 367 | } 368 | 369 | bool is_isotree_model = false; 370 | bool is_compatible = false; 371 | bool has_combined_objects = false; 372 | bool has_IsoForest = false; 373 | bool has_ExtIsoForest = false; 374 | bool has_Imputer = false; 375 | bool has_Indexer = false; 376 | bool has_metadata = false; 377 | size_t size_metadata = 0; 378 | 379 | inspect_serialized_object( 380 | file, 381 | is_isotree_model, 382 | is_compatible, 383 | has_combined_objects, 384 | has_IsoForest, 385 | has_ExtIsoForest, 386 | has_Imputer, 387 | has_Indexer, 388 | has_metadata, 389 | size_metadata 390 | ); 391 | 392 | if (!is_isotree_model || !has_combined_objects) { 393 | throw std::runtime_error("Input file is not a serialized isotree model"); 394 | } 395 | if (!is_compatible) { 396 | throw std::runtime_error("Model file format is incompatible"); 397 | } 398 | if (size_metadata == 0) { 399 | throw std::runtime_error("Input file does not contain metadata"); 400 | } 401 | 402 | IsoForest model = IsoForest(); 403 | ExtIsoForest model_ext = ExtIsoForest(); 404 | Imputer imputer = Imputer(); 405 | TreesIndexer indexer = TreesIndexer(); 406 | char *optional_metadata = (char*) calloc(size_metadata, sizeof(char)); 407 | if (optional_metadata == NULL) { 408 | throw std::runtime_error("Cannot allocate memory"); 409 | } 410 | 411 | deserialize_combined(file, &model, &model_ext, &imputer, &indexer, optional_metadata); 412 | file.close(); 413 | 414 | ret.push(Object(Rice::detail::To_Ruby().convert(model_ext))); 415 | ret.push(String(std::string(optional_metadata, size_metadata))); 416 | 417 | free(optional_metadata); 418 | 419 | return ret; 420 | #endif 421 | }); 422 | } 423 | -------------------------------------------------------------------------------- /ext/isotree/extconf.rb: -------------------------------------------------------------------------------- 1 | require "mkmf-rice" 2 | 3 | $CXXFLAGS += " -std=c++17 $(optflags) -D_USE_XOSHIRO -DSUPPORTS_RESTRICT=1 -D_USE_ROBIN_MAP -DDONT_THROW_ON_INTERRUPT" 4 | 5 | apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i 6 | 7 | # check omp first 8 | if have_library("omp") || have_library("gomp") 9 | $CXXFLAGS += " -Xclang" if apple_clang 10 | $CXXFLAGS += " -fopenmp" 11 | end 12 | 13 | ext = File.expand_path(".", __dir__) 14 | isotree_src = File.expand_path("../../vendor/isotree/src", __dir__) 15 | isotree_inc = File.expand_path("../../vendor/isotree/include", __dir__) 16 | 17 | exclude = %w(c_interface.cpp Rwrapper.cpp RcppExports.cpp) 18 | $srcs = Dir["{#{ext},#{isotree_src}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) } 19 | $INCFLAGS << " -I#{isotree_inc}" 20 | $VPATH << isotree_src 21 | 22 | create_makefile("isotree/ext") 23 | -------------------------------------------------------------------------------- /isotree.gemspec: -------------------------------------------------------------------------------- 1 | require_relative "lib/isotree/version" 2 | 3 | Gem::Specification.new do |spec| 4 | spec.name = "isotree" 5 | spec.version = IsoTree::VERSION 6 | spec.summary = "Outlier/anomaly detection for Ruby using Isolation Forest" 7 | spec.homepage = "https://github.com/ankane/isotree-ruby" 8 | spec.license = "BSD-2-Clause" 9 | 10 | spec.author = "Andrew Kane" 11 | spec.email = "andrew@ankane.org" 12 | 13 | spec.files = Dir["*.{md,txt}", "{ext,lib}/**/*", "vendor/isotree/{LICENSE,README.md}", "vendor/isotree/inst/COPYRIGHTS", "vendor/isotree/{include,src}/*.{cpp,hpp}", "vendor/isotree/src/robinmap/{LICENSE,README.md}", "vendor/isotree/src/robinmap/include/**/*"] 14 | spec.require_path = "lib" 15 | spec.extensions = ["ext/isotree/extconf.rb"] 16 | 17 | spec.required_ruby_version = ">= 3.1" 18 | 19 | spec.add_dependency "rice", ">= 4.3.3" 20 | end 21 | -------------------------------------------------------------------------------- /lib/isotree.rb: -------------------------------------------------------------------------------- 1 | # ext 2 | require "isotree/ext" 3 | 4 | # stdlib 5 | require "etc" 6 | require "json" 7 | 8 | # modules 9 | require_relative "isotree/dataset" 10 | require_relative "isotree/isolation_forest" 11 | require_relative "isotree/version" 12 | -------------------------------------------------------------------------------- /lib/isotree/dataset.rb: -------------------------------------------------------------------------------- 1 | module IsoTree 2 | class Dataset 3 | attr_reader :numeric_columns, :categorical_columns, :array_type 4 | 5 | def initialize(data) 6 | @data = data 7 | 8 | if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame) 9 | @vectors = data.vectors 10 | @numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) } 11 | @array_type = false 12 | elsif defined?(Numo::NArray) && data.is_a?(Numo::NArray) 13 | raise ArgumentError, "Input must have 2 dimensions" if data.ndim != 2 14 | 15 | data = data.cast_to(Numo::DFloat) 16 | ncols = data.shape[1] 17 | 18 | @numeric_columns = ncols.times.to_a 19 | @categorical_columns = [] 20 | 21 | @vectors = {} 22 | @numeric_columns.each do |k| 23 | @vectors[k] = data[true, k] 24 | end 25 | @array_type = true 26 | else 27 | data = data.to_a 28 | 29 | hashes = data.all? { |d| d.is_a?(Hash) } 30 | arrays = !hashes && data.all? { |d| d.is_a?(Array) } 31 | unless hashes || arrays 32 | raise ArgumentError, "Array elements must be all hashes or arrays" 33 | end 34 | 35 | ncols = data.first ? data.first.size : 0 36 | if data.any? { |r| r.size != ncols } 37 | raise ArgumentError, "All rows must have the same number of columns" 38 | end 39 | 40 | keys = 41 | if hashes 42 | data.flat_map(&:keys).uniq 43 | else 44 | ncols.times.to_a 45 | end 46 | 47 | @vectors = {} 48 | keys.each do |k| 49 | @vectors[k] = [] 50 | end 51 | data.each do |d| 52 | keys.each do |k| 53 | @vectors[k] << d[k] 54 | end 55 | end 56 | 57 | @numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } } 58 | @array_type = arrays 59 | end 60 | 61 | raise ArgumentError, "No data" if size == 0 62 | end 63 | 64 | def [](k) 65 | @vectors[k] 66 | end 67 | 68 | def size 69 | @vectors.any? ? @vectors.values.first.size : 0 70 | end 71 | end 72 | end 73 | -------------------------------------------------------------------------------- /lib/isotree/isolation_forest.rb: -------------------------------------------------------------------------------- 1 | module IsoTree 2 | class IsolationForest 3 | def initialize( 4 | sample_size: "auto", 5 | ntrees: 500, 6 | ndim: 3, 7 | ntry: 1, 8 | # categ_cols: nil, 9 | max_depth: "auto", 10 | ncols_per_tree: nil, 11 | prob_pick_pooled_gain: 0.0, 12 | prob_pick_avg_gain: 0.0, 13 | prob_pick_full_gain: 0.0, 14 | prob_pick_dens: 0.0, 15 | prob_pick_col_by_range: 0.0, 16 | prob_pick_col_by_var: 0.0, 17 | prob_pick_col_by_kurt: 0.0, 18 | min_gain: 0.0, 19 | missing_action: "auto", 20 | new_categ_action: "auto", 21 | categ_split_type: "auto", 22 | all_perm: false, 23 | coef_by_prop: false, 24 | # recode_categ: false, 25 | weights_as_sample_prob: true, 26 | sample_with_replacement: false, 27 | penalize_range: false, 28 | standardize_data: true, 29 | scoring_metric: "depth", 30 | fast_bratio: true, 31 | weigh_by_kurtosis: false, 32 | coefs: "uniform", 33 | assume_full_distr: true, 34 | # build_imputer: false, 35 | min_imp_obs: 3, 36 | depth_imp: "higher", 37 | weigh_imp_rows: "inverse", 38 | random_seed: 1, 39 | use_long_double: false, 40 | nthreads: -1 41 | ) 42 | @sample_size = sample_size 43 | @ntrees = ntrees 44 | @ndim = ndim 45 | @ntry = ntry 46 | # @categ_cols = categ_cols 47 | @max_depth = max_depth 48 | @ncols_per_tree = ncols_per_tree 49 | @prob_pick_pooled_gain = prob_pick_pooled_gain 50 | @prob_pick_avg_gain = prob_pick_avg_gain 51 | @prob_pick_full_gain = prob_pick_full_gain 52 | @prob_pick_dens = prob_pick_dens 53 | @prob_pick_col_by_range = prob_pick_col_by_range 54 | @prob_pick_col_by_var = prob_pick_col_by_var 55 | @prob_pick_col_by_kurt = prob_pick_col_by_kurt 56 | @min_gain = min_gain 57 | @missing_action = missing_action 58 | @new_categ_action = new_categ_action 59 | @categ_split_type = categ_split_type 60 | @all_perm = all_perm 61 | @coef_by_prop = coef_by_prop 62 | # @recode_categ = recode_categ 63 | @weights_as_sample_prob = weights_as_sample_prob 64 | @sample_with_replacement = sample_with_replacement 65 | @penalize_range = penalize_range 66 | @standardize_data = standardize_data 67 | @scoring_metric = scoring_metric 68 | @fast_bratio = fast_bratio 69 | @weigh_by_kurtosis = weigh_by_kurtosis 70 | @coefs = coefs 71 | @assume_full_distr = assume_full_distr 72 | @min_imp_obs = min_imp_obs 73 | @depth_imp = depth_imp 74 | @weigh_imp_rows = weigh_imp_rows 75 | @random_seed = random_seed 76 | @use_long_double = use_long_double 77 | 78 | # etc module returns virtual cores 79 | nthreads = Etc.nprocessors if nthreads < 0 80 | @nthreads = nthreads 81 | end 82 | 83 | def fit(x) 84 | # make export consistent with Python library 85 | update_params 86 | 87 | x = Dataset.new(x) 88 | prep_fit(x) 89 | options = data_options(x).merge(fit_options) 90 | 91 | if options[:sample_size] == "auto" 92 | options[:sample_size] = [options[:nrows], 10000].min 93 | end 94 | 95 | # prevent segfault 96 | options[:sample_size] = options[:nrows] if options[:sample_size] > options[:nrows] 97 | 98 | @ext_iso_forest = Ext.fit_iforest(options) 99 | end 100 | 101 | def predict(x, output: "score") 102 | check_fit 103 | 104 | x = Dataset.new(x) 105 | prep_predict(x) 106 | 107 | options = data_options(x).merge(nthreads: @nthreads) 108 | case output 109 | when "score" 110 | options[:standardize] = true 111 | when "avg_depth" 112 | options[:standardize] = false 113 | else 114 | raise ArgumentError, "Unknown output" 115 | end 116 | 117 | Ext.predict_iforest(@ext_iso_forest, options) 118 | end 119 | 120 | # same format as Python so models are compatible 121 | def export_model(path, add_metada_file: false) 122 | check_fit 123 | 124 | metadata = export_metadata 125 | if add_metada_file 126 | # indent 4 spaces like Python 127 | File.write("#{path}.metadata", JSON.pretty_generate(metadata, indent: " ")) 128 | end 129 | Ext.serialize_combined(@ext_iso_forest, path, JSON.generate(metadata)) 130 | end 131 | 132 | def self.import_model(path) 133 | model = new 134 | ext_iso_forest, metadata = Ext.deserialize_combined(path) 135 | model.instance_variable_set(:@ext_iso_forest, ext_iso_forest) 136 | model.send(:import_metadata, JSON.parse(metadata)) 137 | model 138 | end 139 | 140 | private 141 | 142 | def export_metadata 143 | data_info = { 144 | ncols_numeric: @numeric_columns.size, 145 | ncols_categ: @categorical_columns.size, 146 | cols_numeric: @numeric_columns, 147 | cols_categ: @categorical_columns, 148 | cat_levels: @categorical_columns.map { |v| @categories[v].keys }, 149 | categ_cols: [], 150 | categ_max: [] 151 | } 152 | 153 | # Ruby-specific 154 | data_info[:sym_numeric] = @numeric_columns.map { |v| v.is_a?(Symbol) } 155 | data_info[:sym_categ] = @categorical_columns.map { |v| v.is_a?(Symbol) } 156 | 157 | model_info = { 158 | ndim: @ndim, 159 | nthreads: @nthreads, 160 | use_long_double: @use_long_double, 161 | build_imputer: false 162 | } 163 | 164 | params = {} 165 | PARAM_KEYS.each do |k| 166 | params[k] = instance_variable_get("@#{k}") 167 | end 168 | 169 | if params[:max_depth] == "auto" 170 | params[:max_depth] = 0 171 | end 172 | 173 | { 174 | data_info: data_info, 175 | model_info: model_info, 176 | params: params 177 | } 178 | end 179 | 180 | def import_metadata(metadata) 181 | data_info = metadata["data_info"] 182 | model_info = metadata["model_info"] 183 | params = metadata["params"] 184 | 185 | # Ruby-specific 186 | sym_numeric = data_info["sym_numeric"].to_a 187 | sym_categ = data_info["sym_categ"].to_a 188 | 189 | @numeric_columns = data_info["cols_numeric"].map.with_index { |v, i| sym_numeric[i] ? v.to_sym : v } 190 | @categorical_columns = data_info["cols_categ"].map.with_index { |v, i| sym_categ[i] ? v.to_sym : v } 191 | @categories = {} 192 | @categorical_columns.zip(data_info["cat_levels"]) do |col, levels| 193 | @categories[col] = levels.map.with_index.to_h 194 | end 195 | 196 | @ndim = model_info["ndim"] 197 | @nthreads = model_info["nthreads"] 198 | @use_long_double = model_info["use_long_double"] 199 | @build_imputer = model_info["build_imputer"] 200 | 201 | PARAM_KEYS.each do |k| 202 | instance_variable_set("@#{k}", params[k.to_s]) 203 | end 204 | end 205 | 206 | def check_fit 207 | raise "Not fit" unless @ext_iso_forest 208 | end 209 | 210 | def prep_fit(df) 211 | @numeric_columns = df.numeric_columns 212 | @categorical_columns = df.categorical_columns 213 | @categories = {} 214 | @categorical_columns.each do |k| 215 | @categories[k] = df[k].uniq.to_a.compact.map.with_index.to_h 216 | end 217 | end 218 | 219 | # TODO handle column type mismatches 220 | def prep_predict(df) 221 | expected_columns = @numeric_columns + @categorical_columns 222 | if df.array_type 223 | if df.numeric_columns.size + df.categorical_columns.size != expected_columns.size 224 | raise ArgumentError, "Input must have #{expected_columns.size} columns for this model" 225 | end 226 | end 227 | expected_columns.each do |k| 228 | raise ArgumentError, "Missing column: #{k}" unless df[k] 229 | end 230 | end 231 | 232 | def data_options(df) 233 | options = {} 234 | 235 | # numeric 236 | numeric_data = String.new 237 | @numeric_columns.each do |k| 238 | v = df[k] 239 | v = v.to_numo if v.respond_to?(:to_numo) # Rover 240 | binary_str = 241 | if v.respond_to?(:to_binary) # Rover and Numo 242 | v.cast_to(Numo::DFloat).to_binary 243 | else 244 | v.pack("d*") 245 | end 246 | numeric_data << binary_str 247 | end 248 | options[:numeric_data] = numeric_data 249 | options[:ncols_numeric] = @numeric_columns.size 250 | 251 | # categorical 252 | categorical_data = String.new 253 | ncat = String.new 254 | @categorical_columns.each do |k| 255 | categories = @categories[k] 256 | # for unseen values, set to categories.size 257 | categories_size = categories.size 258 | values = df[k].map { |v| v.nil? ? -1 : (categories[v] || categories_size) } 259 | # TODO make more efficient 260 | if values.any? { |v| v == categories_size } 261 | warn "[isotree] Unseen values in column: #{k}" 262 | end 263 | 264 | v = values 265 | v = v.to_numo if v.respond_to?(:to_numo) # Rover 266 | binary_str = 267 | if v.respond_to?(:to_binary) # Rover and Numo 268 | v.cast_to(Numo::Int32).to_binary 269 | else 270 | v.pack("i*") 271 | end 272 | categorical_data << binary_str 273 | ncat << [categories.size].pack("i") 274 | end 275 | options[:categorical_data] = categorical_data 276 | options[:ncols_categ] = @categorical_columns.size 277 | options[:ncat] = ncat 278 | 279 | options[:nrows] = df.size 280 | options 281 | end 282 | 283 | PARAM_KEYS = %i( 284 | sample_size ntrees ntry max_depth ncols_per_tree 285 | prob_pick_avg_gain prob_pick_pooled_gain prob_pick_full_gain prob_pick_dens 286 | prob_pick_col_by_range prob_pick_col_by_var prob_pick_col_by_kurt 287 | min_gain missing_action new_categ_action categ_split_type coefs 288 | depth_imp weigh_imp_rows min_imp_obs random_seed all_perm 289 | coef_by_prop weights_as_sample_prob sample_with_replacement penalize_range standardize_data 290 | scoring_metric fast_bratio weigh_by_kurtosis assume_full_distr 291 | ) 292 | 293 | def fit_options 294 | keys = %i( 295 | sample_size ntrees ndim ntry 296 | categ_cols max_depth ncols_per_tree 297 | prob_pick_pooled_gain prob_pick_avg_gain 298 | prob_pick_full_gain prob_pick_dens 299 | prob_pick_col_by_range prob_pick_col_by_var prob_pick_col_by_kurt 300 | min_gain missing_action new_categ_action 301 | categ_split_type all_perm coef_by_prop 302 | weights_as_sample_prob 303 | sample_with_replacement penalize_range standardize_data 304 | scoring_metric fast_bratio 305 | weigh_by_kurtosis coefs min_imp_obs depth_imp 306 | weigh_imp_rows random_seed use_long_double nthreads 307 | ) 308 | options = {} 309 | keys.each do |k| 310 | options[k] = instance_variable_get("@#{k}") 311 | end 312 | 313 | if options[:max_depth] == "auto" 314 | options[:max_depth] = 0 315 | options[:limit_depth] = true 316 | end 317 | 318 | if options[:ncols_per_tree].nil? 319 | options[:ncols_per_tree] = 0 320 | end 321 | 322 | options 323 | end 324 | 325 | def update_params 326 | if @missing_action == "auto" 327 | if @ndim == 1 328 | @missing_action = "divide" 329 | else 330 | @missing_action = "impute" 331 | end 332 | end 333 | 334 | if @new_categ_action == "auto" 335 | if @ndim == 1 336 | @new_categ_action = "weighted" 337 | else 338 | @new_categ_action = "impute" 339 | end 340 | end 341 | 342 | if @categ_split_type == "auto" 343 | if @ndim == 1 344 | @categ_split_type = "single_categ" 345 | else 346 | @categ_split_type = "subset" 347 | end 348 | end 349 | end 350 | end 351 | end 352 | -------------------------------------------------------------------------------- /lib/isotree/version.rb: -------------------------------------------------------------------------------- 1 | module IsoTree 2 | VERSION = "0.4.0" 3 | end 4 | -------------------------------------------------------------------------------- /test/isolation_forest_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class IsolationForestTest < Minitest::Test 4 | def test_hashes 5 | data = test_data 6 | model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1) 7 | model.fit(data) 8 | predictions = model.predict(data) 9 | expected = [0.4816470280716818, 0.46655713161582574, 0.5363011880474468] 10 | assert_elements_in_delta expected, predictions.first(3) 11 | assert_equal 100, predictions.each_with_index.max[1] 12 | end 13 | 14 | def test_array 15 | data = numeric_data 16 | model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1) 17 | model.fit(data) 18 | predictions = model.predict(data) 19 | expected = [0.454691875234909, 0.42805783155356797, 0.5460616479701705] 20 | assert_elements_in_delta expected, predictions.first(3) 21 | assert_equal 100, predictions.each_with_index.max[1] 22 | end 23 | 24 | def test_export 25 | skip "Not supported yet" if windows? 26 | 27 | data = test_data 28 | model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1) 29 | model.fit(data) 30 | original_predictions = model.predict(data) 31 | 32 | tempfile = Tempfile.new 33 | model.export_model(tempfile.path) 34 | model = IsoTree::IsolationForest.import_model(tempfile.path) 35 | predictions = model.predict(data) 36 | assert_elements_in_delta original_predictions, predictions 37 | end 38 | 39 | def test_import_to_python 40 | skip if !ENV["TEST_PYTHON"] || windows? 41 | 42 | model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1) 43 | model.fit(test_data) 44 | model.export_model("/tmp/model.bin", add_metada_file: true) 45 | assert_match "Name: 100", %x[python3 test/support/import.py] 46 | 47 | expected = JSON.parse(File.read("test/support/model.bin.metadata")) 48 | metadata = JSON.parse(File.read("/tmp/model.bin.metadata")) 49 | metadata["data_info"].reject! { |k, _| ["sym_numeric", "sym_categ"].include?(k) } 50 | assert_equal expected, metadata 51 | end 52 | 53 | def test_import_from_python 54 | skip "Not supported yet" if windows? 55 | 56 | model = IsoTree::IsolationForest.import_model("test/support/model.bin") 57 | predictions = model.predict(test_data.map { |v| v.transform_keys(&:to_s) }) 58 | assert_equal 100, predictions.each_with_index.max[1] 59 | end 60 | 61 | def test_import_missing_file 62 | error = assert_raises do 63 | IsoTree::IsolationForest.import_model("missing.bin") 64 | end 65 | assert_equal "Cannot open file", error.message 66 | end 67 | 68 | def test_numo 69 | data = Numo::DFloat.cast(numeric_data) 70 | model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1) 71 | model.fit(data) 72 | predictions = model.predict(data) 73 | expected = [0.454691875234909, 0.42805783155356797, 0.5460616479701705] 74 | assert_elements_in_delta expected, predictions.first(3) 75 | assert_equal 100, predictions.each_with_index.max[1] 76 | end 77 | 78 | def test_rover 79 | require "rover" 80 | 81 | data = Rover::DataFrame.new(test_data) 82 | model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1) 83 | model.fit(data) 84 | predictions = model.predict(data) 85 | expected = [0.4816470280716818, 0.46655713161582574, 0.5363011880474468] 86 | assert_elements_in_delta expected, predictions.first(3) 87 | assert_equal 100, predictions.each_with_index.max[1] 88 | end 89 | 90 | def test_predict_output_avg_depth 91 | data = test_data 92 | model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1) 93 | model.fit(data) 94 | predictions = model.predict(data, output: "avg_depth") 95 | # different results on different platforms with same seed 96 | expected = [8.847458736905825, 9.23295785483866, 7.545738407619213] 97 | assert_elements_in_delta expected, predictions.first(3) 98 | assert_equal 100, predictions.each_with_index.min[1] 99 | end 100 | 101 | def test_not_fit 102 | model = IsoTree::IsolationForest.new 103 | error = assert_raises do 104 | model.predict([]) 105 | end 106 | assert_equal "Not fit", error.message 107 | end 108 | 109 | def test_different_columns 110 | x = Numo::DFloat.new(101, 2).rand_norm 111 | model = IsoTree::IsolationForest.new 112 | model.fit(x) 113 | error = assert_raises(ArgumentError) do 114 | model.predict(x.reshape(2, 101)) 115 | end 116 | assert_equal "Input must have 2 columns for this model", error.message 117 | end 118 | 119 | def test_no_data 120 | model = IsoTree::IsolationForest.new 121 | error = assert_raises(ArgumentError) do 122 | model.fit([]) 123 | end 124 | assert_equal "No data", error.message 125 | end 126 | 127 | def test_bad_size 128 | model = IsoTree::IsolationForest.new 129 | error = assert_raises(ArgumentError) do 130 | model.fit([[1, 2], [3]]) 131 | end 132 | assert_equal "All rows must have the same number of columns", error.message 133 | end 134 | 135 | def test_bad_dimensions 136 | model = IsoTree::IsolationForest.new 137 | error = assert_raises(ArgumentError) do 138 | model.fit(Numo::DFloat.cast([[[1]]])) 139 | end 140 | assert_equal "Input must have 2 dimensions", error.message 141 | end 142 | 143 | def test_bad_sample_size 144 | data = test_data 145 | model = IsoTree::IsolationForest.new(ntrees: 10, ndim: 3, nthreads: 1, sample_size: data.size * 2) 146 | model.fit(data) 147 | end 148 | 149 | def test_data 150 | CSV.table("test/support/data.csv").map(&:to_h) 151 | end 152 | 153 | def numeric_data 154 | test_data.map { |v| [v[:num1], v[:num2]] } 155 | end 156 | 157 | def windows? 158 | Gem.win_platform? 159 | end 160 | end 161 | -------------------------------------------------------------------------------- /test/support/data.csv: -------------------------------------------------------------------------------- 1 | num1,num2,cat1 2 | 1.624345,-0.611756,C 3 | -0.528172,-1.072969,B 4 | 0.865408,-2.301539,C 5 | 1.744812,-0.761207,C 6 | 0.319039,-0.249370,B 7 | 1.462108,-2.060141,C 8 | -0.322417,-0.384054,B 9 | 1.133769,-1.099891,A 10 | -0.172428,-0.877858,C 11 | 0.042214,0.582815,B 12 | -1.100619,1.144724,A 13 | 0.901591,0.502494,B 14 | 0.900856,-0.683728,A 15 | -0.122890,-0.935769,B 16 | -0.267888,0.530355,C 17 | -0.691661,-0.396754,C 18 | -0.687173,-0.845206,B 19 | -0.671246,-0.012665,B 20 | -1.117310,0.234416,B 21 | 1.659802,0.742044,C 22 | -0.191836,-0.887629,B 23 | -0.747158,1.692455,C 24 | 0.050808,-0.636996,B 25 | 0.190915,2.100255,A 26 | 0.120159,0.617203,A 27 | 0.300170,-0.352250,C 28 | -1.142518,-0.349343,A 29 | -0.208894,0.586623,A 30 | 0.838983,0.931102,C 31 | 0.285587,0.885141,A 32 | -0.754398,1.252868,B 33 | 0.512930,-0.298093,A 34 | 0.488518,-0.075572,A 35 | 1.131629,1.519817,C 36 | 2.185575,-1.396496,C 37 | -1.444114,-0.504466,C 38 | 0.160037,0.876169,C 39 | 0.315635,-2.022201,C 40 | -0.306204,0.827975,A 41 | 0.230095,0.762011,C 42 | -0.222328,-0.200758,B 43 | 0.186561,0.410052,A 44 | 0.198300,0.119009,B 45 | -0.670662,0.377564,A 46 | 0.121821,1.129484,A 47 | 1.198918,0.185156,B 48 | -0.375285,-0.638730,C 49 | 0.423494,0.077340,C 50 | -0.343854,0.043597,C 51 | -0.620001,0.698032,B 52 | -0.447129,1.224508,A 53 | 0.403492,0.593579,A 54 | -1.094912,0.169382,A 55 | 0.740556,-0.953701,C 56 | -0.266219,0.032615,A 57 | -1.373117,0.315159,A 58 | 0.846161,-0.859516,A 59 | 0.350546,-1.312283,C 60 | -0.038696,-1.615772,C 61 | 1.121418,0.408901,A 62 | -0.024617,-0.775162,C 63 | 1.273756,1.967102,A 64 | -1.857982,1.236164,A 65 | 1.627651,0.338012,A 66 | -1.199268,0.863345,B 67 | -0.180920,-0.603921,B 68 | -1.230058,0.550537,A 69 | 0.792807,-0.623531,B 70 | 0.520576,-1.144341,C 71 | 0.801861,0.046567,B 72 | -0.186570,-0.101746,B 73 | 0.868886,0.750412,A 74 | 0.529465,0.137701,A 75 | 0.077821,0.618380,B 76 | 0.232495,0.682551,C 77 | -0.310117,-2.434838,B 78 | 1.038825,2.186980,B 79 | 0.441364,-0.100155,B 80 | -0.136445,-0.119054,B 81 | 0.017409,-1.122019,C 82 | -0.517094,-0.997027,A 83 | 0.248799,-0.296641,A 84 | 0.495211,-0.174703,A 85 | 0.986335,0.213534,B 86 | 2.190700,-1.896361,C 87 | -0.646917,0.901487,C 88 | 2.528326,-0.248635,B 89 | 0.043669,-0.226314,B 90 | 1.331457,-0.287308,A 91 | 0.680070,-0.319802,B 92 | -1.272559,0.313548,A 93 | 0.503185,1.293226,A 94 | -0.110447,-0.617362,B 95 | 0.562761,0.240737,A 96 | 0.280665,-0.073113,C 97 | 1.160339,0.369493,B 98 | 1.904659,1.111057,A 99 | 0.659050,-1.627438,C 100 | 0.602319,0.420282,A 101 | 0.810952,1.044442,B 102 | 3.000000,3.000000,A 103 | -------------------------------------------------------------------------------- /test/support/import.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from isotree import IsolationForest 4 | 5 | df = pd.read_csv('test/support/data.csv') 6 | model = IsolationForest.import_model("/tmp/model.bin") 7 | 8 | predictions = model.predict(df) 9 | print(predictions[0:3].tolist()) 10 | print('Point with highest outlier score: ', df.iloc[np.argsort(-predictions)[0]]) 11 | -------------------------------------------------------------------------------- /test/support/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ankane/isotree-ruby/9e06d1923f134725fedb383c84e61fa53cd90669/test/support/model.bin -------------------------------------------------------------------------------- /test/support/model.bin.metadata: -------------------------------------------------------------------------------- 1 | { 2 | "data_info": { 3 | "ncols_numeric": 2, 4 | "ncols_categ": 1, 5 | "cols_numeric": [ 6 | "num1", 7 | "num2" 8 | ], 9 | "cols_categ": [ 10 | "cat1" 11 | ], 12 | "cat_levels": [ 13 | [ 14 | "C", 15 | "B", 16 | "A" 17 | ] 18 | ], 19 | "categ_cols": [], 20 | "categ_max": [] 21 | }, 22 | "model_info": { 23 | "ndim": 3, 24 | "nthreads": 1, 25 | "use_long_double": false, 26 | "build_imputer": false 27 | }, 28 | "params": { 29 | "sample_size": "auto", 30 | "ntrees": 10, 31 | "ntry": 1, 32 | "max_depth": 0, 33 | "ncols_per_tree": null, 34 | "prob_pick_avg_gain": 0.0, 35 | "prob_pick_pooled_gain": 0.0, 36 | "prob_pick_full_gain": 0.0, 37 | "prob_pick_dens": 0.0, 38 | "prob_pick_col_by_range": 0.0, 39 | "prob_pick_col_by_var": 0.0, 40 | "prob_pick_col_by_kurt": 0.0, 41 | "min_gain": 0.0, 42 | "missing_action": "impute", 43 | "new_categ_action": "impute", 44 | "categ_split_type": "subset", 45 | "coefs": "uniform", 46 | "depth_imp": "higher", 47 | "weigh_imp_rows": "inverse", 48 | "min_imp_obs": 3, 49 | "random_seed": 1, 50 | "all_perm": false, 51 | "coef_by_prop": false, 52 | "weights_as_sample_prob": true, 53 | "sample_with_replacement": false, 54 | "penalize_range": false, 55 | "standardize_data": true, 56 | "scoring_metric": "depth", 57 | "fast_bratio": true, 58 | "weigh_by_kurtosis": false, 59 | "assume_full_distr": true 60 | } 61 | } -------------------------------------------------------------------------------- /test/support/predict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from isotree import IsolationForest 4 | 5 | df = pd.read_csv('test/support/data.csv') 6 | model = IsolationForest(ntrees=10, ndim=3, nthreads=1) 7 | model.fit(df) 8 | 9 | predictions = model.predict(df) 10 | print(predictions[0:3].tolist()) 11 | print('Point with highest outlier score: ', df.iloc[np.argsort(-predictions)[0]]) 12 | 13 | print('avg_depth') 14 | print(model.predict(df, output='avg_depth')[0:3].tolist()) 15 | 16 | model.export_model('test/support/model.bin', add_metada_file=True) 17 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | Bundler.require(:default) 3 | require "minitest/autorun" 4 | require "minitest/pride" 5 | require "csv" 6 | 7 | class Minitest::Test 8 | def assert_elements_in_delta(expected, actual) 9 | assert_equal expected.size, actual.size 10 | expected.zip(actual) do |exp, act| 11 | assert_in_delta exp, act 12 | end 13 | end 14 | end 15 | --------------------------------------------------------------------------------