├── .github └── workflows │ └── compile-test.yml ├── .gitignore ├── DecisionTree.nimble ├── Makefile ├── benchmark ├── rf.nim └── rf.py ├── readme.MD ├── src ├── DecisionTree.nim ├── hyperparams.nim ├── impurity.nim ├── node │ ├── constructors.nim │ ├── inode.nim │ ├── leaf.nim │ ├── node.nim │ └── traverse.nim ├── random_forest │ ├── parallel_rf.nim │ └── random_forest.nim ├── rule │ ├── bagging.nim │ ├── stop_rules.nim │ └── tree_rules.nim ├── task.nim ├── train │ ├── sons_gen.nim │ ├── split.nim │ ├── splitresult.nim │ ├── stop.nim │ ├── train.nim │ └── values_selection.nim ├── tree.nim ├── utils.nim └── view.nim └── tests ├── data ├── X_data ├── X_iris.csv ├── iris.data ├── y_data └── y_iris.csv ├── test_fit_tree.nim ├── test_impurity.nim └── test_utils.nim /.github/workflows/compile-test.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: compile-run 4 | 5 | # Controls when the action will run. 6 | on: 7 | # Triggers the workflow on push or pull request events but only for the master branch 8 | push: 9 | branches: [ master ] 10 | pull_request: 11 | branches: [ master ] 12 | 13 | # Allows you to run this workflow manually from the Actions tab 14 | workflow_dispatch: 15 | 16 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 17 | jobs: 18 | # This workflow contains a single job called "build" 19 | build: 20 | # The type of runner that the job will run on 21 | runs-on: ubuntu-20.04 22 | 23 | # Steps represent a sequence of tasks that will be executed as part of the job 24 | steps: 25 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 26 | - uses: actions/checkout@v2 27 | 28 | - name: Pull choosenim 29 | run: curl https://nim-lang.org/choosenim/init.sh -sSf > install-nim.sh 30 | 31 | 32 | - name: Install nim 33 | run: chmod +x install-nim.sh && ./install-nim.sh -y 34 | 35 | # Runs a single command using the runners shell 36 | - name: Run tests on 1.2 37 | run: export PATH=/home/runner/.nimble/bin:$PATH && choosenim 1.2.0 && nimble refresh && nimble install sequtils2 && nim c -r --threads:on tests/test* 38 | 39 | - name: Run tests on 1.4 40 | run: export PATH=/home/runner/.nimble/bin:$PATH && choosenim 1.4.0 && nimble refresh && nimble install sequtils2 && nim c -r --threads:on tests/test* 41 | 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tests/test_fit_tree 2 | tests/test_impurity 3 | benchmark/rf 4 | callgrind* 5 | profile_results.txt 6 | .idea -------------------------------------------------------------------------------- /DecisionTree.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "0.1.1" 4 | author = "Michele De Vita" 5 | description = "Decision tree and Random forest CART implementation for nim" 6 | license = "GPL-3.0" 7 | srcDir = "src" 8 | 9 | 10 | 11 | # Dependencies 12 | 13 | requires "nim >= 1.2.0" 14 | requires "sequtils2 >= 1.1.0" 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | compile-benchmark: 2 | nim c --gc:orc --threads:on -d:release benchmark/rf.nim 3 | run-benchmarks: 4 | echo "Python Random Forest" 5 | time python ./benchmark/rf.py 6 | echo "============================" 7 | echo "Nim Random Forest" 8 | time ./benchmark/rf 9 | clean-compiled: 10 | rm benchmark/rf -------------------------------------------------------------------------------- /benchmark/rf.nim: -------------------------------------------------------------------------------- 1 | import ../src/random_forest/random_forest 2 | import ../tests/test_utils 3 | import times 4 | 5 | proc iris_bench() = 6 | let rf = new_random_forest_classifier(100, max_depth=10) 7 | let X_data = read_X_data("tests/data/X_iris.csv") 8 | let y_data = read_y_data("tests/data/y_iris.csv") 9 | rf.fit(X_data, y_data) 10 | echo "Successfull finished training of a Random Forest of 100 trees" 11 | let yhat = rf.predict(X_data) 12 | echo "accuracy train ", accuracy(y_data, yhat) 13 | 14 | if isMainModule: 15 | iris_bench() 16 | -------------------------------------------------------------------------------- /benchmark/rf.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | from sklearn.metrics import accuracy_score 3 | import pandas as pd 4 | from path import Path 5 | 6 | def main(): 7 | root = Path(__file__).parent.parent 8 | data_folder = root / 'tests' / 'data' 9 | X_train = pd.read_csv(data_folder / 'X_iris.csv', header=None).values 10 | y_train = pd.read_csv(data_folder / 'y_iris.csv', header=None).values.ravel() 11 | rf = RandomForestClassifier(100, max_depth=10) 12 | rf.fit(X_train, y_train) 13 | yhat = rf.predict(X_train) 14 | print('accuracy train', accuracy_score(y_train, yhat)) 15 | 16 | 17 | if __name__ == "__main__": 18 | main() -------------------------------------------------------------------------------- /readme.MD: -------------------------------------------------------------------------------- 1 | # Decision tree with nim 2 | 3 | [![nimble](https://raw.githubusercontent.com/yglukhov/nimble-tag/master/nimble_js.png)](https://github.com/Michedev/DecisionTreeNim) 4 | 5 | Nim package for decision trees and random forest 6 | 7 | ## How to install 8 | 9 | `nimble install decisiontree` 10 | 11 | ### Package features 12 | - Inspired by Scikit-learn api 13 | - Random forest can train and predict in parallel 14 | - Actually you feed the X matrix of size [n x m] as `seq[seq[float]]` RowMajor and the y array of size [n] as `seq[float]` 15 | 16 | #### Decision Tree 17 | 18 | ``` 19 | import DecisionTree 20 | 21 | let dt = DecisionTree.new_classification_tree(max_depth=10) 22 | dt.fit(X_train,y_train) 23 | let yhat = dt.predict(X_test) 24 | 25 | ``` 26 | 27 | #### Random forest 28 | 29 | ``` 30 | import DecisionTree 31 | 32 | let rf = DecisionTree.new_random_forest_classifier(n_trees=100, num_threads=4) #parallel training too! 33 | rf.fit(X_train, y_train) 34 | let yhat = rf.predict(X_test) 35 | ``` 36 | 37 | 38 | ## Benchmark with python 39 | 40 | Note: this benchmark is not done to see if nim is quicker than python, of course a good implementation in nim requires less time since it is a statically compiled language and use C compiler optimizations. The purpuose of this benchamrk is to understand HOW MUCH is quicker so the user can decide the language and the library to use because python is slower but has more libraries for data science 41 | 42 | 43 | ##### Iris Dataset 44 | 45 | - Dataset: [iris dataset](https://www.kaggle.com/arshid/iris-flower-dataset) 46 | - Nim Code: 47 | 48 | 49 | import ../src/random_forest/random_forest 50 | import ../tests/test_utils 51 | 52 | proc iris_bench() = 53 | let rf = new_random_forest_classifier(100, max_depth=10) 54 | let X_data = read_X_data("tests/data/X_iris.csv") 55 | let y_data = read_y_data("tests/data/y_iris.csv") 56 | rf.fit(X_data, y_data) 57 | echo "Successfull finished training of a Random Forest of 100 trees" 58 | let yhat = rf.predict(X_data) 59 | echo "accuracy train ", accuracy(y_data, yhat) 60 | 61 | if isMainModule: 62 | iris_bench() 63 | 64 | multitime -n 30 results 65 | 66 | Mean Std.Dev. Min Median Max 67 | real 0.205 0.046 0.124 0.190 0.302 68 | user 0.191 0.042 0.110 0.180 0.297 69 | sys 0.005 0.004 0.000 0.003 0.016 70 | 71 | 72 | - Python code: 73 | 74 | from sklearn.ensemble import RandomForestClassifier 75 | from sklearn.metrics import accuracy_score 76 | import pandas as pd 77 | from path import Path 78 | 79 | def main(): 80 | root = Path(__file__).parent.parent 81 | data_folder = root / 'tests' / 'data' 82 | X_train = pd.read_csv(data_folder / 'X_iris.csv', header=None).values 83 | y_train = pd.read_csv(data_folder / 'y_iris.csv', header=None).values.ravel() 84 | rf = RandomForestClassifier(100, max_depth=10) 85 | rf.fit(X_train, y_train) 86 | yhat = rf.predict(X_train) 87 | print('accuracy train', accuracy_score(y_train, yhat)) 88 | 89 | 90 | if __name__ == "__main__": 91 | main() 92 | 93 | multitime -n 30 results: 94 | 95 | Mean Std.Dev. Min Median Max 96 | real 2.426 0.251 2.131 2.346 3.390 97 | user 2.275 0.158 2.026 2.224 2.711 98 | sys 0.541 0.058 0.431 0.553 0.668 99 | -------------------------------------------------------------------------------- /src/DecisionTree.nim: -------------------------------------------------------------------------------- 1 | # This is just an example to get you started. A typical library package 2 | # exports the main API in this file. Note that you cannot rename this file 3 | # but you can remove it if you wish. 4 | import tree 5 | import random_forest/random_forest 6 | import task 7 | 8 | export tree 9 | export random_forest 10 | export task 11 | -------------------------------------------------------------------------------- /src/hyperparams.nim: -------------------------------------------------------------------------------- 1 | type Hyperparams* = tuple[max_depth: int, min_samples_split: int, max_features: float32, min_impurity_decrease: float32, bagging: float32] 2 | 3 | template hyperparams_binding*(T: typed) = 4 | proc bagging*(t: T): auto {.inline.} = t.hyperparams.bagging 5 | proc max_depth*(t: T): auto {.inline.} = t.hyperparams.max_depth 6 | proc min_samples_split*(t: T): auto {.inline.} = t.hyperparams.min_samples_split 7 | proc min_impurity_decrease*(t: T): auto {.inline.} = t.hyperparams.min_impurity_decrease 8 | proc max_features*(t: T): auto {.inline.} = t.hyperparams.max_features 9 | -------------------------------------------------------------------------------- /src/impurity.nim: -------------------------------------------------------------------------------- 1 | import tables 2 | import math 3 | import sequtils 4 | import utils 5 | 6 | 7 | type Impurity* = enum 8 | Entropy 9 | Gini 10 | Mse 11 | Default 12 | 13 | proc entropy*(y: seq[float32]): float32 {.gcsafe.} = 14 | result = 0.0 15 | for p in y: 16 | result += - (p * ln(p)) 17 | 18 | 19 | proc gini*(y: seq[float32]): float32 {.gcsafe.} = 20 | result = 0.0 21 | for p in y: 22 | result += p * (1 - p) 23 | 24 | 25 | proc mse_from_mean*(y: seq[float32], y_mean: float32): float32 {.gcsafe.} = 26 | result = 0.0 27 | for value in y: 28 | result += (value - y_mean) * (value - y_mean) -------------------------------------------------------------------------------- /src/node/constructors.nim: -------------------------------------------------------------------------------- 1 | import node 2 | import leaf 3 | import ../task 4 | import ../impurity 5 | import ../rule/tree_rules 6 | import ../view 7 | 8 | 9 | proc new_leaf*(father: Node, X: MatrixView[float32], y: VectorView[float32]): Leaf = 10 | result = new(Leaf) 11 | result.level = father.level + 1 12 | result.tree_task = father.tree_task 13 | father.num_sons += 1 14 | result.max_features = father.max_features 15 | result.impurity_f = father.impurity_f 16 | result.stop_rules = father.stop_rules 17 | result.leaf_f = result.get_leaf_func(X, y) 18 | result.leaf_proba = result.get_leaf_proba_func(X, y) 19 | result.num_sons = 0 20 | result.father = father 21 | 22 | proc new_son*(father: Node): Node = 23 | result = new(Node) 24 | result.level = father.level + 1 25 | result.impurity_f = father.impurity_f 26 | result.max_features = father.max_features 27 | result.tree_task = father.tree_task 28 | result.stop_rules = father.stop_rules 29 | result.father = father 30 | result.num_sons = 0 31 | father.num_sons += 1 32 | 33 | 34 | proc new_root*(task: Task, impurity_f: Impurity = Default, stop_rules: TreeStopRules = nil, max_features: float32 = 1.0): Node = 35 | result = new(Node) 36 | result.level = 0 37 | result.max_features = max_features 38 | result.num_sons = 0 39 | result.tree_task = task 40 | result.impurity_value = Inf 41 | if impurity_f == Default: 42 | if task == Classification: 43 | result.impurity_f = Gini 44 | else: 45 | result.impurity_f = Mse 46 | else: 47 | result.impurity_f = impurity_f 48 | if not stop_rules.is_nil(): 49 | result.stop_rules = stop_rules 50 | else: 51 | result.stop_rules = new_tree_stop_rules() 52 | 53 | proc new_root_leaf*(X: MatrixView[float32], y: VectorView[float32]): Leaf = 54 | result = new(Leaf) 55 | result.leaf_f = result.get_leaf_func(X, y) 56 | result.num_sons = 0 57 | -------------------------------------------------------------------------------- /src/node/inode.nim: -------------------------------------------------------------------------------- 1 | import ../task 2 | import ../impurity 3 | 4 | type INode* = ref object of RootObj 5 | tree_task*: Task 6 | impurity_f*: Impurity 7 | level*: Natural 8 | split_value*: float32 9 | split_column*: int 10 | impurity_value*: float32 11 | -------------------------------------------------------------------------------- /src/node/leaf.nim: -------------------------------------------------------------------------------- 1 | import ../task 2 | import node 3 | import tables 4 | import math 5 | import ../view 6 | 7 | 8 | proc get_leaf_func*(n: Node, X: MatrixView[float32], y: VectorView[float32]) : auto {.gcsafe.} = 9 | if n.tree_task == Classification: 10 | var ctable = toCountTable y.to_seq() 11 | let mode: float32 = ctable.largest.key 12 | return proc(x: seq[float32]): float32 {.gcsafe.} = mode 13 | else: 14 | var tot: float32 = 0.0 15 | for v in y: 16 | tot += v 17 | let m: float32 = tot / y.len.float32 18 | return proc(x: seq[float32]): float32 {.gcsafe.} = m 19 | 20 | #TODO export and add field on leaf type 21 | proc get_leaf_proba_func*(n: Node, X: MatrixView[float32], y: VectorView[float32]) : auto {.gcsafe.} = 22 | if n.tree_task == Classification: 23 | var count_table = toCountTable y.to_seq 24 | var probs = newSeq[float32](count_table.len) 25 | var i = 0 26 | for (k, freq) in mpairs(count_table): 27 | probs[i] = freq / y.len 28 | inc i 29 | return proc(x: seq[float32]): seq[float32] {.gcsafe.} = probs 30 | elif n.tree_task == Regression: 31 | raise newException(Exception, "Cannot estimate probability in a regression tree") 32 | -------------------------------------------------------------------------------- /src/node/node.nim: -------------------------------------------------------------------------------- 1 | import ../task 2 | import ../rule/tree_rules 3 | import inode 4 | 5 | type 6 | Node* = ref object of INode 7 | sons*: array[2, Node] 8 | num_sons* : int 9 | father*: Node 10 | max_features*: float32 11 | stop_rules*: TreeStopRules 12 | Leaf* = ref object of Node 13 | leaf_f*: proc(x: seq[float32]): float32 {.gcsafe.} 14 | leaf_proba*: proc(x: seq[float32]): seq[float32] {.gcsafe.} 15 | RootIsLeaf* = object of Exception 16 | 17 | proc is_leaf*(n: Node): bool = n is Leaf 18 | 19 | import inode 20 | 21 | -------------------------------------------------------------------------------- /src/node/traverse.nim: -------------------------------------------------------------------------------- 1 | import node 2 | 3 | 4 | method get_value*(n: Node, x: sink seq[float32]): float32 {.base, gcsafe.} = 5 | let value = x[n.split_column] 6 | let i = (value > n.split_value).int 7 | return n.sons[i].get_value(x) 8 | 9 | method get_value*(n: Leaf, x: sink seq[float32]): float32 {.gcsafe.} = 10 | n.leaf_f(x) 11 | 12 | method get_proba*(n: Node, x: sink seq[float32]): seq[float32] {.base, gcsafe.} = 13 | let value = x[n.split_column] 14 | let i = (value > n.split_value).int 15 | return n.sons[i].get_proba(x) 16 | 17 | method get_proba*(n: Leaf, x: sink seq[float32]): seq[float32] {.gcsafe.} = 18 | n.leaf_proba(x) -------------------------------------------------------------------------------- /src/random_forest/parallel_rf.nim: -------------------------------------------------------------------------------- 1 | import sequtils 2 | import threadpool 3 | 4 | #for parallel use only 5 | proc tree_fit(tree: DecisionTree, X: ptr seq[seq[float32]], y: ptr seq[float32]): DecisionTree {.thread.} = 6 | tree.fit(X[], y[]) 7 | return tree 8 | 9 | proc fit_parallel*(forest: RandomForest, X: seq[seq[float32]], y: seq[float32]) = 10 | let trees_per_thread: int = (forest.num_trees.float32 / forest.num_threads.float32).int 11 | echo "trees per thread: ", trees_per_thread 12 | var threads = newSeq[FlowVar[DecisionTree]](forest.num_trees) 13 | let X_addr = unsafeAddr(X) 14 | let y_addr = unsafeAddr(y) 15 | for i, tree in forest.trees: 16 | threads[i] = spawn(tree_fit(tree, X_addr, y_addr)) 17 | for i in 0.. 1: 52 | rf.fit_parallel(X,y) 53 | else: 54 | for tree in rf.trees: 55 | tree.fit(X, y) 56 | 57 | 58 | 59 | proc predict*(rf: RandomForest, x: sink seq[float32]): float32 {.gcsafe.} = 60 | let predictions = rf.trees.map_it(it.predict(x)) 61 | case rf.task: 62 | of Classification: 63 | return predictions.toCountTable().largest.key 64 | of Regression: 65 | return predictions.mean() 66 | 67 | proc predict*(rf: RandomForest, X: sink seq[seq[float32]]): seq[float32] {.gcsafe.} = 68 | result = new_seq[float32](X.len) 69 | for i, row in X: 70 | result[i] = rf.predict(row) 71 | 72 | proc predict_proba*(forest: RandomForest, x: sink seq[float32]): seq[float32] {.gcsafe.} = 73 | result = new_seq[float32](forest.num_classes) 74 | for t in forest.trees: 75 | let p_y = t.predict_proba(x) 76 | for i_class in 0.. max_depth) 11 | 12 | proc unique_class_rule*(): Rule = 13 | proc is_unique_class(n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.} = 14 | if y.len == 0: 15 | return true 16 | let first_el = y[0] 17 | for el in y: 18 | if el != first_el: 19 | return false 20 | return true 21 | return is_unique_class 22 | 23 | proc min_impurity_decrease_rule*(threshold: float32): PostSplitRule = 24 | proc is_impurity_less(n: INode, X: MatrixView[float32], y: VectorView[float32], X1: MatrixView[float32], y1: VectorView[float32], X2: MatrixView[float32], y2: VectorView[float32], split: SplitResult): bool {.gcsafe.} = 25 | ## Stop if the decrease of split impurity is less than threshold 26 | let perc_1 = X1.len.float32 / X.len.float32 27 | let perc_2 = X2.len.float32 / X.len.float32 28 | 29 | let decrease = n.impurity_value - (perc_1 * split.impurity_1 + perc_2 * split.impurity_2) 30 | return decrease < threshold 31 | return is_impurity_less -------------------------------------------------------------------------------- /src/rule/tree_rules.nim: -------------------------------------------------------------------------------- 1 | import ../node/inode 2 | import ../view 3 | import ../train/splitresult 4 | 5 | 6 | type 7 | Rule* = proc(n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.} 8 | PostSplitRule* = proc(n: INode, X: MatrixView[float32], y: VectorView[float32], X1: MatrixView[float32], y1: VectorView[float32], X2: MatrixView[float32], y2: VectorView[float32], split: SplitResult): bool {.gcsafe.} 9 | TreeStopRules* = ref object 10 | ## Rules checked when a new son node is created. If one or more stop rules are true, then makes a leaf instead of internal node 11 | creation_rules: seq[Rule] 12 | ## Rules checked before creating a split 13 | pre_split_rules: seq[Rule] 14 | ## Rules checked after found the best split for a internal node. 15 | ## The type is different from the other two because receive in input also the split informations 16 | post_split_rules: seq[PostSplitRule] 17 | 18 | 19 | proc new_tree_stop_rules*(): TreeStopRules = 20 | result = new(TreeStopRules) 21 | result.creation_rules = @[] 22 | result.pre_split_rules = @[] 23 | result.post_split_rules = @[] 24 | 25 | 26 | proc add_creation_rule* (tr: TreeStopRules, rule: Rule) = 27 | tr.creation_rules.add rule 28 | 29 | proc add_pre_split_rule* (tr: TreeStopRules, rule: Rule) = 30 | tr.pre_split_rules.add rule 31 | 32 | proc add_post_split_rule* (tr: TreeStopRules, rule: PostSplitRule) = 33 | tr.post_split_rules.add rule 34 | 35 | proc add_creation_rules* (tr: TreeStopRules, rules: sink seq[Rule]) = 36 | for rule in rules: 37 | tr.add_creation_rule rule 38 | 39 | proc add_pre_split_rules* (tr: TreeStopRules, rules: sink seq[Rule]) = 40 | for rule in rules: 41 | tr.add_pre_split_rule rule 42 | 43 | proc add_post_split_rules* (tr: TreeStopRules, rules: seq[PostSplitRule]) = 44 | for rule in rules: 45 | tr.add_post_split_rule rule 46 | 47 | proc any_true(rules: sink seq[Rule], n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe, inline.} = 48 | for rule in rules: 49 | if rule(n, X, y): 50 | # echo "stop for rule ", rule 51 | return true 52 | return false 53 | 54 | proc any_true(rules: seq[PostSplitRule], n: INode, X: MatrixView[float32], y: VectorView[float32], X1: MatrixView[float32], y1: VectorView[float32], X2: MatrixView[float32], y2: VectorView[float32], split: SplitResult): bool {.gcsafe, inline.} = 55 | for rule in rules: 56 | if rule(n, X, y, X1, y1, X2, y2, split): 57 | return true 58 | return false 59 | 60 | proc on_creation*(tsr: TreeStopRules, n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.} = 61 | any_true(tsr.creation_rules, n, X, y) 62 | 63 | proc on_pre_split*(tsr: TreeStopRules, n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.} = 64 | any_true(tsr.pre_split_rules, n, X, y) 65 | 66 | proc on_post_split*(tsr: TreeStopRules, n: INode, X: MatrixView[float32], y: VectorView[float32], X1: MatrixView[float32], y1: VectorView[float32], X2: MatrixView[float32], y2: VectorView[float32], split: SplitResult): bool {.gcsafe.} = 67 | any_true(tsr.post_split_rules, n, X, y, X1, y1, X2, y2, split) -------------------------------------------------------------------------------- /src/task.nim: -------------------------------------------------------------------------------- 1 | type Task* = enum 2 | Classification, Regression -------------------------------------------------------------------------------- /src/train/sons_gen.nim: -------------------------------------------------------------------------------- 1 | import ../node/node, ../node/constructors, ../node/leaf 2 | import split, stop, splitresult 3 | import typetraits 4 | import ../rule/tree_rules 5 | import options 6 | import ../view 7 | 8 | type Sons = tuple[first, second: Node, X1, X2: MatrixView[float32], y1, y2: VectorView[float32]] 9 | 10 | proc generate_sons*(n: Node, X: MatrixView[float32], y: VectorView[float32]): Option[Sons] {.gcsafe.} = 11 | let split: SplitResult = best_split(n.tree_task, n.impurity_f, X, y, n.max_features) 12 | # echo "Split on ", split.col, " with value ", split.split_value 13 | # echo "tot split value: ", split.impurity, " and single values " , split.impurity_1, " - ", split.impurity_2 14 | n.split_column = split.col 15 | n.split_value = split.split_value 16 | let 17 | x1_len = split.index[0].len 18 | x2_len = split.index[1].len 19 | var 20 | X1 = new_matrix_view(X, split.index[0]) 21 | y1 = new_vector_view(y, split.index[0]) 22 | X2 = new_matrix_view(X, split.index[1]) 23 | y2 = new_vector_view(y, split.index[1]) 24 | if n.stop_rules.on_post_split(n, X, y, X1, y1, X2, y2, split): 25 | # echo "Split block on depth ", n.level 26 | return options.none[Sons]() 27 | 28 | if n.stop_rules.on_creation(n, X1, y1): 29 | n.sons[0] = new_leaf(n, X1, y1) 30 | else: 31 | n.sons[0] = new_son(n) 32 | if n.stop_rules.on_creation(n, X2, y2): 33 | n.sons[1] = new_leaf(n, X2, y2) 34 | else: 35 | n.sons[1] = new_son(n) 36 | n.sons[0].impurity_value = split.impurity_1 37 | n.sons[1].impurity_value = split.impurity_2 38 | 39 | return some((n.sons[0], n.sons[1], X1, X2, y1, y2)) -------------------------------------------------------------------------------- /src/train/split.nim: -------------------------------------------------------------------------------- 1 | import ../node/node 2 | import algorithm 3 | import system 4 | import math 5 | import ../utils 6 | import random 7 | import splitresult 8 | import ../view 9 | import tables 10 | import sugar 11 | import ../task 12 | import values_selection 13 | import ../impurity 14 | 15 | 16 | 17 | proc split_y_by_value_classification(x_col: ColumnMatrixView[float32], y: VectorView[float32], split_value: float32): tuple[y1, y2: CountTableRef[float32], i1, i2: seq[int]] = 18 | var 19 | y1 = newCountTable[float32]((y.len / 2).int.nextPowerOfTwo) 20 | y2 = newCountTable[float32]((y.len / 2).int.nextPowerOfTwo) 21 | i1 = new_seq[int](0) 22 | i2 = new_seq[int](0) 23 | for i, v in x_col: 24 | if v < split_value: 25 | y1.inc y.get_raw(i) 26 | i1.add i 27 | else: 28 | y2.inc y.get_raw(i) 29 | i2.add i 30 | return (y1, y2, i1, i2) 31 | 32 | proc split_y_by_value_regression(x_col: ColumnMatrixView[float32], y: VectorView[float32], split_value: float32): tuple[y1, y2: seq[float32], m1, m2: float32, i1, i2: seq[int]] = 33 | var 34 | mean_1: float32 = 0.0 35 | mean_2: float32 = 0.0 36 | y1: seq[float32] = new_seq[float32](0) 37 | y2: seq[float32] = new_seq[float32](0) 38 | i1 = new_seq[int](0) 39 | i2 = new_seq[int](0) 40 | for i, v in x_col: 41 | if v < split_value: 42 | mean_1 += v 43 | y1.add v 44 | i1.add i 45 | else: 46 | mean_2 += v 47 | y2.add v 48 | i2.add i 49 | mean_1 /= i1.len.float32 50 | mean_2 /= i2.len.float32 51 | return (y1, y2, mean_1, mean_2, i1, i2) 52 | 53 | 54 | proc best_split_col(t: Task, impurity_f: Impurity, x_col: ColumnMatrixView[float32], y: VectorView[float32]): SplitResult {.gcsafe.} = 55 | ## 56 | assert x_col.len == y.len 57 | let splits = percentiles(x_col, 10) 58 | var min_impurity = Inf 59 | var best_split = 0.0 60 | var best_i1: seq[int] 61 | var best_i2: seq[int] 62 | var min_impurity_1: float32 = 0.0 63 | var min_impurity_2: float32 = 0.0 64 | for split in splits: 65 | var 66 | impurity_y1 = Inf 67 | impurity_y2 = Inf 68 | split_i1: seq[int] = @[] 69 | split_i2: seq[int] = @[] 70 | if t == Classification: 71 | let (count_y1, count_y2, i1, i2) = split_y_by_value_classification(x_col, y, split) 72 | if i1.len == 0 or i2.len == 0: 73 | continue 74 | let freq_y1 = collect(newSeq): 75 | for x in count_y1.values: 76 | x.float32 / y.len.float32 77 | let freq_y2 = collect(newSeq): 78 | for x in count_y2.values: 79 | x.float32 / y.len.float32 80 | var true_impurity_f: proc(p_y: seq[float32]): float32 {.gcsafe.} = nil 81 | if impurity_f == Gini: 82 | true_impurity_f = gini 83 | elif impurity_f == Entropy: 84 | true_impurity_f = entropy 85 | impurity_y1 = true_impurity_f(freq_y1) 86 | impurity_y2 = true_impurity_f(freq_y2) 87 | split_i1 := i1 88 | split_i2 := i2 89 | else: 90 | let (y1, y2, mean_1, mean_2, i1, i2) = split_y_by_value_regression(x_col, y, split) 91 | if i1.len == 0 or i2.len == 0: 92 | continue 93 | if impurity_f == Mse: 94 | impurity_y1 = mse_from_mean(y1, mean_1) 95 | impurity_y2 = mse_from_mean(y2, mean_2) 96 | split_i1 := i1 97 | split_i2 := i2 98 | else: 99 | raise newException(ValueError, "impurity_f in regression must be Mse") 100 | let tot_impurity: float32 = impurity_y1 + impurity_y2 101 | if min_impurity > tot_impurity and split_i1.len > 0 and split_i2.len > 0: 102 | min_impurity = tot_impurity 103 | best_split = split 104 | best_i1 := split_i1 105 | best_i2 := split_i2 106 | min_impurity_1 = impurity_y1 107 | min_impurity_2 = impurity_y2 108 | return new_split_result(best_split, min_impurity, -1, [best_i1, best_i2], min_impurity_1, min_impurity_2) 109 | 110 | 111 | 112 | proc random_features(num_features: int, max_features: float32): seq[int] = 113 | result = newSeq[int](0) 114 | for i in 0.. j_split.impurity: 124 | best_split = j_split 125 | best_split.col = j 126 | return best_split 127 | -------------------------------------------------------------------------------- /src/train/splitresult.nim: -------------------------------------------------------------------------------- 1 | type 2 | SplitResult* = ref object 3 | split_value*: float32 4 | impurity*: float32 5 | col*: int 6 | index*: array[2, seq[int]] 7 | impurity_1*: float32 8 | impurity_2*: float32 9 | 10 | 11 | proc new_split_result*(split_value, impurity: float32): SplitResult = 12 | result = new(SplitResult) 13 | result.split_value = split_value 14 | result.impurity = impurity 15 | 16 | proc new_split_result*(split_value, impurity: float32, col: int, index: array[2, seq[int]], impurity_1: float32 = 0.0, impurity_2: float32 = 0.0): SplitResult = 17 | result = new(SplitResult) 18 | result.split_value = split_value 19 | result.impurity = impurity 20 | result.col = col 21 | result.index = index 22 | result.impurity_1 = impurity_1 23 | result.impurity_2 = impurity_2 24 | -------------------------------------------------------------------------------- /src/train/stop.nim: -------------------------------------------------------------------------------- 1 | import ../node/node 2 | import ../view 3 | import ../task 4 | 5 | proc unique_class(y: VectorView[float32]): bool = 6 | for value in y: 7 | if value != y[0]: 8 | return false 9 | # echo y, " has one class" 10 | return true 11 | 12 | proc on_creating_new_node*(n: Node, X: MatrixView[float32], y: VectorView[float32]): bool = 13 | if n.tree_task == Classification: 14 | if unique_class(y): 15 | return true 16 | return n.level >= 3 or y.len <= 1 17 | 18 | -------------------------------------------------------------------------------- /src/train/train.nim: -------------------------------------------------------------------------------- 1 | import ../node/[node, constructors] 2 | import sons_gen 3 | import options 4 | import ../view 5 | 6 | type NodeWithData = tuple[n: Node, X: MatrixView[float32], y: VectorView[float32]] 7 | 8 | ## Train function of decision tree 9 | proc fit* (root: Node, X: MatrixView[float32], y: VectorView[float32]) {.gcsafe.} = 10 | assert X.len == y.len 11 | var border = new_seq[NodeWithData](1) 12 | border[0] = (root, X, y) 13 | while border.len > 0: 14 | let (node, X_data, y_data) = border.pop() 15 | let sons_opt = node.generate_sons(X_data, y_data) 16 | if sons_opt.is_some(): 17 | let sons = sons_opt.get() 18 | if not(sons.first of Leaf): 19 | border.add((sons.first, sons.X1, sons.y1)) 20 | if not(sons.second of Leaf): 21 | border.add((sons.second, sons.X2, sons.y2)) 22 | elif sons_opt.is_none() and not node.father.isNil(): 23 | let father = node.father 24 | if father.num_sons == 0: 25 | father.sons[0] = new_leaf(father, X_data, y_data) 26 | else: 27 | for i in 0.. max_value: 17 | max_value = v 18 | for i in 1..(n-1): 19 | result[i-1] = min_value + (max_value - min_value).float32 * i.float32 / n.float32 20 | 21 | proc percentiles*(data: ColumnMatrixView[float32], n: int): seq[float32] = 22 | let data_sorted = data.to_seq.sorted 23 | if data.len < n: 24 | result = new_seq[float32](data.len - 1) 25 | for i in 0..= data.len - 1: 33 | continue 34 | result[i] = (data_sorted[data_index] + data_sorted[data_index+1]) / 2.0 35 | -------------------------------------------------------------------------------- /src/tree.nim: -------------------------------------------------------------------------------- 1 | import node/[node, constructors, traverse] 2 | import train/[split, sons_gen, train] 3 | import task 4 | import typetraits 5 | import sequtils 6 | import rule/[tree_rules, stop_rules, bagging] 7 | import hyperparams 8 | 9 | type 10 | DecisionTree* = ref object 11 | root: Node 12 | stop_rules: TreeStopRules 13 | hyperparams: Hyperparams 14 | 15 | proc assert_int_hp(value: int, msg: string = "") = 16 | assert value == -1 or value > 0, msg 17 | 18 | proc assert_0_1_float32_hp(value: float32, msg: string = "") = 19 | assert value <= 1.0 or value >= 0.0 or value == -1.0, msg 20 | 21 | proc assert_positive_float32_hp(value: float32, msg: string = "") = 22 | assert value == -1.0 or value > 0.0, msg 23 | 24 | hyperparams_binding(DecisionTree) 25 | 26 | 27 | proc add_rules(tree: DecisionTree, max_depth: int, min_samples_split: int, max_features: float32, min_impurity_decrease: float32) = 28 | if max_depth != -1: 29 | tree.stop_rules.add_creation_rule max_depth_rule(max_depth) 30 | if min_samples_split != -1: 31 | tree.stop_rules.add_pre_split_rule min_samples_split_rule(min_samples_split) 32 | if min_impurity_decrease != -1.0: 33 | tree.stop_rules.add_post_split_rule min_impurity_decrease_rule(min_impurity_decrease) 34 | tree.stop_rules.add_creation_rule unique_class_rule() 35 | 36 | 37 | proc new_tree*(task: Task, h: Hyperparams, 38 | custom_creation_rules: seq[Rule] = @[], 39 | custom_pre_split_rules: seq[Rule] = @[], 40 | custom_post_split_rules: seq[PostSplitRule] = @[]): DecisionTree = 41 | result = new(DecisionTree) 42 | assert_int_hp(h.max_depth) 43 | assert_int_hp(h.min_samples_split) 44 | assert_0_1_float32_hp(h.max_features) 45 | assert_positive_float32_hp(h.min_impurity_decrease) 46 | assert_0_1_float32_hp(h.bagging) 47 | result.stop_rules = new_tree_stop_rules() 48 | 49 | result.add_rules(h.max_depth, h.min_samples_split, h.max_features, h.min_impurity_decrease) 50 | result.root = new_root(task, stop_rules=result.stop_rules) 51 | result.hyperparams = h 52 | 53 | 54 | proc new_classification_tree* (max_depth: int = -1, min_samples_split: int = -1, max_features: float32 = 1.0, min_impurity_decrease: float32 = 1e-6, 55 | bagging: float32 = 1.0): DecisionTree = 56 | new_tree(task=Classification, (max_depth, min_samples_split, max_features, min_impurity_decrease, bagging)) 57 | 58 | proc new_regression_tree* (max_depth: int = -1, min_samples_split: int = -1, max_features: float32 = 1.0, min_impurity_decrease: float32 = 1e-6, 59 | bagging: float32 = 1.0): DecisionTree = 60 | new_tree(task=Regression, (max_depth, min_samples_split, max_features, min_impurity_decrease, bagging)) 61 | 62 | 63 | 64 | ## Train function of decision tree 65 | proc fit* (t: DecisionTree, X: sink seq[seq[float32]], y: sink seq[float32]) {.gcsafe.} = 66 | let (X_train, y_train) = bagging(X, y, t.bagging) 67 | fit(t.root, X_train, y_train) 68 | 69 | proc print_root_split(t: DecisionTree) = 70 | if t.root of Leaf: 71 | echo "Root is a leaf" 72 | else: 73 | echo "Root node, split in column ", t.root.split_column, " with value ", t.root.split_value 74 | 75 | proc predict*(tree: DecisionTree, x: sink seq[float32]): float32 {.gcsafe.} = 76 | tree.root.get_value(x) 77 | 78 | proc predict*(tree: DecisionTree, X: sink seq[seq[float32]]): seq[float32] {.gcsafe.} = 79 | result = newSeq[float32](X.len) 80 | for i, row in X: 81 | result[i] = tree.predict(row) 82 | 83 | proc predict_proba*(tree: DecisionTree, x: seq[float32]): seq[float32] {.gcsafe.} = 84 | tree.root.get_proba(x) 85 | 86 | proc predict_proba*(tree: DecisionTree, X: seq[seq[float32]]): seq[seq[float32]] {.gcsafe.} = 87 | result = newSeq[seq[float32]](X.len) 88 | for i, row in X: 89 | result[i] = tree.predict_proba(row) -------------------------------------------------------------------------------- /src/utils.nim: -------------------------------------------------------------------------------- 1 | import sequtils 2 | import math 3 | import random 4 | 5 | proc sample_wo_reins*(start, finish: int, perc: float32): seq[int] = 6 | assert 0.0 <= perc and perc <= 1.0 7 | if perc == 1.0: 8 | return (start..finish).to_seq 9 | else: 10 | var series = (start..finish).to_seq 11 | let len_sub = (series.len.float32 * perc).round.int 12 | result = new_seq[int](len_sub) 13 | for i in 0.. 0.95) 45 | echo "accuracy on iris train set is ", accuracy_iris 46 | test "Random forest should overfit when predict on Iris train set": 47 | let X_iris = read_X_data("tests/data/X_iris.csv") 48 | let y_iris = read_y_data("tests/data/y_iris.csv") 49 | let rf = new_random_forest_classifier(100, num_threads=1) 50 | var start = now() 51 | rf.fit(X_iris, y_iris) 52 | let time_fit = now() - start 53 | start = now() 54 | let yhat = rf.predict(X_iris) 55 | let time_predict = now() - start 56 | let accuracy_iris = accuracy(y_iris, yhat) 57 | require(accuracy_iris > 0.95) 58 | echo "accuracy on iris train set is ", accuracy_iris 59 | echo "seconds fit: ", time_fit.inMilliseconds.float32 / 1000, " seconds predict: ", time_predict.inMilliseconds.float32 / 1000 60 | test "Random forest with parallel training should overfit when predict on Iris train set": 61 | let X_iris = read_X_data("tests/data/X_iris.csv") 62 | let y_iris = read_y_data("tests/data/y_iris.csv") 63 | let rf = new_random_forest_classifier(100, num_threads=4) 64 | var start = now() 65 | rf.fit(X_iris, y_iris) 66 | let time_fit = now() - start 67 | echo "seconds fit: ", time_fit.inMilliseconds.float32 / 1000 68 | start = now() 69 | let yhat = rf.predict(X_iris) 70 | let time_predict = now() - start 71 | let accuracy_iris = accuracy(y_iris, yhat) 72 | require(accuracy_iris > 0.95) 73 | echo "accuracy on iris train set is ", accuracy_iris 74 | echo " seconds predict: ", time_predict.inMilliseconds.float32 / 1000 75 | test "Random forest should overfit when predict on Iris train set x 10": 76 | let X_iris = read_X_data("tests/data/X_iris.csv", times=10) 77 | let y_iris = read_y_data("tests/data/y_iris.csv", times=10) 78 | let rf = new_random_forest_classifier(100, num_threads=1) 79 | var start = now() 80 | rf.fit(X_iris, y_iris) 81 | let time_fit = now() - start 82 | start = now() 83 | let yhat = rf.predict(X_iris) 84 | let time_predict = now() - start 85 | let accuracy_iris = accuracy(y_iris, yhat) 86 | require(accuracy_iris > 0.95) 87 | echo "accuracy on iris train set is ", accuracy_iris 88 | echo "seconds fit: ", time_fit.inMilliseconds.float32 / 1000, " seconds predict: ", time_predict.inMilliseconds.float32 / 1000 89 | test "Random forest with parallel training should overfit when predict on Iris train set x 10": 90 | let X_iris = read_X_data("tests/data/X_iris.csv", times=10) 91 | let y_iris = read_y_data("tests/data/y_iris.csv", times=10) 92 | let rf = new_random_forest_classifier(100, num_threads=4) 93 | var start = now() 94 | rf.fit(X_iris, y_iris) 95 | let time_fit = now() - start 96 | start = now() 97 | let yhat = rf.predict(X_iris) 98 | let time_predict = now() - start 99 | let accuracy_iris = accuracy(y_iris, yhat) 100 | require(accuracy_iris > 0.95) 101 | echo "accuracy on iris train set is ", accuracy_iris 102 | echo "seconds fit: ", time_fit.inMilliseconds.float32 / 1000, " seconds predict: ", time_predict.inMilliseconds.float32 / 1000 103 | -------------------------------------------------------------------------------- /tests/test_impurity.nim: -------------------------------------------------------------------------------- 1 | import ../src/impurity 2 | import unittest 3 | import math 4 | 5 | 6 | proc equals(a,b,delta: float32 = 10e-4): bool = 7 | return abs(a - b) < delta 8 | 9 | suite "Test impurity functions": 10 | setup: 11 | let no_uncertainity = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 12 | let max_uncertainity = @[0.0, 0.0, 0.0, 1.0, 1.0, 1.0] 13 | let random_seq: seq[float32] = @[1.0, 9.0, 1.0, 9.0, 8.0, 0.0, 7.0, 8.0, 1.0, 9.0, 4.0, 6.0, 7.0, 3.0, 0.0, 9.0, 6.0, 9.0, 5.0, 3.0] 14 | test "Test gini_index": 15 | let gini_no_unc = gini(no_uncertainity) 16 | require gini_no_unc.equals(0.0) 17 | let gini_max_unc = gini(max_uncertainity) 18 | require gini_max_unc.equals(0.5) 19 | let gini_random = gini(random_seq) 20 | require gini_random.equals(0.86) 21 | test "Test entropy": 22 | let entropy_no_unc = entropy(no_uncertainity) 23 | require entropy_no_unc.equals(0.0) 24 | let entropy_max_unc = entropy(max_uncertainity) 25 | require entropy_max_unc.equals(0.693) 26 | let entropy_random = entropy(random_seq) 27 | require entropy_random.equals(2.08, delta=10e-2) -------------------------------------------------------------------------------- /tests/test_utils.nim: -------------------------------------------------------------------------------- 1 | import strutils 2 | 3 | proc read_X_data*(path: string, sep: char = ',', times: int = 1) : seq[seq[float32]] = 4 | result = new_seq[seq[float32]](0) 5 | for i in 0..