├── .github
    └── workflows
    │   └── compile-test.yml
├── .gitignore
├── DecisionTree.nimble
├── Makefile
├── benchmark
    ├── rf.nim
    └── rf.py
├── readme.MD
├── src
    ├── DecisionTree.nim
    ├── hyperparams.nim
    ├── impurity.nim
    ├── node
    │   ├── constructors.nim
    │   ├── inode.nim
    │   ├── leaf.nim
    │   ├── node.nim
    │   └── traverse.nim
    ├── random_forest
    │   ├── parallel_rf.nim
    │   └── random_forest.nim
    ├── rule
    │   ├── bagging.nim
    │   ├── stop_rules.nim
    │   └── tree_rules.nim
    ├── task.nim
    ├── train
    │   ├── sons_gen.nim
    │   ├── split.nim
    │   ├── splitresult.nim
    │   ├── stop.nim
    │   ├── train.nim
    │   └── values_selection.nim
    ├── tree.nim
    ├── utils.nim
    └── view.nim
└── tests
    ├── data
        ├── X_data
        ├── X_iris.csv
        ├── iris.data
        ├── y_data
        └── y_iris.csv
    ├── test_fit_tree.nim
    ├── test_impurity.nim
    └── test_utils.nim


/.github/workflows/compile-test.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: compile-run
 4 | 
 5 | # Controls when the action will run. 
 6 | on:
 7 |   # Triggers the workflow on push or pull request events but only for the master branch
 8 |   push:
 9 |     branches: [ master ]
10 |   pull_request:
11 |     branches: [ master ]
12 | 
13 |   # Allows you to run this workflow manually from the Actions tab
14 |   workflow_dispatch:
15 | 
16 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
17 | jobs:
18 |   # This workflow contains a single job called "build"
19 |   build:
20 |     # The type of runner that the job will run on
21 |     runs-on: ubuntu-20.04
22 | 
23 |     # Steps represent a sequence of tasks that will be executed as part of the job
24 |     steps:
25 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
26 |       - uses: actions/checkout@v2
27 |       
28 |       - name: Pull choosenim
29 |         run: curl https://nim-lang.org/choosenim/init.sh -sSf > install-nim.sh
30 |         
31 |         
32 |       - name: Install nim
33 |         run: chmod +x install-nim.sh && ./install-nim.sh -y
34 |         
35 |       # Runs a single command using the runners shell
36 |       - name: Run tests on 1.2
37 |         run: export PATH=/home/runner/.nimble/bin:$PATH && choosenim 1.2.0 && nimble refresh && nimble install sequtils2 && nim c -r --threads:on tests/test*
38 |         
39 |       - name: Run tests on 1.4
40 |         run: export PATH=/home/runner/.nimble/bin:$PATH && choosenim 1.4.0 && nimble refresh && nimble install sequtils2 && nim c -r --threads:on tests/test*
41 | 
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tests/test_fit_tree
2 | tests/test_impurity
3 | benchmark/rf
4 | callgrind*
5 | profile_results.txt
6 | .idea


--------------------------------------------------------------------------------
/DecisionTree.nimble:
--------------------------------------------------------------------------------
 1 | # Package
 2 | 
 3 | version       = "0.1.1"
 4 | author        = "Michele De Vita"
 5 | description   = "Decision tree and Random forest CART implementation for nim"
 6 | license       = "GPL-3.0"
 7 | srcDir        = "src"
 8 | 
 9 | 
10 | 
11 | # Dependencies
12 | 
13 | requires "nim >= 1.2.0"
14 | requires "sequtils2 >= 1.1.0"
15 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | compile-benchmark:
 2 | 	nim c --gc:orc --threads:on  -d:release benchmark/rf.nim
 3 | run-benchmarks:
 4 | 	echo "Python Random Forest"
 5 | 	time python ./benchmark/rf.py
 6 | 	echo "============================"
 7 | 	echo "Nim Random Forest"
 8 | 	time ./benchmark/rf
 9 | clean-compiled:
10 | 	rm benchmark/rf


--------------------------------------------------------------------------------
/benchmark/rf.nim:
--------------------------------------------------------------------------------
 1 | import ../src/random_forest/random_forest
 2 | import ../tests/test_utils
 3 | import times
 4 | 
 5 | proc iris_bench() =
 6 |     let rf = new_random_forest_classifier(100, max_depth=10)
 7 |     let X_data = read_X_data("tests/data/X_iris.csv")
 8 |     let y_data = read_y_data("tests/data/y_iris.csv")
 9 |     rf.fit(X_data, y_data)
10 |     echo "Successfull finished training of a Random Forest of 100 trees"
11 |     let yhat = rf.predict(X_data)
12 |     echo "accuracy train ", accuracy(y_data, yhat)
13 |     
14 | if isMainModule:
15 |     iris_bench()
16 | 


--------------------------------------------------------------------------------
/benchmark/rf.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier
 2 | from sklearn.metrics import accuracy_score
 3 | import pandas as pd
 4 | from path import Path
 5 | 
 6 | def main():
 7 |     root = Path(__file__).parent.parent 
 8 |     data_folder = root / 'tests' / 'data'
 9 |     X_train = pd.read_csv(data_folder / 'X_iris.csv', header=None).values
10 |     y_train = pd.read_csv(data_folder / 'y_iris.csv', header=None).values.ravel()
11 |     rf = RandomForestClassifier(100, max_depth=10)
12 |     rf.fit(X_train, y_train)
13 |     yhat = rf.predict(X_train)
14 |     print('accuracy train', accuracy_score(y_train, yhat))
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     main()


--------------------------------------------------------------------------------
/readme.MD:
--------------------------------------------------------------------------------
 1 | # Decision tree with nim
 2 | 
 3 | [![nimble](https://raw.githubusercontent.com/yglukhov/nimble-tag/master/nimble_js.png)](https://github.com/Michedev/DecisionTreeNim)
 4 | 
 5 | Nim package for decision trees and random forest
 6 | 
 7 | ## How to install
 8 | 
 9 | `nimble install decisiontree`
10 | 
11 | ### Package features
12 | - Inspired by Scikit-learn api
13 | - Random forest can train and predict in parallel
14 | - Actually you feed the X matrix of size [n x m] as `seq[seq[float]]` RowMajor and the y array of size [n] as `seq[float]`
15 | 
16 | #### Decision Tree
17 | 
18 | ```
19 | import DecisionTree
20 | 
21 | let dt = DecisionTree.new_classification_tree(max_depth=10)
22 | dt.fit(X_train,y_train)
23 | let yhat = dt.predict(X_test)
24 | 
25 | ```
26 | 
27 | #### Random forest
28 | 
29 | ```
30 | import DecisionTree
31 | 
32 | let rf = DecisionTree.new_random_forest_classifier(n_trees=100, num_threads=4) #parallel training too!
33 | rf.fit(X_train, y_train)
34 | let yhat = rf.predict(X_test)
35 | ```
36 | 
37 | 
38 | ## Benchmark with python
39 | 
40 | Note: this benchmark is not done to see if nim is quicker than python, of course a good implementation in nim requires less time since it is a statically compiled language and use C compiler optimizations. The purpuose of this benchamrk is to understand HOW MUCH is quicker so the user can decide the language and the library to use because python is slower but has more libraries for data science
41 | 
42 | 
43 | ##### Iris Dataset
44 | 
45 | - Dataset: [iris dataset](https://www.kaggle.com/arshid/iris-flower-dataset)
46 | - Nim Code:
47 | 
48 | 
49 |         import ../src/random_forest/random_forest
50 |         import ../tests/test_utils
51 | 
52 |         proc iris_bench() =
53 |             let rf = new_random_forest_classifier(100, max_depth=10)
54 |             let X_data = read_X_data("tests/data/X_iris.csv")
55 |             let y_data = read_y_data("tests/data/y_iris.csv")
56 |             rf.fit(X_data, y_data)
57 |             echo "Successfull finished training of a Random Forest of 100 trees"
58 |             let yhat = rf.predict(X_data)
59 |             echo "accuracy train ", accuracy(y_data, yhat)
60 |             
61 |         if isMainModule:
62 |             iris_bench()
63 | 
64 |     multitime -n 30 results
65 | 
66 |                     Mean        Std.Dev.    Min         Median      Max
67 |         real        0.205       0.046       0.124       0.190       0.302       
68 |         user        0.191       0.042       0.110       0.180       0.297       
69 |         sys         0.005       0.004       0.000       0.003       0.016       
70 | 
71 | 
72 | - Python code:
73 | 
74 |         from sklearn.ensemble import RandomForestClassifier
75 |         from sklearn.metrics import accuracy_score
76 |         import pandas as pd
77 |         from path import Path
78 | 
79 |         def main():
80 |             root = Path(__file__).parent.parent 
81 |             data_folder = root / 'tests' / 'data'
82 |             X_train = pd.read_csv(data_folder / 'X_iris.csv', header=None).values
83 |             y_train = pd.read_csv(data_folder / 'y_iris.csv', header=None).values.ravel()
84 |             rf = RandomForestClassifier(100, max_depth=10)
85 |             rf.fit(X_train, y_train)
86 |             yhat = rf.predict(X_train)
87 |             print('accuracy train', accuracy_score(y_train, yhat))
88 | 
89 | 
90 |         if __name__ == "__main__":
91 |             main()
92 | 
93 |     multitime -n 30 results:
94 | 
95 |                     Mean        Std.Dev.    Min         Median      Max
96 |         real        2.426       0.251       2.131       2.346       3.390       
97 |         user        2.275       0.158       2.026       2.224       2.711       
98 |         sys         0.541       0.058       0.431       0.553       0.668       
99 | 


--------------------------------------------------------------------------------
/src/DecisionTree.nim:
--------------------------------------------------------------------------------
 1 | # This is just an example to get you started. A typical library package
 2 | # exports the main API in this file. Note that you cannot rename this file
 3 | # but you can remove it if you wish.
 4 | import tree
 5 | import random_forest/random_forest
 6 | import task
 7 | 
 8 | export tree
 9 | export random_forest
10 | export task
11 | 


--------------------------------------------------------------------------------
/src/hyperparams.nim:
--------------------------------------------------------------------------------
1 | type Hyperparams* = tuple[max_depth: int, min_samples_split: int, max_features: float32, min_impurity_decrease: float32, bagging: float32]
2 | 
3 | template hyperparams_binding*(T: typed) =
4 |     proc bagging*(t: T): auto {.inline.} = t.hyperparams.bagging
5 |     proc max_depth*(t: T): auto {.inline.} = t.hyperparams.max_depth
6 |     proc min_samples_split*(t: T): auto {.inline.} = t.hyperparams.min_samples_split
7 |     proc min_impurity_decrease*(t: T): auto {.inline.} = t.hyperparams.min_impurity_decrease
8 |     proc max_features*(t: T): auto {.inline.} = t.hyperparams.max_features
9 | 


--------------------------------------------------------------------------------
/src/impurity.nim:
--------------------------------------------------------------------------------
 1 | import tables
 2 | import math
 3 | import sequtils
 4 | import utils
 5 | 
 6 | 
 7 | type Impurity* = enum
 8 |     Entropy
 9 |     Gini
10 |     Mse
11 |     Default
12 | 
13 | proc entropy*(y: seq[float32]): float32 {.gcsafe.} =
14 |     result = 0.0
15 |     for p in y:
16 |         result +=  - (p * ln(p))
17 | 
18 | 
19 | proc gini*(y: seq[float32]): float32 {.gcsafe.} =
20 |     result = 0.0
21 |     for p in y:
22 |         result +=  p * (1 - p)
23 | 
24 | 
25 | proc mse_from_mean*(y: seq[float32], y_mean: float32): float32 {.gcsafe.} =
26 |     result = 0.0
27 |     for value in y:
28 |         result += (value - y_mean) * (value - y_mean)


--------------------------------------------------------------------------------
/src/node/constructors.nim:
--------------------------------------------------------------------------------
 1 | import node
 2 | import leaf
 3 | import ../task
 4 | import ../impurity
 5 | import ../rule/tree_rules
 6 | import ../view
 7 | 
 8 | 
 9 | proc new_leaf*(father: Node, X: MatrixView[float32], y: VectorView[float32]): Leaf =
10 |     result = new(Leaf)
11 |     result.level = father.level + 1
12 |     result.tree_task = father.tree_task
13 |     father.num_sons += 1
14 |     result.max_features = father.max_features
15 |     result.impurity_f = father.impurity_f
16 |     result.stop_rules = father.stop_rules
17 |     result.leaf_f = result.get_leaf_func(X, y)
18 |     result.leaf_proba = result.get_leaf_proba_func(X, y)
19 |     result.num_sons = 0
20 |     result.father = father
21 | 
22 | proc new_son*(father: Node): Node =
23 |     result = new(Node)
24 |     result.level = father.level + 1
25 |     result.impurity_f = father.impurity_f
26 |     result.max_features = father.max_features
27 |     result.tree_task = father.tree_task
28 |     result.stop_rules = father.stop_rules
29 |     result.father = father
30 |     result.num_sons = 0
31 |     father.num_sons += 1
32 | 
33 | 
34 | proc new_root*(task: Task, impurity_f: Impurity = Default, stop_rules: TreeStopRules = nil, max_features: float32 = 1.0): Node =
35 |     result = new(Node)
36 |     result.level = 0
37 |     result.max_features = max_features
38 |     result.num_sons = 0
39 |     result.tree_task = task
40 |     result.impurity_value = Inf
41 |     if impurity_f == Default:
42 |         if task == Classification:
43 |             result.impurity_f = Gini
44 |         else:
45 |             result.impurity_f = Mse
46 |     else:
47 |         result.impurity_f = impurity_f
48 |     if not stop_rules.is_nil():
49 |         result.stop_rules = stop_rules
50 |     else:
51 |         result.stop_rules = new_tree_stop_rules()
52 | 
53 | proc new_root_leaf*(X: MatrixView[float32], y: VectorView[float32]): Leaf =
54 |     result = new(Leaf)
55 |     result.leaf_f = result.get_leaf_func(X, y)
56 |     result.num_sons = 0
57 | 


--------------------------------------------------------------------------------
/src/node/inode.nim:
--------------------------------------------------------------------------------
 1 | import ../task
 2 | import ../impurity
 3 | 
 4 | type INode* = ref object of RootObj
 5 |     tree_task*: Task
 6 |     impurity_f*: Impurity
 7 |     level*: Natural
 8 |     split_value*: float32
 9 |     split_column*: int
10 |     impurity_value*: float32
11 | 


--------------------------------------------------------------------------------
/src/node/leaf.nim:
--------------------------------------------------------------------------------
 1 | import ../task
 2 | import node
 3 | import tables
 4 | import math
 5 | import ../view
 6 | 
 7 | 
 8 | proc get_leaf_func*(n: Node, X: MatrixView[float32], y: VectorView[float32]) : auto {.gcsafe.} =
 9 |     if n.tree_task == Classification:
10 |         var ctable = toCountTable y.to_seq()
11 |         let mode: float32 = ctable.largest.key
12 |         return proc(x: seq[float32]): float32 {.gcsafe.} = mode
13 |     else:
14 |         var tot: float32 = 0.0
15 |         for v in y:
16 |             tot += v
17 |         let m: float32 = tot / y.len.float32
18 |         return proc(x: seq[float32]): float32 {.gcsafe.} = m
19 | 
20 | #TODO export and add field on leaf type
21 | proc get_leaf_proba_func*(n: Node, X: MatrixView[float32], y: VectorView[float32]) : auto {.gcsafe.} =
22 |     if n.tree_task == Classification:
23 |         var count_table = toCountTable y.to_seq
24 |         var probs = newSeq[float32](count_table.len)
25 |         var i = 0
26 |         for (k, freq) in mpairs(count_table):
27 |             probs[i] = freq / y.len
28 |             inc i
29 |         return proc(x: seq[float32]): seq[float32] {.gcsafe.} = probs
30 |     elif n.tree_task == Regression:
31 |         raise newException(Exception, "Cannot estimate probability in a regression tree")
32 | 


--------------------------------------------------------------------------------
/src/node/node.nim:
--------------------------------------------------------------------------------
 1 | import ../task
 2 | import ../rule/tree_rules
 3 | import inode
 4 | 
 5 | type 
 6 |         Node* = ref object of INode
 7 |                 sons*: array[2, Node]
 8 |                 num_sons* : int
 9 |                 father*: Node
10 |                 max_features*: float32
11 |                 stop_rules*: TreeStopRules
12 |         Leaf* = ref object of Node
13 |                 leaf_f*: proc(x: seq[float32]): float32 {.gcsafe.}
14 |                 leaf_proba*: proc(x: seq[float32]): seq[float32] {.gcsafe.}
15 |         RootIsLeaf* = object of Exception
16 | 
17 | proc is_leaf*(n: Node): bool = n is Leaf
18 | 
19 | import inode
20 | 
21 | 


--------------------------------------------------------------------------------
/src/node/traverse.nim:
--------------------------------------------------------------------------------
 1 | import node
 2 | 
 3 | 
 4 | method get_value*(n: Node, x: sink seq[float32]): float32 {.base, gcsafe.} =
 5 |     let value = x[n.split_column]
 6 |     let i = (value > n.split_value).int
 7 |     return n.sons[i].get_value(x)
 8 | 
 9 | method get_value*(n: Leaf, x: sink seq[float32]): float32 {.gcsafe.} =
10 |     n.leaf_f(x)
11 | 
12 | method get_proba*(n: Node, x: sink seq[float32]): seq[float32] {.base, gcsafe.} =
13 |     let value = x[n.split_column]
14 |     let i = (value > n.split_value).int
15 |     return n.sons[i].get_proba(x)
16 | 
17 | method get_proba*(n: Leaf, x: sink seq[float32]): seq[float32] {.gcsafe.} =
18 |     n.leaf_proba(x)


--------------------------------------------------------------------------------
/src/random_forest/parallel_rf.nim:
--------------------------------------------------------------------------------
 1 | import sequtils
 2 | import threadpool
 3 | 
 4 | #for parallel use only
 5 | proc tree_fit(tree: DecisionTree, X: ptr seq[seq[float32]], y: ptr seq[float32]): DecisionTree {.thread.} =
 6 |     tree.fit(X[], y[])
 7 |     return tree
 8 | 
 9 | proc fit_parallel*(forest: RandomForest, X: seq[seq[float32]], y: seq[float32]) =
10 |     let trees_per_thread: int = (forest.num_trees.float32 / forest.num_threads.float32).int
11 |     echo "trees per thread: ", trees_per_thread
12 |     var threads = newSeq[FlowVar[DecisionTree]](forest.num_trees)
13 |     let X_addr = unsafeAddr(X)
14 |     let y_addr = unsafeAddr(y)
15 |     for i, tree in forest.trees:
16 |         threads[i] = spawn(tree_fit(tree, X_addr, y_addr))
17 |     for i in 0..<forest.num_trees:
18 |         forest.trees[i] = ^threads[i]


--------------------------------------------------------------------------------
/src/random_forest/random_forest.nim:
--------------------------------------------------------------------------------
 1 | import ../tree
 2 | import ../task
 3 | import sequtils
 4 | import tables
 5 | import random
 6 | import stats
 7 | import sequtils2
 8 | import ../utils
 9 | import ../hyperparams
10 | 
11 | type RandomForest* = ref object
12 |     trees: seq[DecisionTree]
13 |     task: Task
14 |     hyperparams: Hyperparams
15 |     num_classes*: int
16 |     num_threads*: int
17 | 
18 | proc num_trees*(forest: RandomForest): Natural =
19 |     forest.trees.len
20 |     
21 | 
22 | proc new_random_forest*(task: Task, n_trees, num_threads: int, h: Hyperparams): RandomForest =
23 |     result = new(RandomForest)
24 |     result.hyperparams = h
25 |     result.trees = newSeq[DecisionTree](n_trees)
26 |     result.num_threads = num_threads
27 | 
28 | proc new_random_forest(trees: seq[DecisionTree]): RandomForest =
29 |     result = new(RandomForest)
30 |     result.trees = trees
31 | 
32 | hyperparams_binding(RandomForest)
33 | 
34 | 
35 | proc new_random_forest_classifier*(n_trees: int = 100, max_depth: int = -1, min_samples_split: int = -1,
36 |                                    max_features: float32 = 0.7, min_impurity_decrease: float32 = 1e-6,
37 |                                    bagging: float32 = 0.7, num_threads: int = 1,): RandomForest =
38 |     new_random_forest(Classification, n_trees=n_trees, num_threads=num_threads, h=(max_depth, min_samples_split, max_features, min_impurity_decrease, bagging))
39 | 
40 | proc new_random_forest_regressor*(n_trees: int = 100, max_depth: int = -1, min_samples_split: int = -1,
41 |                                   max_features: float32 = 0.7, min_impurity_decrease: float32 = 1e-6,
42 |                                   bagging: float32 = 0.7, num_threads: int = 1): RandomForest =
43 |     new_random_forest(Regression, n_trees=n_trees, num_threads=num_threads, h=(max_depth, min_samples_split, max_features, min_impurity_decrease, bagging))
44 | 
45 | include parallel_rf
46 |                 
47 | proc fit* (rf: RandomForest, X: seq[seq[float32]], y: seq[float32]) =
48 |     rf.num_classes = y.uniques(preserve_order=false).len
49 |     for i in 0..<rf.num_trees:
50 |         rf.trees[i] = new_tree(rf.task, rf.hyperparams)
51 |     if rf.num_threads > 1:
52 |         rf.fit_parallel(X,y)
53 |     else:
54 |         for tree in rf.trees:
55 |             tree.fit(X, y)
56 |     
57 |                 
58 |     
59 | proc predict*(rf: RandomForest, x: sink seq[float32]): float32 {.gcsafe.} =
60 |     let predictions = rf.trees.map_it(it.predict(x))
61 |     case rf.task:
62 |         of Classification:
63 |             return predictions.toCountTable().largest.key
64 |         of Regression:
65 |             return predictions.mean()
66 | 
67 | proc predict*(rf: RandomForest, X: sink seq[seq[float32]]): seq[float32] {.gcsafe.} =
68 |     result = new_seq[float32](X.len)
69 |     for i, row in X:
70 |         result[i] = rf.predict(row)
71 |     
72 | proc predict_proba*(forest: RandomForest, x: sink seq[float32]): seq[float32] {.gcsafe.} =
73 |     result = new_seq[float32](forest.num_classes)
74 |     for t in forest.trees:
75 |         let p_y = t.predict_proba(x)
76 |         for i_class in 0..<forest.num_classes:
77 |             result[i_class] += p_y[i_class]
78 |     for i_class in 0..<forest.num_classes:
79 |         result[i_class] /= forest.num_trees.float32
80 | 
81 | proc predict_proba*(forest: RandomForest, X: sink seq[seq[float32]]): seq[seq[float32]]  =
82 |     result = newSeq[seq[float32]](X.len)
83 |     for i, row in X:
84 |         result[i] = forest.predict_proba(row)


--------------------------------------------------------------------------------
/src/rule/bagging.nim:
--------------------------------------------------------------------------------
1 | import random
2 | import ../view
3 | import ../utils
4 | 
5 | type Xy* = tuple[X: MatrixView[float32], y: VectorView[float32]]
6 | 
7 | proc bagging*(X: sink seq[seq[float32]], y: sink seq[float32], perc_bagging:float32): Xy =
8 |     let b_index = sample_wo_reins(0, y.len-1, perc_bagging)
9 |     return (new_matrix_view(X, b_index), new_vector_view(y, b_index))


--------------------------------------------------------------------------------
/src/rule/stop_rules.nim:
--------------------------------------------------------------------------------
 1 | import tree_rules
 2 | import ../train/splitresult
 3 | import ../node/inode
 4 | import ../view
 5 | 
 6 | proc min_samples_split_rule*(min_samples_split: int): Rule =
 7 |     (proc(n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.} = len(X) <= min_samples_split)
 8 | 
 9 | proc max_depth_rule*(max_depth: int): Rule =
10 |     (proc(n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.} = n.level > max_depth)
11 | 
12 | proc unique_class_rule*(): Rule =
13 |     proc is_unique_class(n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.} =
14 |         if y.len == 0:
15 |             return true
16 |         let first_el = y[0]
17 |         for el in y:
18 |             if el != first_el:
19 |                 return false
20 |         return true
21 |     return is_unique_class
22 | 
23 | proc min_impurity_decrease_rule*(threshold: float32): PostSplitRule =
24 |     proc is_impurity_less(n: INode, X: MatrixView[float32], y: VectorView[float32], X1: MatrixView[float32], y1: VectorView[float32], X2: MatrixView[float32], y2: VectorView[float32], split: SplitResult): bool {.gcsafe.} =
25 |         ## Stop if the decrease of split impurity is less than threshold
26 |         let perc_1 = X1.len.float32 / X.len.float32
27 |         let perc_2 = X2.len.float32 / X.len.float32
28 | 
29 |         let decrease = n.impurity_value - (perc_1 * split.impurity_1 + perc_2 * split.impurity_2)
30 |         return decrease < threshold        
31 |     return is_impurity_less


--------------------------------------------------------------------------------
/src/rule/tree_rules.nim:
--------------------------------------------------------------------------------
 1 | import ../node/inode
 2 | import ../view
 3 | import ../train/splitresult
 4 | 
 5 | 
 6 | type 
 7 |     Rule* = proc(n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.}
 8 |     PostSplitRule* = proc(n: INode, X: MatrixView[float32], y: VectorView[float32], X1: MatrixView[float32], y1: VectorView[float32], X2: MatrixView[float32], y2: VectorView[float32], split: SplitResult): bool {.gcsafe.}
 9 |     TreeStopRules*  = ref object
10 |         ## Rules checked when a new son node is created. If one or more stop rules are true, then makes a leaf instead of internal node 
11 |         creation_rules: seq[Rule]
12 |         ## Rules checked before creating a split
13 |         pre_split_rules: seq[Rule]
14 |         ## Rules checked after found the best split for a internal node. 
15 |         ## The type is different from the other two because receive in input also the split informations
16 |         post_split_rules: seq[PostSplitRule]
17 | 
18 | 
19 | proc new_tree_stop_rules*(): TreeStopRules =
20 |     result = new(TreeStopRules)
21 |     result.creation_rules = @[]
22 |     result.pre_split_rules = @[]
23 |     result.post_split_rules = @[]
24 | 
25 | 
26 | proc add_creation_rule* (tr: TreeStopRules, rule: Rule) =
27 |     tr.creation_rules.add rule
28 | 
29 | proc add_pre_split_rule* (tr: TreeStopRules, rule: Rule) =
30 |     tr.pre_split_rules.add rule
31 | 
32 | proc add_post_split_rule* (tr: TreeStopRules, rule: PostSplitRule) =
33 |     tr.post_split_rules.add rule
34 | 
35 | proc add_creation_rules* (tr: TreeStopRules, rules: sink seq[Rule]) =
36 |     for rule in rules:
37 |         tr.add_creation_rule rule
38 | 
39 | proc add_pre_split_rules* (tr: TreeStopRules, rules: sink seq[Rule]) =
40 |     for rule in rules:
41 |         tr.add_pre_split_rule rule
42 | 
43 | proc add_post_split_rules* (tr: TreeStopRules, rules: seq[PostSplitRule]) =
44 |     for rule in rules:
45 |         tr.add_post_split_rule rule
46 | 
47 | proc any_true(rules: sink seq[Rule], n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe, inline.} =
48 |     for rule in rules:
49 |         if rule(n, X, y):
50 |             # echo "stop for rule ", rule
51 |             return true
52 |     return false
53 | 
54 | proc any_true(rules: seq[PostSplitRule], n: INode, X: MatrixView[float32], y: VectorView[float32], X1: MatrixView[float32], y1: VectorView[float32], X2: MatrixView[float32], y2: VectorView[float32], split: SplitResult): bool {.gcsafe, inline.} =
55 |     for rule in rules:
56 |         if rule(n, X, y, X1, y1, X2, y2, split):
57 |             return true
58 |     return false
59 | 
60 | proc on_creation*(tsr: TreeStopRules, n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.} =
61 |     any_true(tsr.creation_rules, n, X, y)
62 | 
63 | proc on_pre_split*(tsr: TreeStopRules, n: INode, X: MatrixView[float32], y: VectorView[float32]): bool {.gcsafe.} =
64 |     any_true(tsr.pre_split_rules, n, X, y)
65 | 
66 | proc on_post_split*(tsr: TreeStopRules, n: INode, X: MatrixView[float32], y: VectorView[float32], X1: MatrixView[float32], y1: VectorView[float32], X2: MatrixView[float32], y2: VectorView[float32], split: SplitResult): bool {.gcsafe.} =
67 |     any_true(tsr.post_split_rules, n, X, y, X1, y1, X2, y2, split)


--------------------------------------------------------------------------------
/src/task.nim:
--------------------------------------------------------------------------------
1 | type Task* = enum 
2 |     Classification, Regression


--------------------------------------------------------------------------------
/src/train/sons_gen.nim:
--------------------------------------------------------------------------------
 1 | import ../node/node, ../node/constructors, ../node/leaf
 2 | import split, stop, splitresult
 3 | import typetraits
 4 | import ../rule/tree_rules
 5 | import options
 6 | import ../view
 7 | 
 8 | type Sons = tuple[first, second: Node, X1, X2: MatrixView[float32], y1, y2: VectorView[float32]]
 9 | 
10 | proc generate_sons*(n: Node, X: MatrixView[float32], y: VectorView[float32]): Option[Sons] {.gcsafe.} =
11 |     let split: SplitResult = best_split(n.tree_task, n.impurity_f, X, y, n.max_features)
12 |     # echo "Split on ", split.col, " with value ",  split.split_value
13 |     # echo "tot split value: ", split.impurity, " and single values " , split.impurity_1, " - ", split.impurity_2
14 |     n.split_column = split.col
15 |     n.split_value  = split.split_value
16 |     let
17 |         x1_len = split.index[0].len
18 |         x2_len = split.index[1].len
19 |     var 
20 |         X1 = new_matrix_view(X, split.index[0])
21 |         y1 = new_vector_view(y, split.index[0])
22 |         X2 = new_matrix_view(X, split.index[1])
23 |         y2 = new_vector_view(y, split.index[1])
24 |     if n.stop_rules.on_post_split(n, X, y, X1, y1, X2, y2, split):
25 |         # echo "Split block on depth ", n.level
26 |         return options.none[Sons]()     
27 |     
28 |     if n.stop_rules.on_creation(n, X1, y1):
29 |         n.sons[0] = new_leaf(n, X1, y1)
30 |     else:
31 |         n.sons[0] = new_son(n)
32 |     if n.stop_rules.on_creation(n, X2, y2):
33 |         n.sons[1] = new_leaf(n, X2, y2)
34 |     else:
35 |         n.sons[1] = new_son(n)
36 |     n.sons[0].impurity_value = split.impurity_1
37 |     n.sons[1].impurity_value = split.impurity_2
38 | 
39 |     return some((n.sons[0], n.sons[1], X1, X2, y1, y2))


--------------------------------------------------------------------------------
/src/train/split.nim:
--------------------------------------------------------------------------------
  1 | import ../node/node
  2 | import algorithm
  3 | import system
  4 | import math
  5 | import ../utils
  6 | import random
  7 | import splitresult
  8 | import ../view
  9 | import tables
 10 | import sugar
 11 | import ../task
 12 | import values_selection
 13 | import ../impurity
 14 |    
 15 | 
 16 | 
 17 | proc split_y_by_value_classification(x_col: ColumnMatrixView[float32], y: VectorView[float32], split_value: float32): tuple[y1, y2: CountTableRef[float32], i1, i2: seq[int]] =
 18 |     var 
 19 |         y1 = newCountTable[float32]((y.len / 2).int.nextPowerOfTwo)
 20 |         y2 = newCountTable[float32]((y.len / 2).int.nextPowerOfTwo)
 21 |         i1 = new_seq[int](0)
 22 |         i2 = new_seq[int](0)
 23 |     for i, v in x_col:
 24 |         if v < split_value:
 25 |             y1.inc y.get_raw(i)
 26 |             i1.add i
 27 |         else:
 28 |             y2.inc y.get_raw(i)
 29 |             i2.add i
 30 |     return (y1, y2, i1, i2)
 31 | 
 32 | proc split_y_by_value_regression(x_col: ColumnMatrixView[float32], y: VectorView[float32], split_value: float32): tuple[y1, y2: seq[float32], m1, m2: float32, i1, i2: seq[int]] =
 33 |     var 
 34 |         mean_1: float32 = 0.0
 35 |         mean_2: float32 = 0.0
 36 |         y1: seq[float32] = new_seq[float32](0)
 37 |         y2: seq[float32] = new_seq[float32](0)
 38 |         i1 = new_seq[int](0)
 39 |         i2 = new_seq[int](0)
 40 |     for i, v in x_col:
 41 |         if v < split_value:
 42 |             mean_1 += v
 43 |             y1.add v
 44 |             i1.add i
 45 |         else:
 46 |             mean_2 += v
 47 |             y2.add v
 48 |             i2.add i
 49 |     mean_1 /= i1.len.float32
 50 |     mean_2 /= i2.len.float32
 51 |     return (y1, y2, mean_1, mean_2, i1, i2)
 52 | 
 53 | 
 54 | proc best_split_col(t: Task, impurity_f: Impurity, x_col: ColumnMatrixView[float32], y: VectorView[float32]): SplitResult {.gcsafe.} =
 55 |     ## 
 56 |     assert x_col.len == y.len
 57 |     let splits = percentiles(x_col, 10)
 58 |     var min_impurity = Inf
 59 |     var best_split = 0.0
 60 |     var best_i1: seq[int]
 61 |     var best_i2: seq[int]
 62 |     var min_impurity_1: float32 = 0.0
 63 |     var min_impurity_2: float32 = 0.0
 64 |     for split in splits:
 65 |         var
 66 |             impurity_y1 = Inf
 67 |             impurity_y2 = Inf
 68 |             split_i1: seq[int] = @[]
 69 |             split_i2: seq[int] = @[]
 70 |         if t == Classification:
 71 |             let (count_y1, count_y2, i1, i2) = split_y_by_value_classification(x_col, y, split)
 72 |             if i1.len == 0 or i2.len == 0:
 73 |                 continue
 74 |             let freq_y1 = collect(newSeq):
 75 |                 for x in count_y1.values:
 76 |                     x.float32 / y.len.float32
 77 |             let freq_y2 = collect(newSeq):
 78 |                 for x in count_y2.values:
 79 |                     x.float32 / y.len.float32
 80 |             var true_impurity_f: proc(p_y: seq[float32]): float32 {.gcsafe.} = nil
 81 |             if impurity_f == Gini:
 82 |                 true_impurity_f = gini
 83 |             elif impurity_f == Entropy:
 84 |                 true_impurity_f = entropy
 85 |             impurity_y1 = true_impurity_f(freq_y1)
 86 |             impurity_y2 = true_impurity_f(freq_y2)
 87 |             split_i1 := i1
 88 |             split_i2 := i2
 89 |         else:
 90 |             let (y1, y2, mean_1, mean_2, i1, i2) = split_y_by_value_regression(x_col, y, split)
 91 |             if i1.len == 0 or i2.len == 0:
 92 |                 continue
 93 |             if impurity_f == Mse:
 94 |                 impurity_y1 = mse_from_mean(y1, mean_1)
 95 |                 impurity_y2 = mse_from_mean(y2, mean_2)
 96 |                 split_i1 := i1
 97 |                 split_i2 := i2
 98 |             else:
 99 |                 raise newException(ValueError, "impurity_f in regression must be Mse")
100 |         let tot_impurity: float32 = impurity_y1 + impurity_y2
101 |         if min_impurity > tot_impurity and split_i1.len > 0 and split_i2.len > 0:
102 |             min_impurity = tot_impurity
103 |             best_split = split
104 |             best_i1 := split_i1
105 |             best_i2 := split_i2
106 |             min_impurity_1 = impurity_y1
107 |             min_impurity_2 = impurity_y2
108 |     return new_split_result(best_split, min_impurity, -1, [best_i1, best_i2], min_impurity_1, min_impurity_2)
109 | 
110 | 
111 | 
112 | proc random_features(num_features: int, max_features: float32): seq[int] =
113 |     result = newSeq[int](0)
114 |     for i in 0..<num_features:
115 |         if rand(1.0) < max_features:
116 |             result.add i
117 | 
118 | proc best_split*(task: Task, impurity_f: Impurity, X: MatrixView[float32], y: VectorView[float32], max_features: float32 = 1.0): SplitResult {.gcsafe.} =
119 |     var 
120 |         best_split: SplitResult = new_split_result(-1, Inf)
121 |     for j in random_features(X.ncols, max_features):
122 |         let j_split = best_split_col(task, impurity_f, X.column(j), y)
123 |         if best_split.impurity > j_split.impurity:
124 |             best_split = j_split
125 |             best_split.col = j
126 |     return best_split
127 | 


--------------------------------------------------------------------------------
/src/train/splitresult.nim:
--------------------------------------------------------------------------------
 1 | type 
 2 |     SplitResult* = ref object
 3 |         split_value*: float32
 4 |         impurity*: float32
 5 |         col*: int
 6 |         index*: array[2, seq[int]]
 7 |         impurity_1*: float32
 8 |         impurity_2*: float32
 9 | 
10 | 
11 | proc new_split_result*(split_value, impurity: float32): SplitResult =
12 |     result = new(SplitResult)
13 |     result.split_value = split_value
14 |     result.impurity = impurity
15 |         
16 | proc new_split_result*(split_value, impurity: float32, col: int, index: array[2, seq[int]], impurity_1: float32 = 0.0, impurity_2: float32 = 0.0): SplitResult =
17 |     result = new(SplitResult)
18 |     result.split_value = split_value
19 |     result.impurity = impurity
20 |     result.col = col
21 |     result.index = index
22 |     result.impurity_1 = impurity_1
23 |     result.impurity_2 = impurity_2
24 | 


--------------------------------------------------------------------------------
/src/train/stop.nim:
--------------------------------------------------------------------------------
 1 | import ../node/node
 2 | import ../view
 3 | import ../task
 4 | 
 5 | proc unique_class(y: VectorView[float32]): bool =
 6 |     for value in y:
 7 |         if value != y[0]:
 8 |             return false
 9 |     # echo y, " has one class"
10 |     return true
11 | 
12 | proc on_creating_new_node*(n: Node, X: MatrixView[float32], y: VectorView[float32]): bool =
13 |     if n.tree_task == Classification:
14 |         if unique_class(y):
15 |             return true
16 |     return n.level >= 3 or y.len <= 1
17 | 
18 | 


--------------------------------------------------------------------------------
/src/train/train.nim:
--------------------------------------------------------------------------------
 1 | import ../node/[node, constructors]
 2 | import sons_gen
 3 | import options
 4 | import ../view
 5 | 
 6 | type NodeWithData = tuple[n: Node, X: MatrixView[float32], y: VectorView[float32]]
 7 | 
 8 | ## Train function of decision tree
 9 | proc fit* (root: Node, X: MatrixView[float32], y: VectorView[float32]) {.gcsafe.} =
10 |     assert X.len == y.len
11 |     var border = new_seq[NodeWithData](1)
12 |     border[0] = (root, X, y)
13 |     while border.len > 0:
14 |         let (node, X_data, y_data) = border.pop()
15 |         let sons_opt = node.generate_sons(X_data, y_data)
16 |         if sons_opt.is_some():
17 |             let sons = sons_opt.get()
18 |             if not(sons.first of Leaf):
19 |                 border.add((sons.first, sons.X1, sons.y1))
20 |             if not(sons.second of Leaf):
21 |                 border.add((sons.second, sons.X2, sons.y2))
22 |         elif sons_opt.is_none() and not node.father.isNil():
23 |             let father = node.father
24 |             if father.num_sons == 0:
25 |                 father.sons[0] = new_leaf(father, X_data, y_data)
26 |             else:
27 |                 for i in 0..<father.num_sons:
28 |                     let son = father.sons[i]
29 |                     if son == node:
30 |                         father.num_sons -= 1
31 |                         father.sons[i] = new_leaf(father, X_data, y_data)
32 |         else:
33 |             raise newException(RootIsLeaf, "Root is leaf")
34 | 


--------------------------------------------------------------------------------
/src/train/values_selection.nim:
--------------------------------------------------------------------------------
 1 | import ../view
 2 | import algorithm
 3 | import math
 4 | 
 5 | ##Strategies to get split values
 6 | 
 7 | proc uniform*(data: ColumnMatrixView[float32], n: int): seq[float32] =
 8 |     ## Search min and max on data and then generate n values uniformly between min and max
 9 |     result = newSeq[float32](n-1)
10 |     var
11 |         min_value = Inf
12 |         max_value = - Inf
13 |     for v in data:
14 |         if v < min_value:
15 |             min_value = v
16 |         if v > max_value:
17 |             max_value = v
18 |     for i in 1..(n-1):
19 |         result[i-1] = min_value + (max_value - min_value).float32 * i.float32 / n.float32
20 | 
21 | proc percentiles*(data: ColumnMatrixView[float32], n: int): seq[float32] =
22 |     let data_sorted = data.to_seq.sorted
23 |     if data.len < n:
24 |         result = new_seq[float32](data.len - 1)
25 |         for i in 0..<data_sorted.len-1:
26 |             result[i] = (data_sorted[i] + data_sorted[i+1]) / 2.0
27 |         return result
28 | 
29 |     result = new_seq[float32](n-1)
30 |     for i in 0..<(n-1):
31 |         let data_index = ((i + 1).float32 / n.float32 * data_sorted.len.float32).round.int
32 |         if data_index >= data.len - 1:
33 |             continue
34 |         result[i] = (data_sorted[data_index] + data_sorted[data_index+1]) / 2.0
35 | 


--------------------------------------------------------------------------------
/src/tree.nim:
--------------------------------------------------------------------------------
 1 | import node/[node, constructors, traverse]
 2 | import train/[split, sons_gen, train]
 3 | import task
 4 | import typetraits
 5 | import sequtils
 6 | import rule/[tree_rules, stop_rules, bagging]
 7 | import hyperparams
 8 | 
 9 | type 
10 |     DecisionTree*  = ref object
11 |         root: Node
12 |         stop_rules: TreeStopRules
13 |         hyperparams: Hyperparams
14 | 
15 | proc assert_int_hp(value: int, msg: string = "") =
16 |     assert value == -1 or value > 0, msg
17 | 
18 | proc assert_0_1_float32_hp(value: float32, msg: string = "") = 
19 |     assert value <= 1.0 or value >= 0.0 or value == -1.0, msg
20 | 
21 | proc assert_positive_float32_hp(value: float32, msg: string = "") =
22 |     assert value == -1.0 or value > 0.0, msg
23 | 
24 | hyperparams_binding(DecisionTree)
25 | 
26 | 
27 | proc add_rules(tree: DecisionTree, max_depth: int, min_samples_split: int, max_features: float32, min_impurity_decrease: float32) =
28 |     if max_depth != -1:
29 |         tree.stop_rules.add_creation_rule max_depth_rule(max_depth)
30 |     if min_samples_split != -1:
31 |         tree.stop_rules.add_pre_split_rule min_samples_split_rule(min_samples_split)
32 |     if min_impurity_decrease != -1.0:
33 |         tree.stop_rules.add_post_split_rule min_impurity_decrease_rule(min_impurity_decrease)
34 |     tree.stop_rules.add_creation_rule unique_class_rule()
35 | 
36 | 
37 | proc new_tree*(task: Task, h: Hyperparams,
38 |                custom_creation_rules: seq[Rule] = @[],
39 |                custom_pre_split_rules: seq[Rule] = @[],
40 |                custom_post_split_rules: seq[PostSplitRule] = @[]): DecisionTree =
41 |     result = new(DecisionTree)
42 |     assert_int_hp(h.max_depth)
43 |     assert_int_hp(h.min_samples_split)
44 |     assert_0_1_float32_hp(h.max_features)
45 |     assert_positive_float32_hp(h.min_impurity_decrease)
46 |     assert_0_1_float32_hp(h.bagging)
47 |     result.stop_rules = new_tree_stop_rules()
48 |     
49 |     result.add_rules(h.max_depth, h.min_samples_split, h.max_features, h.min_impurity_decrease)
50 |     result.root = new_root(task, stop_rules=result.stop_rules)
51 |     result.hyperparams = h
52 | 
53 | 
54 | proc new_classification_tree* (max_depth: int = -1, min_samples_split: int = -1, max_features: float32 = 1.0, min_impurity_decrease: float32 = 1e-6,
55 |                                bagging: float32 = 1.0): DecisionTree = 
56 |     new_tree(task=Classification, (max_depth, min_samples_split, max_features, min_impurity_decrease, bagging)) 
57 | 
58 | proc new_regression_tree* (max_depth: int = -1, min_samples_split: int = -1, max_features: float32 = 1.0, min_impurity_decrease: float32 = 1e-6,
59 |                            bagging: float32 = 1.0): DecisionTree = 
60 |     new_tree(task=Regression, (max_depth, min_samples_split, max_features, min_impurity_decrease, bagging)) 
61 | 
62 | 
63 | 
64 | ## Train function of decision tree
65 | proc fit* (t: DecisionTree, X: sink seq[seq[float32]], y: sink seq[float32]) {.gcsafe.} =
66 |     let (X_train, y_train) = bagging(X, y, t.bagging)
67 |     fit(t.root, X_train, y_train)
68 | 
69 | proc print_root_split(t: DecisionTree) =
70 |     if t.root of Leaf:
71 |         echo "Root is a leaf"
72 |     else:
73 |         echo "Root node, split in column ", t.root.split_column, " with value ", t.root.split_value
74 |             
75 | proc predict*(tree: DecisionTree, x: sink seq[float32]): float32 {.gcsafe.} =
76 |     tree.root.get_value(x)
77 | 
78 | proc predict*(tree: DecisionTree, X: sink seq[seq[float32]]): seq[float32] {.gcsafe.} =
79 |     result = newSeq[float32](X.len)
80 |     for i, row in X:
81 |         result[i] = tree.predict(row)
82 | 
83 | proc predict_proba*(tree: DecisionTree, x: seq[float32]): seq[float32] {.gcsafe.} =
84 |     tree.root.get_proba(x)
85 | 
86 | proc predict_proba*(tree: DecisionTree, X: seq[seq[float32]]): seq[seq[float32]] {.gcsafe.} =
87 |     result = newSeq[seq[float32]](X.len)
88 |     for i, row in X:
89 |         result[i] = tree.predict_proba(row)


--------------------------------------------------------------------------------
/src/utils.nim:
--------------------------------------------------------------------------------
 1 | import sequtils
 2 | import math
 3 | import random
 4 | 
 5 | proc sample_wo_reins*(start, finish: int, perc: float32): seq[int] =
 6 |     assert 0.0 <= perc and perc <= 1.0
 7 |     if perc == 1.0:
 8 |         return (start..finish).to_seq
 9 |     else:
10 |         var series = (start..finish).to_seq
11 |         let len_sub = (series.len.float32 * perc).round.int
12 |         result = new_seq[int](len_sub)
13 |         for i in 0..<len_sub:
14 |             let i_sample = rand(series.len-1)
15 |             let element = series[i_sample]
16 |             series.del i_sample
17 |             result[i] = element
18 | 
19 | proc mean*(s: seq[float32]): float32 =
20 |     result = 0.0
21 |     for v in s:
22 |         result += v
23 |     result /= s.len.float32
24 | 
25 | template `:=`*[T](a,b: seq[T]): untyped =
26 |     shallowCopy(a,b)
27 |                


--------------------------------------------------------------------------------
/src/view.nim:
--------------------------------------------------------------------------------
 1 | import sequtils
 2 | import utils
 3 | 
 4 | type
 5 |     VectorView*[T: int|float32|float32] = ref object
 6 |         data: seq[T]
 7 |         index: seq[int]
 8 |     MatrixView*[T: int|float32|float32] = ref object
 9 |         data: seq[seq[T]]
10 |         index: seq[int]
11 |         columns: seq[int]
12 |     ColumnMatrixView*[T: int|float32|float32] = ref object
13 |         data: seq[seq[T]]
14 |         index: seq[int]
15 |         fixed: int
16 | 
17 | 
18 | proc new_vector_view*[T: int|float32|float32](data: seq[T], index: seq[int]): VectorView[T] =
19 |     result = new(VectorView[T])
20 |     result.data := data
21 |     result.index = index
22 | 
23 | proc new_vector_view*[T: int|float32|float32](v: VectorView[T], index: seq[int]): VectorView[T] =
24 |     new_vector_view(v.data, index)
25 |     
26 | 
27 | proc new_matrix_view*[T: int|float32|float32](data: seq[seq[T]], index, columns: seq[int]): MatrixView[T] =
28 |     result = new(MatrixView[T])
29 |     result.data := data
30 |     result.index = index
31 |     result.columns = columns
32 | 
33 | proc new_matrix_view*[T: int|float32|float32](data: seq[seq[T]], index: seq[int]): MatrixView[T] =
34 |     new_matrix_view(data, index, (0..<data[0].len).to_seq)
35 |     
36 | proc new_matrix_view*[T: int|float32|float32](m: MatrixView[T], index: seq[int]): MatrixView[T] =
37 |     new_matrix_view(m.data, index, m.columns)
38 |         
39 | proc new_matrix_view*[T: int|float32|float32](m: MatrixView[T], index, columns: seq[int]): MatrixView[T] =
40 |     new_matrix_view(m.data, index, columns)
41 |     
42 | proc set_index*[T: int|float32|float32, V: VectorView[T] | MatrixView[T]](m: V, index: seq[int]) =
43 |     m.index = index
44 | 
45 | proc column*[T: int | float32](m: MatrixView[T], col: int): ColumnMatrixView[T] =
46 |     result = new(ColumnMatrixView[T])
47 |     result.data := m.data
48 |     result.index = m.index
49 |     result.fixed = col
50 | 
51 | proc get_raw*[T: int | float32](v: VectorView[T], i: int): T =
52 |     v.data[i]
53 | 
54 | proc len*[T: int|float32|float32](m: MatrixView[T]): int = m.index.len
55 | proc ncols*[T: int|float32|float32](m: MatrixView[T]): int = m.data[0].len
56 | proc len*[T: int|float32|float32](m: VectorView[T]): int = m.index.len
57 | proc len*[T: int|float32|float32](m: ColumnMatrixView[T]): int = m.index.len
58 | 
59 | iterator items*[T: int|float32|float32](c: ColumnMatrixView[T]): T =
60 |     for i in c.index:
61 |         yield c.data[i][c.fixed]
62 | 
63 | iterator pairs*[T: int|float32|float32](c: ColumnMatrixView[T]): tuple[i: int, v: T] =
64 |     for i in c.index:
65 |         yield (i, c.data[i][c.fixed])
66 | 
67 | iterator items*[T: int|float32|float32](c: VectorView[T]): T =
68 |     for i in c.index:
69 |         yield c.data[i]
70 | 
71 | iterator pairs*[T: int|float32|float32](c: VectorView[T]): tuple[i: int, v: T] =
72 |     for i in c.index:
73 |         yield (i, c.data[i])
74 | 
75 | proc `[]`*[T: int|float32|float32](v: VectorView[T], i: int): T =
76 |     v.data[v.index[i]]
77 | 
78 | 
79 | 
80 | proc to_seq*[T: int | float32](c: ColumnMatrixView[T]): seq[T] =
81 |     result = new_seq[T](c.len)
82 |     var i = 0
83 |     for v in c:
84 |         result[i] = v
85 |         inc i
86 | 
87 | proc to_seq*[T: int | float32](c: VectorView[T]): seq[T] =
88 |     result = new_seq[T](c.len)
89 |     var i = 0
90 |     for v in c:
91 |         result[i] = v
92 |         inc i
93 |         


--------------------------------------------------------------------------------
/tests/data/X_data:
--------------------------------------------------------------------------------
  1 | -0.36641793842156156,-0.429159485487535
  2 | -0.4061166079080828,0.39102705855784003
  3 | 0.7140186830265457,-0.13948940677597385
  4 | 0.057861742030653335,-0.5248159587904203
  5 | 0.8310881195754597,0.4835960956801931
  6 | 0.1184956385183408,0.8302753428850052
  7 | 0.3791225205461498,-1.4493378549241858
  8 | 0.5763428847451009,0.15382235556727616
  9 | -0.5353349357686806,0.15885491763007292
 10 | 0.5635628200391326,-0.11838805842707455
 11 | -0.35814346975095057,0.8141077306881301
 12 | -0.5584283396804465,0.4150490389720745
 13 | 0.3489685364472359,0.5125766661828991
 14 | 0.2917747381159952,-0.8095282854523855
 15 | 0.28164678566847684,0.17881606544549059
 16 | 0.5168569765791015,-0.029789321047778473
 17 | 0.2954034181172027,-0.10323942233851054
 18 | -0.7696170272760169,0.6170996039734966
 19 | 0.30541601770294025,0.2408989005727877
 20 | 0.5972850110470284,1.4071244358877628
 21 | -0.47041674985839665,-0.4361712435108494
 22 | -0.03601609170374967,0.6146445737100239
 23 | 0.2015487364460171,0.30749810926945065
 24 | 0.6496272436576279,-0.10307399261622491
 25 | -0.02115912637177074,0.19702032239152056
 26 | 0.4689337447825168,0.10598860888247766
 27 | -0.4658426528692915,0.3145962622461056
 28 | 0.6268309896887099,-0.33057691507977943
 29 | 0.5850045281793533,-0.2708371588251595
 30 | -0.12113120035097272,-0.3284302603968388
 31 | -0.5456721155019113,0.09633537737910214
 32 | 0.787331786159895,0.03556125089135423
 33 | 0.04736899988190577,-0.34824202771699175
 34 | 0.37648965039107807,0.19786576727166838
 35 | -0.6978147575454167,-0.3581669295162409
 36 | 0.41374298217906813,-0.8695346776733162
 37 | -0.5459272060513359,0.008273425026697364
 38 | 0.09794222472624782,0.010586135600705914
 39 | 0.7408531441161239,-0.020236575179099544
 40 | -0.2581705968986483,-0.7976070808011977
 41 | -0.8169668389921012,-0.10531790012253879
 42 | -0.05604199976914878,0.26473409156972877
 43 | -0.6697455384706206,0.04197342214423673
 44 | 0.6568166070868406,0.7063190527583685
 45 | 0.13976101506353272,-0.15692754389313865
 46 | 0.909744982834952,0.28932376462575227
 47 | 0.2842045327837953,-0.23466980654109817
 48 | 0.6752777807368546,0.3200283665295217
 49 | 0.8147990639419572,0.5897914495108915
 50 | -0.30619180483505415,0.21253996034472925
 51 | 0.748968430684262,-0.627120686591937
 52 | 0.5304065061498819,0.37020127902655187
 53 | 0.3193042711615959,0.2933421948780569
 54 | -0.2476000093343488,-0.05669851286996653
 55 | -0.06889684143790184,-0.02801550286539519
 56 | 0.11296797934452468,-0.6867366213294736
 57 | -0.1525298228463791,0.1154370623223742
 58 | 1.538808154604028,-0.6556768954368469
 59 | -0.4393406466278966,-0.6110900962342881
 60 | -0.4285254851003676,0.13549421740165435
 61 | 0.014034667405065537,-0.06913401849947612
 62 | 0.11294422159067374,0.06198506495677632
 63 | 0.9379790224737846,0.22980618848092185
 64 | -0.31050530093738604,-0.1524386628195747
 65 | 0.0787296356116825,0.22038187736402212
 66 | 0.15343032307917512,0.005688406341872599
 67 | 0.48606832176693826,0.11498407014561708
 68 | 0.08243205976679799,0.3937410749617549
 69 | -0.6073642961147239,0.030471632010655572
 70 | 0.5988403898910182,0.7898641031378761
 71 | -0.6682959316250493,-0.6525310020514372
 72 | 0.05426321984856326,0.09552533224519963
 73 | -0.05889981666256658,-0.7038463055279309
 74 | 0.5903159867826197,-0.25222616485084026
 75 | 0.576205916576858,0.8657434494051687
 76 | -0.19776117742607255,-1.0646949523370532
 77 | -0.16727514346708564,-0.375589072060782
 78 | 0.8309755021234423,0.9576194923310715
 79 | -0.14164412031430135,0.3202922967629782
 80 | 0.44121032421455164,0.25769569039730184
 81 | -0.8632285987873524,-0.36659693061238957
 82 | 0.47619185749651316,0.43320278212782093
 83 | -0.5724451123772991,-0.5497525849258834
 84 | 0.170249502395943,0.5922094450937404
 85 | -0.0269991013740849,-0.21318341597510682
 86 | 0.3327581769782026,-0.017307044340087483
 87 | -0.006287423226618516,-0.3397786203860869
 88 | -0.07167355018205243,0.8064968691788647
 89 | 0.01072399803443487,-0.7208192895531926
 90 | -0.10411544116180063,0.21120894774142998
 91 | -0.39338525775144473,0.15967074343373217
 92 | -0.047748633761903,-0.38902998937278754
 93 | -0.8810653760395565,-0.1028080025022986
 94 | -0.8456486781937496,-0.17762243731854008
 95 | 0.5077280154601116,-0.18792991523079997
 96 | -0.05950242927127622,0.3603327193255425
 97 | 0.26464757251771,-0.29931599402594544
 98 | 1.0483144332787566,-0.04463079405462066
 99 | 0.22668733171824573,0.4200467855783969
100 | 0.3648048723531465,-0.27490372657923784
101 | 3.17573962746139,0.26572477440019965
102 | 3.141978013664265,-0.5756602460629697
103 | 3.596450722167308,-0.06741881516472495
104 | 2.402474311237391,0.44052091098380464
105 | 2.6595135984405696,-0.24556268473760037
106 | 2.897864441752506,0.3185629085429191
107 | 2.7838382572077722,0.5923804814515521
108 | 3.8556463836480686,-0.44314130164296067
109 | 2.3906250890702716,-0.11877627292716403
110 | 2.194968555000227,-0.3695123887681875
111 | 2.273348856368706,-0.4708546027848647
112 | 3.3107687873317593,0.16536015017746386
113 | 2.501353934437842,-0.45845127755370835
114 | 3.2124609080526625,-0.2850460914131571
115 | 2.4219556910487245,0.08562793408959096
116 | 3.216449926517657,0.23129606244084414
117 | 2.8816929894782675,0.13158634714027576
118 | 2.2840645568131386,-0.41329212245358943
119 | 3.8295037774834912,-0.08257377481215251
120 | 3.661044047079203,0.6480644262150818
121 | 2.452990932932072,0.08226825097827634
122 | 3.51621954825505,-0.485568441569125
123 | 3.698075393482856,0.5823923621097492
124 | 3.1712607337213075,-0.26249355849240635
125 | 2.6373918192077226,-0.3616926018716566
126 | 2.8890185338758436,-0.34683203587583966
127 | 2.4035286412999795,-0.6541777644580316
128 | 2.6133118300352107,-1.0142883075827231
129 | 3.359108676986657,0.17870249845003622
130 | 2.811513429839873,-0.20056312727864173
131 | 3.22311014515164,-1.0774002205594795
132 | 3.4974966728044627,-0.41094796107845294
133 | 2.8748654024294917,-0.19383270698031765
134 | 2.7340672854221095,0.6943397301729091
135 | 2.3274072242844945,0.2550594254496289
136 | 3.430772040306046,-0.10836033790219188
137 | 3.008551430204315,-0.7060570059999528
138 | 2.7397607891621463,0.2512121914759478
139 | 3.247610573105846,-0.07108938302522194
140 | 3.9115246162921324,0.728513323583129
141 | 2.887559675895798,0.47200095258539954
142 | 3.161461278782903,0.567619308062183
143 | 3.2513837141921744,-0.5599877191524206
144 | 2.401213585809474,-0.14746310643015723
145 | 3.1031089025028864,-0.22570755765795705
146 | 3.4861796618104286,0.6625569830513529
147 | 2.673467889714713,0.8261677365555756
148 | 3.381142232174305,-0.11349500338870376
149 | 3.1506946652599197,-0.5776668556251585
150 | 3.4174185396407837,-0.3145327258053042
151 | 3.463451934652102,0.07319240579793035
152 | 2.5677447729138416,-0.015213867586538646
153 | 2.607328106333609,-0.10208022557805266
154 | 3.4162163729309514,-0.2820815230371721
155 | 2.6502485732641503,-0.03538678952587996
156 | 3.0864137420426276,0.6283775620866703
157 | 3.281362838088838,0.21212713269906688
158 | 3.5008548344436847,-0.5122549617159493
159 | 3.0570948773427813,0.06827671493491068
160 | 3.426697249010866,-0.46817658050348554
161 | 2.06405883385865,-0.12334961873563037
162 | 2.7615906034024293,-0.2930681661629582
163 | 2.391817139745516,-0.2789308718595137
164 | 2.4010157601832436,-0.4622386769307986
165 | 2.7504870329126776,-0.17765628296870198
166 | 2.5398500712345595,-0.17360229575418032
167 | 2.077303205569366,-0.5517421639118327
168 | 2.7704267939060183,-0.15090167792076095
169 | 4.0316104851937205,-0.22294854654819757
170 | 2.990948946848891,0.4025241684598176
171 | 3.5973341755606265,-0.12826845964323338
172 | 2.670663286352112,-0.17435668832450063
173 | 3.216149482387293,0.5405625906792471
174 | 3.304755798695259,-0.1142619126955857
175 | 2.2846306001315777,0.317721669603206
176 | 2.7943081366395646,0.1949019546826775
177 | 2.155720655663028,-0.5748444331602837
178 | 2.8397318213103664,-0.41453419805779224
179 | 2.5078054139863153,0.22145395948362345
180 | 3.6652311151305854,0.14329044966478946
181 | 2.3234151765564723,-0.4456966657163773
182 | 2.7220907467530937,-0.43784453314747984
183 | 3.3762454915579503,0.03842764764251685
184 | 2.9063027864228594,0.7222285449196914
185 | 3.2189477078531223,0.2638922434121966
186 | 3.37668324524433,0.5930271875998191
187 | 4.096014573185081,0.2351635585953516
188 | 3.9177222055515766,0.08314859771135891
189 | 2.4285111313688965,0.18204944772186124
190 | 3.8821027850072785,-1.0241735071161544
191 | 2.588578063002479,0.1361086742709238
192 | 3.621626544048108,-0.02503893907498763
193 | 3.293912729854787,1.1164202958767029
194 | 2.737310803128809,-0.09904114854642386
195 | 2.9570245196560947,-0.5340105871010896
196 | 2.238944398335324,-0.36838544651183386
197 | 4.470094347316829,0.24874565447863423
198 | 2.9585828428902015,0.09813822540179816
199 | 3.8010803966661117,-0.0029368295077830896
200 | 3.428776373760252,-0.4276051759087078


--------------------------------------------------------------------------------
/tests/data/X_iris.csv:
--------------------------------------------------------------------------------
  1 | 5.1,3.5,1.4,0.2
  2 | 4.9,3.0,1.4,0.2
  3 | 4.7,3.2,1.3,0.2
  4 | 4.6,3.1,1.5,0.2
  5 | 5.0,3.6,1.4,0.2
  6 | 5.4,3.9,1.7,0.4
  7 | 4.6,3.4,1.4,0.3
  8 | 5.0,3.4,1.5,0.2
  9 | 4.4,2.9,1.4,0.2
 10 | 4.9,3.1,1.5,0.1
 11 | 5.4,3.7,1.5,0.2
 12 | 4.8,3.4,1.6,0.2
 13 | 4.8,3.0,1.4,0.1
 14 | 4.3,3.0,1.1,0.1
 15 | 5.8,4.0,1.2,0.2
 16 | 5.7,4.4,1.5,0.4
 17 | 5.4,3.9,1.3,0.4
 18 | 5.1,3.5,1.4,0.3
 19 | 5.7,3.8,1.7,0.3
 20 | 5.1,3.8,1.5,0.3
 21 | 5.4,3.4,1.7,0.2
 22 | 5.1,3.7,1.5,0.4
 23 | 4.6,3.6,1.0,0.2
 24 | 5.1,3.3,1.7,0.5
 25 | 4.8,3.4,1.9,0.2
 26 | 5.0,3.0,1.6,0.2
 27 | 5.0,3.4,1.6,0.4
 28 | 5.2,3.5,1.5,0.2
 29 | 5.2,3.4,1.4,0.2
 30 | 4.7,3.2,1.6,0.2
 31 | 4.8,3.1,1.6,0.2
 32 | 5.4,3.4,1.5,0.4
 33 | 5.2,4.1,1.5,0.1
 34 | 5.5,4.2,1.4,0.2
 35 | 4.9,3.1,1.5,0.1
 36 | 5.0,3.2,1.2,0.2
 37 | 5.5,3.5,1.3,0.2
 38 | 4.9,3.1,1.5,0.1
 39 | 4.4,3.0,1.3,0.2
 40 | 5.1,3.4,1.5,0.2
 41 | 5.0,3.5,1.3,0.3
 42 | 4.5,2.3,1.3,0.3
 43 | 4.4,3.2,1.3,0.2
 44 | 5.0,3.5,1.6,0.6
 45 | 5.1,3.8,1.9,0.4
 46 | 4.8,3.0,1.4,0.3
 47 | 5.1,3.8,1.6,0.2
 48 | 4.6,3.2,1.4,0.2
 49 | 5.3,3.7,1.5,0.2
 50 | 5.0,3.3,1.4,0.2
 51 | 7.0,3.2,4.7,1.4
 52 | 6.4,3.2,4.5,1.5
 53 | 6.9,3.1,4.9,1.5
 54 | 5.5,2.3,4.0,1.3
 55 | 6.5,2.8,4.6,1.5
 56 | 5.7,2.8,4.5,1.3
 57 | 6.3,3.3,4.7,1.6
 58 | 4.9,2.4,3.3,1.0
 59 | 6.6,2.9,4.6,1.3
 60 | 5.2,2.7,3.9,1.4
 61 | 5.0,2.0,3.5,1.0
 62 | 5.9,3.0,4.2,1.5
 63 | 6.0,2.2,4.0,1.0
 64 | 6.1,2.9,4.7,1.4
 65 | 5.6,2.9,3.6,1.3
 66 | 6.7,3.1,4.4,1.4
 67 | 5.6,3.0,4.5,1.5
 68 | 5.8,2.7,4.1,1.0
 69 | 6.2,2.2,4.5,1.5
 70 | 5.6,2.5,3.9,1.1
 71 | 5.9,3.2,4.8,1.8
 72 | 6.1,2.8,4.0,1.3
 73 | 6.3,2.5,4.9,1.5
 74 | 6.1,2.8,4.7,1.2
 75 | 6.4,2.9,4.3,1.3
 76 | 6.6,3.0,4.4,1.4
 77 | 6.8,2.8,4.8,1.4
 78 | 6.7,3.0,5.0,1.7
 79 | 6.0,2.9,4.5,1.5
 80 | 5.7,2.6,3.5,1.0
 81 | 5.5,2.4,3.8,1.1
 82 | 5.5,2.4,3.7,1.0
 83 | 5.8,2.7,3.9,1.2
 84 | 6.0,2.7,5.1,1.6
 85 | 5.4,3.0,4.5,1.5
 86 | 6.0,3.4,4.5,1.6
 87 | 6.7,3.1,4.7,1.5
 88 | 6.3,2.3,4.4,1.3
 89 | 5.6,3.0,4.1,1.3
 90 | 5.5,2.5,4.0,1.3
 91 | 5.5,2.6,4.4,1.2
 92 | 6.1,3.0,4.6,1.4
 93 | 5.8,2.6,4.0,1.2
 94 | 5.0,2.3,3.3,1.0
 95 | 5.6,2.7,4.2,1.3
 96 | 5.7,3.0,4.2,1.2
 97 | 5.7,2.9,4.2,1.3
 98 | 6.2,2.9,4.3,1.3
 99 | 5.1,2.5,3.0,1.1
100 | 5.7,2.8,4.1,1.3
101 | 6.3,3.3,6.0,2.5
102 | 5.8,2.7,5.1,1.9
103 | 7.1,3.0,5.9,2.1
104 | 6.3,2.9,5.6,1.8
105 | 6.5,3.0,5.8,2.2
106 | 7.6,3.0,6.6,2.1
107 | 4.9,2.5,4.5,1.7
108 | 7.3,2.9,6.3,1.8
109 | 6.7,2.5,5.8,1.8
110 | 7.2,3.6,6.1,2.5
111 | 6.5,3.2,5.1,2.0
112 | 6.4,2.7,5.3,1.9
113 | 6.8,3.0,5.5,2.1
114 | 5.7,2.5,5.0,2.0
115 | 5.8,2.8,5.1,2.4
116 | 6.4,3.2,5.3,2.3
117 | 6.5,3.0,5.5,1.8
118 | 7.7,3.8,6.7,2.2
119 | 7.7,2.6,6.9,2.3
120 | 6.0,2.2,5.0,1.5
121 | 6.9,3.2,5.7,2.3
122 | 5.6,2.8,4.9,2.0
123 | 7.7,2.8,6.7,2.0
124 | 6.3,2.7,4.9,1.8
125 | 6.7,3.3,5.7,2.1
126 | 7.2,3.2,6.0,1.8
127 | 6.2,2.8,4.8,1.8
128 | 6.1,3.0,4.9,1.8
129 | 6.4,2.8,5.6,2.1
130 | 7.2,3.0,5.8,1.6
131 | 7.4,2.8,6.1,1.9
132 | 7.9,3.8,6.4,2.0
133 | 6.4,2.8,5.6,2.2
134 | 6.3,2.8,5.1,1.5
135 | 6.1,2.6,5.6,1.4
136 | 7.7,3.0,6.1,2.3
137 | 6.3,3.4,5.6,2.4
138 | 6.4,3.1,5.5,1.8
139 | 6.0,3.0,4.8,1.8
140 | 6.9,3.1,5.4,2.1
141 | 6.7,3.1,5.6,2.4
142 | 6.9,3.1,5.1,2.3
143 | 5.8,2.7,5.1,1.9
144 | 6.8,3.2,5.9,2.3
145 | 6.7,3.3,5.7,2.5
146 | 6.7,3.0,5.2,2.3
147 | 6.3,2.5,5.0,1.9
148 | 6.5,3.0,5.2,2.0
149 | 6.2,3.4,5.4,2.3
150 | 5.9,3.0,5.1,1.8
151 | 


--------------------------------------------------------------------------------
/tests/data/iris.data:
--------------------------------------------------------------------------------
  1 | 5.1,3.5,1.4,0.2,Iris-setosa
  2 | 4.9,3.0,1.4,0.2,Iris-setosa
  3 | 4.7,3.2,1.3,0.2,Iris-setosa
  4 | 4.6,3.1,1.5,0.2,Iris-setosa
  5 | 5.0,3.6,1.4,0.2,Iris-setosa
  6 | 5.4,3.9,1.7,0.4,Iris-setosa
  7 | 4.6,3.4,1.4,0.3,Iris-setosa
  8 | 5.0,3.4,1.5,0.2,Iris-setosa
  9 | 4.4,2.9,1.4,0.2,Iris-setosa
 10 | 4.9,3.1,1.5,0.1,Iris-setosa
 11 | 5.4,3.7,1.5,0.2,Iris-setosa
 12 | 4.8,3.4,1.6,0.2,Iris-setosa
 13 | 4.8,3.0,1.4,0.1,Iris-setosa
 14 | 4.3,3.0,1.1,0.1,Iris-setosa
 15 | 5.8,4.0,1.2,0.2,Iris-setosa
 16 | 5.7,4.4,1.5,0.4,Iris-setosa
 17 | 5.4,3.9,1.3,0.4,Iris-setosa
 18 | 5.1,3.5,1.4,0.3,Iris-setosa
 19 | 5.7,3.8,1.7,0.3,Iris-setosa
 20 | 5.1,3.8,1.5,0.3,Iris-setosa
 21 | 5.4,3.4,1.7,0.2,Iris-setosa
 22 | 5.1,3.7,1.5,0.4,Iris-setosa
 23 | 4.6,3.6,1.0,0.2,Iris-setosa
 24 | 5.1,3.3,1.7,0.5,Iris-setosa
 25 | 4.8,3.4,1.9,0.2,Iris-setosa
 26 | 5.0,3.0,1.6,0.2,Iris-setosa
 27 | 5.0,3.4,1.6,0.4,Iris-setosa
 28 | 5.2,3.5,1.5,0.2,Iris-setosa
 29 | 5.2,3.4,1.4,0.2,Iris-setosa
 30 | 4.7,3.2,1.6,0.2,Iris-setosa
 31 | 4.8,3.1,1.6,0.2,Iris-setosa
 32 | 5.4,3.4,1.5,0.4,Iris-setosa
 33 | 5.2,4.1,1.5,0.1,Iris-setosa
 34 | 5.5,4.2,1.4,0.2,Iris-setosa
 35 | 4.9,3.1,1.5,0.1,Iris-setosa
 36 | 5.0,3.2,1.2,0.2,Iris-setosa
 37 | 5.5,3.5,1.3,0.2,Iris-setosa
 38 | 4.9,3.1,1.5,0.1,Iris-setosa
 39 | 4.4,3.0,1.3,0.2,Iris-setosa
 40 | 5.1,3.4,1.5,0.2,Iris-setosa
 41 | 5.0,3.5,1.3,0.3,Iris-setosa
 42 | 4.5,2.3,1.3,0.3,Iris-setosa
 43 | 4.4,3.2,1.3,0.2,Iris-setosa
 44 | 5.0,3.5,1.6,0.6,Iris-setosa
 45 | 5.1,3.8,1.9,0.4,Iris-setosa
 46 | 4.8,3.0,1.4,0.3,Iris-setosa
 47 | 5.1,3.8,1.6,0.2,Iris-setosa
 48 | 4.6,3.2,1.4,0.2,Iris-setosa
 49 | 5.3,3.7,1.5,0.2,Iris-setosa
 50 | 5.0,3.3,1.4,0.2,Iris-setosa
 51 | 7.0,3.2,4.7,1.4,Iris-versicolor
 52 | 6.4,3.2,4.5,1.5,Iris-versicolor
 53 | 6.9,3.1,4.9,1.5,Iris-versicolor
 54 | 5.5,2.3,4.0,1.3,Iris-versicolor
 55 | 6.5,2.8,4.6,1.5,Iris-versicolor
 56 | 5.7,2.8,4.5,1.3,Iris-versicolor
 57 | 6.3,3.3,4.7,1.6,Iris-versicolor
 58 | 4.9,2.4,3.3,1.0,Iris-versicolor
 59 | 6.6,2.9,4.6,1.3,Iris-versicolor
 60 | 5.2,2.7,3.9,1.4,Iris-versicolor
 61 | 5.0,2.0,3.5,1.0,Iris-versicolor
 62 | 5.9,3.0,4.2,1.5,Iris-versicolor
 63 | 6.0,2.2,4.0,1.0,Iris-versicolor
 64 | 6.1,2.9,4.7,1.4,Iris-versicolor
 65 | 5.6,2.9,3.6,1.3,Iris-versicolor
 66 | 6.7,3.1,4.4,1.4,Iris-versicolor
 67 | 5.6,3.0,4.5,1.5,Iris-versicolor
 68 | 5.8,2.7,4.1,1.0,Iris-versicolor
 69 | 6.2,2.2,4.5,1.5,Iris-versicolor
 70 | 5.6,2.5,3.9,1.1,Iris-versicolor
 71 | 5.9,3.2,4.8,1.8,Iris-versicolor
 72 | 6.1,2.8,4.0,1.3,Iris-versicolor
 73 | 6.3,2.5,4.9,1.5,Iris-versicolor
 74 | 6.1,2.8,4.7,1.2,Iris-versicolor
 75 | 6.4,2.9,4.3,1.3,Iris-versicolor
 76 | 6.6,3.0,4.4,1.4,Iris-versicolor
 77 | 6.8,2.8,4.8,1.4,Iris-versicolor
 78 | 6.7,3.0,5.0,1.7,Iris-versicolor
 79 | 6.0,2.9,4.5,1.5,Iris-versicolor
 80 | 5.7,2.6,3.5,1.0,Iris-versicolor
 81 | 5.5,2.4,3.8,1.1,Iris-versicolor
 82 | 5.5,2.4,3.7,1.0,Iris-versicolor
 83 | 5.8,2.7,3.9,1.2,Iris-versicolor
 84 | 6.0,2.7,5.1,1.6,Iris-versicolor
 85 | 5.4,3.0,4.5,1.5,Iris-versicolor
 86 | 6.0,3.4,4.5,1.6,Iris-versicolor
 87 | 6.7,3.1,4.7,1.5,Iris-versicolor
 88 | 6.3,2.3,4.4,1.3,Iris-versicolor
 89 | 5.6,3.0,4.1,1.3,Iris-versicolor
 90 | 5.5,2.5,4.0,1.3,Iris-versicolor
 91 | 5.5,2.6,4.4,1.2,Iris-versicolor
 92 | 6.1,3.0,4.6,1.4,Iris-versicolor
 93 | 5.8,2.6,4.0,1.2,Iris-versicolor
 94 | 5.0,2.3,3.3,1.0,Iris-versicolor
 95 | 5.6,2.7,4.2,1.3,Iris-versicolor
 96 | 5.7,3.0,4.2,1.2,Iris-versicolor
 97 | 5.7,2.9,4.2,1.3,Iris-versicolor
 98 | 6.2,2.9,4.3,1.3,Iris-versicolor
 99 | 5.1,2.5,3.0,1.1,Iris-versicolor
100 | 5.7,2.8,4.1,1.3,Iris-versicolor
101 | 6.3,3.3,6.0,2.5,Iris-virginica
102 | 5.8,2.7,5.1,1.9,Iris-virginica
103 | 7.1,3.0,5.9,2.1,Iris-virginica
104 | 6.3,2.9,5.6,1.8,Iris-virginica
105 | 6.5,3.0,5.8,2.2,Iris-virginica
106 | 7.6,3.0,6.6,2.1,Iris-virginica
107 | 4.9,2.5,4.5,1.7,Iris-virginica
108 | 7.3,2.9,6.3,1.8,Iris-virginica
109 | 6.7,2.5,5.8,1.8,Iris-virginica
110 | 7.2,3.6,6.1,2.5,Iris-virginica
111 | 6.5,3.2,5.1,2.0,Iris-virginica
112 | 6.4,2.7,5.3,1.9,Iris-virginica
113 | 6.8,3.0,5.5,2.1,Iris-virginica
114 | 5.7,2.5,5.0,2.0,Iris-virginica
115 | 5.8,2.8,5.1,2.4,Iris-virginica
116 | 6.4,3.2,5.3,2.3,Iris-virginica
117 | 6.5,3.0,5.5,1.8,Iris-virginica
118 | 7.7,3.8,6.7,2.2,Iris-virginica
119 | 7.7,2.6,6.9,2.3,Iris-virginica
120 | 6.0,2.2,5.0,1.5,Iris-virginica
121 | 6.9,3.2,5.7,2.3,Iris-virginica
122 | 5.6,2.8,4.9,2.0,Iris-virginica
123 | 7.7,2.8,6.7,2.0,Iris-virginica
124 | 6.3,2.7,4.9,1.8,Iris-virginica
125 | 6.7,3.3,5.7,2.1,Iris-virginica
126 | 7.2,3.2,6.0,1.8,Iris-virginica
127 | 6.2,2.8,4.8,1.8,Iris-virginica
128 | 6.1,3.0,4.9,1.8,Iris-virginica
129 | 6.4,2.8,5.6,2.1,Iris-virginica
130 | 7.2,3.0,5.8,1.6,Iris-virginica
131 | 7.4,2.8,6.1,1.9,Iris-virginica
132 | 7.9,3.8,6.4,2.0,Iris-virginica
133 | 6.4,2.8,5.6,2.2,Iris-virginica
134 | 6.3,2.8,5.1,1.5,Iris-virginica
135 | 6.1,2.6,5.6,1.4,Iris-virginica
136 | 7.7,3.0,6.1,2.3,Iris-virginica
137 | 6.3,3.4,5.6,2.4,Iris-virginica
138 | 6.4,3.1,5.5,1.8,Iris-virginica
139 | 6.0,3.0,4.8,1.8,Iris-virginica
140 | 6.9,3.1,5.4,2.1,Iris-virginica
141 | 6.7,3.1,5.6,2.4,Iris-virginica
142 | 6.9,3.1,5.1,2.3,Iris-virginica
143 | 5.8,2.7,5.1,1.9,Iris-virginica
144 | 6.8,3.2,5.9,2.3,Iris-virginica
145 | 6.7,3.3,5.7,2.5,Iris-virginica
146 | 6.7,3.0,5.2,2.3,Iris-virginica
147 | 6.3,2.5,5.0,1.9,Iris-virginica
148 | 6.5,3.0,5.2,2.0,Iris-virginica
149 | 6.2,3.4,5.4,2.3,Iris-virginica
150 | 5.9,3.0,5.1,1.8,Iris-virginica
151 | 
152 | 


--------------------------------------------------------------------------------
/tests/data/y_data:
--------------------------------------------------------------------------------
  1 | 0.0
  2 | 0.0
  3 | 0.0
  4 | 0.0
  5 | 0.0
  6 | 0.0
  7 | 0.0
  8 | 0.0
  9 | 0.0
 10 | 0.0
 11 | 0.0
 12 | 0.0
 13 | 0.0
 14 | 0.0
 15 | 0.0
 16 | 0.0
 17 | 0.0
 18 | 0.0
 19 | 0.0
 20 | 0.0
 21 | 0.0
 22 | 0.0
 23 | 0.0
 24 | 0.0
 25 | 0.0
 26 | 0.0
 27 | 0.0
 28 | 0.0
 29 | 0.0
 30 | 0.0
 31 | 0.0
 32 | 0.0
 33 | 0.0
 34 | 0.0
 35 | 0.0
 36 | 0.0
 37 | 0.0
 38 | 0.0
 39 | 0.0
 40 | 0.0
 41 | 0.0
 42 | 0.0
 43 | 0.0
 44 | 0.0
 45 | 0.0
 46 | 0.0
 47 | 0.0
 48 | 0.0
 49 | 0.0
 50 | 0.0
 51 | 0.0
 52 | 0.0
 53 | 0.0
 54 | 0.0
 55 | 0.0
 56 | 0.0
 57 | 0.0
 58 | 0.0
 59 | 0.0
 60 | 0.0
 61 | 0.0
 62 | 0.0
 63 | 0.0
 64 | 0.0
 65 | 0.0
 66 | 0.0
 67 | 0.0
 68 | 0.0
 69 | 0.0
 70 | 0.0
 71 | 0.0
 72 | 0.0
 73 | 0.0
 74 | 0.0
 75 | 0.0
 76 | 0.0
 77 | 0.0
 78 | 0.0
 79 | 0.0
 80 | 0.0
 81 | 0.0
 82 | 0.0
 83 | 0.0
 84 | 0.0
 85 | 0.0
 86 | 0.0
 87 | 0.0
 88 | 0.0
 89 | 0.0
 90 | 0.0
 91 | 0.0
 92 | 0.0
 93 | 0.0
 94 | 0.0
 95 | 0.0
 96 | 0.0
 97 | 0.0
 98 | 0.0
 99 | 0.0
100 | 0.0
101 | 1.0
102 | 1.0
103 | 1.0
104 | 1.0
105 | 1.0
106 | 1.0
107 | 1.0
108 | 1.0
109 | 1.0
110 | 1.0
111 | 1.0
112 | 1.0
113 | 1.0
114 | 1.0
115 | 1.0
116 | 1.0
117 | 1.0
118 | 1.0
119 | 1.0
120 | 1.0
121 | 1.0
122 | 1.0
123 | 1.0
124 | 1.0
125 | 1.0
126 | 1.0
127 | 1.0
128 | 1.0
129 | 1.0
130 | 1.0
131 | 1.0
132 | 1.0
133 | 1.0
134 | 1.0
135 | 1.0
136 | 1.0
137 | 1.0
138 | 1.0
139 | 1.0
140 | 1.0
141 | 1.0
142 | 1.0
143 | 1.0
144 | 1.0
145 | 1.0
146 | 1.0
147 | 1.0
148 | 1.0
149 | 1.0
150 | 1.0
151 | 1.0
152 | 1.0
153 | 1.0
154 | 1.0
155 | 1.0
156 | 1.0
157 | 1.0
158 | 1.0
159 | 1.0
160 | 1.0
161 | 1.0
162 | 1.0
163 | 1.0
164 | 1.0
165 | 1.0
166 | 1.0
167 | 1.0
168 | 1.0
169 | 1.0
170 | 1.0
171 | 1.0
172 | 1.0
173 | 1.0
174 | 1.0
175 | 1.0
176 | 1.0
177 | 1.0
178 | 1.0
179 | 1.0
180 | 1.0
181 | 1.0
182 | 1.0
183 | 1.0
184 | 1.0
185 | 1.0
186 | 1.0
187 | 1.0
188 | 1.0
189 | 1.0
190 | 1.0
191 | 1.0
192 | 1.0
193 | 1.0
194 | 1.0
195 | 1.0
196 | 1.0
197 | 1.0
198 | 1.0
199 | 1.0
200 | 1.0


--------------------------------------------------------------------------------
/tests/data/y_iris.csv:
--------------------------------------------------------------------------------
  1 | 0
  2 | 0
  3 | 0
  4 | 0
  5 | 0
  6 | 0
  7 | 0
  8 | 0
  9 | 0
 10 | 0
 11 | 0
 12 | 0
 13 | 0
 14 | 0
 15 | 0
 16 | 0
 17 | 0
 18 | 0
 19 | 0
 20 | 0
 21 | 0
 22 | 0
 23 | 0
 24 | 0
 25 | 0
 26 | 0
 27 | 0
 28 | 0
 29 | 0
 30 | 0
 31 | 0
 32 | 0
 33 | 0
 34 | 0
 35 | 0
 36 | 0
 37 | 0
 38 | 0
 39 | 0
 40 | 0
 41 | 0
 42 | 0
 43 | 0
 44 | 0
 45 | 0
 46 | 0
 47 | 0
 48 | 0
 49 | 0
 50 | 0
 51 | 1
 52 | 1
 53 | 1
 54 | 1
 55 | 1
 56 | 1
 57 | 1
 58 | 1
 59 | 1
 60 | 1
 61 | 1
 62 | 1
 63 | 1
 64 | 1
 65 | 1
 66 | 1
 67 | 1
 68 | 1
 69 | 1
 70 | 1
 71 | 1
 72 | 1
 73 | 1
 74 | 1
 75 | 1
 76 | 1
 77 | 1
 78 | 1
 79 | 1
 80 | 1
 81 | 1
 82 | 1
 83 | 1
 84 | 1
 85 | 1
 86 | 1
 87 | 1
 88 | 1
 89 | 1
 90 | 1
 91 | 1
 92 | 1
 93 | 1
 94 | 1
 95 | 1
 96 | 1
 97 | 1
 98 | 1
 99 | 1
100 | 1
101 | 2
102 | 2
103 | 2
104 | 2
105 | 2
106 | 2
107 | 2
108 | 2
109 | 2
110 | 2
111 | 2
112 | 2
113 | 2
114 | 2
115 | 2
116 | 2
117 | 2
118 | 2
119 | 2
120 | 2
121 | 2
122 | 2
123 | 2
124 | 2
125 | 2
126 | 2
127 | 2
128 | 2
129 | 2
130 | 2
131 | 2
132 | 2
133 | 2
134 | 2
135 | 2
136 | 2
137 | 2
138 | 2
139 | 2
140 | 2
141 | 2
142 | 2
143 | 2
144 | 2
145 | 2
146 | 2
147 | 2
148 | 2
149 | 2
150 | 2


--------------------------------------------------------------------------------
/tests/test_fit_tree.nim:
--------------------------------------------------------------------------------
  1 | import ../src/tree
  2 | import unittest
  3 | import strutils
  4 | import ../src/random_forest/random_forest
  5 | import test_utils
  6 | import times
  7 | 
  8 | suite "Test fit classification tree":
  9 |     setup:
 10 |         let t = new_classification_tree()
 11 |         let X1 = @[@[1.0, 4.2],
 12 |                    @[5.0, 120.4],
 13 |                    @[1.0, 3212.3],
 14 |                    @[110.0, 329.12]]
 15 |         let y1 = @[0.0, 0.0, 0.0, 1.0]
 16 | 
 17 |     test "Perfect single split":
 18 |         t.fit(X1, y1)
 19 |         var y_pred = t.predict(X1)
 20 |         t.print_root_split()
 21 |         require(y_pred == y1)
 22 |     test "Perfect single split with a lot of data":
 23 |         let X_long = read_X_data("tests/data/X_data")
 24 |         let y_long = read_y_data("tests/data/y_data")
 25 |         t.fit(X_long, y_long)
 26 |         var y_pred = t.predict(X_long)
 27 |         require(y_pred.len == y_long.len)
 28 |         require(y_pred == y_long)
 29 |     test "Test fit tree on iris dataset":
 30 |         let X_iris = read_X_data("tests/data/X_iris.csv")
 31 |         let y_iris = read_y_data("tests/data/y_iris.csv")
 32 |         t.fit(X_iris, y_iris)
 33 |         let yhat = t.predict(X_iris)
 34 |         require(yhat.len == y_iris.len)
 35 |         require(0.0 in yhat)
 36 |         require(1.0 in yhat)
 37 |         require(2.0 in yhat)
 38 |     test "Decision tree should overfit when predict on Iris train set":
 39 |         let X_iris = read_X_data("tests/data/X_iris.csv")
 40 |         let y_iris = read_y_data("tests/data/y_iris.csv")
 41 |         t.fit(X_iris, y_iris)
 42 |         let yhat = t.predict(X_iris)
 43 |         let accuracy_iris = accuracy(y_iris, yhat)
 44 |         require(accuracy_iris > 0.95)
 45 |         echo "accuracy on iris train set is ", accuracy_iris
 46 |     test "Random forest should overfit when predict on Iris train set":
 47 |         let X_iris = read_X_data("tests/data/X_iris.csv")
 48 |         let y_iris = read_y_data("tests/data/y_iris.csv")
 49 |         let rf = new_random_forest_classifier(100, num_threads=1)
 50 |         var start = now()
 51 |         rf.fit(X_iris, y_iris)
 52 |         let time_fit = now() - start
 53 |         start = now()
 54 |         let yhat = rf.predict(X_iris)
 55 |         let time_predict = now() - start
 56 |         let accuracy_iris = accuracy(y_iris, yhat)
 57 |         require(accuracy_iris > 0.95)
 58 |         echo "accuracy on iris train set is ", accuracy_iris
 59 |         echo "seconds fit: ", time_fit.inMilliseconds.float32 / 1000, " seconds predict: ", time_predict.inMilliseconds.float32 / 1000
 60 |     test "Random forest with parallel training should overfit when predict on Iris train set":
 61 |         let X_iris = read_X_data("tests/data/X_iris.csv")
 62 |         let y_iris = read_y_data("tests/data/y_iris.csv")
 63 |         let rf = new_random_forest_classifier(100, num_threads=4)
 64 |         var start = now()
 65 |         rf.fit(X_iris, y_iris)
 66 |         let time_fit = now() - start
 67 |         echo "seconds fit: ", time_fit.inMilliseconds.float32 / 1000
 68 |         start = now()
 69 |         let yhat = rf.predict(X_iris)
 70 |         let time_predict = now() - start
 71 |         let accuracy_iris = accuracy(y_iris, yhat)
 72 |         require(accuracy_iris > 0.95)
 73 |         echo "accuracy on iris train set is ", accuracy_iris
 74 |         echo " seconds predict: ", time_predict.inMilliseconds.float32 / 1000
 75 |     test "Random forest should overfit when predict on Iris train set x 10":
 76 |         let X_iris = read_X_data("tests/data/X_iris.csv", times=10)
 77 |         let y_iris = read_y_data("tests/data/y_iris.csv", times=10)
 78 |         let rf = new_random_forest_classifier(100, num_threads=1)
 79 |         var start = now()
 80 |         rf.fit(X_iris, y_iris)
 81 |         let time_fit = now() - start
 82 |         start = now()
 83 |         let yhat = rf.predict(X_iris)
 84 |         let time_predict = now() - start
 85 |         let accuracy_iris = accuracy(y_iris, yhat)
 86 |         require(accuracy_iris > 0.95)
 87 |         echo "accuracy on iris train set is ", accuracy_iris
 88 |         echo "seconds fit: ", time_fit.inMilliseconds.float32 / 1000, " seconds predict: ", time_predict.inMilliseconds.float32 / 1000
 89 |     test "Random forest with parallel training should overfit when predict on Iris train set x 10":
 90 |         let X_iris = read_X_data("tests/data/X_iris.csv", times=10)
 91 |         let y_iris = read_y_data("tests/data/y_iris.csv", times=10)
 92 |         let rf = new_random_forest_classifier(100, num_threads=4)
 93 |         var start = now()
 94 |         rf.fit(X_iris, y_iris)
 95 |         let time_fit = now() - start
 96 |         start = now()
 97 |         let yhat = rf.predict(X_iris)
 98 |         let time_predict = now() - start
 99 |         let accuracy_iris = accuracy(y_iris, yhat)
100 |         require(accuracy_iris > 0.95)
101 |         echo "accuracy on iris train set is ", accuracy_iris
102 |         echo "seconds fit: ", time_fit.inMilliseconds.float32 / 1000, " seconds predict: ", time_predict.inMilliseconds.float32 / 1000
103 |     


--------------------------------------------------------------------------------
/tests/test_impurity.nim:
--------------------------------------------------------------------------------
 1 | import ../src/impurity
 2 | import unittest
 3 | import math
 4 | 
 5 | 
 6 | proc equals(a,b,delta: float32 = 10e-4): bool =
 7 |     return abs(a - b) < delta
 8 | 
 9 | suite "Test impurity functions":
10 |     setup:
11 |         let no_uncertainity = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
12 |         let max_uncertainity = @[0.0, 0.0, 0.0, 1.0, 1.0, 1.0]
13 |         let random_seq: seq[float32] = @[1.0, 9.0, 1.0, 9.0, 8.0, 0.0, 7.0, 8.0, 1.0, 9.0, 4.0, 6.0, 7.0, 3.0, 0.0, 9.0, 6.0, 9.0, 5.0, 3.0]
14 |     test "Test gini_index":
15 |         let gini_no_unc = gini(no_uncertainity)
16 |         require gini_no_unc.equals(0.0)
17 |         let gini_max_unc = gini(max_uncertainity)
18 |         require gini_max_unc.equals(0.5)
19 |         let gini_random = gini(random_seq)
20 |         require gini_random.equals(0.86)
21 |     test "Test entropy":
22 |         let entropy_no_unc = entropy(no_uncertainity)
23 |         require entropy_no_unc.equals(0.0)
24 |         let entropy_max_unc = entropy(max_uncertainity)
25 |         require entropy_max_unc.equals(0.693)
26 |         let entropy_random = entropy(random_seq)
27 |         require entropy_random.equals(2.08, delta=10e-2)


--------------------------------------------------------------------------------
/tests/test_utils.nim:
--------------------------------------------------------------------------------
 1 | import strutils
 2 | 
 3 | proc read_X_data*(path: string, sep: char = ',', times: int = 1) : seq[seq[float32]] =
 4 |     result = new_seq[seq[float32]](0)
 5 |     for i in 0..<times:
 6 |         let f: File = open path
 7 |         var i = 0
 8 |         for l in f.lines():
 9 |             let splitted = l.split sep
10 |             var row = new_seq[float32](len(splitted))
11 |             for i, el in splitted:
12 |                 row[i] = el.parseFloat()
13 |             result.add row
14 |             inc i
15 |         f.close()
16 |     
17 | proc read_y_data*(path: string, times: int = 1): seq[float32] =
18 |     result = new_seq[float32](0)
19 |     for i in 0..<times:
20 |         let f: File = open path
21 |         var i = 0
22 |         for l in f.lines():
23 |             result.add l.parseFloat()
24 |             inc i
25 |         f.close()
26 | 
27 | 
28 | proc accuracy*[IntFloat: int|float32](ytrue, ypred: seq[IntFloat]): float32 =
29 |     assert ytrue.len == ypred.len
30 |     var correct = 0
31 |     for i in 0..<ytrue.len:
32 |         if ytrue[i] == ypred[i]:
33 |             inc correct
34 |     return correct.float32 / ytrue.len.float32


--------------------------------------------------------------------------------