├── .gitignore
├── .idea
├── .name
├── misc.xml
├── modules.xml
├── random-forest-c.iml
└── vcs.xml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── data.py
├── eval
├── eval.c
└── eval.h
├── main.c
├── model
├── forest.c
├── forest.h
├── tree.c
└── tree.h
└── utils
├── argparse.h
├── data.c
├── data.h
├── utils.c
└── utils.h
/.gitignore:
--------------------------------------------------------------------------------
1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
3 |
4 | # Project-specific
5 | *.csv
6 | .DS_Store
7 | /build
8 | /.vscode
9 |
10 | # User-specific stuff
11 | .idea/**/workspace.xml
12 | .idea/**/tasks.xml
13 | .idea/**/usage.statistics.xml
14 | .idea/**/dictionaries
15 | .idea/**/shelf
16 |
17 | # Generated files
18 | .idea/**/contentModel.xml
19 |
20 | # Sensitive or high-churn files
21 | .idea/**/dataSources/
22 | .idea/**/dataSources.ids
23 | .idea/**/dataSources.local.xml
24 | .idea/**/sqlDataSources.xml
25 | .idea/**/dynamic.xml
26 | .idea/**/uiDesigner.xml
27 | .idea/**/dbnavigator.xml
28 |
29 | # Gradle
30 | .idea/**/gradle.xml
31 | .idea/**/libraries
32 |
33 | # Gradle and Maven with auto-import
34 | # When using Gradle or Maven with auto-import, you should exclude module files,
35 | # since they will be recreated, and may cause churn. Uncomment if using
36 | # auto-import.
37 | # .idea/modules.xml
38 | # .idea/*.iml
39 | # .idea/modules
40 | # *.iml
41 | # *.ipr
42 |
43 | # CMake
44 | cmake-build-*/
45 |
46 | # Mongo Explorer plugin
47 | .idea/**/mongoSettings.xml
48 |
49 | # File-based project format
50 | *.iws
51 |
52 | # IntelliJ
53 | out/
54 |
55 | # mpeltonen/sbt-idea plugin
56 | .idea_modules/
57 |
58 | # JIRA plugin
59 | atlassian-ide-plugin.xml
60 |
61 | # Cursive Clojure plugin
62 | .idea/replstate.xml
63 |
64 | # Crashlytics plugin (for Android Studio and IntelliJ)
65 | com_crashlytics_export_strings.xml
66 | crashlytics.properties
67 | crashlytics-build.properties
68 | fabric.properties
69 |
70 | # Editor-based Rest Client
71 | .idea/httpRequests
72 |
73 | # Android studio 3.1+ serialized cache file
74 | .idea/caches/build_file_checksums.ser
75 |
76 | # Prerequisites
77 | *.d
78 |
79 | # Object files
80 | *.o
81 | *.ko
82 | *.obj
83 | *.elf
84 |
85 | # Linker output
86 | *.ilk
87 | *.map
88 | *.exp
89 |
90 | # Precompiled Headers
91 | *.gch
92 | *.pch
93 |
94 | # Libraries
95 | *.lib
96 | *.a
97 | *.la
98 | *.lo
99 |
100 | # Shared objects (inc. Windows DLLs)
101 | *.dll
102 | *.so
103 | *.so.*
104 | *.dylib
105 |
106 | # Executables
107 | *.exe
108 | *.out
109 | *.app
110 | *.i*86
111 | *.x86_64
112 | *.hex
113 |
114 | # Debug files
115 | *.dSYM/
116 | *.su
117 | *.idb
118 | *.pdb
119 |
120 | # Kernel Module Compile Results
121 | *.mod*
122 | *.cmd
123 | .tmp_versions/
124 | modules.order
125 | Module.symvers
126 | Mkfile.old
127 | dkms.conf
128 |
--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | random_forest_c
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/random-forest-c.iml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.9)
2 | project(random_forest_c C)
3 |
4 | set(CMAKE_C_STANDARD 99)
5 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -O1")
6 |
7 | add_executable(random-forest main.c utils/utils.c utils/utils.h utils/data.c utils/data.h model/tree.c model/tree.h model/forest.c model/forest.h eval/eval.c eval/eval.h)
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Random Forests - C
2 | A proof of concept basic implementation of random forests for classification and accompanying decision trees in C.
3 |
4 | ## Running the code
5 |
6 | Fastest way to start experimenting is to
7 | - (1) run the `data.py` script to generate some random CSV data
8 | - (2) compile as preferred (optionally using the `CMakeLists.txt` provided)
9 | - (3) run `./random-forests-c ` or `./random-forests-c --help` to see which arguments are available to configure.
10 |
11 | The [`main.c`](./main.c) file contains an example configuration of a random forest and code to run `cross_validate()` which will both train and evaluate a model.
12 |
13 | ### Training
14 |
15 | The `cross_validate()` function runs k-fold cross validation on whatever data is provided -- first trains the model and then evaluates it on every of the testing folds.
16 |
17 | The main function that handles model training is `train_model()`
18 | ```c
19 | const DecisionTreeNode **train_model(double **data,
20 | const RandomForestParameters *params,
21 | const struct dim *csv_dim,
22 | const ModelContext *ctx);
23 | ```
24 | It returns an array of `DecisionTreeNode` pointers to roots of decision trees comprising the forest, and the parameters are
25 |
26 | - `**training_data` - training data (equivalent to a DataFrame in Python).
27 | - `*params` - pointer to struct that holds the configuration of a random forest model.
28 | - `*csv_dim` - pointer to a struct holding row x col dimensions of the read data.
29 | - `*ctx` - pointer to a context object that holds some optional data that can be used for training / evaluation.
30 |
31 | For example:
32 | ```c
33 | const ModelContext ctx = (ModelContext){
34 | testingFoldIdx : foldIdx /* Fold to use for evaluation. */,
35 | rowsPerFold : csv_dim->rows / k_folds /* Number of rows per fold. */
36 | };
37 |
38 | const DecisionTreeNode **random_forest = (const DecisionTreeNode **)train_model(
39 | data,
40 | params,
41 | csv_dim,
42 | &ctx);
43 | ```
44 |
45 | ### Evaluation
46 |
47 | After training we can evaluate the model with `eval_model()` which returns an accuracy measure for model performance.
48 | For example:
49 |
50 | ```c
51 | // Evaluate the model that was just trained. We use the fold identified by 'foldIdx' in 'ctx' to evaluate the model.
52 | double accuracy = eval_model(
53 | random_forest /* Model to evaluate. */,
54 | data,
55 | params,
56 | csv_dim,
57 | &ctx);
58 | ```
59 |
60 | ## Code structure
61 |
62 | - `model` -- random forest and decision trees.
63 | - `eval` -- evaluation code for running `cross_validate()` or `hyperparameter_search()` to test the model.
64 | - `utils` -- utilities for data management, argument parsing, etc.
65 |
66 | The optional arguments to the program (can be viewed by running with a `--help` flag)
67 | ```
68 | -c, --num_cols=number Optional number of cols in the input CSV_FILE, if
69 | known
70 | -r, --num_rows=number Optional number of rows in the input CSV_FILE, if
71 | known
72 | -l, --log_level=number Optional debug logging level [0-3]. Level 0 is no
73 | output, 3 is most verbose. Defaults to 1.
74 | -s, --seed=number Optional random number seed.
75 | ```
76 |
77 | ## Reference
78 | Breiman, Leo. "Random forests." Machine learning 45.1 (2001): 5-32.
--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import random
3 | import pandas as pd
4 |
5 | NUM_ROWS = 100
6 | NUM_COLS = 100
7 |
8 | rows_written = 0
9 | data = []
10 |
11 | for i in range(NUM_ROWS):
12 | row = []
13 | for j in range(NUM_COLS - 1):
14 | row.append(random.uniform(0, 1))
15 |
16 | # append the target variable as a random integer of two categories [0, 1]
17 | row.append(random.randint(0, 1) * 1.0)
18 |
19 | data.append(row)
20 | rows_written += 1
21 |
22 | df = pd.DataFrame.from_records(data)
23 |
24 | # write the dataframe with rows to csv
25 | file_name = "data.csv"
26 | df.to_csv(file_name, index=False)
27 |
28 | print(f'wrote dataframe of {rows_written} rows to: {file_name}')
29 |
--------------------------------------------------------------------------------
/eval/eval.c:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #include
6 | #include
7 | #include "eval.h"
8 |
9 | void hyperparameter_search(double **data, struct dim *csv_dim)
10 | {
11 | // Init the options for number of trees to: 10, 100, 1000.
12 | size_t n = 3;
13 |
14 | size_t *estimators = malloc(sizeof(size_t) * n);
15 | estimators[0] = 10;
16 | estimators[1] = 50;
17 | estimators[2] = 100;
18 |
19 | // Init the options for the max depth for a tree to: 3, 7, 10.
20 | size_t *max_depths = malloc(sizeof(size_t) * n);
21 | max_depths[0] = 3;
22 | max_depths[1] = 7;
23 | max_depths[2] = 10;
24 |
25 | // Defaults based on SKLearn's defaults / hand picked in order to compare performance
26 | // with the same parameters.
27 | size_t max_features = 3;
28 | size_t min_samples_leaf = 2;
29 | size_t max_depth = 7;
30 |
31 | // Number of folds for cross validation.
32 | size_t k_folds = 5;
33 |
34 | // Best params computed from running the hyperparameter search.
35 | size_t best_n_estimators = -1;
36 | double best_accuracy = -1;
37 |
38 | for (size_t i = 0; i < n; ++i)
39 | {
40 | size_t n_estimators = estimators[i]; /* Number of trees in the forest. */
41 |
42 | for (size_t j = 0; j < n; ++j)
43 | {
44 | size_t max_depth = max_depths[j];
45 |
46 | RandomForestParameters params = {
47 | n_estimators : n_estimators,
48 | max_depth : max_depth,
49 | min_samples_leaf : min_samples_leaf,
50 | max_features : max_features
51 | };
52 |
53 | if (log_level > 0)
54 | {
55 | printf("[hyperparameter search] running cross_validate\n");
56 | printf("[hyperparameter search] ");
57 | print_params(¶ms);
58 | }
59 |
60 | double cv_accuracy = cross_validate(data,
61 | ¶ms,
62 | csv_dim,
63 | k_folds);
64 |
65 | if (log_level > 0)
66 | printf("[hyperparameter search] cross validation accuracy: %f%% (%ld%%)\n",
67 | (cv_accuracy * 100),
68 | (long)(cv_accuracy * 100));
69 |
70 | // Update best accuracy and best parameters found so far from the hyperparameter search.
71 | if (cv_accuracy > best_accuracy)
72 | {
73 | best_accuracy = cv_accuracy;
74 | best_n_estimators = n_estimators;
75 | }
76 | }
77 | }
78 |
79 | // Free auxillary buffers.
80 | free(estimators);
81 | free(max_depths);
82 |
83 | printf("[hyperparameter search] run complete\n best_accuracy: %f\n best_n_estimators (trees): %ld\n",
84 | best_accuracy, best_n_estimators);
85 | }
86 |
87 | double eval_model(const DecisionTreeNode **random_forest,
88 | double **data,
89 | const RandomForestParameters *params,
90 | const struct dim *csv_dim,
91 | const ModelContext *ctx)
92 | {
93 | // Keeping track of how many predictions have been correct. Accuracy can be
94 | // computed with 'num_correct' / 'rowsPerFold' (or how many predictions we make).
95 | long num_correct = 0;
96 |
97 | // Since we are evaluating the model on a single fold (to control overfitting), we start
98 | // iterating the rows for which we are getting predictions at an offset that can be computed
99 | // as 'testingFoldIdx * rowsPerFold' and make predictions for 'rowsPerFold' number of rows
100 | size_t row_id_offset = ctx->testingFoldIdx * ctx->rowsPerFold;
101 | for (size_t row_id = row_id_offset; row_id < row_id_offset + ctx->rowsPerFold; ++row_id)
102 | {
103 | int prediction = predict_model(&random_forest,
104 | params->n_estimators,
105 | data[row_id]);
106 | int ground_truth = (int)data[row_id][csv_dim->cols - 1];
107 |
108 | if (log_level > 1)
109 | printf("majority vote: %d | %d ground truth\n", prediction, ground_truth);
110 |
111 | if (prediction == ground_truth)
112 | ++num_correct;
113 | }
114 | return (double)num_correct / (double)ctx->rowsPerFold;
115 | }
116 |
117 | double cross_validate(double **data,
118 | const RandomForestParameters *params,
119 | const struct dim *csv_dim,
120 | const int k_folds)
121 | {
122 | // Sum of all accuracies on every evaluated fold.
123 | double sumAccuracy = 0;
124 |
125 | // Iterate through the fold indeces and fit models on the selections. The current 'foldIdx' is the index
126 | // of the fold in the array of all loaded data that is the fold that's currently the test fold, with all of
127 | // the other folds being used for training.
128 | for (size_t foldIdx = 0; foldIdx < k_folds; ++foldIdx)
129 | {
130 | const ModelContext ctx = (ModelContext){
131 | testingFoldIdx : foldIdx /* Fold to use for evaluation. */,
132 | rowsPerFold : csv_dim->rows / k_folds /* Number of rows per fold. */
133 | };
134 |
135 | // Train an instance of the model with every fold of data except of the fold indentified by
136 | // 'foldIdx' used for training the the 'foldIdx' fold withheld from training in order to be
137 | // used for evaluation.
138 | const DecisionTreeNode **random_forest = (const DecisionTreeNode **)train_model(
139 | data,
140 | params,
141 | csv_dim,
142 | &ctx);
143 |
144 | // Evaluate the model that was just trained. We use the fold identified by 'foldIdx' to evaluate
145 | // the model.
146 | const double accuracy = eval_model(
147 | random_forest /* Model to evaluate. */,
148 | data,
149 | params,
150 | csv_dim,
151 | &ctx);
152 | sumAccuracy += accuracy;
153 |
154 | // Free memory that was used to store the model.
155 | free_random_forest(&random_forest, params->n_estimators);
156 | }
157 |
158 | return sumAccuracy / k_folds;
159 | }
160 |
--------------------------------------------------------------------------------
/eval/eval.h:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #ifndef eval_h
6 | #define eval_h
7 |
8 | #include "../model/tree.h"
9 | #include "../model/forest.h"
10 | #include "../utils/utils.h"
11 | #include "../utils/data.h"
12 |
13 | /*
14 | Runs a hyperparameter search across a number of pre-defined parameters for the random forest model and
15 | reports the best parameters. Calls 'cross_validate' on each parameter configuration to get the cross validation
16 | accuracy for each set-up. Can be adjusted to run across as many parameters as needed.
17 | */
18 | void hyperparameter_search(double **data, struct dim *csv_dim);
19 |
20 | /*
21 | Runs k-fold cross validation on the 'data' and returns the accuracy. In the process builds up a random
22 | forest model for each iteration and evaluates on a separate test fold.
23 | */
24 | double cross_validate(double **data,
25 | const RandomForestParameters *params,
26 | const struct dim *csv_dim,
27 | const int k_folds);
28 |
29 | #endif // eval_h
30 |
--------------------------------------------------------------------------------
/main.c:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #include
6 | #include
7 | #include
8 | #include "eval/eval.h"
9 | #include "utils/argparse.h"
10 | #include "utils/data.h"
11 | #include "utils/utils.h"
12 |
13 | /* Our argp parser. */
14 | static struct argp argp = {options, parse_opt, args_doc, doc};
15 |
16 | int main(int argc, char **argv)
17 | {
18 | struct arguments arguments;
19 |
20 | // Default argument values.
21 | arguments.log_level = 1;
22 | arguments.rows = 0;
23 | arguments.cols = 0;
24 |
25 | /* Parse our arguments; every option seen by parse_opt will
26 | be reflected in arguments. */
27 | argp_parse(&argp, argc, argv, 0, 0, &arguments);
28 |
29 | // Set the log level to whatever was parsed from the arguments or the default value.
30 | set_log_level(arguments.log_level);
31 |
32 | // Optionally set the random seed if a specific random seed was provided via an argument.
33 | if (arguments.random_seed)
34 | srand(arguments.random_seed);
35 | else
36 | srand(time(NULL));
37 |
38 | // Read the csv file from args which must be parsed now.
39 | const char *file_name = arguments.args[0];
40 |
41 | // If the values for rows and cols were provided as arguments, then use them for the
42 | // 'dim' struct, otherwise call 'parse_csv_dims()' to parse the csv file provided to
43 | // compute the size of the csv file.
44 | struct dim csv_dim;
45 |
46 | if (arguments.rows && arguments.cols)
47 | csv_dim = (struct dim){rows : arguments.rows, cols : arguments.cols};
48 | else
49 | csv_dim = parse_csv_dims(file_name);
50 |
51 | if (log_level > 0)
52 | printf("using:\n verbose log level: %d\n rows: %ld, cols: %ld\nreading from csv file:\n \"%s\"\n",
53 | log_level,
54 | csv_dim.rows,
55 | csv_dim.cols,
56 | file_name);
57 |
58 | // Allocate memory for the data coming from the .csv and read in the data.
59 | double *data = malloc(sizeof(double) * csv_dim.rows * csv_dim.cols);
60 | parse_csv(file_name, &data, csv_dim);
61 |
62 | // Compute a checksum of the data to verify that loaded correctly.
63 | if (log_level > 1)
64 | printf("data checksum = %f\n", _1d_checksum(data, csv_dim.rows * csv_dim.cols));
65 |
66 | const int k_folds = 1;
67 |
68 | if (log_level > 0)
69 | printf("using:\n k_folds: %d\n", k_folds);
70 |
71 | // Example configuration for a random forest model.
72 | const RandomForestParameters params = {
73 | n_estimators : 3 /* Number of trees in the random forest model. */,
74 | max_depth : 7 /* Maximum depth of a tree in the model. */,
75 | min_samples_leaf : 3,
76 | max_features : 3
77 | };
78 |
79 | // Print random forest parameters.
80 | if (log_level > 0)
81 | print_params(¶ms);
82 |
83 | // Pivot the csv file data into a two dimensional array.
84 | double **pivoted_data;
85 | pivot_data(data, csv_dim, &pivoted_data);
86 |
87 | if (log_level > 1)
88 | printf("checksum of pivoted 2d array: %f\n", _2d_checksum(pivoted_data, csv_dim.rows, csv_dim.cols));
89 |
90 | // Start the clock for timing.
91 | clock_t begin_clock = clock();
92 |
93 | double cv_accuracy = cross_validate(pivoted_data, ¶ms, &csv_dim, k_folds);
94 | printf("cross validation accuracy: %f%% (%ld%%)\n",
95 | (cv_accuracy * 100),
96 | (long)(cv_accuracy * 100));
97 |
98 | // Record and output the time taken to run.
99 | clock_t end_clock = clock();
100 | printf("(time taken: %fs)\n", (double)(end_clock - begin_clock) / CLOCKS_PER_SEC);
101 |
102 | // Free loaded csv file data.
103 | free(data);
104 | free(pivoted_data);
105 | }
106 |
--------------------------------------------------------------------------------
/model/forest.c:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #include "forest.h"
6 |
7 | const DecisionTreeNode *train_model_tree(double **data,
8 | const RandomForestParameters *params,
9 | const struct dim *csv_dim,
10 | long *nodeId /* Ascending node ID generator */,
11 | const ModelContext *ctx)
12 | {
13 | DecisionTreeNode *root = empty_node(nodeId);
14 | DecisionTreeDataSplit data_split = calculate_best_data_split(data,
15 | params->max_features,
16 | csv_dim->rows,
17 | csv_dim->cols,
18 | ctx);
19 |
20 | if (log_level > 1)
21 | printf("calculated best split for the dataset in train_model_tree\n"
22 | "half1: %ld\nhalf2: %ld\nbest gini: %f\nbest value: %f\nbest index: %d\n",
23 | data_split.data[0].length,
24 | data_split.data[1].length,
25 | data_split.gini,
26 | data_split.value,
27 | data_split.index);
28 |
29 | populate_split_data(root, &data_split);
30 |
31 | // Start building the tree recursively.
32 | grow(root,
33 | params->max_depth,
34 | params->min_samples_leaf,
35 | params->max_features,
36 | 1 /* Current depth. */,
37 | csv_dim->rows,
38 | csv_dim->cols,
39 | nodeId,
40 | ctx);
41 |
42 | // Free any temp memory.
43 | free(data_split.data);
44 |
45 | return root;
46 | }
47 |
48 | const DecisionTreeNode **train_model(double **data,
49 | const RandomForestParameters *params,
50 | const struct dim *csv_dim,
51 | const ModelContext *ctx)
52 | {
53 | // Random forest model which is stored as a contigious list of pointers to DecisionTreeNode structs.
54 | const DecisionTreeNode **random_forest = (const DecisionTreeNode **)
55 | malloc(sizeof(DecisionTreeNode *) * params->n_estimators);
56 |
57 | // Node ID generator. We use this such that every node in the tree gets assigned a strictly
58 | // increasing ID for debugging.
59 | long nodeId = 0;
60 |
61 | // Populate the array with allocated memory for the random forest with pointers to individual decision
62 | // trees.
63 | for (size_t i = 0; i < params->n_estimators; ++i)
64 | {
65 | random_forest[i] = train_model_tree(data, params, csv_dim, &nodeId, ctx);
66 | }
67 | return random_forest;
68 | }
69 |
70 | int predict_model(const DecisionTreeNode ***random_forest, size_t n_estimators, double *row)
71 | {
72 | int zeroes = 0;
73 | int ones = 0;
74 | for (size_t i = 0; i < n_estimators; ++i)
75 | {
76 | int prediction;
77 | make_prediction((*random_forest)[i] /* root of the tree */,
78 | row,
79 | &prediction);
80 |
81 | if (prediction == 0)
82 | zeroes++;
83 | else if (prediction == 1)
84 | ones++;
85 | else
86 | {
87 | printf("Error: currently only support binary classification, i.e. prediction values 0/1, got: %d\n",
88 | prediction);
89 | exit(1);
90 | }
91 | }
92 | if (ones > zeroes)
93 | return 1;
94 | else
95 | return 0;
96 | }
97 |
98 | void free_random_forest(const DecisionTreeNode ***random_forest, const size_t length)
99 | {
100 | long freeCount = 0;
101 | for (size_t idx = 0; idx < length; ++idx)
102 | {
103 | // Recursively free this DecisionTree rooted at the current node.
104 | free_decision_tree_node((*random_forest)[idx], &freeCount);
105 | }
106 | // Free the actual array of pointers to the nodes.
107 | free(*random_forest);
108 |
109 | if (log_level > 2)
110 | printf("total DecisionTreeNode freed: %ld\n", freeCount);
111 | }
112 |
113 | void print_params(const RandomForestParameters *params)
114 | {
115 | printf("using RandomForestParameters:\n n_estimators: %ld\n max_depth: %ld\n min_samples_leaf: %ld\n max_features: %ld\n",
116 | params->n_estimators,
117 | params->max_depth,
118 | params->min_samples_leaf,
119 | params->max_features);
120 | }
121 |
--------------------------------------------------------------------------------
/model/forest.h:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #ifndef forest_h
6 | #define forest_h
7 |
8 | #include
9 | #include "tree.h"
10 |
11 | extern int log_level;
12 |
13 | /*
14 | Parameters for a Random Forest model.
15 | */
16 | struct RandomForestParameters
17 | {
18 | size_t n_estimators; // Number of trees in a forest.
19 | size_t max_depth; // Maximum depth of a tree.
20 | size_t min_samples_leaf; // Minimum number of data samples at a leaf node.
21 | size_t max_features; // Number of features considered when calculating the best data split.
22 | };
23 |
24 | typedef struct RandomForestParameters RandomForestParameters;
25 |
26 | /*
27 | Function to print a RandomForestParameters struct for debugging.
28 | */
29 | void print_params(const RandomForestParameters *params);
30 |
31 | /*
32 | Trains a single decision tree on the provided data and returns a pointer to the root DecisionTreeNode
33 | of the tree stored on the heap.
34 | */
35 | const DecisionTreeNode *
36 | train_model_tree(double **data,
37 | const RandomForestParameters *params,
38 | const struct dim *csv_dim,
39 | long *nodeId /* Ascending node ID generator */,
40 | const ModelContext *ctx);
41 |
42 | /*
43 | Trains a random forest model that is comprised of individually built decision trees. Returns an array
44 | of pointers to DecisionTreeNode's that are the roots of the decision trees in the random forest model.
45 | */
46 | const DecisionTreeNode **train_model(double **data,
47 | const RandomForestParameters *params,
48 | const struct dim *csv_dim,
49 | const ModelContext *ctx);
50 |
51 | /*
52 | Given a single row, gets predictions from every decision tree in the 'random_forest' model
53 | for the class target that the row should be classified into and returns the class target value
54 | that is the majority vote.
55 | */
56 | int predict_model(const DecisionTreeNode ***random_forest, size_t n_estimators, double *row);
57 |
58 | /*
59 | Frees memory for a given random forest model (array of pointers to DecisionTreeNode's).
60 | */
61 | void free_random_forest(const DecisionTreeNode ***random_forest, const size_t length);
62 |
63 | #endif // forest_h
64 |
--------------------------------------------------------------------------------
/model/tree.c:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #include "tree.h"
6 |
7 | /*
8 | Allocates memory for an empty DecisionTreeNode and returns a pointer to the node.
9 | */
10 | DecisionTreeNode *empty_node(long *id)
11 | {
12 | DecisionTreeNode *node = malloc(sizeof(DecisionTreeNode));
13 |
14 | node->id = (*id);
15 | node->leftChild = NULL;
16 | node->rightChild = NULL;
17 |
18 | node->half1 = NULL;
19 | node->half2 = NULL;
20 | node->size_half1 = 0;
21 | node->size_half2 = 0;
22 |
23 | node->split_index = -1;
24 | node->split_value = -1;
25 | node->split_data_halves = NULL;
26 |
27 | (*id)++;
28 |
29 | if (log_level > 2)
30 | printf("created a DecisionTreeNode with id %ld stored at address %p \n", node->id, node);
31 |
32 | return node;
33 | }
34 |
35 | /*
36 | Populates a given DecisionTreeNode with data from the DecisionTreeDataSplit struct
37 | pointed to by 'data_split'.
38 | */
39 | void populate_split_data(DecisionTreeNode *node, DecisionTreeDataSplit *data_split)
40 | {
41 | node->split_index = (*data_split).index;
42 | node->split_value = (*data_split).value;
43 | node->split_data_halves = (*data_split).data;
44 | }
45 |
46 | /*
47 | Given a two dimensional array of data finds and returns a DecisionTreeTargetClasses
48 | struct with unique target classes found in the dataset at column with index 'cols - 1'.
49 | */
50 | DecisionTreeTargetClasses get_target_class_values(double **data, size_t rows, size_t cols, const ModelContext *ctx)
51 | {
52 | if (log_level > 1)
53 | printf("generating class value set...\n");
54 |
55 | size_t count = 0;
56 | int *target_class_values = malloc(count * sizeof(int));
57 |
58 | for (size_t i = 0; i < rows; ++i)
59 | {
60 | // Skip rows that we are withholding from training for evaluation.
61 | if (is_row_part_of_testing_fold(i, ctx))
62 | {
63 | if (log_level > 1)
64 | printf(" skipping row %ld which is part of testing fold %ld\n", i, ctx->testingFoldIdx);
65 | continue;
66 | }
67 |
68 | int class_target = (int)data[i][cols - 1];
69 | if (!contains_int(target_class_values, count, class_target))
70 | {
71 | if (log_level > 1)
72 | printf("adding %d \n", class_target);
73 | count++;
74 | int *temp = realloc(target_class_values, count * sizeof(int));
75 | if (temp != NULL)
76 | target_class_values = temp;
77 | target_class_values[count - 1] = class_target;
78 | }
79 | }
80 | if (log_level > 1)
81 | printf("-------------------------------\ncount of unique classes: %ld\n", count);
82 | return (DecisionTreeTargetClasses){count, target_class_values};
83 | }
84 |
85 | /*
86 | Given a two dimensional array of data returns the leaf node class value for the given
87 | data. The leaf node class value is whichever class value that is the class target value for
88 | the majority of the rows in the data.
89 | */
90 | int get_leaf_node_class_value(double **data, size_t rows, size_t cols)
91 | {
92 | int zeroes = 0;
93 | int ones = 0;
94 | for (size_t i = 0; i < rows; ++i)
95 | {
96 | int class_label = (int)data[i][cols - 1];
97 | if (class_label == 0)
98 | zeroes++;
99 | else if (class_label == 1)
100 | ones++;
101 | else
102 | {
103 | printf("Error: currently only support binary classification, i.e. class target values 0/1, got: %d\n",
104 | class_label);
105 | exit(1);
106 | }
107 | }
108 | if (ones >= zeroes)
109 | return 1;
110 | else
111 | return 0;
112 | }
113 |
114 | /*
115 | Given a two dimensional array of data and parameters for a split, splits the data into two halves and
116 | returns a pointer to an array of two DecisionTreeData for the two halves of the split.
117 | */
118 | DecisionTreeData *split_dataset(int feature_index,
119 | double value,
120 | double **data,
121 | size_t rows,
122 | size_t cols)
123 | {
124 | if (log_level > 1)
125 | printf("splitting dataset into two halves...\n");
126 |
127 | // Buffers to hold rows of data as we are distributing rows based on the split.
128 | double **left = (double **)malloc(1 * sizeof(double) * cols);
129 | double **right = (double **)malloc(1 * sizeof(double) * cols);
130 |
131 | size_t left_count = 0;
132 | size_t right_count = 0;
133 |
134 | for (size_t i = 0; i < rows; ++i)
135 | {
136 | double *row = data[i];
137 | if (row[feature_index] < value)
138 | {
139 | // Copy the row into the left half and resize the buffer.
140 | left[left_count++] = row;
141 | double **temp = realloc(left, left_count * sizeof(double) * cols);
142 | if (temp != NULL)
143 | left = temp;
144 | }
145 | else
146 | {
147 | // Copy the row into the right half and resize the buffer.
148 | right[right_count++] = row;
149 | double **temp = realloc(right, right_count * sizeof(double) * cols);
150 | if (temp != NULL)
151 | right = temp;
152 | }
153 | }
154 | DecisionTreeData *data_split = malloc(sizeof(DecisionTreeData) * 2);
155 | data_split[0] = (DecisionTreeData){left_count, left};
156 | data_split[1] = (DecisionTreeData){right_count, right};
157 |
158 | if (log_level > 1)
159 | printf("split dataset into: %ld | %ld\n", left_count, right_count);
160 |
161 | return data_split;
162 | }
163 |
164 | double calculate_gini_index(DecisionTreeData *data_split,
165 | int *class_labels,
166 | size_t class_labels_count,
167 | size_t cols)
168 | {
169 | if (log_level > 1)
170 | printf("calculating gini index based on split...\n");
171 |
172 | // DecisionTreeData data split should consist of two halves.
173 | int count = 2;
174 | size_t n_instances = data_split[0].length + data_split[1].length;
175 | double gini = 0.0;
176 | for (size_t i = 0; i < count; ++i)
177 | {
178 | DecisionTreeData group = data_split[i];
179 |
180 | size_t size = group.length;
181 | if (size == 0)
182 | continue;
183 |
184 | double sum = 0.0;
185 | for (size_t j = 0; j < class_labels_count; ++j)
186 | {
187 | int class = class_labels[j];
188 | int occurences = 0;
189 | for (size_t k = 0; k < size; ++k)
190 | {
191 | int label = (int)group.data[k][cols - 1];
192 | if (label == class)
193 | occurences += 1;
194 | }
195 | double p_class = (double)occurences / (double)size;
196 | sum += (p_class * p_class);
197 | }
198 | gini += (1.0 - sum) * ((double)size / (double)n_instances);
199 | }
200 |
201 | if (log_level > 1)
202 | {
203 | printf("gini: %f\n", gini);
204 | printf("-----------------------------------------\n");
205 | }
206 |
207 | return gini;
208 | }
209 |
210 | DecisionTreeDataSplit calculate_best_data_split(double **data,
211 | size_t max_features,
212 | size_t rows,
213 | size_t cols,
214 | const ModelContext *ctx)
215 | {
216 | if (log_level > 1)
217 | {
218 | printf("calculating best split for dataset...\n");
219 | printf("rows: %ld\ncols: %ld\n", rows, cols);
220 | }
221 |
222 | // Target classes available in this dataset.
223 | DecisionTreeTargetClasses classes = get_target_class_values(data, rows, cols, ctx);
224 |
225 | // Keeping track of best data split available along with best parameters associated with
226 | // that data split.
227 | DecisionTreeData *best_data_split = NULL;
228 | double best_value = DBL_MAX;
229 | double best_gini = DBL_MAX;
230 | int best_index = INT_MAX;
231 |
232 | // Create a features array and initialize to avoid non-set memory.
233 | int *features = malloc(max_features * sizeof(int));
234 | for (size_t i = 0; i < max_features; ++i)
235 | features[i] = -1;
236 |
237 | size_t count = 0;
238 | while (count < max_features)
239 | {
240 | // Maximum index for a feature which should not include the class target column index
241 | // which is 'cols - 1'.
242 | int max = cols - 2;
243 | int min = 0;
244 | int index = rand() % (max + 1 - min) + min;
245 | if (!contains_int(features, max_features /* size of 'features' array */, index))
246 | {
247 | if (log_level > 1)
248 | printf("adding unique index: %d\n", index);
249 | features[count++] = index;
250 | }
251 | }
252 | if (log_level > 1)
253 | printf("-----------------------------------------\n");
254 |
255 | for (size_t i = 0; i < max_features; ++i)
256 | {
257 | int feature_index = features[i];
258 | for (size_t j = 0; j < rows; ++j)
259 | {
260 | DecisionTreeData *data_split = split_dataset(feature_index,
261 | data[j][feature_index],
262 | data,
263 | rows,
264 | cols);
265 | double gini = calculate_gini_index(data_split, classes.labels, classes.count, cols);
266 |
267 | if (gini < best_gini)
268 | {
269 | best_index = feature_index;
270 | best_value = data[j][feature_index];
271 | best_gini = gini;
272 |
273 | // First free the memory that was previously allocated for the 'data_split' and pointer
274 | // which was assigned to 'best_data_split' since now we have found a new better split.
275 | if (best_data_split)
276 | free_decision_tree_data(best_data_split);
277 |
278 | best_data_split = data_split;
279 | }
280 | else
281 | {
282 | free_decision_tree_data(data_split);
283 | }
284 | }
285 | }
286 |
287 | // Free any other memory.
288 | free(features);
289 | free(classes.labels);
290 |
291 | return (DecisionTreeDataSplit){best_index, best_value, best_gini, best_data_split};
292 | }
293 |
294 | void grow(DecisionTreeNode *decision_tree,
295 | size_t max_depth,
296 | size_t min_samples_leaf,
297 | size_t max_features,
298 | int depth,
299 | size_t rows,
300 | size_t cols,
301 | long *nodeId,
302 | const ModelContext *ctx)
303 | {
304 | DecisionTreeData left_half = decision_tree->split_data_halves[0];
305 | DecisionTreeData right_half = decision_tree->split_data_halves[1];
306 |
307 | double **left = left_half.data;
308 | double **right = right_half.data;
309 |
310 | decision_tree->split_data_halves = NULL;
311 |
312 | if (left == NULL || right == NULL)
313 | {
314 | // If we are at the leaf node, then combine both left and right side data and compute get the leaf
315 | // node class for the combined data.
316 | double **combined_data = combine_arrays(left, right, left_half.length, right_half.length, cols);
317 | int leaf = get_leaf_node_class_value(combined_data, rows, cols);
318 |
319 | decision_tree->left_leaf = leaf;
320 | decision_tree->right_leaf = leaf;
321 |
322 | free(left);
323 | free(right);
324 | free(combined_data);
325 |
326 | return;
327 | }
328 | if (depth >= max_depth)
329 | {
330 | decision_tree->left_leaf = get_leaf_node_class_value(left, left_half.length /* rows */, cols);
331 | decision_tree->right_leaf = get_leaf_node_class_value(right, right_half.length /* rows */, cols);
332 |
333 | free(left);
334 | free(right);
335 |
336 | return;
337 | }
338 | if (left_half.length <= min_samples_leaf)
339 | {
340 | decision_tree->left_leaf = get_leaf_node_class_value(left, left_half.length /* rows */, cols);
341 | }
342 | else
343 | {
344 | DecisionTreeDataSplit data_split = calculate_best_data_split(left,
345 | max_features,
346 | left_half.length /* rows */,
347 | cols,
348 | ctx);
349 |
350 | // Create the left child of the current node and populate with data from the data split.
351 | decision_tree->leftChild = empty_node(nodeId);
352 | populate_split_data(decision_tree->leftChild, &data_split);
353 |
354 | grow(decision_tree->leftChild,
355 | max_depth,
356 | min_samples_leaf,
357 | max_features,
358 | depth + 1 /* since we are now at the next 'level' in the tree */,
359 | rows,
360 | cols,
361 | nodeId,
362 | ctx);
363 |
364 | free(data_split.data);
365 | }
366 | if (right_half.length <= min_samples_leaf)
367 | {
368 | decision_tree->right_leaf = get_leaf_node_class_value(right, right_half.length, cols);
369 | }
370 | else
371 | {
372 | DecisionTreeDataSplit data_split = calculate_best_data_split(right,
373 | max_features,
374 | right_half.length /* rows */,
375 | cols,
376 | ctx);
377 |
378 | // Create the right child of the current node and populate with data from the data split.
379 | decision_tree->rightChild = empty_node(nodeId);
380 | populate_split_data(decision_tree->rightChild, &data_split);
381 |
382 | grow(decision_tree->rightChild,
383 | max_depth,
384 | min_samples_leaf,
385 | max_features,
386 | depth + 1 /* since we are now at the next 'level' in the tree */,
387 | rows,
388 | cols,
389 | nodeId,
390 | ctx);
391 |
392 | free(data_split.data);
393 | }
394 |
395 | free(left);
396 | free(right);
397 | }
398 |
399 | void make_prediction(const DecisionTreeNode *decision_tree, double *row, int *prediction_val)
400 | {
401 | if (row[decision_tree->split_index] < decision_tree->split_value)
402 | {
403 | if (decision_tree->leftChild != NULL)
404 | make_prediction(decision_tree->leftChild, row, prediction_val);
405 | else
406 | (*prediction_val) = decision_tree->left_leaf;
407 | }
408 | else
409 | {
410 | if (decision_tree->rightChild != NULL)
411 | make_prediction(decision_tree->rightChild, row, prediction_val);
412 | else
413 | (*prediction_val) = decision_tree->right_leaf;
414 | }
415 | }
416 |
417 | /*
418 | Frees memory for a given DecisionTreeNode.
419 | */
420 | void free_decision_tree_node(const DecisionTreeNode *node, long *freeCount)
421 | {
422 | (*freeCount)++;
423 | if (node->leftChild)
424 | free_decision_tree_node(node->leftChild, freeCount);
425 | if (node->rightChild)
426 | free_decision_tree_node(node->rightChild, freeCount);
427 |
428 | if (log_level > 2)
429 | printf("freeing DecisionTreeNode with id=%ld\n", node->id);
430 |
431 | if (node && node->split_data_halves && node->split_data_halves->length)
432 | {
433 | free(node->split_data_halves[0].data);
434 | free(node->split_data_halves[1].data);
435 | free(node->split_data_halves);
436 | }
437 |
438 | free((void *)node);
439 | }
440 |
441 | /*
442 | Frees memory for a given DecisionTreeData.
443 | */
444 | void free_decision_tree_data(DecisionTreeData *data_split)
445 | {
446 | free(data_split[0].data);
447 | free(data_split[1].data);
448 | free(data_split);
449 | }
450 |
--------------------------------------------------------------------------------
/model/tree.h:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #ifndef tree_h
6 | #define tree_h
7 |
8 | #include
9 | #include
10 | #include "../utils/utils.h"
11 |
12 | typedef struct DecisionTreeData DecisionTreeData;
13 | typedef struct DecisionTreeNode DecisionTreeNode;
14 | typedef struct DecisionTreeDataSplit DecisionTreeDataSplit;
15 | typedef struct DecisionTreeTargetClasses DecisionTreeTargetClasses;
16 |
17 | /*
18 | Represents a single node in a decision tree that comprise a random forest.
19 | */
20 | struct DecisionTreeNode
21 | {
22 | long id;
23 | struct DecisionTreeNode *leftChild;
24 | struct DecisionTreeNode *rightChild;
25 |
26 | double *half1;
27 | double *half2;
28 | size_t size_half1;
29 | size_t size_half2;
30 |
31 | double split_value;
32 | long split_index;
33 | DecisionTreeData *split_data_halves;
34 |
35 | // if the node is a leaf
36 | int left_leaf;
37 | int right_leaf;
38 | };
39 |
40 | struct DecisionTreeData
41 | {
42 | size_t length;
43 | double **data;
44 | };
45 |
46 | struct DecisionTreeDataSplit
47 | {
48 | int index;
49 | double value;
50 | double gini;
51 | DecisionTreeData *data;
52 | };
53 |
54 | struct DecisionTreeTargetClasses
55 | {
56 | size_t count;
57 | int *labels;
58 | };
59 |
60 | /*
61 | Functions to free memory allocated for the structs.
62 | */
63 | void free_decision_tree_data(DecisionTreeData *data_split);
64 | void free_decision_tree_node(const DecisionTreeNode *node, long *freeCount);
65 |
66 | /*
67 | Creates a new empry DecisionTreeNode with id from the strictly increasing
68 | id generator.
69 | */
70 | DecisionTreeNode *empty_node(long *id);
71 |
72 | /*
73 | Function to recursively grow a DecisionTreeNode by splitting the dataset and creating
74 | left / right children until fully splitting the rows across all nodes.
75 | */
76 | void grow(DecisionTreeNode *decision_tree,
77 | size_t max_depth,
78 | size_t min_samples_leaf,
79 | size_t max_features,
80 | int depth,
81 | size_t rows,
82 | size_t cols,
83 | long *nodeId,
84 | const ModelContext *ctx);
85 |
86 | /*
87 | Calculates the best split for the 'data' given a number of randomly selected features from the data
88 | (columns) up to the number of maximum number of features 'max_features'.
89 | */
90 | DecisionTreeDataSplit calculate_best_data_split(double **data,
91 | size_t max_features,
92 | size_t rows,
93 | size_t cols,
94 | const ModelContext *ctx);
95 |
96 | /*
97 | Populates a given DecisionTreeNode with data from the DecisionTreeDataSplit struct
98 | pointed to by 'data_split'.
99 | */
100 | void populate_split_data(DecisionTreeNode *node, DecisionTreeDataSplit *data_split);
101 | /*
102 | Given a row of data and a trained decision tree, computes the predicted class target value for the row
103 | and writes the prediction into the variable pointed to 'prediction_val'.
104 | */
105 | void make_prediction(const DecisionTreeNode *decision_tree, double *row, int *prediction_val);
106 |
107 | #endif // tree_h
108 |
--------------------------------------------------------------------------------
/utils/argparse.h:
--------------------------------------------------------------------------------
1 | #ifndef argparse_h
2 | #define argparse_h
3 |
4 | #include
5 | #include
6 |
7 | /* How many arguments we accept. */
8 | #define COUNT_ARGS 1
9 |
10 | const char *argp_program_version =
11 | "random-forests-c 1.0";
12 | const char *argp_program_bug_address =
13 | "https://github.com/dobroshynski/random-forests-c";
14 |
15 | /* Program documentation. */
16 | static char doc[] =
17 | "random-forests-c -- Basic implementation of random forests and accompanying decision trees in C";
18 |
19 | /* A description of the arguments we accept. */
20 | static char args_doc[] = "CSV_FILE";
21 |
22 | /* The options we understand. */
23 | static struct argp_option options[] = {
24 | {"num_rows", 'r', "number", 0, "Optional number of rows in the input CSV_FILE, if known", 0},
25 | {"num_cols", 'c', "number", 0, "Optional number of cols in the input CSV_FILE, if known", 0},
26 | {"log_level", 'l', "number", 0, "Optional debug logging level [0-3]. Level 0 is no output, 3 is most verbose. Defaults to 1.", 1},
27 | {"seed", 's', "number", 0, "Optional random number seed.", 2},
28 | {0}};
29 |
30 | /* Used by main to communicate with parse_opt. */
31 | struct arguments
32 | {
33 | char *args[COUNT_ARGS]; /* CSV file argument. */
34 |
35 | long rows, cols;
36 | int log_level;
37 | int random_seed;
38 | };
39 |
40 | /* Parse a single option. */
41 | static error_t
42 | parse_opt(int key, char *arg, struct argp_state *state)
43 | {
44 | /* Get the input argument from argp_parse, which we
45 | know is a pointer to our arguments structure. */
46 | struct arguments *arguments = state->input;
47 |
48 | switch (key)
49 | {
50 | case 'r':
51 | arguments->rows = atol(arg);
52 | break;
53 | case 'c':
54 | arguments->cols = atol(arg);
55 | break;
56 | case 'l':
57 | arguments->log_level = atoi(arg);
58 | break;
59 | case 's':
60 | arguments->random_seed = atoi(arg);
61 | break;
62 |
63 | case ARGP_KEY_ARG:
64 | if (state->arg_num >= COUNT_ARGS)
65 | /* Too many arguments. */
66 | argp_usage(state);
67 |
68 | arguments->args[state->arg_num] = arg;
69 |
70 | break;
71 |
72 | case ARGP_KEY_END:
73 | if (state->arg_num < COUNT_ARGS)
74 | /* Not enough arguments. */
75 | argp_usage(state);
76 | break;
77 |
78 | default:
79 | return ARGP_ERR_UNKNOWN;
80 | }
81 | return 0;
82 | }
83 |
84 | #endif // argparse_h
85 |
--------------------------------------------------------------------------------
/utils/data.c:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #include "data.h"
6 |
7 | struct dim parse_csv_dims(const char *file_name)
8 | {
9 | FILE *csv_file;
10 | csv_file = fopen(file_name, "r");
11 |
12 | if (csv_file == NULL)
13 | {
14 | printf("Error: can't open file: %s\n", file_name);
15 | exit(-1);
16 | }
17 |
18 | const char *delimiter = ",";
19 |
20 | char *buffer = malloc(BUFSIZ);
21 | char *token;
22 |
23 | // Keeping track of how many rows and columns there are.
24 | int rows = 0;
25 | int cols = 0;
26 |
27 | // Reach each line of the file into the buffer.
28 | while (fgets(buffer, BUFSIZ, csv_file) != NULL)
29 | {
30 | ++rows;
31 |
32 | // We use a second counter for columns in order to count the number of columns
33 | // for each row we read and will trigger an assert if there is a mismatch in the
34 | // size of the cols, i.e. not all rows have the same number of columns which would
35 | // cause undefined behavior.
36 | int curr_cols = 0;
37 |
38 | // Get every token and print it.
39 | token = strtok(buffer, delimiter);
40 | while (token != NULL)
41 | {
42 | ++curr_cols;
43 | // printf("%s\n", token);
44 |
45 | // Get the next token.
46 | token = strtok(NULL, delimiter);
47 | }
48 | if (cols == 0)
49 | {
50 | cols = curr_cols;
51 | }
52 | else
53 | {
54 | assert(curr_cols == cols && "Error: every row must have the same amount of columns");
55 | }
56 | }
57 | // We read one extra row for the csv header, so adjust here.
58 | --rows;
59 |
60 | fclose(csv_file);
61 | free(buffer);
62 |
63 | // Make sure that the dimensions are valid.
64 | assert(rows > 0 && "# of rows in csv must be > 0");
65 | assert(cols > 0 && "# of cols in csv must be > 0");
66 |
67 | return (struct dim){rows : rows, cols : cols};
68 | }
69 |
70 | void parse_csv(const char *file_name, double **data_p, const struct dim csv_dim)
71 | {
72 | FILE *csv_file;
73 | csv_file = fopen(file_name, "r");
74 |
75 | if (csv_file == NULL)
76 | {
77 | printf("Error: can't open file: %s\n", file_name);
78 | exit(-1);
79 | }
80 |
81 | const char *delimiter = ",";
82 |
83 | char *buffer = malloc(BUFSIZ);
84 | char *token;
85 |
86 | // Keeping track which row we are on.
87 | int row = 0;
88 | // Keeping track of the index in the data array.
89 | int idx = 0;
90 |
91 | // Reach each line of the file into the buffer.
92 | while (row <= csv_dim.rows && fgets(buffer, BUFSIZ, csv_file) != NULL)
93 | {
94 | if (++row == 1)
95 | continue;
96 |
97 | // Get every token and print it.
98 | token = strtok(buffer, delimiter);
99 | while (token != NULL)
100 | {
101 | // if (1)
102 | // printf("%s\n", token);
103 |
104 | (*data_p)[idx] = atof(token);
105 | // printf("%f\n", (*data)[idx]);
106 | ++idx;
107 |
108 | // Get the next token.
109 | token = strtok(NULL, delimiter);
110 | }
111 | }
112 |
113 | if (log_level > 1)
114 | printf("read %d rows from file %s\n", row - 1, file_name);
115 |
116 | fclose(csv_file);
117 | free(buffer);
118 | }
119 |
120 | void pivot_data(double *data, const struct dim csv_dim, double ***pivoted_data_p)
121 | {
122 | (*pivoted_data_p) = _2d_calloc(csv_dim.rows, csv_dim.cols);
123 |
124 | for (size_t i = 0; i < csv_dim.rows; ++i)
125 | for (size_t j = 0; j < csv_dim.cols; ++j)
126 | (*pivoted_data_p)[i][j] = data[(i * csv_dim.cols) + j];
127 | }
128 |
--------------------------------------------------------------------------------
/utils/data.h:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #ifndef data_h
6 | #define data_h
7 |
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include "utils.h"
15 |
16 | /*
17 | Struct for parsed data dimensions.
18 | */
19 | struct dim
20 | {
21 | size_t rows;
22 | size_t cols;
23 | };
24 |
25 | /*
26 | Attempts to read a csv file at path given by 'file_name', and if successfull, records the
27 | dimensions of the csv file, asserts that all rows have the same number of columns, and returns
28 | the dimensions of the file in a struct of type dim.
29 |
30 | If the numbers for rows and cols are given to the program as command line arguments, then calling
31 | this function can be skipped, as we already would know how much memory to allocate to fit the csv
32 | file of the given dimensions.
33 | */
34 | struct dim parse_csv_dims(const char *file_name);
35 |
36 | /*
37 | Attempts to read a csv file at path given by 'file_name' and write the values one-by-one into 'data'.
38 | If an argument for '--num_rows' was provided to the program and is less than the actual number of
39 | rows in the csv file, the function will stop reading at that row. This allows to read only the top
40 | 'num_rows' in the input data file if needed.
41 | */
42 | void parse_csv(const char *file_name, double **data_p, const struct dim csv_dim);
43 |
44 | /*
45 | Pivots and transforms the data in 'data' array into a two-dimensional array of size
46 | 'csv_dim.rows' * 'csv_dim.cols' pointed to by 'pivoted_data_p'.
47 | */
48 | void pivot_data(double *data, const struct dim csv_dim, double ***pivoted_data_p);
49 |
50 | #endif // data_h
51 |
--------------------------------------------------------------------------------
/utils/utils.c:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #include "utils.h"
6 |
7 | int get_log_level()
8 | {
9 | return log_level;
10 | }
11 |
12 | void set_log_level(int selected_log_level)
13 | {
14 | if (selected_log_level < 0 || selected_log_level > 3)
15 | {
16 | printf("Error: log_level must be in range [0, 3] got: %d\n", selected_log_level);
17 | exit(1);
18 | }
19 | log_level = selected_log_level;
20 | }
21 |
22 | int contains_int(int *arr, size_t n, int val)
23 | {
24 | for (size_t i = 0; i < n; ++i)
25 | {
26 | if (arr[i] == val)
27 | return 1;
28 | }
29 | return 0;
30 | }
31 |
32 | int is_row_part_of_testing_fold(int row, const ModelContext *ctx)
33 | {
34 | size_t lower_bound = ctx->testingFoldIdx * ctx->rowsPerFold;
35 | size_t upper_bound = lower_bound + ctx->rowsPerFold;
36 |
37 | if (row >= lower_bound && row <= upper_bound)
38 | return 1;
39 | else
40 | return 0;
41 | }
42 |
43 | double **combine_arrays(double **first, double **second, size_t n1, size_t n2, size_t cols)
44 | {
45 | double **combined = (double **)malloc((n1 + n2) * sizeof(double) * cols);
46 | int row_index = 0;
47 | for (size_t i = 0; i < n1; ++i)
48 | {
49 | double *row = first[i];
50 | combined[row_index++] = row;
51 | }
52 | for (size_t j = 0; j < n2; ++j)
53 | {
54 | double *row = second[j];
55 | combined[row_index++] = row;
56 | }
57 | return combined;
58 | }
59 |
60 | double **_2d_malloc(const size_t rows, const size_t cols)
61 | {
62 | double **data;
63 | double *ptr;
64 |
65 | int len = sizeof(double *) * rows + sizeof(double) * cols * rows;
66 | data = (double **)malloc(len);
67 |
68 | ptr = (double *)(data + rows);
69 |
70 | for (size_t i = 0; i < rows; ++i)
71 | data[i] = ptr + cols * i;
72 |
73 | return data;
74 | }
75 |
76 | double **_2d_calloc(const size_t rows, const size_t cols)
77 | {
78 | double **data;
79 | double *ptr;
80 |
81 | int len = sizeof(double *) * rows + sizeof(double) * cols * rows;
82 | data = (double **)calloc(len, sizeof(double));
83 |
84 | ptr = (double *)(data + rows);
85 |
86 | for (size_t i = 0; i < rows; ++i)
87 | data[i] = ptr + cols * i;
88 |
89 | return data;
90 | }
91 |
92 | double _1d_checksum(double *data, size_t size)
93 | {
94 | double sum = 0;
95 | for (size_t i = 0; i < size; ++i)
96 | {
97 | sum += data[i];
98 | }
99 | return sum;
100 | }
101 |
102 | double _2d_checksum(double **data, size_t rows, size_t cols)
103 | {
104 | double sum = 0;
105 | for (size_t i = 0; i < rows; ++i)
106 | {
107 | for (size_t j = 0; j < cols; ++j)
108 | {
109 | sum += data[i][j];
110 | }
111 | }
112 | return sum;
113 | }
114 |
--------------------------------------------------------------------------------
/utils/utils.h:
--------------------------------------------------------------------------------
1 | /*
2 | @author andrii dobroshynski
3 | */
4 |
5 | #ifndef utils_h
6 | #define utils_h
7 |
8 | #include
9 | #include "data.h"
10 |
11 | /*
12 | Struct to hold information about a current model training run.
13 | */
14 | struct ModelContext
15 | {
16 | const size_t testingFoldIdx;
17 | const size_t rowsPerFold;
18 | };
19 |
20 | typedef struct ModelContext ModelContext;
21 |
22 | /*
23 | The debug log level that can be adjusted via an argument.
24 | */
25 | int log_level;
26 |
27 | /*
28 | Given a pointer to a buffer array of integers returns whether or not a given integer 'n'
29 | is present in the array.
30 | */
31 | int contains_int(int *arr, size_t n, int val);
32 |
33 | /*
34 | Given two two-dimensional like arrays 'first' and 'second', merges them into one and returns a
35 | new pointer.
36 | */
37 | double **combine_arrays(double **first, double **second, size_t n1, size_t n2, size_t cols);
38 |
39 | /*
40 | Given a row number and a model context returns whether or not the particular
41 | row belongs to a fold that is designated as the evaluation / testing fold.
42 | */
43 | int is_row_part_of_testing_fold(int row, const ModelContext *ctx);
44 |
45 | /*
46 | Sets the 'log_level' to the 'selected_log_level'.
47 | */
48 | void set_log_level(int selected_log_level);
49 |
50 | /*
51 | Returns whatever the current 'log_level' is.
52 | */
53 | int get_log_level();
54 |
55 | /*
56 | Allocates memory for a two-dimensional like array of size 'rows' * 'cols'.
57 | */
58 | double **_2d_malloc(const size_t rows, const size_t cols);
59 |
60 | /*
61 | Allocates memory for a two-dimensional like array of size 'rows' * 'cols' with all elements initialized
62 | to zero.
63 | */
64 | double **_2d_calloc(const size_t rows, const size_t cols);
65 |
66 | /*
67 | Computes a checksum of a one-dimensional like array. Used to verify consistency of data.
68 | */
69 | double _1d_checksum(double *data, size_t size);
70 |
71 | /*
72 | Computes a checksum of a two-dimensional like array. Used to verify consistency of data.
73 | */
74 | double _2d_checksum(double **data, size_t rows, size_t cols);
75 |
76 | #endif // utils_h
77 |
--------------------------------------------------------------------------------