├── .gitignore ├── .idea ├── .name ├── misc.xml ├── modules.xml ├── random-forest-c.iml └── vcs.xml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── data.py ├── eval ├── eval.c └── eval.h ├── main.c ├── model ├── forest.c ├── forest.h ├── tree.c └── tree.h └── utils ├── argparse.h ├── data.c ├── data.h ├── utils.c └── utils.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 3 | 4 | # Project-specific 5 | *.csv 6 | .DS_Store 7 | /build 8 | /.vscode 9 | 10 | # User-specific stuff 11 | .idea/**/workspace.xml 12 | .idea/**/tasks.xml 13 | .idea/**/usage.statistics.xml 14 | .idea/**/dictionaries 15 | .idea/**/shelf 16 | 17 | # Generated files 18 | .idea/**/contentModel.xml 19 | 20 | # Sensitive or high-churn files 21 | .idea/**/dataSources/ 22 | .idea/**/dataSources.ids 23 | .idea/**/dataSources.local.xml 24 | .idea/**/sqlDataSources.xml 25 | .idea/**/dynamic.xml 26 | .idea/**/uiDesigner.xml 27 | .idea/**/dbnavigator.xml 28 | 29 | # Gradle 30 | .idea/**/gradle.xml 31 | .idea/**/libraries 32 | 33 | # Gradle and Maven with auto-import 34 | # When using Gradle or Maven with auto-import, you should exclude module files, 35 | # since they will be recreated, and may cause churn. Uncomment if using 36 | # auto-import. 37 | # .idea/modules.xml 38 | # .idea/*.iml 39 | # .idea/modules 40 | # *.iml 41 | # *.ipr 42 | 43 | # CMake 44 | cmake-build-*/ 45 | 46 | # Mongo Explorer plugin 47 | .idea/**/mongoSettings.xml 48 | 49 | # File-based project format 50 | *.iws 51 | 52 | # IntelliJ 53 | out/ 54 | 55 | # mpeltonen/sbt-idea plugin 56 | .idea_modules/ 57 | 58 | # JIRA plugin 59 | atlassian-ide-plugin.xml 60 | 61 | # Cursive Clojure plugin 62 | .idea/replstate.xml 63 | 64 | # Crashlytics plugin (for Android Studio and IntelliJ) 65 | com_crashlytics_export_strings.xml 66 | crashlytics.properties 67 | crashlytics-build.properties 68 | fabric.properties 69 | 70 | # Editor-based Rest Client 71 | .idea/httpRequests 72 | 73 | # Android studio 3.1+ serialized cache file 74 | .idea/caches/build_file_checksums.ser 75 | 76 | # Prerequisites 77 | *.d 78 | 79 | # Object files 80 | *.o 81 | *.ko 82 | *.obj 83 | *.elf 84 | 85 | # Linker output 86 | *.ilk 87 | *.map 88 | *.exp 89 | 90 | # Precompiled Headers 91 | *.gch 92 | *.pch 93 | 94 | # Libraries 95 | *.lib 96 | *.a 97 | *.la 98 | *.lo 99 | 100 | # Shared objects (inc. Windows DLLs) 101 | *.dll 102 | *.so 103 | *.so.* 104 | *.dylib 105 | 106 | # Executables 107 | *.exe 108 | *.out 109 | *.app 110 | *.i*86 111 | *.x86_64 112 | *.hex 113 | 114 | # Debug files 115 | *.dSYM/ 116 | *.su 117 | *.idb 118 | *.pdb 119 | 120 | # Kernel Module Compile Results 121 | *.mod* 122 | *.cmd 123 | .tmp_versions/ 124 | modules.order 125 | Module.symvers 126 | Mkfile.old 127 | dkms.conf 128 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | random_forest_c -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/random-forest-c.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(random_forest_c C) 3 | 4 | set(CMAKE_C_STANDARD 99) 5 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -O1") 6 | 7 | add_executable(random-forest main.c utils/utils.c utils/utils.h utils/data.c utils/data.h model/tree.c model/tree.h model/forest.c model/forest.h eval/eval.c eval/eval.h) 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Random Forests - C 2 | A proof of concept basic implementation of random forests for classification and accompanying decision trees in C. 3 | 4 | ## Running the code 5 | 6 | Fastest way to start experimenting is to 7 | - (1) run the `data.py` script to generate some random CSV data 8 | - (2) compile as preferred (optionally using the `CMakeLists.txt` provided) 9 | - (3) run `./random-forests-c ` or `./random-forests-c --help` to see which arguments are available to configure. 10 | 11 | The [`main.c`](./main.c) file contains an example configuration of a random forest and code to run `cross_validate()` which will both train and evaluate a model. 12 | 13 | ### Training 14 | 15 | The `cross_validate()` function runs k-fold cross validation on whatever data is provided -- first trains the model and then evaluates it on every of the testing folds. 16 | 17 | The main function that handles model training is `train_model()` 18 | ```c 19 | const DecisionTreeNode **train_model(double **data, 20 | const RandomForestParameters *params, 21 | const struct dim *csv_dim, 22 | const ModelContext *ctx); 23 | ``` 24 | It returns an array of `DecisionTreeNode` pointers to roots of decision trees comprising the forest, and the parameters are 25 | 26 | - `**training_data` - training data (equivalent to a DataFrame in Python). 27 | - `*params` - pointer to struct that holds the configuration of a random forest model. 28 | - `*csv_dim` - pointer to a struct holding row x col dimensions of the read data. 29 | - `*ctx` - pointer to a context object that holds some optional data that can be used for training / evaluation. 30 | 31 | For example: 32 | ```c 33 | const ModelContext ctx = (ModelContext){ 34 | testingFoldIdx : foldIdx /* Fold to use for evaluation. */, 35 | rowsPerFold : csv_dim->rows / k_folds /* Number of rows per fold. */ 36 | }; 37 | 38 | const DecisionTreeNode **random_forest = (const DecisionTreeNode **)train_model( 39 | data, 40 | params, 41 | csv_dim, 42 | &ctx); 43 | ``` 44 | 45 | ### Evaluation 46 | 47 | After training we can evaluate the model with `eval_model()` which returns an accuracy measure for model performance. 48 | For example: 49 | 50 | ```c 51 | // Evaluate the model that was just trained. We use the fold identified by 'foldIdx' in 'ctx' to evaluate the model. 52 | double accuracy = eval_model( 53 | random_forest /* Model to evaluate. */, 54 | data, 55 | params, 56 | csv_dim, 57 | &ctx); 58 | ``` 59 | 60 | ## Code structure 61 | 62 | - `model` -- random forest and decision trees. 63 | - `eval` -- evaluation code for running `cross_validate()` or `hyperparameter_search()` to test the model. 64 | - `utils` -- utilities for data management, argument parsing, etc. 65 | 66 | The optional arguments to the program (can be viewed by running with a `--help` flag) 67 | ``` 68 | -c, --num_cols=number Optional number of cols in the input CSV_FILE, if 69 | known 70 | -r, --num_rows=number Optional number of rows in the input CSV_FILE, if 71 | known 72 | -l, --log_level=number Optional debug logging level [0-3]. Level 0 is no 73 | output, 3 is most verbose. Defaults to 1. 74 | -s, --seed=number Optional random number seed. 75 | ``` 76 | 77 | ## Reference 78 | Breiman, Leo. "Random forests." Machine learning 45.1 (2001): 5-32. -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import random 3 | import pandas as pd 4 | 5 | NUM_ROWS = 100 6 | NUM_COLS = 100 7 | 8 | rows_written = 0 9 | data = [] 10 | 11 | for i in range(NUM_ROWS): 12 | row = [] 13 | for j in range(NUM_COLS - 1): 14 | row.append(random.uniform(0, 1)) 15 | 16 | # append the target variable as a random integer of two categories [0, 1] 17 | row.append(random.randint(0, 1) * 1.0) 18 | 19 | data.append(row) 20 | rows_written += 1 21 | 22 | df = pd.DataFrame.from_records(data) 23 | 24 | # write the dataframe with rows to csv 25 | file_name = "data.csv" 26 | df.to_csv(file_name, index=False) 27 | 28 | print(f'wrote dataframe of {rows_written} rows to: {file_name}') 29 | -------------------------------------------------------------------------------- /eval/eval.c: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #include 6 | #include 7 | #include "eval.h" 8 | 9 | void hyperparameter_search(double **data, struct dim *csv_dim) 10 | { 11 | // Init the options for number of trees to: 10, 100, 1000. 12 | size_t n = 3; 13 | 14 | size_t *estimators = malloc(sizeof(size_t) * n); 15 | estimators[0] = 10; 16 | estimators[1] = 50; 17 | estimators[2] = 100; 18 | 19 | // Init the options for the max depth for a tree to: 3, 7, 10. 20 | size_t *max_depths = malloc(sizeof(size_t) * n); 21 | max_depths[0] = 3; 22 | max_depths[1] = 7; 23 | max_depths[2] = 10; 24 | 25 | // Defaults based on SKLearn's defaults / hand picked in order to compare performance 26 | // with the same parameters. 27 | size_t max_features = 3; 28 | size_t min_samples_leaf = 2; 29 | size_t max_depth = 7; 30 | 31 | // Number of folds for cross validation. 32 | size_t k_folds = 5; 33 | 34 | // Best params computed from running the hyperparameter search. 35 | size_t best_n_estimators = -1; 36 | double best_accuracy = -1; 37 | 38 | for (size_t i = 0; i < n; ++i) 39 | { 40 | size_t n_estimators = estimators[i]; /* Number of trees in the forest. */ 41 | 42 | for (size_t j = 0; j < n; ++j) 43 | { 44 | size_t max_depth = max_depths[j]; 45 | 46 | RandomForestParameters params = { 47 | n_estimators : n_estimators, 48 | max_depth : max_depth, 49 | min_samples_leaf : min_samples_leaf, 50 | max_features : max_features 51 | }; 52 | 53 | if (log_level > 0) 54 | { 55 | printf("[hyperparameter search] running cross_validate\n"); 56 | printf("[hyperparameter search] "); 57 | print_params(¶ms); 58 | } 59 | 60 | double cv_accuracy = cross_validate(data, 61 | ¶ms, 62 | csv_dim, 63 | k_folds); 64 | 65 | if (log_level > 0) 66 | printf("[hyperparameter search] cross validation accuracy: %f%% (%ld%%)\n", 67 | (cv_accuracy * 100), 68 | (long)(cv_accuracy * 100)); 69 | 70 | // Update best accuracy and best parameters found so far from the hyperparameter search. 71 | if (cv_accuracy > best_accuracy) 72 | { 73 | best_accuracy = cv_accuracy; 74 | best_n_estimators = n_estimators; 75 | } 76 | } 77 | } 78 | 79 | // Free auxillary buffers. 80 | free(estimators); 81 | free(max_depths); 82 | 83 | printf("[hyperparameter search] run complete\n best_accuracy: %f\n best_n_estimators (trees): %ld\n", 84 | best_accuracy, best_n_estimators); 85 | } 86 | 87 | double eval_model(const DecisionTreeNode **random_forest, 88 | double **data, 89 | const RandomForestParameters *params, 90 | const struct dim *csv_dim, 91 | const ModelContext *ctx) 92 | { 93 | // Keeping track of how many predictions have been correct. Accuracy can be 94 | // computed with 'num_correct' / 'rowsPerFold' (or how many predictions we make). 95 | long num_correct = 0; 96 | 97 | // Since we are evaluating the model on a single fold (to control overfitting), we start 98 | // iterating the rows for which we are getting predictions at an offset that can be computed 99 | // as 'testingFoldIdx * rowsPerFold' and make predictions for 'rowsPerFold' number of rows 100 | size_t row_id_offset = ctx->testingFoldIdx * ctx->rowsPerFold; 101 | for (size_t row_id = row_id_offset; row_id < row_id_offset + ctx->rowsPerFold; ++row_id) 102 | { 103 | int prediction = predict_model(&random_forest, 104 | params->n_estimators, 105 | data[row_id]); 106 | int ground_truth = (int)data[row_id][csv_dim->cols - 1]; 107 | 108 | if (log_level > 1) 109 | printf("majority vote: %d | %d ground truth\n", prediction, ground_truth); 110 | 111 | if (prediction == ground_truth) 112 | ++num_correct; 113 | } 114 | return (double)num_correct / (double)ctx->rowsPerFold; 115 | } 116 | 117 | double cross_validate(double **data, 118 | const RandomForestParameters *params, 119 | const struct dim *csv_dim, 120 | const int k_folds) 121 | { 122 | // Sum of all accuracies on every evaluated fold. 123 | double sumAccuracy = 0; 124 | 125 | // Iterate through the fold indeces and fit models on the selections. The current 'foldIdx' is the index 126 | // of the fold in the array of all loaded data that is the fold that's currently the test fold, with all of 127 | // the other folds being used for training. 128 | for (size_t foldIdx = 0; foldIdx < k_folds; ++foldIdx) 129 | { 130 | const ModelContext ctx = (ModelContext){ 131 | testingFoldIdx : foldIdx /* Fold to use for evaluation. */, 132 | rowsPerFold : csv_dim->rows / k_folds /* Number of rows per fold. */ 133 | }; 134 | 135 | // Train an instance of the model with every fold of data except of the fold indentified by 136 | // 'foldIdx' used for training the the 'foldIdx' fold withheld from training in order to be 137 | // used for evaluation. 138 | const DecisionTreeNode **random_forest = (const DecisionTreeNode **)train_model( 139 | data, 140 | params, 141 | csv_dim, 142 | &ctx); 143 | 144 | // Evaluate the model that was just trained. We use the fold identified by 'foldIdx' to evaluate 145 | // the model. 146 | const double accuracy = eval_model( 147 | random_forest /* Model to evaluate. */, 148 | data, 149 | params, 150 | csv_dim, 151 | &ctx); 152 | sumAccuracy += accuracy; 153 | 154 | // Free memory that was used to store the model. 155 | free_random_forest(&random_forest, params->n_estimators); 156 | } 157 | 158 | return sumAccuracy / k_folds; 159 | } 160 | -------------------------------------------------------------------------------- /eval/eval.h: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #ifndef eval_h 6 | #define eval_h 7 | 8 | #include "../model/tree.h" 9 | #include "../model/forest.h" 10 | #include "../utils/utils.h" 11 | #include "../utils/data.h" 12 | 13 | /* 14 | Runs a hyperparameter search across a number of pre-defined parameters for the random forest model and 15 | reports the best parameters. Calls 'cross_validate' on each parameter configuration to get the cross validation 16 | accuracy for each set-up. Can be adjusted to run across as many parameters as needed. 17 | */ 18 | void hyperparameter_search(double **data, struct dim *csv_dim); 19 | 20 | /* 21 | Runs k-fold cross validation on the 'data' and returns the accuracy. In the process builds up a random 22 | forest model for each iteration and evaluates on a separate test fold. 23 | */ 24 | double cross_validate(double **data, 25 | const RandomForestParameters *params, 26 | const struct dim *csv_dim, 27 | const int k_folds); 28 | 29 | #endif // eval_h 30 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #include 6 | #include 7 | #include 8 | #include "eval/eval.h" 9 | #include "utils/argparse.h" 10 | #include "utils/data.h" 11 | #include "utils/utils.h" 12 | 13 | /* Our argp parser. */ 14 | static struct argp argp = {options, parse_opt, args_doc, doc}; 15 | 16 | int main(int argc, char **argv) 17 | { 18 | struct arguments arguments; 19 | 20 | // Default argument values. 21 | arguments.log_level = 1; 22 | arguments.rows = 0; 23 | arguments.cols = 0; 24 | 25 | /* Parse our arguments; every option seen by parse_opt will 26 | be reflected in arguments. */ 27 | argp_parse(&argp, argc, argv, 0, 0, &arguments); 28 | 29 | // Set the log level to whatever was parsed from the arguments or the default value. 30 | set_log_level(arguments.log_level); 31 | 32 | // Optionally set the random seed if a specific random seed was provided via an argument. 33 | if (arguments.random_seed) 34 | srand(arguments.random_seed); 35 | else 36 | srand(time(NULL)); 37 | 38 | // Read the csv file from args which must be parsed now. 39 | const char *file_name = arguments.args[0]; 40 | 41 | // If the values for rows and cols were provided as arguments, then use them for the 42 | // 'dim' struct, otherwise call 'parse_csv_dims()' to parse the csv file provided to 43 | // compute the size of the csv file. 44 | struct dim csv_dim; 45 | 46 | if (arguments.rows && arguments.cols) 47 | csv_dim = (struct dim){rows : arguments.rows, cols : arguments.cols}; 48 | else 49 | csv_dim = parse_csv_dims(file_name); 50 | 51 | if (log_level > 0) 52 | printf("using:\n verbose log level: %d\n rows: %ld, cols: %ld\nreading from csv file:\n \"%s\"\n", 53 | log_level, 54 | csv_dim.rows, 55 | csv_dim.cols, 56 | file_name); 57 | 58 | // Allocate memory for the data coming from the .csv and read in the data. 59 | double *data = malloc(sizeof(double) * csv_dim.rows * csv_dim.cols); 60 | parse_csv(file_name, &data, csv_dim); 61 | 62 | // Compute a checksum of the data to verify that loaded correctly. 63 | if (log_level > 1) 64 | printf("data checksum = %f\n", _1d_checksum(data, csv_dim.rows * csv_dim.cols)); 65 | 66 | const int k_folds = 1; 67 | 68 | if (log_level > 0) 69 | printf("using:\n k_folds: %d\n", k_folds); 70 | 71 | // Example configuration for a random forest model. 72 | const RandomForestParameters params = { 73 | n_estimators : 3 /* Number of trees in the random forest model. */, 74 | max_depth : 7 /* Maximum depth of a tree in the model. */, 75 | min_samples_leaf : 3, 76 | max_features : 3 77 | }; 78 | 79 | // Print random forest parameters. 80 | if (log_level > 0) 81 | print_params(¶ms); 82 | 83 | // Pivot the csv file data into a two dimensional array. 84 | double **pivoted_data; 85 | pivot_data(data, csv_dim, &pivoted_data); 86 | 87 | if (log_level > 1) 88 | printf("checksum of pivoted 2d array: %f\n", _2d_checksum(pivoted_data, csv_dim.rows, csv_dim.cols)); 89 | 90 | // Start the clock for timing. 91 | clock_t begin_clock = clock(); 92 | 93 | double cv_accuracy = cross_validate(pivoted_data, ¶ms, &csv_dim, k_folds); 94 | printf("cross validation accuracy: %f%% (%ld%%)\n", 95 | (cv_accuracy * 100), 96 | (long)(cv_accuracy * 100)); 97 | 98 | // Record and output the time taken to run. 99 | clock_t end_clock = clock(); 100 | printf("(time taken: %fs)\n", (double)(end_clock - begin_clock) / CLOCKS_PER_SEC); 101 | 102 | // Free loaded csv file data. 103 | free(data); 104 | free(pivoted_data); 105 | } 106 | -------------------------------------------------------------------------------- /model/forest.c: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #include "forest.h" 6 | 7 | const DecisionTreeNode *train_model_tree(double **data, 8 | const RandomForestParameters *params, 9 | const struct dim *csv_dim, 10 | long *nodeId /* Ascending node ID generator */, 11 | const ModelContext *ctx) 12 | { 13 | DecisionTreeNode *root = empty_node(nodeId); 14 | DecisionTreeDataSplit data_split = calculate_best_data_split(data, 15 | params->max_features, 16 | csv_dim->rows, 17 | csv_dim->cols, 18 | ctx); 19 | 20 | if (log_level > 1) 21 | printf("calculated best split for the dataset in train_model_tree\n" 22 | "half1: %ld\nhalf2: %ld\nbest gini: %f\nbest value: %f\nbest index: %d\n", 23 | data_split.data[0].length, 24 | data_split.data[1].length, 25 | data_split.gini, 26 | data_split.value, 27 | data_split.index); 28 | 29 | populate_split_data(root, &data_split); 30 | 31 | // Start building the tree recursively. 32 | grow(root, 33 | params->max_depth, 34 | params->min_samples_leaf, 35 | params->max_features, 36 | 1 /* Current depth. */, 37 | csv_dim->rows, 38 | csv_dim->cols, 39 | nodeId, 40 | ctx); 41 | 42 | // Free any temp memory. 43 | free(data_split.data); 44 | 45 | return root; 46 | } 47 | 48 | const DecisionTreeNode **train_model(double **data, 49 | const RandomForestParameters *params, 50 | const struct dim *csv_dim, 51 | const ModelContext *ctx) 52 | { 53 | // Random forest model which is stored as a contigious list of pointers to DecisionTreeNode structs. 54 | const DecisionTreeNode **random_forest = (const DecisionTreeNode **) 55 | malloc(sizeof(DecisionTreeNode *) * params->n_estimators); 56 | 57 | // Node ID generator. We use this such that every node in the tree gets assigned a strictly 58 | // increasing ID for debugging. 59 | long nodeId = 0; 60 | 61 | // Populate the array with allocated memory for the random forest with pointers to individual decision 62 | // trees. 63 | for (size_t i = 0; i < params->n_estimators; ++i) 64 | { 65 | random_forest[i] = train_model_tree(data, params, csv_dim, &nodeId, ctx); 66 | } 67 | return random_forest; 68 | } 69 | 70 | int predict_model(const DecisionTreeNode ***random_forest, size_t n_estimators, double *row) 71 | { 72 | int zeroes = 0; 73 | int ones = 0; 74 | for (size_t i = 0; i < n_estimators; ++i) 75 | { 76 | int prediction; 77 | make_prediction((*random_forest)[i] /* root of the tree */, 78 | row, 79 | &prediction); 80 | 81 | if (prediction == 0) 82 | zeroes++; 83 | else if (prediction == 1) 84 | ones++; 85 | else 86 | { 87 | printf("Error: currently only support binary classification, i.e. prediction values 0/1, got: %d\n", 88 | prediction); 89 | exit(1); 90 | } 91 | } 92 | if (ones > zeroes) 93 | return 1; 94 | else 95 | return 0; 96 | } 97 | 98 | void free_random_forest(const DecisionTreeNode ***random_forest, const size_t length) 99 | { 100 | long freeCount = 0; 101 | for (size_t idx = 0; idx < length; ++idx) 102 | { 103 | // Recursively free this DecisionTree rooted at the current node. 104 | free_decision_tree_node((*random_forest)[idx], &freeCount); 105 | } 106 | // Free the actual array of pointers to the nodes. 107 | free(*random_forest); 108 | 109 | if (log_level > 2) 110 | printf("total DecisionTreeNode freed: %ld\n", freeCount); 111 | } 112 | 113 | void print_params(const RandomForestParameters *params) 114 | { 115 | printf("using RandomForestParameters:\n n_estimators: %ld\n max_depth: %ld\n min_samples_leaf: %ld\n max_features: %ld\n", 116 | params->n_estimators, 117 | params->max_depth, 118 | params->min_samples_leaf, 119 | params->max_features); 120 | } 121 | -------------------------------------------------------------------------------- /model/forest.h: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #ifndef forest_h 6 | #define forest_h 7 | 8 | #include 9 | #include "tree.h" 10 | 11 | extern int log_level; 12 | 13 | /* 14 | Parameters for a Random Forest model. 15 | */ 16 | struct RandomForestParameters 17 | { 18 | size_t n_estimators; // Number of trees in a forest. 19 | size_t max_depth; // Maximum depth of a tree. 20 | size_t min_samples_leaf; // Minimum number of data samples at a leaf node. 21 | size_t max_features; // Number of features considered when calculating the best data split. 22 | }; 23 | 24 | typedef struct RandomForestParameters RandomForestParameters; 25 | 26 | /* 27 | Function to print a RandomForestParameters struct for debugging. 28 | */ 29 | void print_params(const RandomForestParameters *params); 30 | 31 | /* 32 | Trains a single decision tree on the provided data and returns a pointer to the root DecisionTreeNode 33 | of the tree stored on the heap. 34 | */ 35 | const DecisionTreeNode * 36 | train_model_tree(double **data, 37 | const RandomForestParameters *params, 38 | const struct dim *csv_dim, 39 | long *nodeId /* Ascending node ID generator */, 40 | const ModelContext *ctx); 41 | 42 | /* 43 | Trains a random forest model that is comprised of individually built decision trees. Returns an array 44 | of pointers to DecisionTreeNode's that are the roots of the decision trees in the random forest model. 45 | */ 46 | const DecisionTreeNode **train_model(double **data, 47 | const RandomForestParameters *params, 48 | const struct dim *csv_dim, 49 | const ModelContext *ctx); 50 | 51 | /* 52 | Given a single row, gets predictions from every decision tree in the 'random_forest' model 53 | for the class target that the row should be classified into and returns the class target value 54 | that is the majority vote. 55 | */ 56 | int predict_model(const DecisionTreeNode ***random_forest, size_t n_estimators, double *row); 57 | 58 | /* 59 | Frees memory for a given random forest model (array of pointers to DecisionTreeNode's). 60 | */ 61 | void free_random_forest(const DecisionTreeNode ***random_forest, const size_t length); 62 | 63 | #endif // forest_h 64 | -------------------------------------------------------------------------------- /model/tree.c: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #include "tree.h" 6 | 7 | /* 8 | Allocates memory for an empty DecisionTreeNode and returns a pointer to the node. 9 | */ 10 | DecisionTreeNode *empty_node(long *id) 11 | { 12 | DecisionTreeNode *node = malloc(sizeof(DecisionTreeNode)); 13 | 14 | node->id = (*id); 15 | node->leftChild = NULL; 16 | node->rightChild = NULL; 17 | 18 | node->half1 = NULL; 19 | node->half2 = NULL; 20 | node->size_half1 = 0; 21 | node->size_half2 = 0; 22 | 23 | node->split_index = -1; 24 | node->split_value = -1; 25 | node->split_data_halves = NULL; 26 | 27 | (*id)++; 28 | 29 | if (log_level > 2) 30 | printf("created a DecisionTreeNode with id %ld stored at address %p \n", node->id, node); 31 | 32 | return node; 33 | } 34 | 35 | /* 36 | Populates a given DecisionTreeNode with data from the DecisionTreeDataSplit struct 37 | pointed to by 'data_split'. 38 | */ 39 | void populate_split_data(DecisionTreeNode *node, DecisionTreeDataSplit *data_split) 40 | { 41 | node->split_index = (*data_split).index; 42 | node->split_value = (*data_split).value; 43 | node->split_data_halves = (*data_split).data; 44 | } 45 | 46 | /* 47 | Given a two dimensional array of data finds and returns a DecisionTreeTargetClasses 48 | struct with unique target classes found in the dataset at column with index 'cols - 1'. 49 | */ 50 | DecisionTreeTargetClasses get_target_class_values(double **data, size_t rows, size_t cols, const ModelContext *ctx) 51 | { 52 | if (log_level > 1) 53 | printf("generating class value set...\n"); 54 | 55 | size_t count = 0; 56 | int *target_class_values = malloc(count * sizeof(int)); 57 | 58 | for (size_t i = 0; i < rows; ++i) 59 | { 60 | // Skip rows that we are withholding from training for evaluation. 61 | if (is_row_part_of_testing_fold(i, ctx)) 62 | { 63 | if (log_level > 1) 64 | printf(" skipping row %ld which is part of testing fold %ld\n", i, ctx->testingFoldIdx); 65 | continue; 66 | } 67 | 68 | int class_target = (int)data[i][cols - 1]; 69 | if (!contains_int(target_class_values, count, class_target)) 70 | { 71 | if (log_level > 1) 72 | printf("adding %d \n", class_target); 73 | count++; 74 | int *temp = realloc(target_class_values, count * sizeof(int)); 75 | if (temp != NULL) 76 | target_class_values = temp; 77 | target_class_values[count - 1] = class_target; 78 | } 79 | } 80 | if (log_level > 1) 81 | printf("-------------------------------\ncount of unique classes: %ld\n", count); 82 | return (DecisionTreeTargetClasses){count, target_class_values}; 83 | } 84 | 85 | /* 86 | Given a two dimensional array of data returns the leaf node class value for the given 87 | data. The leaf node class value is whichever class value that is the class target value for 88 | the majority of the rows in the data. 89 | */ 90 | int get_leaf_node_class_value(double **data, size_t rows, size_t cols) 91 | { 92 | int zeroes = 0; 93 | int ones = 0; 94 | for (size_t i = 0; i < rows; ++i) 95 | { 96 | int class_label = (int)data[i][cols - 1]; 97 | if (class_label == 0) 98 | zeroes++; 99 | else if (class_label == 1) 100 | ones++; 101 | else 102 | { 103 | printf("Error: currently only support binary classification, i.e. class target values 0/1, got: %d\n", 104 | class_label); 105 | exit(1); 106 | } 107 | } 108 | if (ones >= zeroes) 109 | return 1; 110 | else 111 | return 0; 112 | } 113 | 114 | /* 115 | Given a two dimensional array of data and parameters for a split, splits the data into two halves and 116 | returns a pointer to an array of two DecisionTreeData for the two halves of the split. 117 | */ 118 | DecisionTreeData *split_dataset(int feature_index, 119 | double value, 120 | double **data, 121 | size_t rows, 122 | size_t cols) 123 | { 124 | if (log_level > 1) 125 | printf("splitting dataset into two halves...\n"); 126 | 127 | // Buffers to hold rows of data as we are distributing rows based on the split. 128 | double **left = (double **)malloc(1 * sizeof(double) * cols); 129 | double **right = (double **)malloc(1 * sizeof(double) * cols); 130 | 131 | size_t left_count = 0; 132 | size_t right_count = 0; 133 | 134 | for (size_t i = 0; i < rows; ++i) 135 | { 136 | double *row = data[i]; 137 | if (row[feature_index] < value) 138 | { 139 | // Copy the row into the left half and resize the buffer. 140 | left[left_count++] = row; 141 | double **temp = realloc(left, left_count * sizeof(double) * cols); 142 | if (temp != NULL) 143 | left = temp; 144 | } 145 | else 146 | { 147 | // Copy the row into the right half and resize the buffer. 148 | right[right_count++] = row; 149 | double **temp = realloc(right, right_count * sizeof(double) * cols); 150 | if (temp != NULL) 151 | right = temp; 152 | } 153 | } 154 | DecisionTreeData *data_split = malloc(sizeof(DecisionTreeData) * 2); 155 | data_split[0] = (DecisionTreeData){left_count, left}; 156 | data_split[1] = (DecisionTreeData){right_count, right}; 157 | 158 | if (log_level > 1) 159 | printf("split dataset into: %ld | %ld\n", left_count, right_count); 160 | 161 | return data_split; 162 | } 163 | 164 | double calculate_gini_index(DecisionTreeData *data_split, 165 | int *class_labels, 166 | size_t class_labels_count, 167 | size_t cols) 168 | { 169 | if (log_level > 1) 170 | printf("calculating gini index based on split...\n"); 171 | 172 | // DecisionTreeData data split should consist of two halves. 173 | int count = 2; 174 | size_t n_instances = data_split[0].length + data_split[1].length; 175 | double gini = 0.0; 176 | for (size_t i = 0; i < count; ++i) 177 | { 178 | DecisionTreeData group = data_split[i]; 179 | 180 | size_t size = group.length; 181 | if (size == 0) 182 | continue; 183 | 184 | double sum = 0.0; 185 | for (size_t j = 0; j < class_labels_count; ++j) 186 | { 187 | int class = class_labels[j]; 188 | int occurences = 0; 189 | for (size_t k = 0; k < size; ++k) 190 | { 191 | int label = (int)group.data[k][cols - 1]; 192 | if (label == class) 193 | occurences += 1; 194 | } 195 | double p_class = (double)occurences / (double)size; 196 | sum += (p_class * p_class); 197 | } 198 | gini += (1.0 - sum) * ((double)size / (double)n_instances); 199 | } 200 | 201 | if (log_level > 1) 202 | { 203 | printf("gini: %f\n", gini); 204 | printf("-----------------------------------------\n"); 205 | } 206 | 207 | return gini; 208 | } 209 | 210 | DecisionTreeDataSplit calculate_best_data_split(double **data, 211 | size_t max_features, 212 | size_t rows, 213 | size_t cols, 214 | const ModelContext *ctx) 215 | { 216 | if (log_level > 1) 217 | { 218 | printf("calculating best split for dataset...\n"); 219 | printf("rows: %ld\ncols: %ld\n", rows, cols); 220 | } 221 | 222 | // Target classes available in this dataset. 223 | DecisionTreeTargetClasses classes = get_target_class_values(data, rows, cols, ctx); 224 | 225 | // Keeping track of best data split available along with best parameters associated with 226 | // that data split. 227 | DecisionTreeData *best_data_split = NULL; 228 | double best_value = DBL_MAX; 229 | double best_gini = DBL_MAX; 230 | int best_index = INT_MAX; 231 | 232 | // Create a features array and initialize to avoid non-set memory. 233 | int *features = malloc(max_features * sizeof(int)); 234 | for (size_t i = 0; i < max_features; ++i) 235 | features[i] = -1; 236 | 237 | size_t count = 0; 238 | while (count < max_features) 239 | { 240 | // Maximum index for a feature which should not include the class target column index 241 | // which is 'cols - 1'. 242 | int max = cols - 2; 243 | int min = 0; 244 | int index = rand() % (max + 1 - min) + min; 245 | if (!contains_int(features, max_features /* size of 'features' array */, index)) 246 | { 247 | if (log_level > 1) 248 | printf("adding unique index: %d\n", index); 249 | features[count++] = index; 250 | } 251 | } 252 | if (log_level > 1) 253 | printf("-----------------------------------------\n"); 254 | 255 | for (size_t i = 0; i < max_features; ++i) 256 | { 257 | int feature_index = features[i]; 258 | for (size_t j = 0; j < rows; ++j) 259 | { 260 | DecisionTreeData *data_split = split_dataset(feature_index, 261 | data[j][feature_index], 262 | data, 263 | rows, 264 | cols); 265 | double gini = calculate_gini_index(data_split, classes.labels, classes.count, cols); 266 | 267 | if (gini < best_gini) 268 | { 269 | best_index = feature_index; 270 | best_value = data[j][feature_index]; 271 | best_gini = gini; 272 | 273 | // First free the memory that was previously allocated for the 'data_split' and pointer 274 | // which was assigned to 'best_data_split' since now we have found a new better split. 275 | if (best_data_split) 276 | free_decision_tree_data(best_data_split); 277 | 278 | best_data_split = data_split; 279 | } 280 | else 281 | { 282 | free_decision_tree_data(data_split); 283 | } 284 | } 285 | } 286 | 287 | // Free any other memory. 288 | free(features); 289 | free(classes.labels); 290 | 291 | return (DecisionTreeDataSplit){best_index, best_value, best_gini, best_data_split}; 292 | } 293 | 294 | void grow(DecisionTreeNode *decision_tree, 295 | size_t max_depth, 296 | size_t min_samples_leaf, 297 | size_t max_features, 298 | int depth, 299 | size_t rows, 300 | size_t cols, 301 | long *nodeId, 302 | const ModelContext *ctx) 303 | { 304 | DecisionTreeData left_half = decision_tree->split_data_halves[0]; 305 | DecisionTreeData right_half = decision_tree->split_data_halves[1]; 306 | 307 | double **left = left_half.data; 308 | double **right = right_half.data; 309 | 310 | decision_tree->split_data_halves = NULL; 311 | 312 | if (left == NULL || right == NULL) 313 | { 314 | // If we are at the leaf node, then combine both left and right side data and compute get the leaf 315 | // node class for the combined data. 316 | double **combined_data = combine_arrays(left, right, left_half.length, right_half.length, cols); 317 | int leaf = get_leaf_node_class_value(combined_data, rows, cols); 318 | 319 | decision_tree->left_leaf = leaf; 320 | decision_tree->right_leaf = leaf; 321 | 322 | free(left); 323 | free(right); 324 | free(combined_data); 325 | 326 | return; 327 | } 328 | if (depth >= max_depth) 329 | { 330 | decision_tree->left_leaf = get_leaf_node_class_value(left, left_half.length /* rows */, cols); 331 | decision_tree->right_leaf = get_leaf_node_class_value(right, right_half.length /* rows */, cols); 332 | 333 | free(left); 334 | free(right); 335 | 336 | return; 337 | } 338 | if (left_half.length <= min_samples_leaf) 339 | { 340 | decision_tree->left_leaf = get_leaf_node_class_value(left, left_half.length /* rows */, cols); 341 | } 342 | else 343 | { 344 | DecisionTreeDataSplit data_split = calculate_best_data_split(left, 345 | max_features, 346 | left_half.length /* rows */, 347 | cols, 348 | ctx); 349 | 350 | // Create the left child of the current node and populate with data from the data split. 351 | decision_tree->leftChild = empty_node(nodeId); 352 | populate_split_data(decision_tree->leftChild, &data_split); 353 | 354 | grow(decision_tree->leftChild, 355 | max_depth, 356 | min_samples_leaf, 357 | max_features, 358 | depth + 1 /* since we are now at the next 'level' in the tree */, 359 | rows, 360 | cols, 361 | nodeId, 362 | ctx); 363 | 364 | free(data_split.data); 365 | } 366 | if (right_half.length <= min_samples_leaf) 367 | { 368 | decision_tree->right_leaf = get_leaf_node_class_value(right, right_half.length, cols); 369 | } 370 | else 371 | { 372 | DecisionTreeDataSplit data_split = calculate_best_data_split(right, 373 | max_features, 374 | right_half.length /* rows */, 375 | cols, 376 | ctx); 377 | 378 | // Create the right child of the current node and populate with data from the data split. 379 | decision_tree->rightChild = empty_node(nodeId); 380 | populate_split_data(decision_tree->rightChild, &data_split); 381 | 382 | grow(decision_tree->rightChild, 383 | max_depth, 384 | min_samples_leaf, 385 | max_features, 386 | depth + 1 /* since we are now at the next 'level' in the tree */, 387 | rows, 388 | cols, 389 | nodeId, 390 | ctx); 391 | 392 | free(data_split.data); 393 | } 394 | 395 | free(left); 396 | free(right); 397 | } 398 | 399 | void make_prediction(const DecisionTreeNode *decision_tree, double *row, int *prediction_val) 400 | { 401 | if (row[decision_tree->split_index] < decision_tree->split_value) 402 | { 403 | if (decision_tree->leftChild != NULL) 404 | make_prediction(decision_tree->leftChild, row, prediction_val); 405 | else 406 | (*prediction_val) = decision_tree->left_leaf; 407 | } 408 | else 409 | { 410 | if (decision_tree->rightChild != NULL) 411 | make_prediction(decision_tree->rightChild, row, prediction_val); 412 | else 413 | (*prediction_val) = decision_tree->right_leaf; 414 | } 415 | } 416 | 417 | /* 418 | Frees memory for a given DecisionTreeNode. 419 | */ 420 | void free_decision_tree_node(const DecisionTreeNode *node, long *freeCount) 421 | { 422 | (*freeCount)++; 423 | if (node->leftChild) 424 | free_decision_tree_node(node->leftChild, freeCount); 425 | if (node->rightChild) 426 | free_decision_tree_node(node->rightChild, freeCount); 427 | 428 | if (log_level > 2) 429 | printf("freeing DecisionTreeNode with id=%ld\n", node->id); 430 | 431 | if (node && node->split_data_halves && node->split_data_halves->length) 432 | { 433 | free(node->split_data_halves[0].data); 434 | free(node->split_data_halves[1].data); 435 | free(node->split_data_halves); 436 | } 437 | 438 | free((void *)node); 439 | } 440 | 441 | /* 442 | Frees memory for a given DecisionTreeData. 443 | */ 444 | void free_decision_tree_data(DecisionTreeData *data_split) 445 | { 446 | free(data_split[0].data); 447 | free(data_split[1].data); 448 | free(data_split); 449 | } 450 | -------------------------------------------------------------------------------- /model/tree.h: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #ifndef tree_h 6 | #define tree_h 7 | 8 | #include 9 | #include 10 | #include "../utils/utils.h" 11 | 12 | typedef struct DecisionTreeData DecisionTreeData; 13 | typedef struct DecisionTreeNode DecisionTreeNode; 14 | typedef struct DecisionTreeDataSplit DecisionTreeDataSplit; 15 | typedef struct DecisionTreeTargetClasses DecisionTreeTargetClasses; 16 | 17 | /* 18 | Represents a single node in a decision tree that comprise a random forest. 19 | */ 20 | struct DecisionTreeNode 21 | { 22 | long id; 23 | struct DecisionTreeNode *leftChild; 24 | struct DecisionTreeNode *rightChild; 25 | 26 | double *half1; 27 | double *half2; 28 | size_t size_half1; 29 | size_t size_half2; 30 | 31 | double split_value; 32 | long split_index; 33 | DecisionTreeData *split_data_halves; 34 | 35 | // if the node is a leaf 36 | int left_leaf; 37 | int right_leaf; 38 | }; 39 | 40 | struct DecisionTreeData 41 | { 42 | size_t length; 43 | double **data; 44 | }; 45 | 46 | struct DecisionTreeDataSplit 47 | { 48 | int index; 49 | double value; 50 | double gini; 51 | DecisionTreeData *data; 52 | }; 53 | 54 | struct DecisionTreeTargetClasses 55 | { 56 | size_t count; 57 | int *labels; 58 | }; 59 | 60 | /* 61 | Functions to free memory allocated for the structs. 62 | */ 63 | void free_decision_tree_data(DecisionTreeData *data_split); 64 | void free_decision_tree_node(const DecisionTreeNode *node, long *freeCount); 65 | 66 | /* 67 | Creates a new empry DecisionTreeNode with id from the strictly increasing 68 | id generator. 69 | */ 70 | DecisionTreeNode *empty_node(long *id); 71 | 72 | /* 73 | Function to recursively grow a DecisionTreeNode by splitting the dataset and creating 74 | left / right children until fully splitting the rows across all nodes. 75 | */ 76 | void grow(DecisionTreeNode *decision_tree, 77 | size_t max_depth, 78 | size_t min_samples_leaf, 79 | size_t max_features, 80 | int depth, 81 | size_t rows, 82 | size_t cols, 83 | long *nodeId, 84 | const ModelContext *ctx); 85 | 86 | /* 87 | Calculates the best split for the 'data' given a number of randomly selected features from the data 88 | (columns) up to the number of maximum number of features 'max_features'. 89 | */ 90 | DecisionTreeDataSplit calculate_best_data_split(double **data, 91 | size_t max_features, 92 | size_t rows, 93 | size_t cols, 94 | const ModelContext *ctx); 95 | 96 | /* 97 | Populates a given DecisionTreeNode with data from the DecisionTreeDataSplit struct 98 | pointed to by 'data_split'. 99 | */ 100 | void populate_split_data(DecisionTreeNode *node, DecisionTreeDataSplit *data_split); 101 | /* 102 | Given a row of data and a trained decision tree, computes the predicted class target value for the row 103 | and writes the prediction into the variable pointed to 'prediction_val'. 104 | */ 105 | void make_prediction(const DecisionTreeNode *decision_tree, double *row, int *prediction_val); 106 | 107 | #endif // tree_h 108 | -------------------------------------------------------------------------------- /utils/argparse.h: -------------------------------------------------------------------------------- 1 | #ifndef argparse_h 2 | #define argparse_h 3 | 4 | #include 5 | #include 6 | 7 | /* How many arguments we accept. */ 8 | #define COUNT_ARGS 1 9 | 10 | const char *argp_program_version = 11 | "random-forests-c 1.0"; 12 | const char *argp_program_bug_address = 13 | "https://github.com/dobroshynski/random-forests-c"; 14 | 15 | /* Program documentation. */ 16 | static char doc[] = 17 | "random-forests-c -- Basic implementation of random forests and accompanying decision trees in C"; 18 | 19 | /* A description of the arguments we accept. */ 20 | static char args_doc[] = "CSV_FILE"; 21 | 22 | /* The options we understand. */ 23 | static struct argp_option options[] = { 24 | {"num_rows", 'r', "number", 0, "Optional number of rows in the input CSV_FILE, if known", 0}, 25 | {"num_cols", 'c', "number", 0, "Optional number of cols in the input CSV_FILE, if known", 0}, 26 | {"log_level", 'l', "number", 0, "Optional debug logging level [0-3]. Level 0 is no output, 3 is most verbose. Defaults to 1.", 1}, 27 | {"seed", 's', "number", 0, "Optional random number seed.", 2}, 28 | {0}}; 29 | 30 | /* Used by main to communicate with parse_opt. */ 31 | struct arguments 32 | { 33 | char *args[COUNT_ARGS]; /* CSV file argument. */ 34 | 35 | long rows, cols; 36 | int log_level; 37 | int random_seed; 38 | }; 39 | 40 | /* Parse a single option. */ 41 | static error_t 42 | parse_opt(int key, char *arg, struct argp_state *state) 43 | { 44 | /* Get the input argument from argp_parse, which we 45 | know is a pointer to our arguments structure. */ 46 | struct arguments *arguments = state->input; 47 | 48 | switch (key) 49 | { 50 | case 'r': 51 | arguments->rows = atol(arg); 52 | break; 53 | case 'c': 54 | arguments->cols = atol(arg); 55 | break; 56 | case 'l': 57 | arguments->log_level = atoi(arg); 58 | break; 59 | case 's': 60 | arguments->random_seed = atoi(arg); 61 | break; 62 | 63 | case ARGP_KEY_ARG: 64 | if (state->arg_num >= COUNT_ARGS) 65 | /* Too many arguments. */ 66 | argp_usage(state); 67 | 68 | arguments->args[state->arg_num] = arg; 69 | 70 | break; 71 | 72 | case ARGP_KEY_END: 73 | if (state->arg_num < COUNT_ARGS) 74 | /* Not enough arguments. */ 75 | argp_usage(state); 76 | break; 77 | 78 | default: 79 | return ARGP_ERR_UNKNOWN; 80 | } 81 | return 0; 82 | } 83 | 84 | #endif // argparse_h 85 | -------------------------------------------------------------------------------- /utils/data.c: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #include "data.h" 6 | 7 | struct dim parse_csv_dims(const char *file_name) 8 | { 9 | FILE *csv_file; 10 | csv_file = fopen(file_name, "r"); 11 | 12 | if (csv_file == NULL) 13 | { 14 | printf("Error: can't open file: %s\n", file_name); 15 | exit(-1); 16 | } 17 | 18 | const char *delimiter = ","; 19 | 20 | char *buffer = malloc(BUFSIZ); 21 | char *token; 22 | 23 | // Keeping track of how many rows and columns there are. 24 | int rows = 0; 25 | int cols = 0; 26 | 27 | // Reach each line of the file into the buffer. 28 | while (fgets(buffer, BUFSIZ, csv_file) != NULL) 29 | { 30 | ++rows; 31 | 32 | // We use a second counter for columns in order to count the number of columns 33 | // for each row we read and will trigger an assert if there is a mismatch in the 34 | // size of the cols, i.e. not all rows have the same number of columns which would 35 | // cause undefined behavior. 36 | int curr_cols = 0; 37 | 38 | // Get every token and print it. 39 | token = strtok(buffer, delimiter); 40 | while (token != NULL) 41 | { 42 | ++curr_cols; 43 | // printf("%s\n", token); 44 | 45 | // Get the next token. 46 | token = strtok(NULL, delimiter); 47 | } 48 | if (cols == 0) 49 | { 50 | cols = curr_cols; 51 | } 52 | else 53 | { 54 | assert(curr_cols == cols && "Error: every row must have the same amount of columns"); 55 | } 56 | } 57 | // We read one extra row for the csv header, so adjust here. 58 | --rows; 59 | 60 | fclose(csv_file); 61 | free(buffer); 62 | 63 | // Make sure that the dimensions are valid. 64 | assert(rows > 0 && "# of rows in csv must be > 0"); 65 | assert(cols > 0 && "# of cols in csv must be > 0"); 66 | 67 | return (struct dim){rows : rows, cols : cols}; 68 | } 69 | 70 | void parse_csv(const char *file_name, double **data_p, const struct dim csv_dim) 71 | { 72 | FILE *csv_file; 73 | csv_file = fopen(file_name, "r"); 74 | 75 | if (csv_file == NULL) 76 | { 77 | printf("Error: can't open file: %s\n", file_name); 78 | exit(-1); 79 | } 80 | 81 | const char *delimiter = ","; 82 | 83 | char *buffer = malloc(BUFSIZ); 84 | char *token; 85 | 86 | // Keeping track which row we are on. 87 | int row = 0; 88 | // Keeping track of the index in the data array. 89 | int idx = 0; 90 | 91 | // Reach each line of the file into the buffer. 92 | while (row <= csv_dim.rows && fgets(buffer, BUFSIZ, csv_file) != NULL) 93 | { 94 | if (++row == 1) 95 | continue; 96 | 97 | // Get every token and print it. 98 | token = strtok(buffer, delimiter); 99 | while (token != NULL) 100 | { 101 | // if (1) 102 | // printf("%s\n", token); 103 | 104 | (*data_p)[idx] = atof(token); 105 | // printf("%f\n", (*data)[idx]); 106 | ++idx; 107 | 108 | // Get the next token. 109 | token = strtok(NULL, delimiter); 110 | } 111 | } 112 | 113 | if (log_level > 1) 114 | printf("read %d rows from file %s\n", row - 1, file_name); 115 | 116 | fclose(csv_file); 117 | free(buffer); 118 | } 119 | 120 | void pivot_data(double *data, const struct dim csv_dim, double ***pivoted_data_p) 121 | { 122 | (*pivoted_data_p) = _2d_calloc(csv_dim.rows, csv_dim.cols); 123 | 124 | for (size_t i = 0; i < csv_dim.rows; ++i) 125 | for (size_t j = 0; j < csv_dim.cols; ++j) 126 | (*pivoted_data_p)[i][j] = data[(i * csv_dim.cols) + j]; 127 | } 128 | -------------------------------------------------------------------------------- /utils/data.h: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #ifndef data_h 6 | #define data_h 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "utils.h" 15 | 16 | /* 17 | Struct for parsed data dimensions. 18 | */ 19 | struct dim 20 | { 21 | size_t rows; 22 | size_t cols; 23 | }; 24 | 25 | /* 26 | Attempts to read a csv file at path given by 'file_name', and if successfull, records the 27 | dimensions of the csv file, asserts that all rows have the same number of columns, and returns 28 | the dimensions of the file in a struct of type dim. 29 | 30 | If the numbers for rows and cols are given to the program as command line arguments, then calling 31 | this function can be skipped, as we already would know how much memory to allocate to fit the csv 32 | file of the given dimensions. 33 | */ 34 | struct dim parse_csv_dims(const char *file_name); 35 | 36 | /* 37 | Attempts to read a csv file at path given by 'file_name' and write the values one-by-one into 'data'. 38 | If an argument for '--num_rows' was provided to the program and is less than the actual number of 39 | rows in the csv file, the function will stop reading at that row. This allows to read only the top 40 | 'num_rows' in the input data file if needed. 41 | */ 42 | void parse_csv(const char *file_name, double **data_p, const struct dim csv_dim); 43 | 44 | /* 45 | Pivots and transforms the data in 'data' array into a two-dimensional array of size 46 | 'csv_dim.rows' * 'csv_dim.cols' pointed to by 'pivoted_data_p'. 47 | */ 48 | void pivot_data(double *data, const struct dim csv_dim, double ***pivoted_data_p); 49 | 50 | #endif // data_h 51 | -------------------------------------------------------------------------------- /utils/utils.c: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #include "utils.h" 6 | 7 | int get_log_level() 8 | { 9 | return log_level; 10 | } 11 | 12 | void set_log_level(int selected_log_level) 13 | { 14 | if (selected_log_level < 0 || selected_log_level > 3) 15 | { 16 | printf("Error: log_level must be in range [0, 3] got: %d\n", selected_log_level); 17 | exit(1); 18 | } 19 | log_level = selected_log_level; 20 | } 21 | 22 | int contains_int(int *arr, size_t n, int val) 23 | { 24 | for (size_t i = 0; i < n; ++i) 25 | { 26 | if (arr[i] == val) 27 | return 1; 28 | } 29 | return 0; 30 | } 31 | 32 | int is_row_part_of_testing_fold(int row, const ModelContext *ctx) 33 | { 34 | size_t lower_bound = ctx->testingFoldIdx * ctx->rowsPerFold; 35 | size_t upper_bound = lower_bound + ctx->rowsPerFold; 36 | 37 | if (row >= lower_bound && row <= upper_bound) 38 | return 1; 39 | else 40 | return 0; 41 | } 42 | 43 | double **combine_arrays(double **first, double **second, size_t n1, size_t n2, size_t cols) 44 | { 45 | double **combined = (double **)malloc((n1 + n2) * sizeof(double) * cols); 46 | int row_index = 0; 47 | for (size_t i = 0; i < n1; ++i) 48 | { 49 | double *row = first[i]; 50 | combined[row_index++] = row; 51 | } 52 | for (size_t j = 0; j < n2; ++j) 53 | { 54 | double *row = second[j]; 55 | combined[row_index++] = row; 56 | } 57 | return combined; 58 | } 59 | 60 | double **_2d_malloc(const size_t rows, const size_t cols) 61 | { 62 | double **data; 63 | double *ptr; 64 | 65 | int len = sizeof(double *) * rows + sizeof(double) * cols * rows; 66 | data = (double **)malloc(len); 67 | 68 | ptr = (double *)(data + rows); 69 | 70 | for (size_t i = 0; i < rows; ++i) 71 | data[i] = ptr + cols * i; 72 | 73 | return data; 74 | } 75 | 76 | double **_2d_calloc(const size_t rows, const size_t cols) 77 | { 78 | double **data; 79 | double *ptr; 80 | 81 | int len = sizeof(double *) * rows + sizeof(double) * cols * rows; 82 | data = (double **)calloc(len, sizeof(double)); 83 | 84 | ptr = (double *)(data + rows); 85 | 86 | for (size_t i = 0; i < rows; ++i) 87 | data[i] = ptr + cols * i; 88 | 89 | return data; 90 | } 91 | 92 | double _1d_checksum(double *data, size_t size) 93 | { 94 | double sum = 0; 95 | for (size_t i = 0; i < size; ++i) 96 | { 97 | sum += data[i]; 98 | } 99 | return sum; 100 | } 101 | 102 | double _2d_checksum(double **data, size_t rows, size_t cols) 103 | { 104 | double sum = 0; 105 | for (size_t i = 0; i < rows; ++i) 106 | { 107 | for (size_t j = 0; j < cols; ++j) 108 | { 109 | sum += data[i][j]; 110 | } 111 | } 112 | return sum; 113 | } 114 | -------------------------------------------------------------------------------- /utils/utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | @author andrii dobroshynski 3 | */ 4 | 5 | #ifndef utils_h 6 | #define utils_h 7 | 8 | #include 9 | #include "data.h" 10 | 11 | /* 12 | Struct to hold information about a current model training run. 13 | */ 14 | struct ModelContext 15 | { 16 | const size_t testingFoldIdx; 17 | const size_t rowsPerFold; 18 | }; 19 | 20 | typedef struct ModelContext ModelContext; 21 | 22 | /* 23 | The debug log level that can be adjusted via an argument. 24 | */ 25 | int log_level; 26 | 27 | /* 28 | Given a pointer to a buffer array of integers returns whether or not a given integer 'n' 29 | is present in the array. 30 | */ 31 | int contains_int(int *arr, size_t n, int val); 32 | 33 | /* 34 | Given two two-dimensional like arrays 'first' and 'second', merges them into one and returns a 35 | new pointer. 36 | */ 37 | double **combine_arrays(double **first, double **second, size_t n1, size_t n2, size_t cols); 38 | 39 | /* 40 | Given a row number and a model context returns whether or not the particular 41 | row belongs to a fold that is designated as the evaluation / testing fold. 42 | */ 43 | int is_row_part_of_testing_fold(int row, const ModelContext *ctx); 44 | 45 | /* 46 | Sets the 'log_level' to the 'selected_log_level'. 47 | */ 48 | void set_log_level(int selected_log_level); 49 | 50 | /* 51 | Returns whatever the current 'log_level' is. 52 | */ 53 | int get_log_level(); 54 | 55 | /* 56 | Allocates memory for a two-dimensional like array of size 'rows' * 'cols'. 57 | */ 58 | double **_2d_malloc(const size_t rows, const size_t cols); 59 | 60 | /* 61 | Allocates memory for a two-dimensional like array of size 'rows' * 'cols' with all elements initialized 62 | to zero. 63 | */ 64 | double **_2d_calloc(const size_t rows, const size_t cols); 65 | 66 | /* 67 | Computes a checksum of a one-dimensional like array. Used to verify consistency of data. 68 | */ 69 | double _1d_checksum(double *data, size_t size); 70 | 71 | /* 72 | Computes a checksum of a two-dimensional like array. Used to verify consistency of data. 73 | */ 74 | double _2d_checksum(double **data, size_t rows, size_t cols); 75 | 76 | #endif // utils_h 77 | --------------------------------------------------------------------------------