├── .gitignore ├── LICENSE.md ├── README.md ├── forest └── index.js ├── index.js ├── package.json ├── tests ├── basic.py ├── index.js └── iris.json ├── tree └── index.js └── utilities └── index.js /.gitignore: -------------------------------------------------------------------------------- 1 | ###OSX### 2 | 3 | .DS_Store 4 | .AppleDouble 5 | .LSOverride 6 | 7 | # Icon must ends with two \r. 8 | Icon 9 | 10 | 11 | # Thumbnails 12 | ._* 13 | 14 | # Files that might appear on external disk 15 | .Spotlight-V100 16 | .Trashes 17 | 18 | 19 | ###Node### 20 | 21 | # Logs 22 | logs 23 | *.log 24 | 25 | # Runtime data 26 | pids 27 | *.pid 28 | *.seed 29 | 30 | # Directory for instrumented libs generated by jscoverage/JSCover 31 | lib-cov 32 | 33 | # Coverage directory used by tools like istanbul 34 | coverage 35 | 36 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 37 | .grunt 38 | 39 | # Compiled binary addons (http://nodejs.org/api/addons.html) 40 | build/Release 41 | 42 | # Dependency directory 43 | # Deployed apps should consider commenting this line out: 44 | # see https://npmjs.org/doc/faq.html#Should-I-check-my-node_modules-folder-into-git 45 | node_modules 46 | 47 | 48 | ###Python### 49 | 50 | # Byte-compiled / optimized / DLL files 51 | __pycache__/ 52 | *.py[cod] 53 | 54 | # C extensions 55 | *.so 56 | 57 | # Distribution / packaging 58 | .Python 59 | env/ 60 | bin/ 61 | build/ 62 | develop-eggs/ 63 | dist/ 64 | eggs/ 65 | lib/ 66 | lib64/ 67 | parts/ 68 | sdist/ 69 | var/ 70 | *.egg-info/ 71 | .installed.cfg 72 | *.egg 73 | 74 | # Installer logs 75 | pip-log.txt 76 | pip-delete-this-directory.txt 77 | 78 | # Unit test / coverage reports 79 | htmlcov/ 80 | .tox/ 81 | .coverage 82 | .cache 83 | nosetests.xml 84 | coverage.xml 85 | 86 | # Translations 87 | *.mo 88 | 89 | # Mr Developer 90 | .mr.developer.cfg 91 | .project 92 | .pydevproject 93 | 94 | # Rope 95 | .ropeproject 96 | 97 | # Django stuff: 98 | *.log 99 | *.pot 100 | 101 | sample-app 102 | 103 | # Sphinx documentation 104 | docs/_build/ -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Jessica Frazelle 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Random Forest 2 | 3 | A random forest classifier. A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting. 4 | 5 | Modeled after [scikit-learn's RandomForestClassifier](http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html). 6 | 7 | ### Installation 8 | ```bash 9 | $ npm install random-forest-classifier 10 | ``` 11 | 12 | ### Example 13 | 14 | ```javascript 15 | var fs = require('fs'), 16 | RandomForestClassifier = require('random-forest-classifier').RandomForestClassifier; 17 | 18 | var data = [ 19 | { 20 | "length":5.1, 21 | "width":3.5, 22 | "petal_length":1.4, 23 | "petal_width":0.2, 24 | "species":"setosa" 25 | }, 26 | { 27 | "length":6.5, 28 | "width":3, 29 | "petal_length":5.2, 30 | "petal_width":2, 31 | "species":"virginica" 32 | }, 33 | { 34 | "length":6.6, 35 | "width":3, 36 | "petal_length":4.4, 37 | "petal_width":1.4, 38 | "species":"versicolor" 39 | }... 40 | ]; 41 | 42 | var testdata = [{ 43 | "length":6.3, 44 | "width":2.5, 45 | "petal_length":5, 46 | "petal_width":1.9, 47 | //"species":"virginica" 48 | }, 49 | { 50 | "length":4.7, 51 | "width":3.2, 52 | "petal_length":1.3, 53 | "petal_width":0.2, 54 | //"species":"setosa" 55 | }... 56 | ]; 57 | 58 | var rf = new RandomForestClassifier({ 59 | n_estimators: 10 60 | }); 61 | 62 | rf.fit(data, null, "species", function(err, trees){ 63 | //console.log(JSON.stringify(trees, null, 4)); 64 | var pred = rf.predict(testdata, trees); 65 | 66 | console.log(pred); 67 | 68 | // pred = ["virginica", "setosa"] 69 | }); 70 | ``` 71 | 72 | ### Usage 73 | 74 | #### Options 75 | 76 | **`n_estimators`:** *integer, optional (default=10)* The number of trees in the forest. 77 | 78 | **example** 79 | 80 | ```javascript 81 | var rf = new RandomForestClassifier({ 82 | n_estimators: 20 83 | }); 84 | ``` 85 | 86 | ##### `rf.fit(data, features, target, function(err, trees){})` 87 | 88 | Build a forest of trees from the training set (data, features, target). 89 | 90 | **parameters** 91 | 92 | - **`data`**: training data array 93 | - **`features`**: if `null` it defaults to all features except the target, otherwise it only uses the array of features passed 94 | - **`target`**: the target feature 95 | 96 | **example** 97 | ```javascript 98 | var rf = new RandomForestClassifier({ 99 | n_estimators: 20 100 | }); 101 | 102 | rf.fit(data, ["length", "width"], "species", function(err, trees){ 103 | console.log(JSON.stringify(trees, null, 4)); 104 | }); 105 | ``` 106 | 107 | ##### `rf.predict(data, trees)` 108 | 109 | The predicted class of an input sample is computed as the majority prediction of the trees in the forest. 110 | 111 | **parameters** 112 | 113 | - **`data`**: input sample 114 | - **`trees`**: the forest of trees outputted by `rf.fit` 115 | 116 | **example** 117 | ```javascript 118 | var rf = new RandomForestClassifier({ 119 | n_estimators: 20 120 | }); 121 | 122 | rf.fit(data, ["length", "width"], "species", function(err, trees){ 123 | 124 | var pred = rf.predict(sample_data, trees); 125 | 126 | console.log(pred); 127 | // pred = ["virginica", "setosa"] 128 | }); 129 | ``` 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /forest/index.js: -------------------------------------------------------------------------------- 1 | /* 2 | A random forest classifier. 3 | 4 | A random forest is a meta estimator that fits a number of decision tree 5 | classifiers on various sub-samples of the dataset and use averaging to 6 | improve the predictive accuracy and control over-fitting. 7 | 8 | Parameters 9 | ---------- 10 | n_estimators : integer, optional (default=10) 11 | The number of trees in the forest. 12 | 13 | criterion : string, optional (default="gini") 14 | The function to measure the quality of a split. Supported criteria are 15 | "gini" for the Gini impurity and "entropy" for the information gain. 16 | Note: this parameter is tree-specific. 17 | 18 | max_features : int, float, string or None, optional (default="auto") 19 | The number of features to consider when looking for the best split: 20 | 21 | - If int, then consider `max_features` features at each split. 22 | - If float, then `max_features` is a percentage and 23 | `int(max_features * n_features)` features are considered at each 24 | split. 25 | - If "auto", then `max_features=sqrt(n_features)`. 26 | - If "sqrt", then `max_features=sqrt(n_features)`. 27 | - If "log2", then `max_features=log2(n_features)`. 28 | - If None, then `max_features=n_features`. 29 | 30 | Note: the search for a split does not stop until at least one 31 | valid partition of the node samples is found, even if it requires to 32 | effectively inspect more than ``max_features`` features. 33 | Note: this parameter is tree-specific. 34 | 35 | max_depth : integer or None, optional (default=None) 36 | The maximum depth of the tree. If None, then nodes are expanded until 37 | all leaves are pure or until all leaves contain less than 38 | min_samples_split samples. 39 | Ignored if ``max_samples_leaf`` is not None. 40 | Note: this parameter is tree-specific. 41 | 42 | min_samples_split : integer, optional (default=2) 43 | The minimum number of samples required to split an internal node. 44 | Note: this parameter is tree-specific. 45 | 46 | min_samples_leaf : integer, optional (default=1) 47 | The minimum number of samples in newly created leaves. A split is 48 | discarded if after the split, one of the leaves would contain less then 49 | ``min_samples_leaf`` samples. 50 | Note: this parameter is tree-specific. 51 | 52 | max_leaf_nodes : int or None, optional (default=None) 53 | Grow trees with ``max_leaf_nodes`` in best-first fashion. 54 | Best nodes are defined as relative reduction in impurity. 55 | If None then unlimited number of leaf nodes. 56 | If not None then ``max_depth`` will be ignored. 57 | Note: this parameter is tree-specific. 58 | 59 | verbose : int, optional (default=0) 60 | Controls the verbosity of the tree building process. 61 | */ 62 | 63 | var async = require('async'), 64 | utils = require('../utilities'), 65 | DecisionTreeClassifier = require('../tree'); 66 | 67 | var RandomForestClassifier = function(params) { 68 | this.n_estimators = params.n_estimators || 10; 69 | this.criterion = params.criterion || "entropy"; 70 | this.max_features = params.max_features || "auto"; 71 | this.min_samples_split = params.min_samples_split || 2; 72 | this.min_samples_leaf = params.min_samples_leaf || 1; 73 | this.verbose = this.verbose || 0; 74 | }; 75 | 76 | var _parallel_build_tree = function(data, features, y) { 77 | return function (n, next) { 78 | var CLF = new DecisionTreeClassifier({}); 79 | var tree = CLF.fit(data, features, y); 80 | CLF.model = tree; 81 | next(null, CLF); 82 | }; 83 | }; 84 | 85 | RandomForestClassifier.prototype = { 86 | fit: function(data, features, y, cb) { 87 | // initialize & fit trees 88 | // this is done async because it can be independent 89 | async.times(this.n_estimators, _parallel_build_tree(data, features, y), function(err, trees) { 90 | if (err) { console.log(err); } 91 | 92 | cb(err, trees); 93 | }); 94 | }, 95 | predict: function(data, trees) { 96 | this.trees = trees; 97 | var probabilities = new Array(data.length); 98 | for (var i=0; i < data.length ;i++) { 99 | var dec = []; 100 | for (var j=0; j < this.n_estimators; j++){ 101 | dec.push(trees[j].predict(data[i])); 102 | } 103 | if (utils.GetType(dec[0]) == "string"){ 104 | probabilities[i] = utils.GetDominate(dec); 105 | } else { 106 | probabilities[i] = utils.Average(dec); 107 | } 108 | } 109 | return probabilities; 110 | } 111 | }; 112 | 113 | module.exports = RandomForestClassifier; 114 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var RandomForestClassifier = require('./forest'), 2 | DecisionTreeClassifier = require('./tree'); 3 | 4 | module.exports.RandomForestClassifier = RandomForestClassifier; 5 | module.exports.DecisionTreeClassifier = DecisionTreeClassifier; -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "random-forest-classifier", 3 | "version": "0.6.0", 4 | "description": "A random forest classifier. A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "tests/index.js" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/jessfraz/random-forest-classifier" 12 | }, 13 | "keywords": [ 14 | "random forest", 15 | "machine learning", 16 | "classifier" 17 | ], 18 | "author": "Jessica Frazelle", 19 | "license": "MIT", 20 | "bugs": { 21 | "url": "https://github.com/jessfraz/random-forest-classifier/issues" 22 | }, 23 | "homepage": "https://github.com/jessfraz/random-forest-classifier", 24 | "dependencies": { 25 | "async": "^0.9.0", 26 | "underscore": "^1.6.0" 27 | }, 28 | "devDependencies": { 29 | "ejs": "^2.5.5", 30 | "express": "^4.4.4" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /tests/basic.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.ensemble import RandomForestClassifier 3 | import pandas as pd 4 | import numpy as np 5 | 6 | iris = load_iris() 7 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 8 | df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 9 | df['species'] = pd.Categorical(iris.target, iris.target_names) 10 | 11 | train, test = df[df['is_train'] == True], df[df['is_train'] == False] 12 | 13 | features = df.columns[:4] 14 | 15 | print test[features].head() 16 | 17 | clf = RandomForestClassifier(n_jobs=2) 18 | y, _ = pd.factorize(train['species']) 19 | clf.fit(train[features], y) 20 | 21 | preds = iris.target_names[clf.predict(test[features])] 22 | print pd.crosstab(test['species'], 23 | preds, rownames=['actual'], colnames=['preds']) 24 | -------------------------------------------------------------------------------- /tests/index.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'), 2 | _ = require('underscore') 3 | file = __dirname + '/iris.json', 4 | RandomForestClassifier = require('../index.js').RandomForestClassifier; 5 | 6 | 7 | fs.readFile(file, 'utf8', function (err, data) { 8 | if (err) { 9 | console.log('Error reading iris json file: ' + err); 10 | } 11 | 12 | data = JSON.parse(data); 13 | 14 | // remove ten items for the testdata 15 | var testdata = []; 16 | for (var i=0; i < 10; i++){ 17 | var ran_num = Math.floor(Math.random() * (data.length - 0 + 1)); 18 | testdata.push(data[ran_num]); 19 | data = _.without(data, data[ran_num]); 20 | } 21 | 22 | var rf = new RandomForestClassifier({ 23 | n_estimators: 10 24 | }); 25 | 26 | rf.fit(data, null, "species", function(err, trees){ 27 | //console.log(trees); 28 | //console.log(JSON.stringify(trees, null, 4)); 29 | var expected = _.pluck(testdata, "species"); 30 | var pred = rf.predict(testdata, trees); 31 | console.log("outcome:", pred); 32 | console.log("expected: ", expected); 33 | var correct = 0; 34 | for (var i=0; i< pred.length; i++){ 35 | if (pred[i]==expected[i]){ 36 | correct++; 37 | } 38 | } 39 | console.log(correct + "/" + pred.length, (correct/pred.length)*100 + "%", "accurate"); 40 | }); 41 | }); 42 | -------------------------------------------------------------------------------- /tests/iris.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "length":5.1, 4 | "width":3.5, 5 | "petal_length":1.4, 6 | "petal_width":0.2, 7 | "species":"setosa" 8 | }, 9 | { 10 | "length":4.7, 11 | "width":3.2, 12 | "petal_length":1.3, 13 | "petal_width":0.2, 14 | "species":"setosa" 15 | }, 16 | { 17 | "length":4.6, 18 | "width":3.1, 19 | "petal_length":1.5, 20 | "petal_width":0.2, 21 | "species":"setosa" 22 | }, 23 | { 24 | "length":5, 25 | "width":3.6, 26 | "petal_length":1.4, 27 | "petal_width":0.2, 28 | "species":"setosa" 29 | }, 30 | { 31 | "length":5.4, 32 | "width":3.9, 33 | "petal_length":1.7, 34 | "petal_width":0.4, 35 | "species":"setosa" 36 | }, 37 | { 38 | "length":4.6, 39 | "width":3.4, 40 | "petal_length":1.4, 41 | "petal_width":0.3, 42 | "species":"setosa" 43 | }, 44 | { 45 | "length":5, 46 | "width":3.4, 47 | "petal_length":1.5, 48 | "petal_width":0.2, 49 | "species":"setosa" 50 | }, 51 | { 52 | "length":4.4, 53 | "width":2.9, 54 | "petal_length":1.4, 55 | "petal_width":0.2, 56 | "species":"setosa" 57 | }, 58 | { 59 | "length":4.9, 60 | "width":3.1, 61 | "petal_length":1.5, 62 | "petal_width":0.1, 63 | "species":"setosa" 64 | }, 65 | { 66 | "length":5.4, 67 | "width":3.7, 68 | "petal_length":1.5, 69 | "petal_width":0.2, 70 | "species":"setosa" 71 | }, 72 | { 73 | "length":4.8, 74 | "width":3.4, 75 | "petal_length":1.6, 76 | "petal_width":0.2, 77 | "species":"setosa" 78 | }, 79 | { 80 | "length":4.8, 81 | "width":3, 82 | "petal_length":1.4, 83 | "petal_width":0.1, 84 | "species":"setosa" 85 | }, 86 | { 87 | "length":4.3, 88 | "width":3, 89 | "petal_length":1.1, 90 | "petal_width":0.1, 91 | "species":"setosa" 92 | }, 93 | { 94 | "length":5.8, 95 | "width":4, 96 | "petal_length":1.2, 97 | "petal_width":0.2, 98 | "species":"setosa" 99 | }, 100 | { 101 | "length":5.7, 102 | "width":4.4, 103 | "petal_length":1.5, 104 | "petal_width":0.4, 105 | "species":"setosa" 106 | }, 107 | { 108 | "length":5.4, 109 | "width":3.9, 110 | "petal_length":1.3, 111 | "petal_width":0.4, 112 | "species":"setosa" 113 | }, 114 | { 115 | "length":5.1, 116 | "width":3.5, 117 | "petal_length":1.4, 118 | "petal_width":0.3, 119 | "species":"setosa" 120 | }, 121 | { 122 | "length":5.7, 123 | "width":3.8, 124 | "petal_length":1.7, 125 | "petal_width":0.3, 126 | "species":"setosa" 127 | }, 128 | { 129 | "length":5.4, 130 | "width":3.4, 131 | "petal_length":1.7, 132 | "petal_width":0.2, 133 | "species":"setosa" 134 | }, 135 | { 136 | "length":5.1, 137 | "width":3.7, 138 | "petal_length":1.5, 139 | "petal_width":0.4, 140 | "species":"setosa" 141 | }, 142 | { 143 | "length":4.6, 144 | "width":3.6, 145 | "petal_length":1, 146 | "petal_width":0.2, 147 | "species":"setosa" 148 | }, 149 | { 150 | "length":5.1, 151 | "width":3.3, 152 | "petal_length":1.7, 153 | "petal_width":0.5, 154 | "species":"setosa" 155 | }, 156 | { 157 | "length":4.8, 158 | "width":3.4, 159 | "petal_length":1.9, 160 | "petal_width":0.2, 161 | "species":"setosa" 162 | }, 163 | { 164 | "length":5, 165 | "width":3, 166 | "petal_length":1.6, 167 | "petal_width":0.2, 168 | "species":"setosa" 169 | }, 170 | { 171 | "length":5, 172 | "width":3.4, 173 | "petal_length":1.6, 174 | "petal_width":0.4, 175 | "species":"setosa" 176 | }, 177 | { 178 | "length":5.2, 179 | "width":3.5, 180 | "petal_length":1.5, 181 | "petal_width":0.2, 182 | "species":"setosa" 183 | }, 184 | { 185 | "length":5.2, 186 | "width":3.4, 187 | "petal_length":1.4, 188 | "petal_width":0.2, 189 | "species":"setosa" 190 | }, 191 | { 192 | "length":4.7, 193 | "width":3.2, 194 | "petal_length":1.6, 195 | "petal_width":0.2, 196 | "species":"setosa" 197 | }, 198 | { 199 | "length":4.8, 200 | "width":3.1, 201 | "petal_length":1.6, 202 | "petal_width":0.2, 203 | "species":"setosa" 204 | }, 205 | { 206 | "length":5.4, 207 | "width":3.4, 208 | "petal_length":1.5, 209 | "petal_width":0.4, 210 | "species":"setosa" 211 | }, 212 | { 213 | "length":5.2, 214 | "width":4.1, 215 | "petal_length":1.5, 216 | "petal_width":0.1, 217 | "species":"setosa" 218 | }, 219 | { 220 | "length":5.5, 221 | "width":4.2, 222 | "petal_length":1.4, 223 | "petal_width":0.2, 224 | "species":"setosa" 225 | }, 226 | { 227 | "length":4.9, 228 | "width":3.1, 229 | "petal_length":1.5, 230 | "petal_width":0.2, 231 | "species":"setosa" 232 | }, 233 | { 234 | "length":5, 235 | "width":3.2, 236 | "petal_length":1.2, 237 | "petal_width":0.2, 238 | "species":"setosa" 239 | }, 240 | { 241 | "length":5.5, 242 | "width":3.5, 243 | "petal_length":1.3, 244 | "petal_width":0.2, 245 | "species":"setosa" 246 | }, 247 | { 248 | "length":4.9, 249 | "width":3.6, 250 | "petal_length":1.4, 251 | "petal_width":0.1, 252 | "species":"setosa" 253 | }, 254 | { 255 | "length":4.4, 256 | "width":3, 257 | "petal_length":1.3, 258 | "petal_width":0.2, 259 | "species":"setosa" 260 | }, 261 | { 262 | "length":5.1, 263 | "width":3.4, 264 | "petal_length":1.5, 265 | "petal_width":0.2, 266 | "species":"setosa" 267 | }, 268 | { 269 | "length":5, 270 | "width":3.5, 271 | "petal_length":1.3, 272 | "petal_width":0.3, 273 | "species":"setosa" 274 | }, 275 | { 276 | "length":4.5, 277 | "width":2.3, 278 | "petal_length":1.3, 279 | "petal_width":0.3, 280 | "species":"setosa" 281 | }, 282 | { 283 | "length":4.4, 284 | "width":3.2, 285 | "petal_length":1.3, 286 | "petal_width":0.2, 287 | "species":"setosa" 288 | }, 289 | { 290 | "length":5, 291 | "width":3.5, 292 | "petal_length":1.6, 293 | "petal_width":0.6, 294 | "species":"setosa" 295 | }, 296 | { 297 | "length":5.1, 298 | "width":3.8, 299 | "petal_length":1.9, 300 | "petal_width":0.4, 301 | "species":"setosa" 302 | }, 303 | { 304 | "length":4.8, 305 | "width":3, 306 | "petal_length":1.4, 307 | "petal_width":0.3, 308 | "species":"setosa" 309 | }, 310 | { 311 | "length":5.1, 312 | "width":3.8, 313 | "petal_length":1.6, 314 | "petal_width":0.2, 315 | "species":"setosa" 316 | }, 317 | { 318 | "length":4.6, 319 | "width":3.2, 320 | "petal_length":1.4, 321 | "petal_width":0.2, 322 | "species":"setosa" 323 | }, 324 | { 325 | "length":5.3, 326 | "width":3.7, 327 | "petal_length":1.5, 328 | "petal_width":0.2, 329 | "species":"setosa" 330 | }, 331 | { 332 | "length":5, 333 | "width":3.3, 334 | "petal_length":1.4, 335 | "petal_width":0.2, 336 | "species":"setosa" 337 | }, 338 | { 339 | "length":7, 340 | "width":3.2, 341 | "petal_length":4.7, 342 | "petal_width":1.4, 343 | "species":"versicolor" 344 | }, 345 | { 346 | "length":6.4, 347 | "width":3.2, 348 | "petal_length":4.5, 349 | "petal_width":1.5, 350 | "species":"versicolor" 351 | }, 352 | { 353 | "length":6.9, 354 | "width":3.1, 355 | "petal_length":4.9, 356 | "petal_width":1.5, 357 | "species":"versicolor" 358 | }, 359 | { 360 | "length":5.5, 361 | "width":2.3, 362 | "petal_length":4, 363 | "petal_width":1.3, 364 | "species":"versicolor" 365 | }, 366 | { 367 | "length":6.5, 368 | "width":2.8, 369 | "petal_length":4.6, 370 | "petal_width":1.5, 371 | "species":"versicolor" 372 | }, 373 | { 374 | "length":5.7, 375 | "width":2.8, 376 | "petal_length":4.5, 377 | "petal_width":1.3, 378 | "species":"versicolor" 379 | }, 380 | { 381 | "length":6.3, 382 | "width":3.3, 383 | "petal_length":4.7, 384 | "petal_width":1.6, 385 | "species":"versicolor" 386 | }, 387 | { 388 | "length":4.9, 389 | "width":2.4, 390 | "petal_length":3.3, 391 | "petal_width":1, 392 | "species":"versicolor" 393 | }, 394 | { 395 | "length":6.6, 396 | "width":2.9, 397 | "petal_length":4.6, 398 | "petal_width":1.3, 399 | "species":"versicolor" 400 | }, 401 | { 402 | "length":5.2, 403 | "width":2.7, 404 | "petal_length":3.9, 405 | "petal_width":1.4, 406 | "species":"versicolor" 407 | }, 408 | { 409 | "length":5, 410 | "width":2, 411 | "petal_length":3.5, 412 | "petal_width":1, 413 | "species":"versicolor" 414 | }, 415 | { 416 | "length":5.9, 417 | "width":3, 418 | "petal_length":4.2, 419 | "petal_width":1.5, 420 | "species":"versicolor" 421 | }, 422 | { 423 | "length":6, 424 | "width":2.2, 425 | "petal_length":4, 426 | "petal_width":1, 427 | "species":"versicolor" 428 | }, 429 | { 430 | "length":6.1, 431 | "width":2.9, 432 | "petal_length":4.7, 433 | "petal_width":1.4, 434 | "species":"versicolor" 435 | }, 436 | { 437 | "length":5.6, 438 | "width":2.9, 439 | "petal_length":3.6, 440 | "petal_width":1.3, 441 | "species":"versicolor" 442 | }, 443 | { 444 | "length":6.7, 445 | "width":3.1, 446 | "petal_length":4.4, 447 | "petal_width":1.4, 448 | "species":"versicolor" 449 | }, 450 | { 451 | "length":5.6, 452 | "width":3, 453 | "petal_length":4.5, 454 | "petal_width":1.5, 455 | "species":"versicolor" 456 | }, 457 | { 458 | "length":5.8, 459 | "width":2.7, 460 | "petal_length":4.1, 461 | "petal_width":1, 462 | "species":"versicolor" 463 | }, 464 | { 465 | "length":6.2, 466 | "width":2.2, 467 | "petal_length":4.5, 468 | "petal_width":1.5, 469 | "species":"versicolor" 470 | }, 471 | { 472 | "length":5.6, 473 | "width":2.5, 474 | "petal_length":3.9, 475 | "petal_width":1.1, 476 | "species":"versicolor" 477 | }, 478 | { 479 | "length":5.9, 480 | "width":3.2, 481 | "petal_length":4.8, 482 | "petal_width":1.8, 483 | "species":"versicolor" 484 | }, 485 | { 486 | "length":6.1, 487 | "width":2.8, 488 | "petal_length":4, 489 | "petal_width":1.3, 490 | "species":"versicolor" 491 | }, 492 | { 493 | "length":6.3, 494 | "width":2.5, 495 | "petal_length":4.9, 496 | "petal_width":1.5, 497 | "species":"versicolor" 498 | }, 499 | { 500 | "length":6.1, 501 | "width":2.8, 502 | "petal_length":4.7, 503 | "petal_width":1.2, 504 | "species":"versicolor" 505 | }, 506 | { 507 | "length":6.4, 508 | "width":2.9, 509 | "petal_length":4.3, 510 | "petal_width":1.3, 511 | "species":"versicolor" 512 | }, 513 | { 514 | "length":6.6, 515 | "width":3, 516 | "petal_length":4.4, 517 | "petal_width":1.4, 518 | "species":"versicolor" 519 | }, 520 | { 521 | "length":6.8, 522 | "width":2.8, 523 | "petal_length":4.8, 524 | "petal_width":1.4, 525 | "species":"versicolor" 526 | }, 527 | { 528 | "length":6.7, 529 | "width":3, 530 | "petal_length":5, 531 | "petal_width":1.7, 532 | "species":"versicolor" 533 | }, 534 | { 535 | "length":6, 536 | "width":2.9, 537 | "petal_length":4.5, 538 | "petal_width":1.5, 539 | "species":"versicolor" 540 | }, 541 | { 542 | "length":5.7, 543 | "width":2.6, 544 | "petal_length":3.5, 545 | "petal_width":1, 546 | "species":"versicolor" 547 | }, 548 | { 549 | "length":5.5, 550 | "width":2.4, 551 | "petal_length":3.8, 552 | "petal_width":1.1, 553 | "species":"versicolor" 554 | }, 555 | { 556 | "length":5.5, 557 | "width":2.4, 558 | "petal_length":3.7, 559 | "petal_width":1, 560 | "species":"versicolor" 561 | }, 562 | { 563 | "length":5.8, 564 | "width":2.7, 565 | "petal_length":3.9, 566 | "petal_width":1.2, 567 | "species":"versicolor" 568 | }, 569 | { 570 | "length":6, 571 | "width":2.7, 572 | "petal_length":5.1, 573 | "petal_width":1.6, 574 | "species":"versicolor" 575 | }, 576 | { 577 | "length":5.4, 578 | "width":3, 579 | "petal_length":4.5, 580 | "petal_width":1.5, 581 | "species":"versicolor" 582 | }, 583 | { 584 | "length":6, 585 | "width":3.4, 586 | "petal_length":4.5, 587 | "petal_width":1.6, 588 | "species":"versicolor" 589 | }, 590 | { 591 | "length":6.7, 592 | "width":3.1, 593 | "petal_length":4.7, 594 | "petal_width":1.5, 595 | "species":"versicolor" 596 | }, 597 | { 598 | "length":6.3, 599 | "width":2.3, 600 | "petal_length":4.4, 601 | "petal_width":1.3, 602 | "species":"versicolor" 603 | }, 604 | { 605 | "length":5.6, 606 | "width":3, 607 | "petal_length":4.1, 608 | "petal_width":1.3, 609 | "species":"versicolor" 610 | }, 611 | { 612 | "length":5.5, 613 | "width":2.5, 614 | "petal_length":4, 615 | "petal_width":1.3, 616 | "species":"versicolor" 617 | }, 618 | { 619 | "length":5.5, 620 | "width":2.6, 621 | "petal_length":4.4, 622 | "petal_width":1.2, 623 | "species":"versicolor" 624 | }, 625 | { 626 | "length":6.1, 627 | "width":3, 628 | "petal_length":4.6, 629 | "petal_width":1.4, 630 | "species":"versicolor" 631 | }, 632 | { 633 | "length":5.8, 634 | "width":2.6, 635 | "petal_length":4, 636 | "petal_width":1.2, 637 | "species":"versicolor" 638 | }, 639 | { 640 | "length":5.6, 641 | "width":2.7, 642 | "petal_length":4.2, 643 | "petal_width":1.3, 644 | "species":"versicolor" 645 | }, 646 | { 647 | "length":5.7, 648 | "width":3, 649 | "petal_length":4.2, 650 | "petal_width":1.2, 651 | "species":"versicolor" 652 | }, 653 | { 654 | "length":5.7, 655 | "width":2.9, 656 | "petal_length":4.2, 657 | "petal_width":1.3, 658 | "species":"versicolor" 659 | }, 660 | { 661 | "length":6.2, 662 | "width":2.9, 663 | "petal_length":4.3, 664 | "petal_width":1.3, 665 | "species":"versicolor" 666 | }, 667 | { 668 | "length":5.1, 669 | "width":2.5, 670 | "petal_length":3, 671 | "petal_width":1.1, 672 | "species":"versicolor" 673 | }, 674 | { 675 | "length":5.7, 676 | "width":2.8, 677 | "petal_length":4.1, 678 | "petal_width":1.3, 679 | "species":"versicolor" 680 | }, 681 | { 682 | "length":6.3, 683 | "width":3.3, 684 | "petal_length":6, 685 | "petal_width":2.5, 686 | "species":"virginica" 687 | }, 688 | { 689 | "length":5.8, 690 | "width":2.7, 691 | "petal_length":5.1, 692 | "petal_width":1.9, 693 | "species":"virginica" 694 | }, 695 | { 696 | "length":7.1, 697 | "width":3, 698 | "petal_length":5.9, 699 | "petal_width":2.1, 700 | "species":"virginica" 701 | }, 702 | { 703 | "length":6.3, 704 | "width":2.9, 705 | "petal_length":5.6, 706 | "petal_width":1.8, 707 | "species":"virginica" 708 | }, 709 | { 710 | "length":6.5, 711 | "width":3, 712 | "petal_length":5.8, 713 | "petal_width":2.2, 714 | "species":"virginica" 715 | }, 716 | { 717 | "length":7.6, 718 | "width":3, 719 | "petal_length":6.6, 720 | "petal_width":2.1, 721 | "species":"virginica" 722 | }, 723 | { 724 | "length":4.9, 725 | "width":2.5, 726 | "petal_length":4.5, 727 | "petal_width":1.7, 728 | "species":"virginica" 729 | }, 730 | { 731 | "length":7.3, 732 | "width":2.9, 733 | "petal_length":6.3, 734 | "petal_width":1.8, 735 | "species":"virginica" 736 | }, 737 | { 738 | "length":6.7, 739 | "width":2.5, 740 | "petal_length":5.8, 741 | "petal_width":1.8, 742 | "species":"virginica" 743 | }, 744 | { 745 | "length":7.2, 746 | "width":3.6, 747 | "petal_length":6.1, 748 | "petal_width":2.5, 749 | "species":"virginica" 750 | }, 751 | { 752 | "length":6.5, 753 | "width":3.2, 754 | "petal_length":5.1, 755 | "petal_width":2, 756 | "species":"virginica" 757 | }, 758 | { 759 | "length":6.4, 760 | "width":2.7, 761 | "petal_length":5.3, 762 | "petal_width":1.9, 763 | "species":"virginica" 764 | }, 765 | { 766 | "length":6.8, 767 | "width":3, 768 | "petal_length":5.5, 769 | "petal_width":2.1, 770 | "species":"virginica" 771 | }, 772 | { 773 | "length":5.7, 774 | "width":2.5, 775 | "petal_length":5, 776 | "petal_width":2, 777 | "species":"virginica" 778 | }, 779 | { 780 | "length":5.8, 781 | "width":2.8, 782 | "petal_length":5.1, 783 | "petal_width":2.4, 784 | "species":"virginica" 785 | }, 786 | { 787 | "length":6.4, 788 | "width":3.2, 789 | "petal_length":5.3, 790 | "petal_width":2.3, 791 | "species":"virginica" 792 | }, 793 | { 794 | "length":6.5, 795 | "width":3, 796 | "petal_length":5.5, 797 | "petal_width":1.8, 798 | "species":"virginica" 799 | }, 800 | { 801 | "length":7.7, 802 | "width":3.8, 803 | "petal_length":6.7, 804 | "petal_width":2.2, 805 | "species":"virginica" 806 | }, 807 | { 808 | "length":7.7, 809 | "width":2.6, 810 | "petal_length":6.9, 811 | "petal_width":2.3, 812 | "species":"virginica" 813 | }, 814 | { 815 | "length":6, 816 | "width":2.2, 817 | "petal_length":5, 818 | "petal_width":1.5, 819 | "species":"virginica" 820 | }, 821 | { 822 | "length":6.9, 823 | "width":3.2, 824 | "petal_length":5.7, 825 | "petal_width":2.3, 826 | "species":"virginica" 827 | }, 828 | { 829 | "length":5.6, 830 | "width":2.8, 831 | "petal_length":4.9, 832 | "petal_width":2, 833 | "species":"virginica" 834 | }, 835 | { 836 | "length":7.7, 837 | "width":2.8, 838 | "petal_length":6.7, 839 | "petal_width":2, 840 | "species":"virginica" 841 | }, 842 | { 843 | "length":6.3, 844 | "width":2.7, 845 | "petal_length":4.9, 846 | "petal_width":1.8, 847 | "species":"virginica" 848 | }, 849 | { 850 | "length":6.7, 851 | "width":3.3, 852 | "petal_length":5.7, 853 | "petal_width":2.1, 854 | "species":"virginica" 855 | }, 856 | { 857 | "length":7.2, 858 | "width":3.2, 859 | "petal_length":6, 860 | "petal_width":1.8, 861 | "species":"virginica" 862 | }, 863 | { 864 | "length":6.2, 865 | "width":2.8, 866 | "petal_length":4.8, 867 | "petal_width":1.8, 868 | "species":"virginica" 869 | }, 870 | { 871 | "length":6.1, 872 | "width":3, 873 | "petal_length":4.9, 874 | "petal_width":1.8, 875 | "species":"virginica" 876 | }, 877 | { 878 | "length":6.4, 879 | "width":2.8, 880 | "petal_length":5.6, 881 | "petal_width":2.1, 882 | "species":"virginica" 883 | }, 884 | { 885 | "length":7.2, 886 | "width":3, 887 | "petal_length":5.8, 888 | "petal_width":1.6, 889 | "species":"virginica" 890 | }, 891 | { 892 | "length":7.4, 893 | "width":2.8, 894 | "petal_length":6.1, 895 | "petal_width":1.9, 896 | "species":"virginica" 897 | }, 898 | { 899 | "length":7.9, 900 | "width":3.8, 901 | "petal_length":6.4, 902 | "petal_width":2, 903 | "species":"virginica" 904 | }, 905 | { 906 | "length":6.4, 907 | "width":2.8, 908 | "petal_length":5.6, 909 | "petal_width":2.2, 910 | "species":"virginica" 911 | }, 912 | { 913 | "length":6.3, 914 | "width":2.8, 915 | "petal_length":5.1, 916 | "petal_width":1.5, 917 | "species":"virginica" 918 | }, 919 | { 920 | "length":6.1, 921 | "width":2.6, 922 | "petal_length":5.6, 923 | "petal_width":1.4, 924 | "species":"virginica" 925 | }, 926 | { 927 | "length":7.7, 928 | "width":3, 929 | "petal_length":6.1, 930 | "petal_width":2.3, 931 | "species":"virginica" 932 | }, 933 | { 934 | "length":6.3, 935 | "width":3.4, 936 | "petal_length":5.6, 937 | "petal_width":2.4, 938 | "species":"virginica" 939 | }, 940 | { 941 | "length":6.4, 942 | "width":3.1, 943 | "petal_length":5.5, 944 | "petal_width":1.8, 945 | "species":"virginica" 946 | }, 947 | { 948 | "length":6, 949 | "width":3, 950 | "petal_length":4.8, 951 | "petal_width":1.8, 952 | "species":"virginica" 953 | }, 954 | { 955 | "length":6.9, 956 | "width":3.1, 957 | "petal_length":5.4, 958 | "petal_width":2.1, 959 | "species":"virginica" 960 | }, 961 | { 962 | "length":6.7, 963 | "width":3.1, 964 | "petal_length":5.6, 965 | "petal_width":2.4, 966 | "species":"virginica" 967 | }, 968 | { 969 | "length":5.8, 970 | "width":2.7, 971 | "petal_length":5.1, 972 | "petal_width":1.9, 973 | "species":"virginica" 974 | }, 975 | { 976 | "length":6.8, 977 | "width":3.2, 978 | "petal_length":5.9, 979 | "petal_width":2.3, 980 | "species":"virginica" 981 | }, 982 | { 983 | "length":6.7, 984 | "width":3.3, 985 | "petal_length":5.7, 986 | "petal_width":2.5, 987 | "species":"virginica" 988 | }, 989 | { 990 | "length":6.7, 991 | "width":3, 992 | "petal_length":5.2, 993 | "petal_width":2.3, 994 | "species":"virginica" 995 | }, 996 | { 997 | "length":6.3, 998 | "width":2.5, 999 | "petal_length":5, 1000 | "petal_width":1.9, 1001 | "species":"virginica" 1002 | }, 1003 | { 1004 | "length":6.5, 1005 | "width":3, 1006 | "petal_length":5.2, 1007 | "petal_width":2, 1008 | "species":"virginica" 1009 | }, 1010 | { 1011 | "length":5.9, 1012 | "width":3, 1013 | "petal_length":5.1, 1014 | "petal_width":1.8, 1015 | "species":"virginica" 1016 | }, 1017 | { 1018 | "length":6.9, 1019 | "width":3.1, 1020 | "petal_length":5.1, 1021 | "petal_width":2.3, 1022 | "species":"virginica" 1023 | }, 1024 | { 1025 | "length":5.1, 1026 | "width":3.8, 1027 | "petal_length":1.5, 1028 | "petal_width":0.3, 1029 | "species":"setosa" 1030 | }, 1031 | { 1032 | "length":5, 1033 | "width":2.3, 1034 | "petal_length":3.3, 1035 | "petal_width":1, 1036 | "species":"versicolor" 1037 | }, 1038 | { 1039 | "length":6.2, 1040 | "width":3.4, 1041 | "petal_length":5.4, 1042 | "petal_width":2.3, 1043 | "species":"virginica" 1044 | }, 1045 | { 1046 | "length":4.9, 1047 | "width":3, 1048 | "petal_length":1.4, 1049 | "petal_width":0.2, 1050 | "species":"setosa" 1051 | }, 1052 | { 1053 | "length":4.7, 1054 | "width":3.2, 1055 | "petal_length":1.3, 1056 | "petal_width":0.2, 1057 | "species":"setosa" 1058 | } 1059 | ] -------------------------------------------------------------------------------- /tree/index.js: -------------------------------------------------------------------------------- 1 | /*A decision tree classifier. 2 | 3 | Parameters 4 | ---------- 5 | criterion : string, optional (default="entropy") 6 | The function to measure the quality of a split. Supported criteria are 7 | "gini" for the Gini impurity and "entropy" for the information gain. 8 | 9 | splitter : string, optional (default="best") 10 | The strategy used to choose the split at each node. Supported 11 | strategies are "best" to choose the best split and "random" to choose 12 | the best random split. 13 | 14 | max_features : int, float, string or None, optional (default=None) 15 | The number of features to consider when looking for the best split: 16 | - If int, then consider `max_features` features at each split. 17 | - If float, then `max_features` is a percentage and 18 | `int(max_features * n_features)` features are considered at each 19 | split. 20 | - If "auto", then `max_features=sqrt(n_features)`. 21 | - If "sqrt", then `max_features=sqrt(n_features)`. 22 | - If "log2", then `max_features=log2(n_features)`. 23 | - If None, then `max_features=n_features`. 24 | 25 | Note: the search for a split does not stop until at least one 26 | valid partition of the node samples is found, even if it requires to 27 | effectively inspect more than ``max_features`` features. 28 | 29 | max_depth : int or None, optional (default=None) 30 | The maximum depth of the tree. If None, then nodes are expanded until 31 | all leaves are pure or until all leaves contain less than 32 | min_samples_split samples. 33 | Ignored if ``max_samples_leaf`` is not None. 34 | 35 | min_samples_split : int, optional (default=2) 36 | The minimum number of samples required to split an internal node. 37 | 38 | min_samples_leaf : int, optional (default=1) 39 | The minimum number of samples required to be at a leaf node. 40 | 41 | max_leaf_nodes : int or None, optional (default=None) 42 | Grow a tree with ``max_leaf_nodes`` in best-first fashion. 43 | Best nodes are defined as relative reduction in impurity. 44 | If None then unlimited number of leaf nodes. 45 | If not None then ``max_depth`` will be ignored. 46 | */ 47 | 48 | var _ = require("underscore"), 49 | utils = require("../utilities"); 50 | 51 | var DecisionTreeClassifier = function(params) { 52 | this.criterion = params.criterion || 'entropy'; 53 | this.splitter = params.splitter || 'best'; 54 | this.min_samples_split = params.min_samples_split || 2; 55 | this.min_samples_leaf = params.min_samples_leaf || 1; 56 | //this.max_depth = params.max_depth || 5; 57 | this.num_tries = params.num_tries || 10; 58 | }; 59 | 60 | DecisionTreeClassifier.prototype = { 61 | fit: function(data, features, y) { 62 | var major_label = utils.GetDominate(_.pluck(data, y)); 63 | return utils.C45(data, features, y, major_label, this.num_tries); 64 | }, 65 | predict: function(sample) { 66 | var root = this.model; 67 | 68 | if (typeof root === 'undefined') { 69 | return 'null'; 70 | } 71 | 72 | while (root.type !== "result") { 73 | var attr = root.name; 74 | var child_node; 75 | if (root.type === 'feature_real') { 76 | var sample_value = parseFloat(sample[attr]); 77 | if (sample_value <= root.cut){ 78 | child_node = root.vals[1]; 79 | } else { 80 | child_node = root.vals[0]; 81 | } 82 | } else { 83 | var sample_value = sample[attr]; 84 | child_node = _.detect(root.vals, function(x) { 85 | return x.name == sample_value; 86 | }); 87 | } 88 | if (child_node) { 89 | root = child_node.child; 90 | } 91 | break; 92 | } 93 | 94 | return root.val; 95 | } 96 | }; 97 | 98 | module.exports = DecisionTreeClassifier; 99 | -------------------------------------------------------------------------------- /utilities/index.js: -------------------------------------------------------------------------------- 1 | var _ = require("underscore"); 2 | 3 | var AllValuesSame = function( arr ){ 4 | if (arr.length > 0) { 5 | for (var i = 1; i < arr.length; i++){ 6 | if (arr[i] !== arr[0]){ 7 | return false; 8 | } 9 | } 10 | } 11 | return true; 12 | } 13 | 14 | var Gain = function(data, feature, y, num_tries){ 15 | var attribute_values = _.pluck(data, feature), 16 | entropy = Entropy(_.pluck(data, y)), 17 | size = data.length, 18 | feature_type = GetType(data[0][feature]); 19 | 20 | if (feature_type == "float" || feature_type == "int"){ 21 | var min = _.min(attribute_values); 22 | var max = _.max(attribute_values); 23 | 24 | var entropies = attribute_values.map(function(n){ 25 | var sub_entropies = []; 26 | 27 | // var cutf = parseFloat(n), 28 | // _gain = entropy - ConditionalEntropy(data, feature, y, cutf); 29 | // sub_entropies.push({ 30 | // feature: feature, 31 | // gain: _gain, 32 | // cut: cutf 33 | // }); 34 | 35 | for (var i=0; i < num_tries; i++) { 36 | var cutf = RandomFloat(min, max), 37 | _gain = entropy - ConditionalEntropy(data, feature, y, cutf); 38 | sub_entropies.push({ 39 | feature: feature, 40 | gain: _gain, 41 | cut: cutf 42 | }); 43 | } 44 | return _.max(sub_entropies, function(e){return e.gain}); 45 | }); 46 | return _.max(entropies, function(e){return e.gain}); 47 | } else { 48 | var entropies = attribute_values.map(function(n){ 49 | var subset = data.filter(function(x){return x[feature] === n}); 50 | return ((subset.length/size) * Entropy(_.pluck(subset, y))); 51 | }); 52 | 53 | var total_entropies = entropies.reduce(function(a, b){ return a+b; }, 0); 54 | return { 55 | feature: feature, 56 | gain: entropy - total_entropies, 57 | cut: 0 58 | }; 59 | } 60 | }; 61 | 62 | var MaxGain = function(data, features, y, num_tries){ 63 | var gains = []; 64 | for (var i=0; i < features.length; i++) { 65 | gains.push(Gain(data, features[i], y, num_tries)); 66 | } 67 | 68 | if ( AllValuesSame(_.pluck(gains, 'gain')) ){ 69 | return gains[RandomInt(0, gains.length)]; 70 | } else { 71 | return _.max(gains,function(e){ 72 | return e.gain; 73 | }); 74 | } 75 | }; 76 | 77 | var GetDominate = function(vals){ 78 | return _.sortBy(vals, function(a){ 79 | return Count(a, vals); 80 | }).reverse()[0]; 81 | }; 82 | 83 | var Count = function (a, vals){ 84 | return _.filter(vals, function(b) { return b === a}).length; 85 | }; 86 | 87 | var Entropy = function(vals){ 88 | var unique = _.unique(vals), 89 | probs = unique.map(function(x){ return Probability(x, vals); }), 90 | logs = probs.map(function(p){ return -p*Log2(p); }); 91 | 92 | return logs.reduce(function(a, b){ return a+b; }, 0); 93 | }; 94 | 95 | var ConditionalEntropy = function(_s, feature, y, cut){ 96 | var s_1 = _s.filter(function(x){return x[feature] <= cut}), 97 | s_2 = _s.filter(function(x){return x[feature] > cut}), 98 | size = _s.length; 99 | return s_1.length/size*Entropy(_.pluck(s_1, y)) + s_2.length/size*Entropy(_.pluck(s_1, y)); 100 | }; 101 | 102 | var Log2 = function(n){ 103 | return Math.log(n)/Math.log(2); 104 | }; 105 | 106 | var Probability = function(val, vals){ 107 | var instances = _.filter(vals, function(x) { return x === val; }).length; 108 | return instances/vals.length; 109 | }; 110 | 111 | var RID = function(){ 112 | return "_r" + Math.round(Math.random()*1000000).toString(); 113 | }; 114 | 115 | var GetType = function(input) { 116 | var m = (/^[\d]+(\.[\d]+)?$/).exec(input); 117 | if (m) { 118 | // Check if there is a decimal place 119 | if (m[1]) { 120 | return 'float'; 121 | } else { 122 | return 'int'; 123 | } 124 | } 125 | return 'string'; 126 | }; 127 | 128 | var Average = function(v){ 129 | var sum = v.reduce(function(a, b) { return a + b }); 130 | var avg = sum / v.length; 131 | return avg; 132 | } 133 | 134 | 135 | var C45 = function(data, features, y, major_label, num_tries){ 136 | var tree = {}; 137 | var y_values = _.pluck(data, y); 138 | 139 | // last leaf 140 | if (y_values.length == 1) { 141 | return { 142 | type:"result", 143 | val: y_values[0], 144 | name: y_values[0], 145 | alias: y_values[0] + RID() 146 | }; 147 | } 148 | 149 | if (y_values.length == 0){ 150 | return { 151 | type:"result", 152 | val: major_label, 153 | name: major_label, 154 | alias: major_label + RID() 155 | }; 156 | } 157 | 158 | if (features === true){ 159 | // end of branch 160 | // returning the most dominate feature 161 | var dominate_y = GetDominate(y_values); 162 | return { 163 | type:"result", 164 | val: dominate_y, 165 | name: dominate_y, 166 | alias: dominate_y + RID() 167 | }; 168 | } 169 | 170 | if (!features || features.length == 0){ 171 | // get all the features that are not y 172 | features = _.reject(_.keys(data[0]), function(f){ return f == y; }); 173 | } 174 | 175 | var best_feature_data = MaxGain(data, features, y, num_tries), 176 | best_feature = best_feature_data.feature; 177 | var feature_remains = _.without(features, best_feature); 178 | var best_feature_type = GetType(data[0][best_feature]); 179 | 180 | // check if its an int/float 181 | if (best_feature_type == "float" || best_feature_type == "int"){ 182 | tree = { 183 | name: best_feature, 184 | alias: best_feature + RID(), 185 | cut: best_feature_data.cut, 186 | type: "feature_real", 187 | vals: [] 188 | }; 189 | 190 | if (feature_remains.length == 0){ 191 | feature_remains = true; 192 | } 193 | 194 | var rightCutData = data.filter(function(x){ return x[best_feature] > best_feature_data.cut}); 195 | var child_node_r = { 196 | name: tree.cut.toString(), 197 | alias: '>' + tree.cut.toString() + RID(), 198 | type: "feature_value" 199 | }; 200 | child_node_r.child = C45(rightCutData, feature_remains, y, major_label, num_tries); 201 | tree.vals.push(child_node_r); 202 | 203 | var leftCutData = data.filter(function(x){return x[best_feature] <= best_feature_data.cut}); 204 | var child_node_l = { 205 | name: tree.cut.toString(), 206 | alias: '<=' + tree.cut.toString() + RID(), 207 | type: "feature_value" 208 | }; 209 | child_node_l.child = C45(leftCutData, feature_remains, y, major_label, num_tries); 210 | tree.vals.push(child_node_l); 211 | } else { 212 | var possibilities = possibilities = _.unique(_.pluck(data, best_feature)); 213 | tree = { 214 | name: best_feature, 215 | alias: best_feature + RID(), 216 | type: "feature", 217 | vals: [] 218 | }; 219 | 220 | tree.vals = _.map(possibilities, function(v){ 221 | var data_modified = data.filter(function(x) { return x[best_feature] == v; }); 222 | 223 | var branch = { 224 | name: v, 225 | alias: v + RID(), 226 | type: "feature_value" 227 | }; 228 | 229 | if (feature_remains.length == 0){ 230 | feature_remains = true; 231 | } 232 | branch.child = C45(data_modified, feature_remains, y, major_label, num_tries); 233 | 234 | return branch; 235 | }); 236 | } 237 | 238 | return tree; 239 | }; 240 | 241 | var ID3 = function(data, features, y){ 242 | var y_values = _.unique(_.pluck(data, y)); 243 | 244 | // last leaf 245 | if (y_values.length == 1){ 246 | return { 247 | type: "result", 248 | val: y_values[0], 249 | name: y_values[0], 250 | alias: y_values[0] + RID() 251 | }; 252 | } 253 | 254 | if (features === true || y_values.length == 0){ 255 | // end of branch 256 | // returning the most dominate feature 257 | var dominate_y = GetDominate(_.pluck(data, y)); 258 | return { 259 | type:"result", 260 | val: dominate_y, 261 | name: dominate_y, 262 | alias: dominate_y + RID() 263 | }; 264 | } 265 | 266 | if (!features || features.length == 0){ 267 | // get all the features that are not y 268 | features = _.reject(_.keys(data[0]), function(f){ return f == y; }); 269 | } 270 | 271 | var best_feature = _.max(features, function(f){return Gain(data, f, y).gain; }); 272 | var feature_remains = _.without(features, best_feature); 273 | var possibilities = _.unique(_.pluck(data, best_feature)); 274 | var tree = { 275 | name: best_feature, 276 | alias: best_feature + RID(), 277 | type: "feature" 278 | }; 279 | 280 | // create the branch of the tree 281 | tree.vals = _.map(possibilities, function(v){ 282 | var data_modified = data.filter(function(x) { return x[best_feature] == v; }); 283 | 284 | var branch = { 285 | name: v, 286 | alias: v + RID(), 287 | type: "feature_value" 288 | }; 289 | 290 | if (feature_remains.length == 0){ 291 | feature_remains = true; 292 | } 293 | branch.child = ID3(data_modified, feature_remains, y); 294 | 295 | return branch; 296 | }); 297 | 298 | return tree; 299 | }; 300 | 301 | var recursived3ifyModel = function(model){ 302 | var new_model = {}; 303 | if (model && model.children){ 304 | for (var j=0; j < model.children.length; j++){ 305 | var cleanname = ""; 306 | if (model.children[j].alias.indexOf("<=") === 0){ 307 | cleanname += "<= "; 308 | } else if (model.children[j].alias.indexOf(">") === 0){ 309 | cleanname += "> "; 310 | } 311 | cleanname += model.children[j].name; 312 | if (model.children[j].child && model.children[j].child.vals) { 313 | model.children[j].children = model.children[j].child.vals; 314 | model.children[j] = recursived3ifyModel(model.children[j]); 315 | } else if (model.children[j].child && model.children[j].child.type == "result"){ 316 | cleanname += " " +model.children[j].child.val; 317 | } 318 | model.children[j].name = cleanname; 319 | } 320 | } 321 | 322 | return model; 323 | }; 324 | 325 | var d3ifyModel = function(trees){ 326 | var models = []; 327 | for (var i=0; i< trees.length; i++){ 328 | models[i] = { 329 | name: trees[i].model.name, 330 | children: trees[i].model.vals 331 | } 332 | models[i] = recursived3ifyModel(models[i]); 333 | } 334 | return models 335 | }; 336 | 337 | 338 | var RandomFloat = function (a, b) { 339 | return Math.random()*(b-a)+a; 340 | }; 341 | 342 | 343 | var RandomInt = function (a, b) { 344 | return Math.floor(Math.random()*(b-a)+a); 345 | }; 346 | 347 | 348 | module.exports.ID3 = ID3; 349 | module.exports.C45 = C45; 350 | module.exports.GetType = GetType; 351 | module.exports.GetDominate = GetDominate; 352 | module.exports.Average = Average; 353 | module.exports.d3ifyModel = d3ifyModel; 354 | module.exports.AllValuesSame = AllValuesSame; 355 | --------------------------------------------------------------------------------