├── HISTORY.md
├── data
    ├── sample.json
    └── object-evaluation.json
├── .gitignore
├── tsconfig.json
├── LICENSE
├── THIRDPARTY.md
├── examples
    ├── javascript-usage.js
    ├── typescript-usage.ts
    ├── random-forest-usage.js
    ├── random-forest-usage.ts
    ├── xgboost-usage.js
    └── xgboost-usage.ts
├── package.json
├── tst
    ├── reported-bugs.ts
    ├── decision-tree.ts
    ├── evaluation.ts
    ├── edge-cases.ts
    ├── prediction-edge-cases.ts
    ├── data-validation.ts
    ├── random-forest-utils.ts
    ├── model-persistence.ts
    ├── id3-algorithm.ts
    ├── xgboost-loss-functions.ts
    └── continuous-variables.ts
├── src
    ├── shared
    │   ├── utils.ts
    │   ├── id3-algorithm.ts
    │   ├── types.ts
    │   ├── loss-functions.ts
    │   ├── gradient-boosting.ts
    │   ├── caching-system.ts
    │   └── memory-optimization.ts
    └── decision-tree.ts
├── TYPESCRIPT_MIGRATION.md
└── CONTRIBUTING.md


/HISTORY.md:
--------------------------------------------------------------------------------
1 | # 0.3.2 / 2019-05-15
2 | 
3 | * fix(license): fix the license text but keep it MIT as of the original code.
4 | * fix(lodash): upgrade lodash to the latest version
5 | * fix(code): make the code working with the latest lodash version
6 | * fork(github): project has been forked from [nodejs-decision-tree-id3](https://github.com/serendipious/nodejs-decision-tree-id3)
7 | * pull request(github): pull request has been made to the original repo


--------------------------------------------------------------------------------
/data/sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"features": ["color", "shape"],
 3 | 	"data": [
 4 | 		{"color":"blue", "shape":"square", "liked":false},
 5 | 		{"color":"red", "shape":"square", "liked":false},
 6 | 		{"color":"blue", "shape":"circle", "liked":true},
 7 | 		{"color":"red", "shape":"circle", "liked":true},
 8 | 		{"color":"blue", "shape":"hexagon", "liked":false},
 9 | 		{"color":"red", "shape":"hexagon", "liked":false},
10 | 		{"color":"yellow", "shape":"hexagon", "liked":true},
11 | 		{"color":"yellow", "shape":"circle", "liked":true}
12 | 	]
13 | }
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | node_modules/
 3 | npm-debug.log*
 4 | yarn-debug.log*
 5 | yarn-error.log*
 6 | 
 7 | # Build output
 8 | lib/
 9 | dist/
10 | 
11 | # TypeScript
12 | *.tsbuildinfo
13 | 
14 | # IDE
15 | .vscode/
16 | .idea/
17 | *.swp
18 | *.swo
19 | 
20 | # OS
21 | .DS_Store
22 | Thumbs.db
23 | 
24 | # Logs
25 | logs
26 | *.log
27 | 
28 | # Runtime data
29 | pids
30 | *.pid
31 | *.seed
32 | *.pid.lock
33 | 
34 | # Coverage directory used by tools like istanbul
35 | coverage/
36 | 
37 | # nyc test coverage
38 | .nyc_output
39 | 
40 | # Dependency directories
41 | jspm_packages/
42 | 
43 | # Optional npm cache directory
44 | .npm
45 | 
46 | # Optional REPL history
47 | .node_repl_history
48 | 
49 | # Output of 'npm pack'
50 | *.tgz
51 | 
52 | # Yarn Integrity file
53 | .yarn-integrity
54 | 


--------------------------------------------------------------------------------
/data/object-evaluation.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"features": ["foo", "bar","flim"],
 3 | 	"data": [
 4 | 		{"foo":true, "bar":true, "flim":true, "classification":{"description":"foo bar flim"}},
 5 | 		{"foo":false, "bar":true, "flim":true, "classification":{"description":"bar flim"}},
 6 | 		{"foo":true, "bar":false, "flim":true, "classification":{"description":"foo flim"}},
 7 | 		{"foo":false, "bar":false, "flim":true, "classification":{"description":"flim"}},
 8 | 		{"foo":true, "bar":true, "flim":false, "classification":{"description":"foo bar"}},
 9 | 		{"foo":false, "bar":true, "flim":false, "classification":{"description":"bar"}},
10 | 		{"foo":true, "bar":false, "flim":false, "classification":{"description":"foo"}},
11 | 		{"foo":false, "bar":false, "flim":false, "classification":{"description":"none"}}
12 | 	]
13 | }


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2022",
 4 |     "module": "ES2022",
 5 |     "lib": ["ES2022"],
 6 |     "outDir": "./lib",
 7 |     "rootDir": "./src",
 8 |     "strict": true,
 9 |     "esModuleInterop": true,
10 |     "skipLibCheck": true,
11 |     "forceConsistentCasingInFileNames": true,
12 |     "declaration": true,
13 |     "declarationMap": true,
14 |     "sourceMap": true,
15 |     "removeComments": false,
16 |     "noImplicitAny": false,
17 |     "noImplicitReturns": true,
18 |     "noFallthroughCasesInSwitch": true,
19 |     "moduleResolution": "bundler",
20 |     "allowSyntheticDefaultImports": true,
21 |     "experimentalDecorators": true,
22 |     "emitDecoratorMetadata": true
23 |   },
24 |   "include": [
25 |     "src/**/*"
26 |   ],
27 |   "exclude": [
28 |     "node_modules",
29 |     "lib",
30 |     "tst"
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | (The MIT License)
 2 | 
 3 | Copyright (c) 2014 Ankit Kuwadekar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | 'Software'), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/THIRDPARTY.md:
--------------------------------------------------------------------------------
 1 | # 3rd Party License Attribution
 2 | 
 3 | -------------------------------------------------------
 4 | 
 5 | ## lodash@4.17.11 (MIT)
 6 | 
 7 | ```
 8 | Copyright JS Foundation and other contributors 
 9 | 
10 | Based on Underscore.js, copyright Jeremy Ashkenas,
11 | DocumentCloud and Investigative Reporters & Editors 
12 | 
13 | This software consists of voluntary contributions made by many
14 | individuals. For exact contribution history, see the revision history
15 | available at https://github.com/lodash/lodash
16 | 
17 | The following license applies to all parts of this software except as
18 | documented below:
19 | 
20 | ====
21 | 
22 | Permission is hereby granted, free of charge, to any person obtaining
23 | a copy of this software and associated documentation files (the
24 | "Software"), to deal in the Software without restriction, including
25 | without limitation the rights to use, copy, modify, merge, publish,
26 | distribute, sublicense, and/or sell copies of the Software, and to
27 | permit persons to whom the Software is furnished to do so, subject to
28 | the following conditions:
29 | 
30 | The above copyright notice and this permission notice shall be
31 | included in all copies or substantial portions of the Software.
32 | 
33 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
34 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
35 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
36 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
37 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
38 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
39 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
40 | 
41 | ====
42 | 
43 | Copyright and related rights for sample code are waived via CC0. Sample
44 | code is defined as all source code displayed within the prose of the
45 | documentation.
46 | 
47 | CC0: http://creativecommons.org/publicdomain/zero/1.0/
48 | 
49 | ====
50 | 
51 | Files located in the node_modules and vendor directories are externally
52 | maintained libraries used by this software which have their own
53 | licenses; we recommend you read them, as their terms may differ from the
54 | terms above.
55 | ```
56 | 


--------------------------------------------------------------------------------
/examples/javascript-usage.js:
--------------------------------------------------------------------------------
 1 | import DecisionTree from '../lib/decision-tree.js';
 2 | 
 3 | // Sample training data
 4 | const trainingData = [
 5 |   { color: "blue", shape: "square", size: "small", liked: false },
 6 |   { color: "red", shape: "square", size: "small", liked: false },
 7 |   { color: "blue", shape: "circle", size: "medium", liked: true },
 8 |   { color: "red", shape: "circle", size: "medium", liked: true },
 9 |   { color: "blue", shape: "hexagon", size: "large", liked: false },
10 |   { color: "red", shape: "hexagon", size: "large", liked: false },
11 |   { color: "yellow", shape: "hexagon", size: "small", liked: true },
12 |   { color: "yellow", shape: "circle", size: "large", liked: true }
13 | ];
14 | 
15 | // Test data
16 | const testData = [
17 |   { color: "blue", shape: "hexagon", size: "medium", liked: false },
18 |   { color: "yellow", shape: "circle", size: "small", liked: true }
19 | ];
20 | 
21 | // Features to use for classification
22 | const features = ["color", "shape", "size"];
23 | const target = "liked";
24 | 
25 | // Create and train the decision tree
26 | const dt = new DecisionTree(target, features);
27 | dt.train(trainingData);
28 | 
29 | // Make predictions
30 | const sample1 = { color: "blue", shape: "hexagon", size: "medium" };
31 | const sample2 = { color: "yellow", shape: "circle", size: "small" };
32 | 
33 | const prediction1 = dt.predict(sample1);
34 | const prediction2 = dt.predict(sample2);
35 | 
36 | console.log(`Prediction for ${JSON.stringify(sample1)}: ${prediction1}`);
37 | console.log(`Prediction for ${JSON.stringify(sample2)}: ${prediction2}`);
38 | 
39 | // Evaluate accuracy
40 | const accuracy = dt.evaluate(testData);
41 | console.log(`Accuracy on test data: ${(accuracy * 100).toFixed(1)}%`);
42 | 
43 | // Export the model
44 | const modelJson = dt.toJSON();
45 | console.log('Model exported successfully');
46 | 
47 | // Import the model to a new instance
48 | const newDt = new DecisionTree(modelJson);
49 | const prediction3 = newDt.predict(sample1);
50 | console.log(`Prediction from imported model: ${prediction3}`);
51 | 
52 | // Access static properties
53 | console.log('Node types:', DecisionTree.NODE_TYPES);
54 | 
55 | // Example with ES6 import syntax (if using bundlers)
56 | // import DecisionTree from 'decision-tree';
57 | // const dt = new DecisionTree(target, features);
58 | 
59 | // For CommonJS environments, you can use dynamic imports:
60 | // const DecisionTree = await import('decision-tree');
61 | // const dt = new DecisionTree.default(target, features);
62 | 


--------------------------------------------------------------------------------
/examples/typescript-usage.ts:
--------------------------------------------------------------------------------
 1 | import DecisionTree from '../lib/decision-tree.js';
 2 | 
 3 | // Define interfaces for type safety
 4 | interface TrainingData {
 5 |   color: string;
 6 |   shape: string;
 7 |   size: string;
 8 |   liked: boolean;
 9 | }
10 | 
11 | interface PredictionData {
12 |   color: string;
13 |   shape: string;
14 |   size: string;
15 | }
16 | 
17 | // Sample training data
18 | const trainingData: TrainingData[] = [
19 |   { color: "blue", shape: "square", size: "small", liked: false },
20 |   { color: "red", shape: "square", size: "small", liked: false },
21 |   { color: "blue", shape: "circle", size: "medium", liked: true },
22 |   { color: "red", shape: "circle", size: "medium", liked: true },
23 |   { color: "blue", shape: "hexagon", size: "large", liked: false },
24 |   { color: "red", shape: "hexagon", size: "large", liked: false },
25 |   { color: "yellow", shape: "hexagon", size: "small", liked: true },
26 |   { color: "yellow", shape: "circle", size: "large", liked: true }
27 | ];
28 | 
29 | // Test data
30 | const testData: TrainingData[] = [
31 |   { color: "blue", shape: "hexagon", size: "medium", liked: false },
32 |   { color: "yellow", shape: "circle", size: "small", liked: true }
33 | ];
34 | 
35 | // Features to use for classification
36 | const features: (keyof TrainingData)[] = ["color", "shape", "size"];
37 | const target: keyof TrainingData = "liked";
38 | 
39 | // Create and train the decision tree
40 | const dt = new DecisionTree(target, features);
41 | dt.train(trainingData);
42 | 
43 | // Make predictions
44 | const sample1: PredictionData = { color: "blue", shape: "hexagon", size: "medium" };
45 | const sample2: PredictionData = { color: "yellow", shape: "circle", size: "small" };
46 | 
47 | const prediction1 = dt.predict(sample1);
48 | const prediction2 = dt.predict(sample2);
49 | 
50 | console.log(`Prediction for ${JSON.stringify(sample1)}: ${prediction1}`);
51 | console.log(`Prediction for ${JSON.stringify(sample2)}: ${prediction2}`);
52 | 
53 | // Evaluate accuracy
54 | const accuracy = dt.evaluate(testData);
55 | console.log(`Accuracy on test data: ${(accuracy * 100).toFixed(1)}%`);
56 | 
57 | // Export the model
58 | const modelJson = dt.toJSON();
59 | console.log('Model exported successfully');
60 | 
61 | // Import the model to a new instance
62 | const newDt = new DecisionTree(modelJson);
63 | const prediction3 = newDt.predict(sample1);
64 | console.log(`Prediction from imported model: ${prediction3}`);
65 | 
66 | // Access static properties
67 | console.log('Node types:', DecisionTree.NODE_TYPES);
68 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "decision-tree",
 3 |   "description": "NodeJS implementation of decision tree, random forest, and XGBoost algorithms with comprehensive performance testing (Node.js 20+)",
 4 |   "version": "1.1.0",
 5 |   "author": "Ankit Kuwadekar",
 6 |   "repository": {
 7 |     "type": "git",
 8 |     "url": "git://github.com/serendipious/nodejs-decision-tree.git"
 9 |   },
10 |   "main": "./lib/decision-tree.js",
11 |   "types": "./lib/decision-tree.d.ts",
12 |   "module": "./lib/decision-tree.js",
13 |   "type": "module",
14 |   "exports": {
15 |     ".": {
16 |       "import": "./lib/decision-tree.js",
17 |       "require": "./lib/decision-tree.js",
18 |       "types": "./lib/decision-tree.d.ts"
19 |     },
20 |     "./package.json": "./package.json"
21 |   },
22 |   "files": [
23 |     "lib",
24 |     "src"
25 |   ],
26 |   "dependencies": {
27 |     "lodash": "^4.17.21"
28 |   },
29 |   "devDependencies": {
30 |     "@types/lodash": "^4.17.20",
31 |     "@types/mocha": "^10.0.10",
32 |     "@types/node": "^24.3.0",
33 |     "cross-env": "^10.0.0",
34 |     "esm": "^3.2.25",
35 |     "mocha": "^11.7.1",
36 |     "ts-node": "^10.9.2",
37 |     "typescript": "^5.9.2"
38 |   },
39 |   "scripts": {
40 |     "build": "tsc",
41 |     "build:watch": "tsc --watch",
42 |     "test": "npm run build && cross-env NODE_PATH=. mocha --require ts-node/register -R spec tst/*.ts",
43 |     "test:bun": "bun run build && NODE_PATH=. npx mocha --require ts-node/register -R spec tst/*.ts",
44 |     "test:watch": "npm run build && NODE_PATH=. ./node_modules/.bin/mocha --require ts-node/register -R spec --watch tst/*.ts",
45 |     "clean": "rm -rf lib",
46 |     "prepublishOnly": "npm run clean && npm run build",
47 |     "example:js": "node examples/javascript-usage.js",
48 |     "example:ts": "ts-node examples/typescript-usage.ts",
49 |     "example:rf-js": "node examples/random-forest-usage.js",
50 |     "example:rf-ts": "ts-node examples/random-forest-usage.ts",
51 |     "example:xgb-js": "node examples/xgboost-usage.js",
52 |     "example:xgb-ts": "ts-node examples/xgboost-usage.ts"
53 |   },
54 |   "engines": {
55 |     "node": ">= 20.x"
56 |   },
57 |   "keywords": [
58 |     "decision",
59 |     "tree",
60 |     "classifier",
61 |     "classification",
62 |     "machine",
63 |     "learning",
64 |     "decision tree",
65 |     "random forest",
66 |     "xgboost",
67 |     "gradient boosting",
68 |     "ensemble learning",
69 |     "machine learning",
70 |     "ID3",
71 |     "performance testing",
72 |     "typescript"
73 |   ],
74 |   "license": "MIT"
75 | }
76 | 


--------------------------------------------------------------------------------
/tst/reported-bugs.ts:
--------------------------------------------------------------------------------
 1 | import { strict as assert } from 'assert';
 2 | import { readFileSync } from 'fs';
 3 | import { fileURLToPath } from 'url';
 4 | import { dirname, join } from 'path';
 5 | import DecisionTree from '../lib/decision-tree.js';
 6 | 
 7 | // Type definitions for test datasets
 8 | interface Dataset<T = any> {
 9 |   features: string[];
10 |   data: T[];
11 | }
12 | 
13 | interface ObjectEvaluationData {
14 |   foo: boolean;
15 |   bar: boolean;
16 |   flim: boolean;
17 |   classification: { description?: string };
18 | }
19 | 
20 | interface TicTacToeData {
21 |   [key: string]: any;
22 |   classification: string;
23 | }
24 | 
25 | interface VotingData {
26 |   [key: string]: any;
27 |   classification: string;
28 | }
29 | 
30 | // Helper function to load JSON files
31 | const __filename = fileURLToPath(import.meta.url);
32 | const __dirname = dirname(__filename);
33 | 
34 | function loadJSON<T>(filename: string): T {
35 |   const filePath = join(__dirname, '..', 'data', filename);
36 |   return JSON.parse(readFileSync(filePath, 'utf8')) as T;
37 | }
38 | 
39 | const OBJECT_EVALUATION_DATASET = loadJSON<Dataset<ObjectEvaluationData>>('object-evaluation.json');
40 | const TIC_TAC_TOE_DATASET = loadJSON<Dataset<TicTacToeData>>('tic-tac-toe.json');
41 | const VOTING_DATASET = loadJSON<Dataset<VotingData>>('voting.json');
42 | 
43 | /**
44 |  * Reported bugs from: https://github.com/serendipious/nodejs-decision-tree/issues
45 |  */
46 | describe('Decision Tree Reported Bugs', function() {
47 |   /**
48 |    * https://github.com/serendipious/nodejs-decision-tree/issues/21
49 |    */
50 |   it('should work with multiple decision tree instance declarations', () => {
51 |     const dt1 = new DecisionTree(TIC_TAC_TOE_DATASET.data, 'classification', TIC_TAC_TOE_DATASET.features);
52 |     const dt2 = new DecisionTree(VOTING_DATASET.data, 'classification', VOTING_DATASET.features);
53 |     const dt3 = new DecisionTree(OBJECT_EVALUATION_DATASET.data, 'classification', OBJECT_EVALUATION_DATASET.features);
54 | 
55 |     assert.strictEqual(dt1.evaluate(TIC_TAC_TOE_DATASET.data), 1);
56 |     assert.strictEqual(dt2.evaluate(VOTING_DATASET.data), 1);
57 |     assert.strictEqual(dt3.evaluate(OBJECT_EVALUATION_DATASET.data), 1);
58 |   });
59 | 
60 |   /**
61 |    * https://github.com/serendipious/nodejs-decision-tree/issues/22
62 |    */
63 |   it('should be able to export and import a trained model', () => {
64 |     const dt1 = new DecisionTree(TIC_TAC_TOE_DATASET.data, 'classification', TIC_TAC_TOE_DATASET.features);
65 |     const dt1ExportedModelJSON = dt1.toJSON();
66 |     
67 |     dt1.import(dt1ExportedModelJSON);
68 |     assert.strictEqual(dt1.evaluate(TIC_TAC_TOE_DATASET.data), 1);
69 |   });
70 | });
71 | 


--------------------------------------------------------------------------------
/tst/decision-tree.ts:
--------------------------------------------------------------------------------
  1 | import { strict as assert } from 'assert';
  2 | import { readFileSync } from 'fs';
  3 | import { fileURLToPath } from 'url';
  4 | import { dirname, join } from 'path';
  5 | import DecisionTree from '../lib/decision-tree.js';
  6 | 
  7 | // Type definitions for test data
  8 | interface SampleData {
  9 |   color: string;
 10 |   shape: string;
 11 |   liked: boolean;
 12 | }
 13 | 
 14 | interface Dataset {
 15 |   features: string[];
 16 |   data: SampleData[];
 17 | }
 18 | 
 19 | // Helper function to load JSON files
 20 | const __filename = fileURLToPath(import.meta.url);
 21 | const __dirname = dirname(__filename);
 22 | 
 23 | function loadJSON<T>(filename: string): T {
 24 |   const filePath = join(__dirname, '..', 'data', filename);
 25 |   return JSON.parse(readFileSync(filePath, 'utf8')) as T;
 26 | }
 27 | 
 28 | const SAMPLE_DATASET = loadJSON<Dataset>('sample.json');
 29 | const SAMPLE_DATASET_CLASS_NAME = 'liked';
 30 | 
 31 | describe('Decision Tree Basics', function() {
 32 |   const dt = new DecisionTree(SAMPLE_DATASET_CLASS_NAME, SAMPLE_DATASET.features);
 33 | 
 34 |   it('should initialize with valid argument constructor', () => {
 35 |     assert.ok(new DecisionTree(SAMPLE_DATASET_CLASS_NAME, SAMPLE_DATASET.features));
 36 |     assert.ok(new DecisionTree(SAMPLE_DATASET.data, SAMPLE_DATASET_CLASS_NAME, SAMPLE_DATASET.features));
 37 |   });
 38 | 
 39 |   it('should initialize & train for the three argument constructor', function() {
 40 |     assert.ok(dt);
 41 |   });
 42 | 
 43 |   it('should throw initialization error with invalid constructor arguments', function() {
 44 |     assert.throws(() => new DecisionTree());
 45 |     assert.throws(() => new DecisionTree(1 as any, 2 as any, 3 as any, 4 as any));
 46 |     assert.throws(() => new DecisionTree(1 as any, 1 as any));
 47 |     assert.throws(() => new DecisionTree("abc", 1 as any));
 48 |     assert.throws(() => new DecisionTree(1 as any, 1 as any, 1 as any));
 49 |   });
 50 | 
 51 |   it('should train on the dataset', function() {
 52 |     dt.train(SAMPLE_DATASET.data);
 53 |     assert.ok(dt.toJSON());
 54 |   });
 55 | 
 56 |   it('should predict on a sample instance', function() {
 57 |     const sample = SAMPLE_DATASET.data[0];
 58 |     const predicted_class = dt.predict(sample);
 59 |     const actual_class = sample[SAMPLE_DATASET_CLASS_NAME as keyof SampleData];
 60 |     assert.strictEqual(predicted_class, actual_class);
 61 |   });
 62 | 
 63 |   it('should evaluate perfectly on training dataset', function() {
 64 |     const accuracy = dt.evaluate(SAMPLE_DATASET.data);
 65 |     assert.strictEqual(accuracy, 1);
 66 |   });
 67 | 
 68 |   it('should provide access to the underlying model as JSON', function() {
 69 |     const dtJson = dt.toJSON();
 70 |     const treeModel = dtJson.model;
 71 | 
 72 |     assert.strictEqual(treeModel.constructor, Object);
 73 |     assert.ok(Array.isArray(treeModel.vals));
 74 |     assert.strictEqual(treeModel.vals.length, 3);
 75 | 
 76 |     assert.ok(Array.isArray(dtJson.features));
 77 |     assert.strictEqual(typeof dtJson.target, 'string');
 78 |   });
 79 | 
 80 |   it('should provide access to insights on each node (e.g. gain, sample size, etc.)', () => {
 81 |     const dtJson = dt.toJSON();
 82 |     const rootNode = dtJson.model;
 83 | 
 84 |     assert.strictEqual(rootNode.gain! >= 0 && rootNode.gain! <= 1, true);
 85 |     assert.strictEqual(typeof rootNode.sampleSize, 'number');
 86 | 
 87 |     const childNodes = rootNode.vals!;
 88 |     for (let childNode of childNodes) {
 89 |       assert.strictEqual(typeof childNode.prob, 'number');
 90 |       assert.strictEqual(typeof childNode.sampleSize, 'number');
 91 |     }
 92 |   });
 93 | 
 94 |   it('should initialize from existing or previously exported model', function() {
 95 |     const pretrainedDecTree = new DecisionTree(dt.toJSON());
 96 |     const pretrainedDecTreeAccuracy = pretrainedDecTree.evaluate(SAMPLE_DATASET.data);
 97 |     assert.strictEqual(pretrainedDecTreeAccuracy, 1);
 98 |   });
 99 | });
100 | 


--------------------------------------------------------------------------------
/src/shared/utils.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Shared utility functions for Decision Tree and Random Forest
  3 |  */
  4 | 
  5 | import _ from 'lodash';
  6 | 
  7 | /**
  8 |  * Generates random UUID
  9 |  * @private
 10 |  */
 11 | export function randomUUID(): string {
 12 |   return "_r" + Math.random().toString(32).slice(2);
 13 | }
 14 | 
 15 | /**
 16 |  * Computes probability of a given value existing in a given list
 17 |  * @private
 18 |  */
 19 | export function prob(value: any, list: any[]): number {
 20 |   let occurrences = _.filter(list, function (element) {
 21 |     return element === value;
 22 |   });
 23 | 
 24 |   let numOccurrences = occurrences.length;
 25 |   let numElements = list.length;
 26 |   return numOccurrences / numElements;
 27 | }
 28 | 
 29 | /**
 30 |  * Computes Log with base-2
 31 |  * @private
 32 |  */
 33 | export function log2(n: number): number {
 34 |   return Math.log(n) / Math.log(2);
 35 | }
 36 | 
 37 | /**
 38 |  * Finds element with highest occurrence in a list
 39 |  * @private
 40 |  */
 41 | export function mostCommon(list: any[]): any {
 42 |   let elementFrequencyMap: { [key: string]: number } = {};
 43 |   let largestFrequency = -1;
 44 |   let mostCommonElement: any = null;
 45 | 
 46 |   list.forEach(function (element) {
 47 |     let elementFrequency = (elementFrequencyMap[element] || 0) + 1;
 48 |     elementFrequencyMap[element] = elementFrequency;
 49 | 
 50 |     if (largestFrequency < elementFrequency) {
 51 |       mostCommonElement = element;
 52 |       largestFrequency = elementFrequency;
 53 |     }
 54 |   });
 55 | 
 56 |   return mostCommonElement;
 57 | }
 58 | 
 59 | /**
 60 |  * Simple seeded random number generator for reproducible results
 61 |  * @private
 62 |  */
 63 | export class SeededRandom {
 64 |   private seed: number;
 65 | 
 66 |   constructor(seed: number) {
 67 |     this.seed = seed;
 68 |   }
 69 | 
 70 |   next(): number {
 71 |     this.seed = (this.seed * 9301 + 49297) % 233280;
 72 |     return this.seed / 233280;
 73 |   }
 74 | 
 75 |   nextInt(max: number): number {
 76 |     return Math.floor(this.next() * max);
 77 |   }
 78 | }
 79 | 
 80 | /**
 81 |  * Bootstrap sampling with replacement
 82 |  * @private
 83 |  */
 84 | export function bootstrapSample(data: any[], sampleSize: number, random: SeededRandom): any[] {
 85 |   if (data.length === 0) {
 86 |     throw new Error('Cannot create bootstrap sample from empty data');
 87 |   }
 88 |   
 89 |   const sample: any[] = [];
 90 |   for (let i = 0; i < sampleSize; i++) {
 91 |     const randomIndex = random.nextInt(data.length);
 92 |     sample.push(data[randomIndex]);
 93 |   }
 94 |   return sample;
 95 | }
 96 | 
 97 | /**
 98 |  * Random feature selection for Random Forest
 99 |  * @private
100 |  */
101 | export function selectRandomFeatures(
102 |   features: string[], 
103 |   maxFeatures: number | 'sqrt' | 'log2' | 'auto', 
104 |   random: SeededRandom
105 | ): string[] {
106 |   const totalFeatures = features.length;
107 |   let numFeatures: number;
108 | 
109 |   switch (maxFeatures) {
110 |     case 'sqrt':
111 |       numFeatures = Math.floor(Math.sqrt(totalFeatures));
112 |       break;
113 |     case 'log2':
114 |       numFeatures = Math.floor(Math.log2(totalFeatures));
115 |       break;
116 |     case 'auto':
117 |       numFeatures = Math.floor(Math.sqrt(totalFeatures));
118 |       break;
119 |     default:
120 |       numFeatures = Math.min(maxFeatures, totalFeatures);
121 |   }
122 | 
123 |   // Ensure we have at least 1 feature
124 |   numFeatures = Math.max(1, numFeatures);
125 | 
126 |   // Shuffle and select features
127 |   const shuffledFeatures = [...features];
128 |   for (let i = shuffledFeatures.length - 1; i > 0; i--) {
129 |     const j = random.nextInt(i + 1);
130 |     [shuffledFeatures[i], shuffledFeatures[j]] = [shuffledFeatures[j], shuffledFeatures[i]];
131 |   }
132 | 
133 |   return shuffledFeatures.slice(0, numFeatures);
134 | }
135 | 
136 | /**
137 |  * Majority voting for ensemble predictions
138 |  * @private
139 |  */
140 | export function majorityVote(predictions: any[]): any {
141 |   const frequencyMap: { [key: string]: number } = {};
142 |   
143 |   predictions.forEach(prediction => {
144 |     const key = String(prediction);
145 |     frequencyMap[key] = (frequencyMap[key] || 0) + 1;
146 |   });
147 | 
148 |   let maxCount = 0;
149 |   let result: any = null;
150 | 
151 |   Object.entries(frequencyMap).forEach(([key, count]) => {
152 |     if (count > maxCount) {
153 |       maxCount = count;
154 |       result = key;
155 |     }
156 |   });
157 | 
158 |   // Convert back to original type if possible
159 |   const firstPrediction = predictions[0];
160 |   if (typeof firstPrediction === 'boolean') {
161 |     return result === 'true';
162 |   } else if (typeof firstPrediction === 'number') {
163 |     return Number(result);
164 |   }
165 |   
166 |   return result;
167 | }
168 | 


--------------------------------------------------------------------------------
/tst/evaluation.ts:
--------------------------------------------------------------------------------
  1 | import { strict as assert } from 'assert';
  2 | import { readFileSync } from 'fs';
  3 | import { fileURLToPath } from 'url';
  4 | import { dirname, join } from 'path';
  5 | import DecisionTree from '../lib/decision-tree.js';
  6 | 
  7 | // Type definitions for test datasets
  8 | interface Dataset<T = any> {
  9 |   features: string[];
 10 |   data: T[];
 11 | }
 12 | 
 13 | interface ObjectEvaluationData {
 14 |   foo: boolean;
 15 |   bar: boolean;
 16 |   flim: boolean;
 17 |   classification: { description?: string };
 18 | }
 19 | 
 20 | interface TicTacToeData {
 21 |   [key: string]: any;
 22 |   classification: string;
 23 | }
 24 | 
 25 | interface VotingData {
 26 |   [key: string]: any;
 27 |   classification: string;
 28 | }
 29 | 
 30 | // Helper function to load JSON files
 31 | const __filename = fileURLToPath(import.meta.url);
 32 | const __dirname = dirname(__filename);
 33 | 
 34 | function loadJSON<T>(filename: string): T {
 35 |   const filePath = join(__dirname, '..', 'data', filename);
 36 |   return JSON.parse(readFileSync(filePath, 'utf8')) as T;
 37 | }
 38 | 
 39 | const OBJECT_EVALUATION_DATASET = loadJSON<Dataset<ObjectEvaluationData>>('object-evaluation.json');
 40 | const TIC_TAC_TOE_DATASET = loadJSON<Dataset<TicTacToeData>>('tic-tac-toe.json');
 41 | const VOTING_DATASET = loadJSON<Dataset<VotingData>>('voting.json');
 42 | 
 43 | describe('DecisionTree Decision Tree on Sample Datasets', function() {
 44 |   describe('Tic Tac Toe Dataset', function() {
 45 |     const dt = new DecisionTree(TIC_TAC_TOE_DATASET.data, 'classification', TIC_TAC_TOE_DATASET.features);
 46 | 
 47 |     it('should initialize on training dataset', function() {
 48 |       assert.ok(dt);
 49 |       assert.ok(dt.toJSON());
 50 |     });
 51 | 
 52 |     it('should evaluate perfectly on training dataset', function() {
 53 |       const accuracy = dt.evaluate(TIC_TAC_TOE_DATASET.data);
 54 |       assert.strictEqual(accuracy, 1);
 55 |     });
 56 |   });
 57 | 
 58 |   describe('Voting Dataset', function() {
 59 |     const dt = new DecisionTree(VOTING_DATASET.data, 'classification', VOTING_DATASET.features);
 60 |     
 61 |     it('should initialize on training dataset', function() {
 62 |       assert.ok(dt);
 63 |       assert.ok(dt.toJSON());
 64 |     });
 65 | 
 66 |     it('should evaluate perfectly on training dataset', function() {
 67 |       const accuracy = dt.evaluate(VOTING_DATASET.data);
 68 |       assert.strictEqual(accuracy, 1);
 69 |     });
 70 |   });
 71 | 
 72 |   describe('Object Evaluation Dataset', function() {
 73 |     const dt = new DecisionTree(OBJECT_EVALUATION_DATASET.data, 'classification', OBJECT_EVALUATION_DATASET.features);
 74 | 
 75 |     it('should initialize on training dataset', function() {
 76 |       assert.ok(dt);
 77 |       assert.ok(dt.toJSON());
 78 |     });
 79 | 
 80 |     it('should evaluate perfectly on training dataset', function() {
 81 |       const data: ObjectEvaluationData[] = [
 82 |         {"foo":true, "bar":true, "flim":true, "classification":{"description":"foo bar flim"}},
 83 |         {"foo":false, "bar":true, "flim":true, "classification":{"description":"bar flim"}},
 84 |         {"foo":true, "bar":false, "flim":true, "classification":{"description":"foo flim"}},
 85 |         {"foo":false, "bar":false, "flim":true, "classification":{"description":"flim"}},
 86 |         {"foo":true, "bar":true, "flim":false, "classification":{"description":"foo bar"}},
 87 |         {"foo":false, "bar":true, "flim":false, "classification":{"description":"bar"}},
 88 |         {"foo":true, "bar":false, "flim":false, "classification":{"description":"foo"}},
 89 |         {"foo":false, "bar":false, "flim":false, "classification":{"description":"none"}}
 90 |       ];
 91 |       const accuracy = dt.evaluate(data);
 92 |       assert.strictEqual(accuracy, 1);
 93 |     });
 94 | 
 95 |     it('should evaluate 87.5% on training dataset', function() {
 96 |       const data: ObjectEvaluationData[] = [
 97 |         {"foo":true, "bar":true, "flim":true, "classification":{"description":"foo bar flim"}},
 98 |         {"foo":false, "bar":true, "flim":true, "classification":{"description":"bar flim"}},
 99 |         {"foo":true, "bar":false, "flim":true, "classification":{"description":"foo flim"}},
100 |         {"foo":false, "bar":false, "flim":true, "classification":{"description":"flim"}},
101 |         {"foo":true, "bar":true, "flim":false, "classification":{"description":"foo bar"}},
102 |         {"foo":false, "bar":true, "flim":false, "classification":{"description":"bar"}},
103 |         {"foo":true, "bar":false, "flim":false, "classification":{"description":"foo"}},
104 |         {"foo":false, "bar":false, "flim":false, "classification":{}}
105 |       ];
106 |       const accuracy = dt.evaluate(data);
107 |       assert.strictEqual(accuracy, 0.875);
108 |     });
109 |   });
110 | });
111 | 


--------------------------------------------------------------------------------
/examples/random-forest-usage.js:
--------------------------------------------------------------------------------
  1 | import RandomForest from '../lib/random-forest.js';
  2 | 
  3 | // Sample training data
  4 | const trainingData = [
  5 |   { color: "blue", shape: "square", size: "small", liked: false },
  6 |   { color: "red", shape: "square", size: "small", liked: false },
  7 |   { color: "blue", shape: "circle", size: "medium", liked: true },
  8 |   { color: "red", shape: "circle", size: "medium", liked: true },
  9 |   { color: "blue", shape: "hexagon", size: "large", liked: false },
 10 |   { color: "red", shape: "hexagon", size: "large", liked: false },
 11 |   { color: "yellow", shape: "hexagon", size: "small", liked: true },
 12 |   { color: "yellow", shape: "circle", size: "large", liked: true }
 13 | ];
 14 | 
 15 | // Test data
 16 | const testData = [
 17 |   { color: "blue", shape: "hexagon", size: "medium", liked: false },
 18 |   { color: "yellow", shape: "circle", size: "small", liked: true }
 19 | ];
 20 | 
 21 | // Features to use for classification
 22 | const features = ["color", "shape", "size"];
 23 | const target = "liked";
 24 | 
 25 | console.log('=== Random Forest Example ===\n');
 26 | 
 27 | // Example 1: Basic Random Forest usage
 28 | console.log('1. Basic Random Forest:');
 29 | const rf1 = new RandomForest(target, features);
 30 | rf1.train(trainingData);
 31 | 
 32 | const sample1 = { color: "blue", shape: "hexagon", size: "medium" };
 33 | const prediction1 = rf1.predict(sample1);
 34 | console.log(`Prediction for ${JSON.stringify(sample1)}: ${prediction1}`);
 35 | 
 36 | const accuracy1 = rf1.evaluate(testData);
 37 | console.log(`Accuracy on test data: ${(accuracy1 * 100).toFixed(1)}%`);
 38 | console.log(`Number of trees: ${rf1.getTreeCount()}\n`);
 39 | 
 40 | // Example 2: Random Forest with custom configuration
 41 | console.log('2. Random Forest with custom configuration:');
 42 | const config = {
 43 |   nEstimators: 50,
 44 |   maxFeatures: 'sqrt',
 45 |   randomState: 42,
 46 |   bootstrap: true
 47 | };
 48 | 
 49 | const rf2 = new RandomForest(target, features, config);
 50 | rf2.train(trainingData);
 51 | 
 52 | const sample2 = { color: "yellow", shape: "circle", size: "small" };
 53 | const prediction2 = rf2.predict(sample2);
 54 | console.log(`Prediction for ${JSON.stringify(sample2)}: ${prediction2}`);
 55 | 
 56 | const accuracy2 = rf2.evaluate(testData);
 57 | console.log(`Accuracy on test data: ${(accuracy2 * 100).toFixed(1)}%`);
 58 | console.log(`Number of trees: ${rf2.getTreeCount()}`);
 59 | console.log(`Configuration:`, rf2.getConfig());
 60 | 
 61 | // Example 3: Feature importance
 62 | console.log('\n3. Feature Importance:');
 63 | const importance = rf2.getFeatureImportance();
 64 | console.log('Feature importance scores:');
 65 | Object.entries(importance).forEach(([feature, score]) => {
 66 |   console.log(`  ${feature}: ${score.toFixed(4)}`);
 67 | });
 68 | 
 69 | // Example 4: Model persistence
 70 | console.log('\n4. Model Persistence:');
 71 | const modelJson = rf2.toJSON();
 72 | console.log('Model exported successfully');
 73 | 
 74 | // Import the model to a new instance
 75 | const rf3 = new RandomForest(modelJson);
 76 | const prediction3 = rf3.predict(sample1);
 77 | console.log(`Prediction from imported model: ${prediction3}`);
 78 | 
 79 | // Example 5: Different feature selection strategies
 80 | console.log('\n5. Different feature selection strategies:');
 81 | 
 82 | const strategies = [
 83 |   { name: 'sqrt', maxFeatures: 'sqrt' },
 84 |   { name: 'log2', maxFeatures: 'log2' },
 85 |   { name: 'auto', maxFeatures: 'auto' },
 86 |   { name: '2 features', maxFeatures: 2 }
 87 | ];
 88 | 
 89 | strategies.forEach(strategy => {
 90 |   const rf = new RandomForest(target, features, { 
 91 |     nEstimators: 10, 
 92 |     maxFeatures: strategy.maxFeatures,
 93 |     randomState: 42 
 94 |   });
 95 |   rf.train(trainingData);
 96 |   const accuracy = rf.evaluate(testData);
 97 |   console.log(`${strategy.name}: ${(accuracy * 100).toFixed(1)}% accuracy`);
 98 | });
 99 | 
100 | // Example 6: Bootstrap sampling comparison
101 | console.log('\n6. Bootstrap vs No Bootstrap:');
102 | 
103 | const withBootstrap = new RandomForest(target, features, { 
104 |   nEstimators: 10, 
105 |   bootstrap: true,
106 |   randomState: 42 
107 | });
108 | withBootstrap.train(trainingData);
109 | const accuracyWithBootstrap = withBootstrap.evaluate(testData);
110 | 
111 | const withoutBootstrap = new RandomForest(target, features, { 
112 |   nEstimators: 10, 
113 |   bootstrap: false,
114 |   randomState: 42 
115 | });
116 | withoutBootstrap.train(trainingData);
117 | const accuracyWithoutBootstrap = withoutBootstrap.evaluate(testData);
118 | 
119 | console.log(`With bootstrap: ${(accuracyWithBootstrap * 100).toFixed(1)}% accuracy`);
120 | console.log(`Without bootstrap: ${(accuracyWithoutBootstrap * 100).toFixed(1)}% accuracy`);
121 | 
122 | console.log('\n=== Random Forest Example Complete ===');
123 | 


--------------------------------------------------------------------------------
/src/shared/id3-algorithm.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * ID3 Algorithm implementation shared between Decision Tree and Random Forest
  3 |  */
  4 | 
  5 | import _ from 'lodash';
  6 | import { TreeNode, TrainingData, FeatureGain, NODE_TYPES } from './types.js';
  7 | import { randomUUID, prob, log2, mostCommon } from './utils.js';
  8 | 
  9 | /**
 10 |  * Creates a new tree using ID3 algorithm
 11 |  * @private
 12 |  */
 13 | export function createTree(
 14 |   data: TrainingData[], 
 15 |   target: string, 
 16 |   features: string[],
 17 |   maxDepth?: number,
 18 |   minSamplesSplit?: number,
 19 |   currentDepth: number = 0
 20 | ): TreeNode {
 21 |   let targets = _.uniq(_.map(data, target));
 22 |   
 23 |   // Base case: all samples have same target
 24 |   if (targets.length == 1) {
 25 |     return {
 26 |       type: NODE_TYPES.RESULT,
 27 |       val: targets[0],
 28 |       name: targets[0],
 29 |       alias: targets[0] + randomUUID()
 30 |     };
 31 |   }
 32 | 
 33 |   // Base case: no features left
 34 |   if (features.length == 0) {
 35 |     let topTarget = mostCommon(targets);
 36 |     return {
 37 |       type: NODE_TYPES.RESULT,
 38 |       val: topTarget,
 39 |       name: topTarget,
 40 |       alias: topTarget + randomUUID()
 41 |     };
 42 |   }
 43 | 
 44 |   // Base case: max depth reached
 45 |   if (maxDepth && currentDepth >= maxDepth) {
 46 |     let topTarget = mostCommon(targets);
 47 |     return {
 48 |       type: NODE_TYPES.RESULT,
 49 |       val: topTarget,
 50 |       name: topTarget,
 51 |       alias: topTarget + randomUUID()
 52 |     };
 53 |   }
 54 | 
 55 |   // Base case: not enough samples to split
 56 |   if (minSamplesSplit && data.length < minSamplesSplit) {
 57 |     let topTarget = mostCommon(targets);
 58 |     return {
 59 |       type: NODE_TYPES.RESULT,
 60 |       val: topTarget,
 61 |       name: topTarget,
 62 |       alias: topTarget + randomUUID()
 63 |     };
 64 |   }
 65 | 
 66 |   let bestFeature = maxGain(data, target, features);
 67 |   let bestFeatureName = bestFeature.name;
 68 |   let bestFeatureGain = bestFeature.gain;
 69 |   let remainingFeatures = _.without(features, bestFeatureName);
 70 |   let possibleValues = _.uniq(_.map(data, bestFeatureName));
 71 | 
 72 |   let node: TreeNode = {
 73 |     name: bestFeatureName,
 74 |     alias: bestFeatureName + randomUUID(),
 75 |     gain: bestFeatureGain,
 76 |     sampleSize: data.length,
 77 |     type: NODE_TYPES.FEATURE,
 78 |     vals: _.map(possibleValues, function (featureVal) {
 79 |       const featureValDataSample = data.filter((dataRow) => dataRow[bestFeatureName] == featureVal);
 80 |       const featureValDataSampleSize = featureValDataSample.length;
 81 | 
 82 |       const child_node: TreeNode = {
 83 |         name: featureVal,
 84 |         alias: featureVal + randomUUID(),
 85 |         type: NODE_TYPES.FEATURE_VALUE,
 86 |         prob: featureValDataSampleSize / data.length,
 87 |         sampleSize: featureValDataSampleSize
 88 |       };
 89 | 
 90 |       child_node.child = createTree(
 91 |         featureValDataSample, 
 92 |         target, 
 93 |         remainingFeatures,
 94 |         maxDepth,
 95 |         minSamplesSplit,
 96 |         currentDepth + 1
 97 |       );
 98 |       return child_node;
 99 |     })
100 |   };
101 | 
102 |   return node;
103 | }
104 | 
105 | /**
106 |  * Computes entropy of a list
107 |  * @private
108 |  */
109 | export function entropy(vals: any[]): number {
110 |   let uniqueVals = _.uniq(vals);
111 |   let probs = uniqueVals.map(function (x) {
112 |     return prob(x, vals);
113 |   });
114 | 
115 |   let logVals = probs.map(function (p) {
116 |     return -p * log2(p);
117 |   });
118 | 
119 |   return logVals.reduce(function (a, b) {
120 |     return a + b;
121 |   }, 0);
122 | }
123 | 
124 | /**
125 |  * Computes information gain
126 |  * @private
127 |  */
128 | export function gain(data: TrainingData[], target: string, feature: string): number {
129 |   let attrVals = _.uniq(_.map(data, feature));
130 |   let setEntropy = entropy(_.map(data, target));
131 |   let setSize = _.size(data);
132 | 
133 |   let entropies = attrVals.map(function (n) {
134 |     let subset = data.filter(function (x) {
135 |       return x[feature] === n;
136 |     });
137 | 
138 |     return (subset.length / setSize) * entropy(_.map(subset, target));
139 |   });
140 | 
141 |   let sumOfEntropies = entropies.reduce(function (a, b) {
142 |     return a + b;
143 |   }, 0);
144 | 
145 |   return setEntropy - sumOfEntropies;
146 | }
147 | 
148 | /**
149 |  * Computes Max gain across features to determine best split
150 |  * @private
151 |  */
152 | export function maxGain(data: TrainingData[], target: string, features: string[]): FeatureGain {
153 |   let maxGain: number | undefined;
154 |   let maxGainFeature: string | undefined;
155 |   
156 |   for (let feature of features) {
157 |     const featureGain = gain(data, target, feature);
158 |     if (!maxGain || maxGain < featureGain) {
159 |       maxGain = featureGain;
160 |       maxGainFeature = feature;
161 |     }
162 |   }
163 |   
164 |   return {gain: maxGain!, name: maxGainFeature!};
165 | }
166 | 


--------------------------------------------------------------------------------
/tst/edge-cases.ts:
--------------------------------------------------------------------------------
  1 | import { strict as assert } from 'assert';
  2 | import DecisionTree from '../lib/decision-tree.js';
  3 | 
  4 | describe('Edge Cases & Error Handling', () => {
  5 |   describe('Empty and Invalid Datasets', () => {
  6 |     it('should handle empty training dataset', () => {
  7 |       const dt = new DecisionTree('target', ['feature1', 'feature2']);
  8 |       // Note: Current implementation only validates that data is an array, not that it has elements
  9 |       // Empty arrays are allowed and will create a tree with no features
 10 |       dt.train([]);
 11 |       assert.ok(dt.toJSON());
 12 |     });
 13 | 
 14 |     it('should handle single sample training dataset', () => {
 15 |       const dt = new DecisionTree('target', ['feature1', 'feature2']);
 16 |       const singleSample = [{ feature1: 'value1', feature2: 'value2', target: 'class1' }];
 17 |       
 18 |       dt.train(singleSample);
 19 |       const prediction = dt.predict({ feature1: 'value1', feature2: 'value2' });
 20 |       assert.strictEqual(prediction, 'class1');
 21 |     });
 22 | 
 23 |     it('should handle null/undefined values in training data', () => {
 24 |       const dt = new DecisionTree('target', ['feature1', 'feature2']);
 25 |       const dataWithNulls = [
 26 |         { feature1: 'value1', feature2: null, target: 'class1' },
 27 |         { feature1: 'value1', feature2: 'value2', target: 'class2' },
 28 |         { feature1: undefined, feature2: 'value2', target: 'class1' }
 29 |       ];
 30 |       
 31 |       dt.train(dataWithNulls);
 32 |       // Should not crash and should make predictions
 33 |       const prediction = dt.predict({ feature1: 'value1', feature2: 'value2' });
 34 |       assert.ok(typeof prediction === 'string');
 35 |     });
 36 | 
 37 |     it('should handle duplicate feature values', () => {
 38 |       const dt = new DecisionTree('target', ['feature1', 'feature2']);
 39 |       const duplicateData = [
 40 |         { feature1: 'value1', feature2: 'value2', target: 'class1' },
 41 |         { feature1: 'value1', feature2: 'value2', target: 'class1' },
 42 |         { feature1: 'value1', feature2: 'value2', target: 'class1' }
 43 |       ];
 44 |       
 45 |       dt.train(duplicateData);
 46 |       const prediction = dt.predict({ feature1: 'value1', feature2: 'value2' });
 47 |       assert.strictEqual(prediction, 'class1');
 48 |     });
 49 |   });
 50 | 
 51 |   describe('Missing Features and Data Inconsistencies', () => {
 52 |     it('should handle missing features in training data', () => {
 53 |       const dt = new DecisionTree('target', ['feature1', 'feature2', 'feature3']);
 54 |       const incompleteData = [
 55 |         { feature1: 'value1', feature2: 'value2', target: 'class1' },
 56 |         { feature1: 'value1', feature2: 'value2', feature3: 'value3', target: 'class2' },
 57 |         { feature1: 'value1', target: 'class1' }
 58 |       ];
 59 |       
 60 |       dt.train(incompleteData);
 61 |       // Should handle missing features gracefully
 62 |       const prediction = dt.predict({ feature1: 'value1', feature2: 'value2' });
 63 |       assert.ok(typeof prediction === 'string');
 64 |     });
 65 | 
 66 |     it('should handle data consistency across samples', () => {
 67 |       const dt = new DecisionTree('target', ['feature1', 'feature2']);
 68 |       const inconsistentData = [
 69 |         { feature1: 'value1', feature2: 'value2', target: 'class1' },
 70 |         { feature1: 'value1', feature2: 'value2', extraFeature: 'extra', target: 'class2' },
 71 |         { feature1: 'value1', feature2: 'value2', target: 'class1' }
 72 |       ];
 73 |       
 74 |       dt.train(inconsistentData);
 75 |       const prediction = dt.predict({ feature1: 'value1', feature2: 'value2' });
 76 |       assert.ok(typeof prediction === 'string');
 77 |     });
 78 |   });
 79 | 
 80 |   describe('Constructor Edge Cases', () => {
 81 |     it('should handle empty features array', () => {
 82 |       const dt = new DecisionTree('target', []);
 83 |       const data = [{ target: 'class1' }];
 84 |       
 85 |       dt.train(data);
 86 |       const prediction = dt.predict({});
 87 |       assert.ok(typeof prediction === 'string');
 88 |     });
 89 | 
 90 |     it('should handle single feature', () => {
 91 |       const dt = new DecisionTree('target', ['feature1']);
 92 |       const data = [
 93 |         { feature1: 'value1', target: 'class1' },
 94 |         { feature1: 'value2', target: 'class2' }
 95 |       ];
 96 |       
 97 |       dt.train(data);
 98 |       const prediction = dt.predict({ feature1: 'value1' });
 99 |       assert.strictEqual(prediction, 'class1');
100 |     });
101 | 
102 |     it('should handle very long feature names', () => {
103 |       const longFeatureName = 'a'.repeat(1000);
104 |       const dt = new DecisionTree('target', [longFeatureName]);
105 |       const data = [{ [longFeatureName]: 'value1', target: 'class1' }];
106 |       
107 |       dt.train(data);
108 |       const prediction = dt.predict({ [longFeatureName]: 'value1' });
109 |       assert.strictEqual(prediction, 'class1');
110 |     });
111 |   });
112 | });
113 | 


--------------------------------------------------------------------------------
/src/shared/types.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Shared type definitions for Decision Tree and Random Forest
  3 |  */
  4 | 
  5 | export interface TreeNode {
  6 |   type: string;
  7 |   name: string;
  8 |   alias: string;
  9 |   val?: any;
 10 |   gain?: number;
 11 |   sampleSize?: number;
 12 |   vals?: TreeNode[];
 13 |   child?: TreeNode;
 14 |   prob?: number;
 15 |   // Continuous variable support
 16 |   splitThreshold?: number;
 17 |   splitOperator?: 'lte' | 'gt' | 'eq';
 18 |   statistics?: {
 19 |     mean?: number;
 20 |     variance?: number;
 21 |     sampleCount?: number;
 22 |   };
 23 | }
 24 | 
 25 | export interface DecisionTreeData {
 26 |   model: TreeNode;
 27 |   data: any[];
 28 |   target: string;
 29 |   features: string[];
 30 |   // Continuous variable support
 31 |   featureTypes?: { [feature: string]: 'discrete' | 'continuous' };
 32 |   algorithm?: 'id3' | 'cart' | 'auto';
 33 |   config?: DecisionTreeConfig;
 34 | }
 35 | 
 36 | export interface DecisionTreeConfig {
 37 |   algorithm?: 'auto' | 'id3' | 'cart';
 38 |   minSamplesSplit?: number;
 39 |   minSamplesLeaf?: number;
 40 |   maxDepth?: number;
 41 |   criterion?: 'gini' | 'entropy' | 'mse' | 'mae';
 42 |   continuousSplitting?: 'binary' | 'multiway';
 43 |   autoDetectTypes?: boolean;
 44 |   discreteThreshold?: number;
 45 |   continuousThreshold?: number;
 46 |   confidenceThreshold?: number;
 47 |   statisticalTests?: boolean;
 48 |   handleMissingValues?: boolean;
 49 |   numericOnlyContinuous?: boolean;
 50 |   cachingEnabled?: boolean;
 51 |   memoryOptimization?: boolean;
 52 | }
 53 | 
 54 | export interface FeatureGain {
 55 |   gain: number;
 56 |   name: string;
 57 | }
 58 | 
 59 | export interface TrainingData {
 60 |   [key: string]: any;
 61 | }
 62 | 
 63 | export interface RandomForestConfig {
 64 |   nEstimators?: number;        // Number of trees (default: 100)
 65 |   maxFeatures?: number | 'sqrt' | 'log2' | 'auto'; // Features per split
 66 |   bootstrap?: boolean;         // Use bootstrap sampling (default: true)
 67 |   randomState?: number;        // Random seed for reproducibility
 68 |   maxDepth?: number;           // Maximum tree depth
 69 |   minSamplesSplit?: number;    // Minimum samples to split
 70 |   // Continuous variable support
 71 |   algorithm?: 'auto' | 'id3' | 'cart' | 'hybrid';
 72 |   autoDetectTypes?: boolean;
 73 |   discreteThreshold?: number;
 74 |   continuousThreshold?: number;
 75 |   confidenceThreshold?: number;
 76 |   statisticalTests?: boolean;
 77 |   handleMissingValues?: boolean;
 78 |   numericOnlyContinuous?: boolean;
 79 |   cachingEnabled?: boolean;
 80 |   memoryOptimization?: boolean;
 81 |   criterion?: 'gini' | 'entropy' | 'mse' | 'mae';
 82 |   continuousSplitting?: 'binary' | 'multiway';
 83 | }
 84 | 
 85 | export interface RandomForestData {
 86 |   trees: DecisionTreeData[];
 87 |   target: string;
 88 |   features: string[];
 89 |   config: RandomForestConfig;
 90 |   data: any[];
 91 | }
 92 | 
 93 | export interface XGBoostConfig {
 94 |   nEstimators?: number;           // Number of boosting rounds
 95 |   learningRate?: number;          // Step size shrinkage (eta)
 96 |   maxDepth?: number;              // Maximum tree depth
 97 |   minChildWeight?: number;        // Minimum sum of instance weight in leaf
 98 |   subsample?: number;             // Fraction of samples for each tree
 99 |   colsampleByTree?: number;       // Fraction of features for each tree
100 |   regAlpha?: number;              // L1 regularization (alpha)
101 |   regLambda?: number;             // L2 regularization (lambda)
102 |   objective?: 'regression' | 'binary' | 'multiclass'; // Loss function
103 |   earlyStoppingRounds?: number;   // Early stopping patience
104 |   randomState?: number;           // Random seed
105 |   validationFraction?: number;    // Fraction for validation set
106 |   // Continuous variable support
107 |   algorithm?: 'auto' | 'id3' | 'cart' | 'hybrid';
108 |   autoDetectTypes?: boolean;
109 |   discreteThreshold?: number;
110 |   continuousThreshold?: number;
111 |   confidenceThreshold?: number;
112 |   statisticalTests?: boolean;
113 |   handleMissingValues?: boolean;
114 |   numericOnlyContinuous?: boolean;
115 |   cachingEnabled?: boolean;
116 |   memoryOptimization?: boolean;
117 |   criterion?: 'gini' | 'entropy' | 'mse' | 'mae';
118 |   continuousSplitting?: 'binary' | 'multiway';
119 | }
120 | 
121 | export interface XGBoostData {
122 |   trees: DecisionTreeData[];
123 |   target: string;
124 |   features: string[];
125 |   config: XGBoostConfig;
126 |   data: any[];
127 |   baseScore: number;
128 |   bestIteration: number;
129 |   boostingHistory: BoostingHistory;
130 | }
131 | 
132 | export interface BoostingHistory {
133 |   trainLoss: number[];
134 |   validationLoss: number[];
135 |   iterations: number[];
136 | }
137 | 
138 | export interface GradientHessian {
139 |   gradient: number[];
140 |   hessian: number[];
141 | }
142 | 
143 | export interface WeightedSample {
144 |   data: any[];
145 |   weights: number[];
146 |   gradients: number[];
147 |   hessians: number[];
148 | }
149 | 
150 | // Node types constant
151 | export const NODE_TYPES = {
152 |   RESULT: 'result',
153 |   FEATURE: 'feature',
154 |   FEATURE_VALUE: 'feature_value'
155 | } as const;
156 | 
157 | export type NodeType = typeof NODE_TYPES[keyof typeof NODE_TYPES];
158 | 


--------------------------------------------------------------------------------
/examples/random-forest-usage.ts:
--------------------------------------------------------------------------------
  1 | import RandomForest from '../lib/random-forest.js';
  2 | 
  3 | // Define interfaces for type safety
  4 | interface TrainingData {
  5 |   color: string;
  6 |   shape: string;
  7 |   size: string;
  8 |   liked: boolean;
  9 | }
 10 | 
 11 | interface PredictionData {
 12 |   color: string;
 13 |   shape: string;
 14 |   size: string;
 15 | }
 16 | 
 17 | // Sample training data
 18 | const trainingData: TrainingData[] = [
 19 |   { color: "blue", shape: "square", size: "small", liked: false },
 20 |   { color: "red", shape: "square", size: "small", liked: false },
 21 |   { color: "blue", shape: "circle", size: "medium", liked: true },
 22 |   { color: "red", shape: "circle", size: "medium", liked: true },
 23 |   { color: "blue", shape: "hexagon", size: "large", liked: false },
 24 |   { color: "red", shape: "hexagon", size: "large", liked: false },
 25 |   { color: "yellow", shape: "hexagon", size: "small", liked: true },
 26 |   { color: "yellow", shape: "circle", size: "large", liked: true }
 27 | ];
 28 | 
 29 | // Test data
 30 | const testData: TrainingData[] = [
 31 |   { color: "blue", shape: "hexagon", size: "medium", liked: false },
 32 |   { color: "yellow", shape: "circle", size: "small", liked: true }
 33 | ];
 34 | 
 35 | // Features to use for classification
 36 | const features: (keyof TrainingData)[] = ["color", "shape", "size"];
 37 | const target: keyof TrainingData = "liked";
 38 | 
 39 | console.log('=== Random Forest TypeScript Example ===\n');
 40 | 
 41 | // Example 1: Basic Random Forest usage
 42 | console.log('1. Basic Random Forest:');
 43 | const rf1 = new RandomForest(target, features);
 44 | rf1.train(trainingData);
 45 | 
 46 | const sample1: PredictionData = { color: "blue", shape: "hexagon", size: "medium" };
 47 | const prediction1 = rf1.predict(sample1);
 48 | console.log(`Prediction for ${JSON.stringify(sample1)}: ${prediction1}`);
 49 | 
 50 | const accuracy1 = rf1.evaluate(testData);
 51 | console.log(`Accuracy on test data: ${(accuracy1 * 100).toFixed(1)}%`);
 52 | console.log(`Number of trees: ${rf1.getTreeCount()}\n`);
 53 | 
 54 | // Example 2: Random Forest with custom configuration
 55 | console.log('2. Random Forest with custom configuration:');
 56 | const config = {
 57 |   nEstimators: 50,
 58 |   maxFeatures: 'sqrt' as const,
 59 |   randomState: 42,
 60 |   bootstrap: true
 61 | };
 62 | 
 63 | const rf2 = new RandomForest(target, features, config);
 64 | rf2.train(trainingData);
 65 | 
 66 | const sample2: PredictionData = { color: "yellow", shape: "circle", size: "small" };
 67 | const prediction2 = rf2.predict(sample2);
 68 | console.log(`Prediction for ${JSON.stringify(sample2)}: ${prediction2}`);
 69 | 
 70 | const accuracy2 = rf2.evaluate(testData);
 71 | console.log(`Accuracy on test data: ${(accuracy2 * 100).toFixed(1)}%`);
 72 | console.log(`Number of trees: ${rf2.getTreeCount()}`);
 73 | console.log(`Configuration:`, rf2.getConfig());
 74 | 
 75 | // Example 3: Feature importance
 76 | console.log('\n3. Feature Importance:');
 77 | const importance = rf2.getFeatureImportance();
 78 | console.log('Feature importance scores:');
 79 | Object.entries(importance).forEach(([feature, score]) => {
 80 |   console.log(`  ${feature}: ${score.toFixed(4)}`);
 81 | });
 82 | 
 83 | // Example 4: Model persistence
 84 | console.log('\n4. Model Persistence:');
 85 | const modelJson = rf2.toJSON();
 86 | console.log('Model exported successfully');
 87 | 
 88 | // Import the model to a new instance
 89 | const rf3 = new RandomForest(modelJson);
 90 | const prediction3 = rf3.predict(sample1);
 91 | console.log(`Prediction from imported model: ${prediction3}`);
 92 | 
 93 | // Example 5: Different feature selection strategies
 94 | console.log('\n5. Different feature selection strategies:');
 95 | 
 96 | const strategies = [
 97 |   { name: 'sqrt', maxFeatures: 'sqrt' as const },
 98 |   { name: 'log2', maxFeatures: 'log2' as const },
 99 |   { name: 'auto', maxFeatures: 'auto' as const },
100 |   { name: '2 features', maxFeatures: 2 }
101 | ];
102 | 
103 | strategies.forEach(strategy => {
104 |   const rf = new RandomForest(target, features, { 
105 |     nEstimators: 10, 
106 |     maxFeatures: strategy.maxFeatures,
107 |     randomState: 42 
108 |   });
109 |   rf.train(trainingData);
110 |   const accuracy = rf.evaluate(testData);
111 |   console.log(`${strategy.name}: ${(accuracy * 100).toFixed(1)}% accuracy`);
112 | });
113 | 
114 | // Example 6: Bootstrap sampling comparison
115 | console.log('\n6. Bootstrap vs No Bootstrap:');
116 | 
117 | const withBootstrap = new RandomForest(target, features, { 
118 |   nEstimators: 10, 
119 |   bootstrap: true,
120 |   randomState: 42 
121 | });
122 | withBootstrap.train(trainingData);
123 | const accuracyWithBootstrap = withBootstrap.evaluate(testData);
124 | 
125 | const withoutBootstrap = new RandomForest(target, features, { 
126 |   nEstimators: 10, 
127 |   bootstrap: false,
128 |   randomState: 42 
129 | });
130 | withoutBootstrap.train(trainingData);
131 | const accuracyWithoutBootstrap = withoutBootstrap.evaluate(testData);
132 | 
133 | console.log(`With bootstrap: ${(accuracyWithBootstrap * 100).toFixed(1)}% accuracy`);
134 | console.log(`Without bootstrap: ${(accuracyWithoutBootstrap * 100).toFixed(1)}% accuracy`);
135 | 
136 | // Example 7: Performance comparison with Decision Tree
137 | console.log('\n7. Random Forest vs Decision Tree Performance:');
138 | 
139 | import DecisionTree from '../lib/decision-tree.js';
140 | 
141 | const dt = new DecisionTree(target, features);
142 | dt.train(trainingData);
143 | const dtAccuracy = dt.evaluate(testData);
144 | 
145 | const rfPerformance = new RandomForest(target, features, { nEstimators: 100, randomState: 42 });
146 | rfPerformance.train(trainingData);
147 | const rfAccuracy = rfPerformance.evaluate(testData);
148 | 
149 | console.log(`Decision Tree accuracy: ${(dtAccuracy * 100).toFixed(1)}%`);
150 | console.log(`Random Forest accuracy: ${(rfAccuracy * 100).toFixed(1)}%`);
151 | console.log(`Improvement: ${((rfAccuracy - dtAccuracy) * 100).toFixed(1)}%`);
152 | 
153 | console.log('\n=== Random Forest TypeScript Example Complete ===');
154 | 


--------------------------------------------------------------------------------
/TYPESCRIPT_MIGRATION.md:
--------------------------------------------------------------------------------
  1 | # TypeScript Migration Guide
  2 | 
  3 | This document explains the TypeScript conversion of the decision-tree module and how to use it.
  4 | 
  5 | ## What Changed
  6 | 
  7 | The module has been converted from JavaScript to TypeScript while maintaining **100% backward compatibility**. Existing JavaScript projects will continue to work without any changes.
  8 | 
  9 | ## New Features
 10 | 
 11 | ### 1. Full TypeScript Support
 12 | - Complete type definitions for all methods and properties
 13 | - Interface definitions for training data and model structures
 14 | - Compile-time type checking for better development experience
 15 | 
 16 | ### 2. Enhanced Development Experience
 17 | - Source maps for debugging
 18 | - Declaration files (`.d.ts`) for IDE support
 19 | - Better IntelliSense and autocomplete
 20 | 
 21 | ### 3. Modern Build System
 22 | - TypeScript compiler with ES5 output for maximum compatibility
 23 | - Watch mode for development
 24 | - Clean build process
 25 | 
 26 | ## File Structure
 27 | 
 28 | ```
 29 | ├── src/                    # TypeScript source files
 30 | │   └── decision-tree.ts   # Main implementation
 31 | ├── lib/                   # Compiled JavaScript (generated)
 32 | │   ├── decision-tree.js   # Main module
 33 | │   ├── decision-tree.d.ts # Type definitions
 34 | │   └── *.map             # Source maps
 35 | ├── examples/              # Usage examples
 36 | │   ├── typescript-usage.ts
 37 | │   └── javascript-usage.js
 38 | └── tst/                   # TypeScript test files
 39 |     ├── decision-tree.ts
 40 |     ├── evaluation.ts
 41 |     └── reported-bugs.ts
 42 | ```
 43 | 
 44 | ## Usage Examples
 45 | 
 46 | ### TypeScript Usage
 47 | 
 48 | ```typescript
 49 | import DecisionTree from 'decision-tree';
 50 | 
 51 | interface TrainingData {
 52 |   color: string;
 53 |   shape: string;
 54 |   liked: boolean;
 55 | }
 56 | 
 57 | const trainingData: TrainingData[] = [
 58 |   { color: "blue", shape: "square", liked: false },
 59 |   { color: "red", shape: "circle", liked: true }
 60 | ];
 61 | 
 62 | const dt = new DecisionTree('liked', ['color', 'shape']);
 63 | dt.train(trainingData);
 64 | 
 65 | const prediction = dt.predict({ color: "blue", shape: "hexagon" });
 66 | ```
 67 | 
 68 | ### JavaScript Usage (Backward Compatible)
 69 | 
 70 | ```javascript
 71 | const DecisionTree = require('decision-tree');
 72 | 
 73 | const trainingData = [
 74 |   { color: "blue", shape: "square", liked: false },
 75 |   { color: "red", shape: "circle", liked: true }
 76 | ];
 77 | 
 78 | const dt = new DecisionTree('liked', ['color', 'shape']);
 79 | dt.train(trainingData);
 80 | 
 81 | const prediction = dt.predict({ color: "blue", shape: "hexagon" });
 82 | ```
 83 | 
 84 | ### ES6 Module Usage
 85 | 
 86 | ```javascript
 87 | import DecisionTree from 'decision-tree';
 88 | 
 89 | const dt = new DecisionTree('liked', ['color', 'shape']);
 90 | // ... rest of the code
 91 | ```
 92 | 
 93 | ## Development Commands
 94 | 
 95 | ```bash
 96 | # Install dependencies
 97 | npm install
 98 | 
 99 | # Build the project
100 | npm run build
101 | 
102 | # Watch mode for development
103 | npm run build:watch
104 | 
105 | # Run tests
106 | npm test
107 | 
108 | # Run examples
109 | npm run example:js    # JavaScript example
110 | npm run example:ts    # TypeScript example
111 | 
112 | # Clean build artifacts
113 | npm run clean
114 | ```
115 | 
116 | ## Type Definitions
117 | 
118 | The module provides comprehensive TypeScript interfaces:
119 | 
120 | ```typescript
121 | interface TreeNode {
122 |   type: string;
123 |   name: string;
124 |   alias: string;
125 |   val?: any;
126 |   gain?: number;
127 |   sampleSize?: number;
128 |   vals?: TreeNode[];
129 |   child?: TreeNode;
130 |   prob?: number;
131 | }
132 | 
133 | interface DecisionTreeData {
134 |   model: TreeNode;
135 |   data: any[];
136 |   target: string;
137 |   features: string[];
138 | }
139 | ```
140 | 
141 | ## Migration for Existing Projects
142 | 
143 | ### No Changes Required
144 | If you're using the module in JavaScript, **no changes are needed**. The compiled JavaScript maintains the exact same API.
145 | 
146 | ### Adding TypeScript Support
147 | To add TypeScript support to your project:
148 | 
149 | 1. Install the module: `npm install decision-tree`
150 | 2. Import with types: `import DecisionTree from 'decision-tree'`
151 | 3. Define interfaces for your data structures
152 | 4. Enjoy full type safety!
153 | 
154 | ## Browser Compatibility
155 | 
156 | The compiled JavaScript is ES2022 compatible and works in:
157 | - All modern browsers
158 | - Node.js 20+
159 | - Modern bundlers (Webpack, Rollup, Vite, etc.)
160 | - ES module environments
161 | 
162 | ## Performance
163 | 
164 | - **Zero runtime overhead** - TypeScript types are removed during compilation
165 | - **Same performance** as the original JavaScript version
166 | - **Smaller bundle size** when using modern bundlers (ES modules)
167 | - **Modern ES2022 features** for better performance and smaller code
168 | 
169 | ## Contributing
170 | 
171 | When contributing to the project:
172 | 
173 | 1. Make changes in the `src/` directory
174 | 2. Run `npm run build` to compile
175 | 3. Ensure tests pass with `npm test`
176 | 4. The compiled JavaScript in `lib/` is automatically generated
177 | 
178 | ## Troubleshooting
179 | 
180 | ### TypeScript Errors
181 | - Ensure you're importing from the correct path
182 | - Check that your data structures match the expected interfaces
183 | - Use type assertions if needed: `data as TrainingData[]`
184 | 
185 | ### Build Issues
186 | - Run `npm run clean` to remove old build artifacts
187 | - Ensure TypeScript is installed: `npm install typescript`
188 | - Check `tsconfig.json` for configuration issues
189 | 
190 | ### Runtime Errors
191 | - The compiled JavaScript is identical to the original
192 | - Check that you're using the correct import/require syntax
193 | - Verify your data format matches the expected structure
194 | 
195 | ## Support
196 | 
197 | For issues or questions:
198 | 1. Check the existing test files in `tst/`
199 | 2. Review the examples in `examples/`
200 | 3. Open an issue on GitHub
201 | 4. Check the main README.md for usage documentation
202 | 


--------------------------------------------------------------------------------
/src/shared/loss-functions.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Loss functions for XGBoost gradient boosting
  3 |  */
  4 | 
  5 | import { GradientHessian } from './types.js';
  6 | 
  7 | /**
  8 |  * Mean Squared Error loss function for regression
  9 |  */
 10 | export class MSELoss {
 11 |   static calculateGradient(prediction: number, actual: number): number {
 12 |     return prediction - actual;
 13 |   }
 14 | 
 15 |   static calculateHessian(prediction: number, actual: number): number {
 16 |     return 1;
 17 |   }
 18 | 
 19 |   static calculateLoss(predictions: number[], actuals: number[]): number {
 20 |     let sum = 0;
 21 |     for (let i = 0; i < predictions.length; i++) {
 22 |       const diff = predictions[i] - actuals[i];
 23 |       sum += diff * diff;
 24 |     }
 25 |     return sum / predictions.length;
 26 |   }
 27 | 
 28 |   static calculateGradientsAndHessians(
 29 |     predictions: number[], 
 30 |     actuals: number[]
 31 |   ): GradientHessian {
 32 |     const gradients: number[] = [];
 33 |     const hessians: number[] = [];
 34 | 
 35 |     for (let i = 0; i < predictions.length; i++) {
 36 |       gradients.push(this.calculateGradient(predictions[i], actuals[i]));
 37 |       hessians.push(this.calculateHessian(predictions[i], actuals[i]));
 38 |     }
 39 | 
 40 |     return { gradient: gradients, hessian: hessians };
 41 |   }
 42 | }
 43 | 
 44 | /**
 45 |  * Logistic loss function for binary classification
 46 |  */
 47 | export class LogisticLoss {
 48 |   static sigmoid(x: number): number {
 49 |     // Clamp x to prevent overflow
 50 |     const clampedX = Math.max(-500, Math.min(500, x));
 51 |     return 1 / (1 + Math.exp(-clampedX));
 52 |   }
 53 | 
 54 |   static calculateGradient(prediction: number, actual: number): number {
 55 |     const prob = this.sigmoid(prediction);
 56 |     return prob - actual;
 57 |   }
 58 | 
 59 |   static calculateHessian(prediction: number, actual: number): number {
 60 |     const prob = this.sigmoid(prediction);
 61 |     return prob * (1 - prob);
 62 |   }
 63 | 
 64 |   static calculateLoss(predictions: number[], actuals: number[]): number {
 65 |     let sum = 0;
 66 |     for (let i = 0; i < predictions.length; i++) {
 67 |       const prob = this.sigmoid(predictions[i]);
 68 |       const actual = actuals[i];
 69 |       // Add small epsilon to prevent log(0)
 70 |       const epsilon = 1e-15;
 71 |       sum += actual * Math.log(prob + epsilon) + (1 - actual) * Math.log(1 - prob + epsilon);
 72 |     }
 73 |     return -sum / predictions.length;
 74 |   }
 75 | 
 76 |   static calculateGradientsAndHessians(
 77 |     predictions: number[], 
 78 |     actuals: number[]
 79 |   ): GradientHessian {
 80 |     const gradients: number[] = [];
 81 |     const hessians: number[] = [];
 82 | 
 83 |     for (let i = 0; i < predictions.length; i++) {
 84 |       gradients.push(this.calculateGradient(predictions[i], actuals[i]));
 85 |       hessians.push(this.calculateHessian(predictions[i], actuals[i]));
 86 |     }
 87 | 
 88 |     return { gradient: gradients, hessian: hessians };
 89 |   }
 90 | }
 91 | 
 92 | /**
 93 |  * Cross-entropy loss function for multiclass classification
 94 |  */
 95 | export class CrossEntropyLoss {
 96 |   static softmax(x: number[]): number[] {
 97 |     const max = Math.max(...x);
 98 |     const exp = x.map(val => Math.exp(val - max));
 99 |     const sum = exp.reduce((a, b) => a + b, 0);
100 |     return exp.map(val => val / sum);
101 |   }
102 | 
103 |   static calculateGradient(predictions: number[], actual: number): number[] {
104 |     const probs = this.softmax(predictions);
105 |     const gradients = new Array(predictions.length).fill(0);
106 |     gradients[actual] = 1;
107 |     return gradients.map((grad, i) => grad - probs[i]);
108 |   }
109 | 
110 |   static calculateHessian(predictions: number[], actual: number): number[][] {
111 |     const probs = this.softmax(predictions);
112 |     const hessians: number[][] = [];
113 |     
114 |     for (let i = 0; i < predictions.length; i++) {
115 |       const row: number[] = [];
116 |       for (let j = 0; j < predictions.length; j++) {
117 |         if (i === j) {
118 |           row.push(probs[i] * (1 - probs[i]));
119 |         } else {
120 |           row.push(-probs[i] * probs[j]);
121 |         }
122 |       }
123 |       hessians.push(row);
124 |     }
125 |     
126 |     return hessians;
127 |   }
128 | 
129 |   static calculateLoss(predictions: number[], actuals: number[]): number {
130 |     // For simplicity, treat as binary classification
131 |     let sum = 0;
132 |     for (let i = 0; i < predictions.length; i++) {
133 |       const actual = actuals[i];
134 |       const prob = 1 / (1 + Math.exp(-predictions[i]));
135 |       const epsilon = 1e-15;
136 |       sum += actual * Math.log(prob + epsilon) + (1 - actual) * Math.log(1 - prob + epsilon);
137 |     }
138 |     return Math.max(0, -sum / predictions.length);
139 |   }
140 | 
141 |   // Interface methods for compatibility
142 |   static calculateGradientsAndHessians(predictions: number[], actuals: number[]): GradientHessian {
143 |     // For simplicity, treat as binary classification
144 |     const gradients: number[] = [];
145 |     const hessians: number[] = [];
146 |     
147 |     for (let i = 0; i < predictions.length; i++) {
148 |       const actual = actuals[i];
149 |       const prob = 1 / (1 + Math.exp(-predictions[i]));
150 |       gradients.push(prob - actual);
151 |       hessians.push(prob * (1 - prob));
152 |     }
153 |     
154 |     return { gradient: gradients, hessian: hessians };
155 |   }
156 | }
157 | 
158 | /**
159 |  * Loss function interface
160 |  */
161 | export interface LossFunction {
162 |   calculateGradientsAndHessians(predictions: number[], actuals: number[]): GradientHessian;
163 |   calculateLoss(predictions: number[], actuals: number[]): number;
164 | }
165 | 
166 | /**
167 |  * Loss function factory
168 |  */
169 | export class LossFunctionFactory {
170 |   static create(objective: 'regression' | 'binary' | 'multiclass'): LossFunction {
171 |     switch (objective) {
172 |       case 'regression':
173 |         return MSELoss as LossFunction;
174 |       case 'binary':
175 |         return LogisticLoss as LossFunction;
176 |       case 'multiclass':
177 |         return CrossEntropyLoss as LossFunction;
178 |       default:
179 |         throw new Error(`Unsupported objective: ${objective}`);
180 |     }
181 |   }
182 | }
183 | 


--------------------------------------------------------------------------------
/examples/xgboost-usage.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * XGBoost Usage Examples
  3 |  * This file demonstrates how to use the XGBoost algorithm for gradient boosting
  4 |  */
  5 | 
  6 | import XGBoost from '../lib/xgboost.js';
  7 | 
  8 | // Sample dataset for demonstration
  9 | const sampleData = [
 10 |   { color: 'red', shape: 'circle', size: 'small', liked: true },
 11 |   { color: 'blue', shape: 'square', size: 'medium', liked: false },
 12 |   { color: 'green', shape: 'triangle', size: 'large', liked: true },
 13 |   { color: 'red', shape: 'square', size: 'small', liked: false },
 14 |   { color: 'blue', shape: 'circle', size: 'medium', liked: true },
 15 |   { color: 'green', shape: 'hexagon', size: 'large', liked: false },
 16 |   { color: 'yellow', shape: 'circle', size: 'small', liked: true },
 17 |   { color: 'purple', shape: 'square', size: 'medium', liked: false }
 18 | ];
 19 | 
 20 | const features = ['color', 'shape', 'size'];
 21 | const target = 'liked';
 22 | 
 23 | console.log('=== XGBoost Basic Usage ===');
 24 | 
 25 | // Basic XGBoost usage
 26 | const xgb = new XGBoost(target, features);
 27 | xgb.train(sampleData);
 28 | 
 29 | console.log('Training completed!');
 30 | console.log('Number of trees:', xgb.getTreeCount());
 31 | console.log('Best iteration:', xgb.getBestIteration());
 32 | 
 33 | // Make predictions
 34 | const testSample = { color: 'blue', shape: 'hexagon', size: 'medium' };
 35 | const prediction = xgb.predict(testSample);
 36 | console.log('Prediction for test sample:', prediction);
 37 | 
 38 | // Evaluate accuracy
 39 | const accuracy = xgb.evaluate(sampleData);
 40 | console.log('Training accuracy:', accuracy.toFixed(3));
 41 | 
 42 | console.log('\n=== XGBoost Configuration ===');
 43 | 
 44 | // XGBoost with custom configuration
 45 | const config = {
 46 |   nEstimators: 50,
 47 |   learningRate: 0.1,
 48 |   maxDepth: 4,
 49 |   minChildWeight: 2,
 50 |   subsample: 0.8,
 51 |   colsampleByTree: 0.8,
 52 |   regAlpha: 0.1,
 53 |   regLambda: 1.0,
 54 |   objective: 'binary',
 55 |   earlyStoppingRounds: 10,
 56 |   validationFraction: 0.2,
 57 |   randomState: 42
 58 | };
 59 | 
 60 | const xgbCustom = new XGBoost(target, features, config);
 61 | xgbCustom.train(sampleData);
 62 | 
 63 | console.log('Custom configuration training completed!');
 64 | console.log('Number of trees:', xgbCustom.getTreeCount());
 65 | console.log('Best iteration:', xgbCustom.getBestIteration());
 66 | 
 67 | // Get feature importance
 68 | const importance = xgbCustom.getFeatureImportance();
 69 | console.log('Feature importance:', importance);
 70 | 
 71 | // Get boosting history
 72 | const history = xgbCustom.getBoostingHistory();
 73 | console.log('Training loss progression:', history.trainLoss.slice(0, 5));
 74 | console.log('Validation loss progression:', history.validationLoss.slice(0, 5));
 75 | 
 76 | console.log('\n=== XGBoost Model Persistence ===');
 77 | 
 78 | // Export model
 79 | const modelJson = xgbCustom.toJSON();
 80 | console.log('Model exported successfully');
 81 | console.log('Model contains', modelJson.trees.length, 'trees');
 82 | 
 83 | // Import model
 84 | const xgbImported = new XGBoost(modelJson);
 85 | console.log('Model imported successfully');
 86 | console.log('Imported model tree count:', xgbImported.getTreeCount());
 87 | 
 88 | // Verify predictions are the same
 89 | const originalPrediction = xgbCustom.predict(testSample);
 90 | const importedPrediction = xgbImported.predict(testSample);
 91 | console.log('Original prediction:', originalPrediction);
 92 | console.log('Imported prediction:', importedPrediction);
 93 | console.log('Predictions match:', originalPrediction === importedPrediction);
 94 | 
 95 | console.log('\n=== XGBoost Different Objectives ===');
 96 | 
 97 | // Regression example
 98 | const regressionData = [
 99 |   { feature1: 1, feature2: 2, target: 10 },
100 |   { feature1: 2, feature2: 3, target: 20 },
101 |   { feature1: 3, feature2: 4, target: 30 },
102 |   { feature1: 4, feature2: 5, target: 40 }
103 | ];
104 | 
105 | const regressionConfig = {
106 |   nEstimators: 20,
107 |   learningRate: 0.1,
108 |   objective: 'regression',
109 |   randomState: 42
110 | };
111 | 
112 | const xgbRegression = new XGBoost('target', ['feature1', 'feature2'], regressionConfig);
113 | xgbRegression.train(regressionData);
114 | 
115 | const regressionPrediction = xgbRegression.predict({ feature1: 5, feature2: 6 });
116 | console.log('Regression prediction:', regressionPrediction);
117 | 
118 | console.log('\n=== XGBoost Early Stopping ===');
119 | 
120 | // Early stopping example
121 | const earlyStoppingConfig = {
122 |   nEstimators: 100,
123 |   learningRate: 0.1,
124 |   earlyStoppingRounds: 5,
125 |   validationFraction: 0.3,
126 |   randomState: 42
127 | };
128 | 
129 | const xgbEarlyStop = new XGBoost(target, features, earlyStoppingConfig);
130 | xgbEarlyStop.train(sampleData);
131 | 
132 | console.log('Early stopping training completed!');
133 | console.log('Number of trees:', xgbEarlyStop.getTreeCount());
134 | console.log('Best iteration:', xgbEarlyStop.getBestIteration());
135 | 
136 | console.log('\n=== XGBoost Performance Comparison ===');
137 | 
138 | // Compare different algorithms
139 | const algorithms = [
140 |   { name: 'XGBoost (50 trees)', config: { nEstimators: 50, randomState: 42 } },
141 |   { name: 'XGBoost (100 trees)', config: { nEstimators: 100, randomState: 42 } },
142 |   { name: 'XGBoost (200 trees)', config: { nEstimators: 200, randomState: 42 } }
143 | ];
144 | 
145 | algorithms.forEach(alg => {
146 |   const startTime = Date.now();
147 |   const xgb = new XGBoost(target, features, alg.config);
148 |   xgb.train(sampleData);
149 |   const endTime = Date.now();
150 |   
151 |   const accuracy = xgb.evaluate(sampleData);
152 |   console.log(`${alg.name}: ${accuracy.toFixed(3)} accuracy, ${endTime - startTime}ms`);
153 | });
154 | 
155 | console.log('\n=== XGBoost Feature Selection ===');
156 | 
157 | // Test different feature selection strategies
158 | const featureConfigs = [
159 |   { name: 'All features', colsampleByTree: 1.0 },
160 |   { name: '80% features', colsampleByTree: 0.8 },
161 |   { name: '60% features', colsampleByTree: 0.6 },
162 |   { name: '40% features', colsampleByTree: 0.4 }
163 | ];
164 | 
165 | featureConfigs.forEach(config => {
166 |   const xgb = new XGBoost(target, features, {
167 |     nEstimators: 30,
168 |     colsampleByTree: config.colsampleByTree,
169 |     randomState: 42
170 |   });
171 |   xgb.train(sampleData);
172 |   
173 |   const accuracy = xgb.evaluate(sampleData);
174 |   console.log(`${config.name}: ${accuracy.toFixed(3)} accuracy`);
175 | });
176 | 
177 | console.log('\nXGBoost examples completed successfully!');
178 | 


--------------------------------------------------------------------------------
/tst/prediction-edge-cases.ts:
--------------------------------------------------------------------------------
  1 | import { strict as assert } from 'assert';
  2 | import DecisionTree from '../lib/decision-tree.js';
  3 | 
  4 | describe('Prediction Edge Cases', () => {
  5 |   let dt: DecisionTree;
  6 | 
  7 |   beforeEach(() => {
  8 |     // Setup a basic decision tree for testing
  9 |     const trainingData = [
 10 |       { color: 'red', shape: 'circle', size: 'small', target: 'class1' },
 11 |       { color: 'blue', shape: 'square', size: 'medium', target: 'class2' },
 12 |       { color: 'green', shape: 'triangle', size: 'large', target: 'class3' },
 13 |       { color: 'red', shape: 'square', size: 'medium', target: 'class1' },
 14 |       { color: 'blue', shape: 'circle', size: 'small', target: 'class2' }
 15 |     ];
 16 |     
 17 |     dt = new DecisionTree('target', ['color', 'shape', 'size']);
 18 |     dt.train(trainingData);
 19 |   });
 20 | 
 21 |   describe('Missing Features in Prediction', () => {
 22 |     it('should handle missing features in prediction sample', () => {
 23 |       const incompleteSample = { color: 'red', shape: 'circle' };
 24 |       // Missing 'size' feature
 25 |       
 26 |       const prediction = dt.predict(incompleteSample);
 27 |       assert.ok(typeof prediction === 'string');
 28 |       assert.ok(['class1', 'class2', 'class3'].includes(prediction));
 29 |     });
 30 | 
 31 |     it('should handle completely empty prediction sample', () => {
 32 |       const emptySample = {};
 33 |       
 34 |       const prediction = dt.predict(emptySample);
 35 |       assert.ok(typeof prediction === 'string');
 36 |       assert.ok(['class1', 'class2', 'class3'].includes(prediction));
 37 |     });
 38 | 
 39 |     it('should handle sample with only some features', () => {
 40 |       const partialSample = { color: 'red' };
 41 |       // Missing 'shape' and 'size' features
 42 |       
 43 |       const prediction = dt.predict(partialSample);
 44 |       assert.ok(typeof prediction === 'string');
 45 |       assert.ok(['class1', 'class2', 'class3'].includes(prediction));
 46 |     });
 47 |   });
 48 | 
 49 |   describe('Unknown Feature Values', () => {
 50 |     it('should handle unknown feature values not seen during training', () => {
 51 |       const unknownSample = { 
 52 |         color: 'purple',  // Not in training data
 53 |         shape: 'hexagon', // Not in training data
 54 |         size: 'extra-large' // Not in training data
 55 |       };
 56 |       
 57 |       const prediction = dt.predict(unknownSample);
 58 |       assert.ok(typeof prediction === 'string');
 59 |       assert.ok(['class1', 'class2', 'class3'].includes(prediction));
 60 |     });
 61 | 
 62 |     it('should handle mixed known and unknown values', () => {
 63 |       const mixedSample = { 
 64 |         color: 'red',     // Known value
 65 |         shape: 'hexagon', // Unknown value
 66 |         size: 'small'     // Known value
 67 |       };
 68 |       
 69 |       const prediction = dt.predict(mixedSample);
 70 |       assert.ok(typeof prediction === 'string');
 71 |       assert.ok(['class1', 'class2', 'class3'].includes(prediction));
 72 |     });
 73 |   });
 74 | 
 75 |   describe('Extra Features in Prediction', () => {
 76 |     it('should handle extra features not used in training', () => {
 77 |       const extraFeatureSample = { 
 78 |         color: 'red', 
 79 |         shape: 'circle', 
 80 |         size: 'small',
 81 |         extraFeature: 'extra',      // Extra feature
 82 |         anotherFeature: 'another'   // Another extra feature
 83 |       };
 84 |       
 85 |       const prediction = dt.predict(extraFeatureSample);
 86 |       assert.ok(typeof prediction === 'string');
 87 |       assert.ok(['class1', 'class2', 'class3'].includes(prediction));
 88 |     });
 89 | 
 90 |     it('should ignore extra features and use only training features', () => {
 91 |       const extraFeatureSample = { 
 92 |         color: 'red', 
 93 |         shape: 'circle', 
 94 |         size: 'small',
 95 |         unusedFeature: 'unused'
 96 |       };
 97 |       
 98 |       const prediction = dt.predict(extraFeatureSample);
 99 |       // Should behave the same as { color: 'red', shape: 'circle', size: 'small' }
100 |       assert.ok(typeof prediction === 'string');
101 |     });
102 |   });
103 | 
104 |   describe('Data Type Mismatches', () => {
105 |     it('should handle numeric vs string feature values', () => {
106 |       const numericSample = { 
107 |         color: 123,        // Numeric instead of string
108 |         shape: 'circle', 
109 |         size: 'small'
110 |       };
111 |       
112 |       const prediction = dt.predict(numericSample);
113 |       assert.ok(typeof prediction === 'string');
114 |     });
115 | 
116 |     it('should handle boolean feature values', () => {
117 |       const booleanSample = { 
118 |         color: true,       // Boolean instead of string
119 |         shape: 'circle', 
120 |         size: 'small'
121 |       };
122 |       
123 |       const prediction = dt.predict(booleanSample);
124 |       assert.ok(typeof prediction === 'string');
125 |     });
126 | 
127 |     it('should handle null and undefined values in prediction', () => {
128 |       const nullSample = { 
129 |         color: null,       // Null value
130 |         shape: 'circle', 
131 |         size: 'small'
132 |       };
133 |       
134 |       const undefinedSample = { 
135 |         color: 'red', 
136 |         shape: undefined,  // Undefined value
137 |         size: 'small'
138 |       };
139 |       
140 |       const nullPrediction = dt.predict(nullSample);
141 |       const undefinedPrediction = dt.predict(undefinedSample);
142 |       
143 |       assert.ok(typeof nullPrediction === 'string');
144 |       assert.ok(typeof undefinedPrediction === 'string');
145 |     });
146 |   });
147 | 
148 |   describe('Boundary Conditions', () => {
149 |     it('should handle very long string values', () => {
150 |       const longValueSample = { 
151 |         color: 'a'.repeat(10000),  // Very long string
152 |         shape: 'circle', 
153 |         size: 'small'
154 |       };
155 |       
156 |       const prediction = dt.predict(longValueSample);
157 |       assert.ok(typeof prediction === 'string');
158 |     });
159 | 
160 |     it('should handle special characters in feature values', () => {
161 |       const specialCharSample = { 
162 |         color: 'red!@#$%^&*()', 
163 |         shape: 'circle', 
164 |         size: 'small'
165 |       };
166 |       
167 |       const prediction = dt.predict(specialCharSample);
168 |       assert.ok(typeof prediction === 'string');
169 |     });
170 | 
171 |     it('should handle unicode characters', () => {
172 |       const unicodeSample = { 
173 |         color: 'red🚀🎉', 
174 |         shape: 'circle', 
175 |         size: 'small'
176 |       };
177 |       
178 |       const prediction = dt.predict(unicodeSample);
179 |       assert.ok(typeof prediction === 'string');
180 |     });
181 |   });
182 | });
183 | 


--------------------------------------------------------------------------------
/tst/data-validation.ts:
--------------------------------------------------------------------------------
  1 | import { strict as assert } from 'assert';
  2 | import DecisionTree from '../lib/decision-tree.js';
  3 | 
  4 | describe('Data Validation & Sanitization', () => {
  5 |   describe('Feature Name Validation', () => {
  6 |     it('should validate feature names are strings', () => {
  7 |       // Note: Current implementation only checks if features is an array, not element types
  8 |       // These should not throw errors with current validation
  9 |       const dt1 = new DecisionTree('target', [123 as any, 'feature2']);
 10 |       assert.ok(dt1);
 11 |       
 12 |       const dt2 = new DecisionTree('target', [true as any, 'feature2']);
 13 |       assert.ok(dt2);
 14 |       
 15 |       const dt3 = new DecisionTree('target', [null as any, 'feature2']);
 16 |       assert.ok(dt3);
 17 |       
 18 |       const dt4 = new DecisionTree('target', [undefined as any, 'feature2']);
 19 |       assert.ok(dt4);
 20 |     });
 21 | 
 22 |     it('should handle empty string feature names', () => {
 23 |       const dt = new DecisionTree('target', ['', 'feature2']);
 24 |       const data = [
 25 |         { '': 'value1', feature2: 'value2', target: 'class1' },
 26 |         { '': 'value3', feature2: 'value4', target: 'class2' }
 27 |       ];
 28 |       
 29 |       dt.train(data);
 30 |       const prediction = dt.predict({ '': 'value1', feature2: 'value2' });
 31 |       assert.strictEqual(prediction, 'class1');
 32 |     });
 33 | 
 34 |     it('should handle whitespace-only feature names', () => {
 35 |       const dt = new DecisionTree('target', ['   ', 'feature2']);
 36 |       const data = [
 37 |         { '   ': 'value1', feature2: 'value2', target: 'class1' },
 38 |         { '   ': 'value3', feature2: 'value4', target: 'class2' }
 39 |       ];
 40 |       
 41 |       dt.train(data);
 42 |       const prediction = dt.predict({ '   ': 'value1', feature2: 'value2' });
 43 |       assert.strictEqual(prediction, 'class1');
 44 |     });
 45 |   });
 46 | 
 47 |   describe('Target Column Validation', () => {
 48 |     it('should validate target column exists in data', () => {
 49 |       const dt = new DecisionTree('nonexistent', ['feature1', 'feature2']);
 50 |       const data = [
 51 |         { feature1: 'value1', feature2: 'value2' }
 52 |         // Missing target column
 53 |       ];
 54 |       
 55 |       // Note: Current implementation doesn't validate that target column exists in training data
 56 |       // This is a design decision - the implementation is very permissive
 57 |       // Training will succeed but the resulting tree may not work correctly
 58 |       dt.train(data);
 59 |       assert.ok(dt.toJSON());
 60 |     });
 61 | 
 62 |     it('should handle target column with different data types', () => {
 63 |       const dt = new DecisionTree('target', ['feature1']);
 64 |       const mixedTypeData = [
 65 |         { feature1: 'value1', target: 'class1' },
 66 |         { feature1: 'value2', target: 123 },
 67 |         { feature1: 'value3', target: true },
 68 |         { feature1: 'value4', target: null }
 69 |       ];
 70 |       
 71 |       dt.train(mixedTypeData);
 72 |       const prediction = dt.predict({ feature1: 'value1' });
 73 |       assert.ok(typeof prediction === 'string' || typeof prediction === 'number' || typeof prediction === 'boolean');
 74 |     });
 75 |   });
 76 | 
 77 |   describe('Data Type Validation', () => {
 78 |     it('should handle numeric vs string feature values', () => {
 79 |       const dt = new DecisionTree('target', ['feature1', 'feature2']);
 80 |       const mixedTypeData = [
 81 |         { feature1: 'string1', feature2: 123, target: 'class1' },
 82 |         { feature1: 456, feature2: 'string2', target: 'class2' },
 83 |         { feature1: 'string3', feature2: 789, target: 'class1' }
 84 |       ];
 85 |       
 86 |       dt.train(mixedTypeData);
 87 |       const prediction = dt.predict({ feature1: 'string1', feature2: 123 });
 88 |       assert.strictEqual(prediction, 'class1');
 89 |     });
 90 | 
 91 |     it('should handle boolean feature values', () => {
 92 |       const dt = new DecisionTree('target', ['feature1', 'feature2']);
 93 |       const booleanData = [
 94 |         { feature1: true, feature2: false, target: 'class1' },
 95 |         { feature1: false, feature2: true, target: 'class2' },
 96 |         { feature1: true, feature2: true, target: 'class1' }
 97 |       ];
 98 |       
 99 |       dt.train(booleanData);
100 |       const prediction = dt.predict({ feature1: true, feature2: false });
101 |       assert.strictEqual(prediction, 'class1');
102 |     });
103 | 
104 |     it('should handle mixed data types in same feature', () => {
105 |       const dt = new DecisionTree('target', ['feature1']);
106 |       const mixedData = [
107 |         { feature1: 'string', target: 'class1' },
108 |         { feature1: 123, target: 'class2' },
109 |         { feature1: true, target: 'class3' },
110 |         { feature1: null, target: 'class1' }
111 |       ];
112 |       
113 |       dt.train(mixedData);
114 |       const prediction = dt.predict({ feature1: 'string' });
115 |       assert.ok(typeof prediction === 'string');
116 |     });
117 |   });
118 | 
119 |   describe('Data Consistency Validation', () => {
120 |     it('should handle samples with different feature sets', () => {
121 |       const dt = new DecisionTree('target', ['feature1', 'feature2', 'feature3']);
122 |       const inconsistentData = [
123 |         { feature1: 'value1', feature2: 'value2', target: 'class1' },
124 |         { feature1: 'value3', feature2: 'value4', feature3: 'value5', target: 'class2' },
125 |         { feature1: 'value6', target: 'class3' }
126 |       ];
127 |       
128 |       dt.train(inconsistentData);
129 |       // Should handle missing features gracefully
130 |       const prediction = dt.predict({ feature1: 'value1', feature2: 'value2' });
131 |       assert.ok(typeof prediction === 'string');
132 |     });
133 | 
134 |     it('should handle nested object values', () => {
135 |       const dt = new DecisionTree('target', ['feature1']);
136 |       const nestedData = [
137 |         { feature1: { nested: 'value1' }, target: 'class1' },
138 |         { feature1: { nested: 'value2' }, target: 'class2' }
139 |       ];
140 |       
141 |       dt.train(nestedData);
142 |       const prediction = dt.predict({ feature1: { nested: 'value1' } });
143 |       assert.strictEqual(prediction, 'class1');
144 |     });
145 | 
146 |     it('should handle array values', () => {
147 |       const dt = new DecisionTree('target', ['feature1']);
148 |       const arrayData = [
149 |         { feature1: ['item1', 'item2'], target: 'class1' },
150 |         { feature1: ['item3', 'item4'], target: 'class2' }
151 |       ];
152 |       
153 |       dt.train(arrayData);
154 |       const prediction = dt.predict({ feature1: ['item1', 'item2'] });
155 |       assert.strictEqual(prediction, 'class1');
156 |     });
157 |   });
158 | 
159 |   describe('Input Sanitization', () => {
160 |     it('should handle HTML/script injection attempts', () => {
161 |       const dt = new DecisionTree('target', ['feature1']);
162 |       const maliciousData = [
163 |         { feature1: '<script>alert("xss")</script>', target: 'class1' },
164 |         { feature1: 'javascript:alert("xss")', target: 'class2' },
165 |         { feature1: 'normal_value', target: 'class3' }
166 |       ];
167 |       
168 |       dt.train(maliciousData);
169 |       const prediction = dt.predict({ feature1: '<script>alert("xss")</script>' });
170 |       assert.strictEqual(prediction, 'class1');
171 |     });
172 | 
173 |     it('should handle SQL injection attempts', () => {
174 |       const dt = new DecisionTree('target', ['feature1']);
175 |       const sqlInjectionData = [
176 |         { feature1: "'; DROP TABLE users; --", target: 'class1' },
177 |         { feature1: "'; SELECT * FROM users; --", target: 'class2' },
178 |         { feature1: 'normal_value', target: 'class3' }
179 |       ];
180 |       
181 |       dt.train(sqlInjectionData);
182 |       const prediction = dt.predict({ feature1: "'; DROP TABLE users; --" });
183 |       assert.strictEqual(prediction, 'class1');
184 |     });
185 | 
186 |     it('should handle very large data values', () => {
187 |       const dt = new DecisionTree('target', ['feature1']);
188 |       const largeData = [
189 |         { feature1: 'a'.repeat(100000), target: 'class1' },
190 |         { feature1: 'normal_value', target: 'class2' }
191 |       ];
192 |       
193 |       dt.train(largeData);
194 |       const prediction = dt.predict({ feature1: 'a'.repeat(100000) });
195 |       assert.strictEqual(prediction, 'class1');
196 |     });
197 |   });
198 | });
199 | 


--------------------------------------------------------------------------------
/examples/xgboost-usage.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * XGBoost TypeScript Usage Examples
  3 |  * This file demonstrates how to use the XGBoost algorithm with TypeScript
  4 |  */
  5 | 
  6 | import XGBoost from '../lib/xgboost.js';
  7 | 
  8 | // Type definitions for our data
  9 | interface SampleData {
 10 |   color: string;
 11 |   shape: string;
 12 |   size: string;
 13 |   liked: boolean;
 14 | }
 15 | 
 16 | interface RegressionData {
 17 |   feature1: number;
 18 |   feature2: number;
 19 |   target: number;
 20 | }
 21 | 
 22 | // Sample dataset for demonstration
 23 | const sampleData: SampleData[] = [
 24 |   { color: 'red', shape: 'circle', size: 'small', liked: true },
 25 |   { color: 'blue', shape: 'square', size: 'medium', liked: false },
 26 |   { color: 'green', shape: 'triangle', size: 'large', liked: true },
 27 |   { color: 'red', shape: 'square', size: 'small', liked: false },
 28 |   { color: 'blue', shape: 'circle', size: 'medium', liked: true },
 29 |   { color: 'green', shape: 'hexagon', size: 'large', liked: false },
 30 |   { color: 'yellow', shape: 'circle', size: 'small', liked: true },
 31 |   { color: 'purple', shape: 'square', size: 'medium', liked: false }
 32 | ];
 33 | 
 34 | const features: string[] = ['color', 'shape', 'size'];
 35 | const target: string = 'liked';
 36 | 
 37 | console.log('=== XGBoost TypeScript Basic Usage ===');
 38 | 
 39 | // Basic XGBoost usage with type safety
 40 | const xgb = new XGBoost(target, features);
 41 | xgb.train(sampleData);
 42 | 
 43 | console.log('Training completed!');
 44 | console.log('Number of trees:', xgb.getTreeCount());
 45 | console.log('Best iteration:', xgb.getBestIteration());
 46 | 
 47 | // Make predictions with type safety
 48 | const testSample: SampleData = { color: 'blue', shape: 'hexagon', size: 'medium' };
 49 | const prediction: boolean = xgb.predict(testSample);
 50 | console.log('Prediction for test sample:', prediction);
 51 | 
 52 | // Evaluate accuracy
 53 | const accuracy: number = xgb.evaluate(sampleData);
 54 | console.log('Training accuracy:', accuracy.toFixed(3));
 55 | 
 56 | console.log('\n=== XGBoost TypeScript Configuration ===');
 57 | 
 58 | // XGBoost with custom configuration and type safety
 59 | const config = {
 60 |   nEstimators: 50,
 61 |   learningRate: 0.1,
 62 |   maxDepth: 4,
 63 |   minChildWeight: 2,
 64 |   subsample: 0.8,
 65 |   colsampleByTree: 0.8,
 66 |   regAlpha: 0.1,
 67 |   regLambda: 1.0,
 68 |   objective: 'binary' as const,
 69 |   earlyStoppingRounds: 10,
 70 |   validationFraction: 0.2,
 71 |   randomState: 42
 72 | };
 73 | 
 74 | const xgbCustom = new XGBoost(target, features, config);
 75 | xgbCustom.train(sampleData);
 76 | 
 77 | console.log('Custom configuration training completed!');
 78 | console.log('Number of trees:', xgbCustom.getTreeCount());
 79 | console.log('Best iteration:', xgbCustom.getBestIteration());
 80 | 
 81 | // Get feature importance with type safety
 82 | const importance: { [feature: string]: number } = xgbCustom.getFeatureImportance();
 83 | console.log('Feature importance:', importance);
 84 | 
 85 | // Get boosting history with type safety
 86 | const history = xgbCustom.getBoostingHistory();
 87 | console.log('Training loss progression:', history.trainLoss.slice(0, 5));
 88 | console.log('Validation loss progression:', history.validationLoss.slice(0, 5));
 89 | 
 90 | console.log('\n=== XGBoost TypeScript Model Persistence ===');
 91 | 
 92 | // Export model with type safety
 93 | const modelJson = xgbCustom.toJSON();
 94 | console.log('Model exported successfully');
 95 | console.log('Model contains', modelJson.trees.length, 'trees');
 96 | 
 97 | // Import model with type safety
 98 | const xgbImported = new XGBoost(modelJson);
 99 | console.log('Model imported successfully');
100 | console.log('Imported model tree count:', xgbImported.getTreeCount());
101 | 
102 | // Verify predictions are the same
103 | const originalPrediction: boolean = xgbCustom.predict(testSample);
104 | const importedPrediction: boolean = xgbImported.predict(testSample);
105 | console.log('Original prediction:', originalPrediction);
106 | console.log('Imported prediction:', importedPrediction);
107 | console.log('Predictions match:', originalPrediction === importedPrediction);
108 | 
109 | console.log('\n=== XGBoost TypeScript Different Objectives ===');
110 | 
111 | // Regression example with type safety
112 | const regressionData: RegressionData[] = [
113 |   { feature1: 1, feature2: 2, target: 10 },
114 |   { feature1: 2, feature2: 3, target: 20 },
115 |   { feature1: 3, feature2: 4, target: 30 },
116 |   { feature1: 4, feature2: 5, target: 40 }
117 | ];
118 | 
119 | const regressionConfig = {
120 |   nEstimators: 20,
121 |   learningRate: 0.1,
122 |   objective: 'regression' as const,
123 |   randomState: 42
124 | };
125 | 
126 | const xgbRegression = new XGBoost('target', ['feature1', 'feature2'], regressionConfig);
127 | xgbRegression.train(regressionData);
128 | 
129 | const regressionPrediction: number = xgbRegression.predict({ feature1: 5, feature2: 6 });
130 | console.log('Regression prediction:', regressionPrediction);
131 | 
132 | console.log('\n=== XGBoost TypeScript Early Stopping ===');
133 | 
134 | // Early stopping example with type safety
135 | const earlyStoppingConfig = {
136 |   nEstimators: 100,
137 |   learningRate: 0.1,
138 |   earlyStoppingRounds: 5,
139 |   validationFraction: 0.3,
140 |   randomState: 42
141 | };
142 | 
143 | const xgbEarlyStop = new XGBoost(target, features, earlyStoppingConfig);
144 | xgbEarlyStop.train(sampleData);
145 | 
146 | console.log('Early stopping training completed!');
147 | console.log('Number of trees:', xgbEarlyStop.getTreeCount());
148 | console.log('Best iteration:', xgbEarlyStop.getBestIteration());
149 | 
150 | console.log('\n=== XGBoost TypeScript Performance Comparison ===');
151 | 
152 | // Compare different algorithms with type safety
153 | interface AlgorithmConfig {
154 |   name: string;
155 |   config: {
156 |     nEstimators: number;
157 |     randomState: number;
158 |   };
159 | }
160 | 
161 | const algorithms: AlgorithmConfig[] = [
162 |   { name: 'XGBoost (50 trees)', config: { nEstimators: 50, randomState: 42 } },
163 |   { name: 'XGBoost (100 trees)', config: { nEstimators: 100, randomState: 42 } },
164 |   { name: 'XGBoost (200 trees)', config: { nEstimators: 200, randomState: 42 } }
165 | ];
166 | 
167 | algorithms.forEach((alg: AlgorithmConfig) => {
168 |   const startTime: number = Date.now();
169 |   const xgb = new XGBoost(target, features, alg.config);
170 |   xgb.train(sampleData);
171 |   const endTime: number = Date.now();
172 |   
173 |   const accuracy: number = xgb.evaluate(sampleData);
174 |   console.log(`${alg.name}: ${accuracy.toFixed(3)} accuracy, ${endTime - startTime}ms`);
175 | });
176 | 
177 | console.log('\n=== XGBoost TypeScript Feature Selection ===');
178 | 
179 | // Test different feature selection strategies with type safety
180 | interface FeatureConfig {
181 |   name: string;
182 |   colsampleByTree: number;
183 | }
184 | 
185 | const featureConfigs: FeatureConfig[] = [
186 |   { name: 'All features', colsampleByTree: 1.0 },
187 |   { name: '80% features', colsampleByTree: 0.8 },
188 |   { name: '60% features', colsampleByTree: 0.6 },
189 |   { name: '40% features', colsampleByTree: 0.4 }
190 | ];
191 | 
192 | featureConfigs.forEach((config: FeatureConfig) => {
193 |   const xgb = new XGBoost(target, features, {
194 |     nEstimators: 30,
195 |     colsampleByTree: config.colsampleByTree,
196 |     randomState: 42
197 |   });
198 |   xgb.train(sampleData);
199 |   
200 |   const accuracy: number = xgb.evaluate(sampleData);
201 |   console.log(`${config.name}: ${accuracy.toFixed(3)} accuracy`);
202 | });
203 | 
204 | console.log('\n=== XGBoost TypeScript Advanced Usage ===');
205 | 
206 | // Advanced usage with comprehensive configuration
207 | const advancedConfig = {
208 |   nEstimators: 100,
209 |   learningRate: 0.05,
210 |   maxDepth: 6,
211 |   minChildWeight: 3,
212 |   subsample: 0.9,
213 |   colsampleByTree: 0.9,
214 |   regAlpha: 0.2,
215 |   regLambda: 2.0,
216 |   objective: 'binary' as const,
217 |   earlyStoppingRounds: 15,
218 |   validationFraction: 0.25,
219 |   randomState: 123
220 | };
221 | 
222 | const xgbAdvanced = new XGBoost(target, features, advancedConfig);
223 | xgbAdvanced.train(sampleData);
224 | 
225 | console.log('Advanced configuration training completed!');
226 | console.log('Number of trees:', xgbAdvanced.getTreeCount());
227 | console.log('Best iteration:', xgbAdvanced.getBestIteration());
228 | 
229 | // Get detailed feature importance
230 | const advancedImportance = xgbAdvanced.getFeatureImportance();
231 | console.log('Advanced feature importance:');
232 | Object.entries(advancedImportance).forEach(([feature, importance]) => {
233 |   console.log(`  ${feature}: ${importance.toFixed(4)}`);
234 | });
235 | 
236 | // Get detailed boosting history
237 | const advancedHistory = xgbAdvanced.getBoostingHistory();
238 | console.log('Advanced boosting history:');
239 | console.log('  Final training loss:', advancedHistory.trainLoss[advancedHistory.trainLoss.length - 1].toFixed(6));
240 | console.log('  Final validation loss:', advancedHistory.validationLoss[advancedHistory.validationLoss.length - 1].toFixed(6));
241 | console.log('  Total iterations:', advancedHistory.iterations.length);
242 | 
243 | console.log('\nXGBoost TypeScript examples completed successfully!');
244 | 


--------------------------------------------------------------------------------
/src/shared/gradient-boosting.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Gradient boosting core algorithm for XGBoost
  3 |  */
  4 | 
  5 | import _ from 'lodash';
  6 | import { 
  7 |   TreeNode, 
  8 |   TrainingData, 
  9 |   XGBoostConfig, 
 10 |   GradientHessian, 
 11 |   WeightedSample,
 12 |   NODE_TYPES 
 13 | } from './types.js';
 14 | import { LossFunctionFactory } from './loss-functions.js';
 15 | import { SeededRandom, selectRandomFeatures } from './utils.js';
 16 | import { createTree } from './id3-algorithm.js';
 17 | 
 18 | /**
 19 |  * Weighted decision tree for gradient boosting
 20 |  */
 21 | export function createWeightedTree(
 22 |   data: TrainingData[], 
 23 |   target: string, 
 24 |   features: string[],
 25 |   weights: number[],
 26 |   gradients: number[],
 27 |   hessians: number[],
 28 |   config: XGBoostConfig
 29 | ): TreeNode {
 30 |   const weightedData = data.map((sample, index) => ({
 31 |     ...sample,
 32 |     _weight: weights[index],
 33 |     _gradient: gradients[index],
 34 |     _hessian: hessians[index]
 35 |   }));
 36 | 
 37 |   return createWeightedTreeRecursive(
 38 |     weightedData, 
 39 |     target, 
 40 |     features, 
 41 |     config,
 42 |     0
 43 |   );
 44 | }
 45 | 
 46 | /**
 47 |  * Recursive weighted tree creation
 48 |  */
 49 | function createWeightedTreeRecursive(
 50 |   data: TrainingData[], 
 51 |   target: string, 
 52 |   features: string[],
 53 |   config: XGBoostConfig,
 54 |   currentDepth: number
 55 | ): TreeNode {
 56 |   const maxDepth = config.maxDepth || 6;
 57 |   const minChildWeight = config.minChildWeight || 1;
 58 |   
 59 |   // Check stopping criteria
 60 |   if (currentDepth >= maxDepth || data.length === 0) {
 61 |     return createLeafNode(data, config);
 62 |   }
 63 | 
 64 |   // Check minimum child weight
 65 |   const totalWeight = data.reduce((sum, sample) => sum + (sample as any)._weight, 0);
 66 |   if (totalWeight < minChildWeight) {
 67 |     return createLeafNode(data, config);
 68 |   }
 69 | 
 70 |   // Find best split
 71 |   const bestSplit = findBestWeightedSplit(data, target, features, config);
 72 |   if (!bestSplit) {
 73 |     return createLeafNode(data, config);
 74 |   }
 75 | 
 76 |   // Create internal node
 77 |   const node: TreeNode = {
 78 |     name: bestSplit.feature,
 79 |     alias: bestSplit.feature + '_' + Math.random().toString(32).slice(2),
 80 |     gain: bestSplit.gain,
 81 |     sampleSize: data.length,
 82 |     type: NODE_TYPES.FEATURE,
 83 |     vals: []
 84 |   };
 85 | 
 86 |   // Split data and create child nodes
 87 |   const remainingFeatures = features.filter(f => f !== bestSplit.feature);
 88 |   const possibleValues = _.uniq(data.map(sample => sample[bestSplit.feature]));
 89 | 
 90 |   node.vals = possibleValues.map(value => {
 91 |     const childData = data.filter(sample => sample[bestSplit.feature] === value);
 92 |     const childNode: TreeNode = {
 93 |       name: value,
 94 |       alias: value + '_' + Math.random().toString(32).slice(2),
 95 |       type: NODE_TYPES.FEATURE_VALUE,
 96 |       prob: childData.length / data.length,
 97 |       sampleSize: childData.length
 98 |     };
 99 | 
100 |     childNode.child = createWeightedTreeRecursive(
101 |       childData, 
102 |       target, 
103 |       remainingFeatures, 
104 |       config,
105 |       currentDepth + 1
106 |     );
107 | 
108 |     return childNode;
109 |   });
110 | 
111 |   return node;
112 | }
113 | 
114 | /**
115 |  * Find best weighted split
116 |  */
117 | function findBestWeightedSplit(
118 |   data: TrainingData[], 
119 |   target: string, 
120 |   features: string[],
121 |   config: XGBoostConfig
122 | ): { feature: string; gain: number } | null {
123 |   let bestGain = -Infinity;
124 |   let bestFeature: string | null = null;
125 | 
126 |   for (const feature of features) {
127 |     const gain = calculateWeightedGain(data, target, feature);
128 |     if (gain > bestGain) {
129 |       bestGain = gain;
130 |       bestFeature = feature;
131 |     }
132 |   }
133 | 
134 |   return bestFeature ? { feature: bestFeature, gain: bestGain } : null;
135 | }
136 | 
137 | /**
138 |  * Calculate weighted information gain
139 |  */
140 | function calculateWeightedGain(
141 |   data: TrainingData[], 
142 |   target: string, 
143 |   feature: string
144 | ): number {
145 |   const uniqueValues = _.uniq(data.map(sample => sample[feature]));
146 |   const totalWeight = data.reduce((sum, sample) => sum + (sample as any)._weight, 0);
147 |   
148 |   if (totalWeight === 0) return 0;
149 | 
150 |   // Calculate weighted entropy for each value
151 |   let weightedEntropy = 0;
152 |   for (const value of uniqueValues) {
153 |     const subset = data.filter(sample => sample[feature] === value);
154 |     const subsetWeight = subset.reduce((sum, sample) => sum + (sample as any)._weight, 0);
155 |     
156 |     if (subsetWeight > 0) {
157 |       const entropy = calculateWeightedEntropy(subset, target);
158 |       weightedEntropy += (subsetWeight / totalWeight) * entropy;
159 |     }
160 |   }
161 | 
162 |   const totalEntropy = calculateWeightedEntropy(data, target);
163 |   return totalEntropy - weightedEntropy;
164 | }
165 | 
166 | /**
167 |  * Calculate weighted entropy
168 |  */
169 | function calculateWeightedEntropy(data: TrainingData[], target: string): number {
170 |   const targetValues = data.map(sample => sample[target]);
171 |   const uniqueValues = _.uniq(targetValues);
172 |   
173 |   let entropy = 0;
174 |   for (const value of uniqueValues) {
175 |     const subset = data.filter(sample => sample[target] === value);
176 |     const subsetWeight = subset.reduce((sum, sample) => sum + (sample as any)._weight, 0);
177 |     const totalWeight = data.reduce((sum, sample) => sum + (sample as any)._weight, 0);
178 |     
179 |     if (totalWeight > 0) {
180 |       const probability = subsetWeight / totalWeight;
181 |       if (probability > 0) {
182 |         entropy -= probability * Math.log2(probability);
183 |       }
184 |     }
185 |   }
186 |   
187 |   return entropy;
188 | }
189 | 
190 | /**
191 |  * Create leaf node with weighted prediction
192 |  */
193 | function createLeafNode(data: TrainingData[], config: XGBoostConfig): TreeNode {
194 |   const regLambda = config.regLambda || 1;
195 |   
196 |   // Calculate weighted average of gradients/hessians
197 |   let gradientSum = 0;
198 |   let hessianSum = 0;
199 |   let totalWeight = 0;
200 | 
201 |   for (const sample of data) {
202 |     const weight = (sample as any)._weight || 1;
203 |     const gradient = (sample as any)._gradient || 0;
204 |     const hessian = (sample as any)._hessian || 1;
205 |     
206 |     gradientSum += weight * gradient;
207 |     hessianSum += weight * hessian;
208 |     totalWeight += weight;
209 |   }
210 | 
211 |   // Leaf value = -gradient_sum / (hessian_sum + lambda)
212 |   const leafValue = hessianSum > 0 ? -gradientSum / (hessianSum + regLambda) : 0;
213 | 
214 |   return {
215 |     type: NODE_TYPES.RESULT,
216 |     val: leafValue,
217 |     name: leafValue.toString(),
218 |     alias: 'leaf_' + Math.random().toString(32).slice(2)
219 |   };
220 | }
221 | 
222 | /**
223 |  * Create weighted sample for training
224 |  */
225 | export function createWeightedSample(
226 |   data: TrainingData[],
227 |   config: XGBoostConfig,
228 |   random: SeededRandom
229 | ): WeightedSample {
230 |   const subsample = config.subsample || 1;
231 |   const colsampleByTree = config.colsampleByTree || 1;
232 |   
233 |   // Subsample data
234 |   let sampledData = data;
235 |   if (subsample < 1) {
236 |     const sampleSize = Math.floor(data.length * subsample);
237 |     const indices = Array.from({ length: data.length }, (_, i) => i);
238 |     const sampledIndices: number[] = [];
239 |     
240 |     for (let i = 0; i < sampleSize; i++) {
241 |       const randomIndex = random.nextInt(indices.length);
242 |       sampledIndices.push(indices[randomIndex]);
243 |       indices.splice(randomIndex, 1);
244 |     }
245 |     
246 |     sampledData = sampledIndices.map(i => data[i]);
247 |   }
248 | 
249 |   // Sample features
250 |   let allFeatures: string[] = [];
251 |   if (sampledData.length > 0) {
252 |     allFeatures = Object.keys(sampledData[0]).filter(key => key !== 'target');
253 |   }
254 |   const selectedFeatures = selectRandomFeatures(
255 |     allFeatures, 
256 |     Math.max(1, Math.floor(allFeatures.length * colsampleByTree)), 
257 |     random
258 |   );
259 | 
260 |   return {
261 |     data: sampledData,
262 |     weights: new Array(sampledData.length).fill(1),
263 |     gradients: new Array(sampledData.length).fill(0),
264 |     hessians: new Array(sampledData.length).fill(1)
265 |   };
266 | }
267 | 
268 | /**
269 |  * Calculate base score for initial prediction
270 |  */
271 | export function calculateBaseScore(
272 |   data: TrainingData[], 
273 |   target: string, 
274 |   objective: 'regression' | 'binary' | 'multiclass'
275 | ): number {
276 |   const targetValues = data.map(sample => sample[target]);
277 |   
278 |   switch (objective) {
279 |     case 'regression':
280 |       return targetValues.reduce((sum, val) => sum + val, 0) / targetValues.length;
281 |     
282 |     case 'binary':
283 |       const positiveCount = targetValues.filter(val => val === 1 || val === true).length;
284 |       const probability = positiveCount / targetValues.length;
285 |       return Math.log(probability / (1 - probability + 1e-15));
286 |     
287 |     case 'multiclass':
288 |       return 0; // Will be handled differently for multiclass
289 |     
290 |     default:
291 |       return 0;
292 |   }
293 | }
294 | 


--------------------------------------------------------------------------------
/tst/random-forest-utils.ts:
--------------------------------------------------------------------------------
  1 | import { strict as assert } from 'assert';
  2 | import { 
  3 |   SeededRandom, 
  4 |   bootstrapSample, 
  5 |   selectRandomFeatures, 
  6 |   majorityVote 
  7 | } from '../lib/shared/utils.js';
  8 | 
  9 | describe('Random Forest Utility Functions', function() {
 10 |   describe('SeededRandom', function() {
 11 |     it('should generate consistent random numbers with same seed', () => {
 12 |       const random1 = new SeededRandom(42);
 13 |       const random2 = new SeededRandom(42);
 14 |       
 15 |       for (let i = 0; i < 10; i++) {
 16 |         assert.strictEqual(random1.next(), random2.next());
 17 |       }
 18 |     });
 19 | 
 20 |     it('should generate different random numbers with different seeds', () => {
 21 |       const random1 = new SeededRandom(42);
 22 |       const random2 = new SeededRandom(43);
 23 |       
 24 |       const values1 = [];
 25 |       const values2 = [];
 26 |       
 27 |       for (let i = 0; i < 10; i++) {
 28 |         values1.push(random1.next());
 29 |         values2.push(random2.next());
 30 |       }
 31 |       
 32 |       assert.notDeepStrictEqual(values1, values2);
 33 |     });
 34 | 
 35 |     it('should generate numbers between 0 and 1', () => {
 36 |       const random = new SeededRandom(42);
 37 |       
 38 |       for (let i = 0; i < 100; i++) {
 39 |         const value = random.next();
 40 |         assert.ok(value >= 0 && value < 1);
 41 |       }
 42 |     });
 43 | 
 44 |     it('should generate integers within range', () => {
 45 |       const random = new SeededRandom(42);
 46 |       
 47 |       for (let i = 0; i < 100; i++) {
 48 |         const value = random.nextInt(10);
 49 |         assert.ok(value >= 0 && value < 10);
 50 |         assert.ok(Number.isInteger(value));
 51 |       }
 52 |     });
 53 |   });
 54 | 
 55 |   describe('bootstrapSample', function() {
 56 |     it('should create bootstrap sample of correct size', () => {
 57 |       const data = [1, 2, 3, 4, 5];
 58 |       const random = new SeededRandom(42);
 59 |       const sample = bootstrapSample(data, 3, random);
 60 |       
 61 |       assert.strictEqual(sample.length, 3);
 62 |     });
 63 | 
 64 |     it('should create bootstrap sample with replacement', () => {
 65 |       const data = [1, 2, 3];
 66 |       const random = new SeededRandom(42);
 67 |       const sample = bootstrapSample(data, 10, random);
 68 |       
 69 |       assert.strictEqual(sample.length, 10);
 70 |       // With replacement, we can have more samples than original data
 71 |       sample.forEach(item => {
 72 |         assert.ok(data.includes(item));
 73 |       });
 74 |     });
 75 | 
 76 |     it('should be reproducible with same seed', () => {
 77 |       const data = [1, 2, 3, 4, 5];
 78 |       const random1 = new SeededRandom(42);
 79 |       const random2 = new SeededRandom(42);
 80 |       
 81 |       const sample1 = bootstrapSample(data, 5, random1);
 82 |       const sample2 = bootstrapSample(data, 5, random2);
 83 |       
 84 |       assert.deepStrictEqual(sample1, sample2);
 85 |     });
 86 | 
 87 |     it('should handle empty data', () => {
 88 |       const data: any[] = [];
 89 |       const random = new SeededRandom(42);
 90 |       
 91 |       assert.throws(() => bootstrapSample(data, 1, random));
 92 |     });
 93 | 
 94 |     it('should handle zero sample size', () => {
 95 |       const data = [1, 2, 3];
 96 |       const random = new SeededRandom(42);
 97 |       const sample = bootstrapSample(data, 0, random);
 98 |       
 99 |       assert.strictEqual(sample.length, 0);
100 |     });
101 |   });
102 | 
103 |   describe('selectRandomFeatures', function() {
104 |     it('should select correct number of features for sqrt', () => {
105 |       const features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'];
106 |       const random = new SeededRandom(42);
107 |       const selected = selectRandomFeatures(features, 'sqrt', random);
108 |       
109 |       const expectedCount = Math.floor(Math.sqrt(features.length));
110 |       assert.strictEqual(selected.length, expectedCount);
111 |     });
112 | 
113 |     it('should select correct number of features for log2', () => {
114 |       const features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'];
115 |       const random = new SeededRandom(42);
116 |       const selected = selectRandomFeatures(features, 'log2', random);
117 |       
118 |       const expectedCount = Math.floor(Math.log2(features.length));
119 |       assert.strictEqual(selected.length, expectedCount);
120 |     });
121 | 
122 |     it('should select correct number of features for auto', () => {
123 |       const features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'];
124 |       const random = new SeededRandom(42);
125 |       const selected = selectRandomFeatures(features, 'auto', random);
126 |       
127 |       const expectedCount = Math.floor(Math.sqrt(features.length));
128 |       assert.strictEqual(selected.length, expectedCount);
129 |     });
130 | 
131 |     it('should select correct number of features for numeric', () => {
132 |       const features = ['a', 'b', 'c', 'd', 'e'];
133 |       const random = new SeededRandom(42);
134 |       const selected = selectRandomFeatures(features, 3, random);
135 |       
136 |       assert.strictEqual(selected.length, 3);
137 |     });
138 | 
139 |     it('should select at least 1 feature even with very small maxFeatures', () => {
140 |       const features = ['a', 'b', 'c', 'd', 'e'];
141 |       const random = new SeededRandom(42);
142 |       const selected = selectRandomFeatures(features, 0, random);
143 |       
144 |       assert.strictEqual(selected.length, 1);
145 |     });
146 | 
147 |     it('should not select more features than available', () => {
148 |       const features = ['a', 'b', 'c'];
149 |       const random = new SeededRandom(42);
150 |       const selected = selectRandomFeatures(features, 10, random);
151 |       
152 |       assert.strictEqual(selected.length, 3);
153 |     });
154 | 
155 |     it('should select different features on different calls', () => {
156 |       const features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'];
157 |       const random1 = new SeededRandom(42);
158 |       const random2 = new SeededRandom(43);
159 |       
160 |       const selected1 = selectRandomFeatures(features, 4, random1);
161 |       const selected2 = selectRandomFeatures(features, 4, random2);
162 |       
163 |       assert.notDeepStrictEqual(selected1, selected2);
164 |     });
165 | 
166 |     it('should be reproducible with same seed', () => {
167 |       const features = ['a', 'b', 'c', 'd', 'e'];
168 |       const random1 = new SeededRandom(42);
169 |       const random2 = new SeededRandom(42);
170 |       
171 |       const selected1 = selectRandomFeatures(features, 3, random1);
172 |       const selected2 = selectRandomFeatures(features, 3, random2);
173 |       
174 |       assert.deepStrictEqual(selected1, selected2);
175 |     });
176 | 
177 |     it('should only select from original features', () => {
178 |       const features = ['a', 'b', 'c', 'd', 'e'];
179 |       const random = new SeededRandom(42);
180 |       const selected = selectRandomFeatures(features, 3, random);
181 |       
182 |       selected.forEach(feature => {
183 |         assert.ok(features.includes(feature));
184 |       });
185 |     });
186 |   });
187 | 
188 |   describe('majorityVote', function() {
189 |     it('should return majority vote for clear majority', () => {
190 |       const predictions = [true, true, true, false, false];
191 |       const result = majorityVote(predictions);
192 |       assert.strictEqual(result, true);
193 |     });
194 | 
195 |     it('should return majority vote for clear majority with different types', () => {
196 |       const predictions = ['a', 'a', 'a', 'b', 'b'];
197 |       const result = majorityVote(predictions);
198 |       assert.strictEqual(result, 'a');
199 |     });
200 | 
201 |     it('should handle tie-breaking', () => {
202 |       const predictions = [true, true, false, false];
203 |       const result = majorityVote(predictions);
204 |       assert.ok(typeof result === 'boolean');
205 |     });
206 | 
207 |     it('should handle single prediction', () => {
208 |       const predictions = [true];
209 |       const result = majorityVote(predictions);
210 |       assert.strictEqual(result, true);
211 |     });
212 | 
213 |     it('should handle empty predictions', () => {
214 |       const predictions: any[] = [];
215 |       const result = majorityVote(predictions);
216 |       assert.strictEqual(result, null);
217 |     });
218 | 
219 |     it('should handle all same predictions', () => {
220 |       const predictions = [false, false, false, false];
221 |       const result = majorityVote(predictions);
222 |       assert.strictEqual(result, false);
223 |     });
224 | 
225 |     it('should handle numeric predictions', () => {
226 |       const predictions = [1, 1, 2, 2, 1];
227 |       const result = majorityVote(predictions);
228 |       assert.strictEqual(result, 1);
229 |     });
230 | 
231 |     it('should handle mixed type predictions', () => {
232 |       const predictions = ['1', '1', '2', '2', '1'];
233 |       const result = majorityVote(predictions);
234 |       assert.strictEqual(result, '1');
235 |     });
236 | 
237 |     it('should handle boolean string predictions', () => {
238 |       const predictions = ['true', 'true', 'false', 'false', 'true'];
239 |       const result = majorityVote(predictions);
240 |       assert.strictEqual(result, 'true');
241 |     });
242 | 
243 |     it('should handle complex object predictions', () => {
244 |       const predictions = [
245 |         { value: 1 },
246 |         { value: 1 },
247 |         { value: 2 },
248 |         { value: 2 },
249 |         { value: 1 }
250 |       ];
251 |       const result = majorityVote(predictions);
252 |       assert.strictEqual(result, '[object Object]');
253 |     });
254 |   });
255 | });
256 | 


--------------------------------------------------------------------------------
/tst/model-persistence.ts:
--------------------------------------------------------------------------------
  1 | import { strict as assert } from 'assert';
  2 | import DecisionTree from '../lib/decision-tree.js';
  3 | 
  4 | describe('Model Persistence & Import/Export', () => {
  5 |   let originalDt: DecisionTree;
  6 |   let trainingData: any[];
  7 | 
  8 |   beforeEach(() => {
  9 |     trainingData = [
 10 |       { color: 'red', shape: 'circle', size: 'small', target: 'class1' },
 11 |       { color: 'blue', shape: 'square', size: 'medium', target: 'class2' },
 12 |       { color: 'green', shape: 'triangle', size: 'large', target: 'class3' },
 13 |       { color: 'red', shape: 'square', size: 'medium', target: 'class1' },
 14 |       { color: 'blue', shape: 'circle', size: 'small', target: 'class2' }
 15 |     ];
 16 |     
 17 |     originalDt = new DecisionTree('target', ['color', 'shape', 'size']);
 18 |     originalDt.train(trainingData);
 19 |   });
 20 | 
 21 |   describe('Export Functionality', () => {
 22 |     it('should export model with correct structure', () => {
 23 |       const exported = originalDt.toJSON();
 24 |       
 25 |       // Verify structure
 26 |       assert.ok(exported.model);
 27 |       assert.ok(Array.isArray(exported.data));
 28 |       assert.strictEqual(exported.target, 'target');
 29 |       assert.ok(Array.isArray(exported.features));
 30 |       assert.strictEqual(exported.features.length, 3);
 31 |       
 32 |       // Verify model properties
 33 |       assert.ok(typeof exported.model.type === 'string');
 34 |       assert.ok(typeof exported.model.name === 'string');
 35 |       assert.ok(typeof exported.model.alias === 'string');
 36 |     });
 37 | 
 38 |     it('should export model with all required properties', () => {
 39 |       const exported = originalDt.toJSON();
 40 |       
 41 |       // Check that all expected properties exist
 42 |       const requiredProps = ['model', 'data', 'target', 'features'];
 43 |       for (const prop of requiredProps) {
 44 |         assert.ok(exported.hasOwnProperty(prop), `Missing property: ${prop}`);
 45 |       }
 46 |     });
 47 | 
 48 |     it('should export model data integrity', () => {
 49 |       const exported = originalDt.toJSON();
 50 |       
 51 |       // Note: Current implementation doesn't store training data in this.data
 52 |       // Only imported models have data stored
 53 |       assert.strictEqual(exported.data.length, 0);
 54 |       assert.strictEqual(exported.target, 'target');
 55 |       assert.deepStrictEqual(exported.features, ['color', 'shape', 'size']);
 56 |       
 57 |       // Data is not preserved in current implementation
 58 |       // This is a design limitation
 59 |     });
 60 | 
 61 |     it('should export model with tree structure', () => {
 62 |       const exported = originalDt.toJSON();
 63 |       
 64 |       // Verify tree structure
 65 |       assert.ok(exported.model.type === 'feature' || exported.model.type === 'result');
 66 |       
 67 |       if (exported.model.type === 'feature') {
 68 |         assert.ok(exported.model.vals);
 69 |         assert.ok(Array.isArray(exported.model.vals));
 70 |         assert.ok(exported.model.vals.length > 0);
 71 |         
 72 |         // Check child nodes
 73 |         for (const child of exported.model.vals) {
 74 |           assert.ok(typeof child.name === 'string');
 75 |           assert.ok(typeof child.alias === 'string');
 76 |           assert.ok(typeof child.type === 'string');
 77 |           assert.ok(typeof child.prob === 'number');
 78 |           assert.ok(typeof child.sampleSize === 'number');
 79 |         }
 80 |       }
 81 |     });
 82 |   });
 83 | 
 84 |   describe('Import Functionality', () => {
 85 |     it('should import model and maintain functionality', () => {
 86 |       const exported = originalDt.toJSON();
 87 |       const importedDt = new DecisionTree(exported);
 88 |       
 89 |       // Test predictions
 90 |       const prediction1 = importedDt.predict({ color: 'red', shape: 'circle', size: 'small' });
 91 |       const prediction2 = importedDt.predict({ color: 'blue', shape: 'square', size: 'medium' });
 92 |       
 93 |       assert.strictEqual(prediction1, 'class1');
 94 |       assert.strictEqual(prediction2, 'class2');
 95 |     });
 96 | 
 97 |     it('should handle import on existing instance', () => {
 98 |       const exported = originalDt.toJSON();
 99 |       const newDt = new DecisionTree('target', ['color', 'shape', 'size']);
100 |       
101 |       newDt.import(exported);
102 |       
103 |       // Test predictions
104 |       const prediction = newDt.predict({ color: 'red', shape: 'circle', size: 'small' });
105 |       assert.strictEqual(prediction, 'class1');
106 |     });
107 | 
108 |     it('should maintain evaluation accuracy after import', () => {
109 |       const exported = originalDt.toJSON();
110 |       const importedDt = new DecisionTree(exported);
111 |       
112 |       const originalAccuracy = originalDt.evaluate(trainingData);
113 |       const importedAccuracy = importedDt.evaluate(trainingData);
114 |       
115 |       assert.strictEqual(importedAccuracy, originalAccuracy);
116 |     });
117 | 
118 |     it('should handle round-trip export/import', () => {
119 |       const exported1 = originalDt.toJSON();
120 |       const importedDt = new DecisionTree(exported1);
121 |       const exported2 = importedDt.toJSON();
122 |       
123 |       // Verify structure is maintained
124 |       assert.deepStrictEqual(exported1.target, exported2.target);
125 |       assert.deepStrictEqual(exported1.features, exported2.features);
126 |       assert.deepStrictEqual(exported1.data, exported2.data);
127 |       
128 |       // Verify predictions are identical
129 |       const sample = { color: 'red', shape: 'circle', size: 'small' };
130 |       const prediction1 = originalDt.predict(sample);
131 |       const prediction2 = importedDt.predict(sample);
132 |       assert.strictEqual(prediction1, prediction2);
133 |     });
134 |   });
135 | 
136 |   describe('Error Handling in Import/Export', () => {
137 |     it('should handle corrupted JSON import', () => {
138 |       const corruptedData = {
139 |         model: null,
140 |         data: [],
141 |         target: 'target',
142 |         features: ['color', 'shape', 'size']
143 |       };
144 |       
145 |       // Note: Current implementation doesn't validate imported data structure
146 |       // This will fail during prediction when trying to access model properties
147 |       const dt = new DecisionTree(corruptedData);
148 |       assert.throws(() => {
149 |         dt.predict({ color: 'red', shape: 'circle', size: 'small' });
150 |       }, /Cannot read properties of null/);
151 |     });
152 | 
153 |     it('should handle missing model properties', () => {
154 |       const incompleteData = {
155 |         data: trainingData,
156 |         target: 'target',
157 |         features: ['color', 'shape', 'size']
158 |         // Missing 'model' property
159 |       };
160 |       
161 |       // Note: Current implementation doesn't validate imported data structure
162 |       // This will fail during prediction when trying to access model properties
163 |       const dt = new DecisionTree(incompleteData as any);
164 |       assert.throws(() => {
165 |         dt.predict({ color: 'red', shape: 'circle', size: 'small' });
166 |       }, /Cannot read properties of undefined/);
167 |     });
168 | 
169 |     it('should handle missing data property', () => {
170 |       const incompleteData = {
171 |         model: { type: 'result', val: 'class1', name: 'class1', alias: 'class1' },
172 |         target: 'target',
173 |         features: ['color', 'shape', 'size']
174 |         // Missing 'data' property
175 |       };
176 |       
177 |       const dt = new DecisionTree(incompleteData as any);
178 |       // Should not crash, but may have limited functionality
179 |       assert.ok(dt);
180 |     });
181 | 
182 |     it('should handle missing target property', () => {
183 |       const incompleteData = {
184 |         model: { type: 'result', val: 'class1', name: 'class1', alias: 'class1' },
185 |         data: trainingData,
186 |         features: ['color', 'shape', 'size']
187 |         // Missing 'target' property
188 |       };
189 |       
190 |       const dt = new DecisionTree(incompleteData as any);
191 |       // Should not crash, but may have limited functionality
192 |       assert.ok(dt);
193 |     });
194 | 
195 |     it('should handle missing features property', () => {
196 |       const incompleteData = {
197 |         model: { type: 'result', val: 'class1', name: 'class1', alias: 'class1' },
198 |         data: trainingData,
199 |         target: 'target'
200 |         // Missing 'features' property
201 |       };
202 |       
203 |       const dt = new DecisionTree(incompleteData as any);
204 |       // Should not crash, but may have limited functionality
205 |       assert.ok(dt);
206 |     });
207 |   });
208 | 
209 |   describe('Model Validation', () => {
210 |     it('should validate imported model structure', () => {
211 |       const exported = originalDt.toJSON();
212 |       const importedDt = new DecisionTree(exported);
213 |       
214 |       // Verify the imported model has the expected structure
215 |       const importedModel = importedDt.toJSON();
216 |       assert.ok(importedModel.model);
217 |       assert.ok(Array.isArray(importedModel.data));
218 |       assert.strictEqual(importedModel.target, 'target');
219 |       assert.ok(Array.isArray(importedModel.features));
220 |     });
221 | 
222 |     it('should handle models with different feature sets', () => {
223 |       const exported = originalDt.toJSON();
224 |       
225 |       // Modify features in exported model
226 |       exported.features = ['color', 'shape']; // Remove 'size'
227 |       
228 |       const importedDt = new DecisionTree(exported);
229 |       
230 |       // Should still work with available features
231 |       const prediction = importedDt.predict({ color: 'red', shape: 'circle' });
232 |       assert.ok(typeof prediction === 'string');
233 |     });
234 | 
235 |     it('should handle models with different target names', () => {
236 |       const exported = originalDt.toJSON();
237 |       
238 |       // Modify target in exported model
239 |       exported.target = 'newTarget';
240 |       
241 |       const importedDt = new DecisionTree(exported);
242 |       
243 |       // Should still work with new target name
244 |       const prediction = importedDt.predict({ color: 'red', shape: 'circle', size: 'small' });
245 |       assert.ok(typeof prediction === 'string');
246 |     });
247 |   });
248 | 
249 | });
250 | 


--------------------------------------------------------------------------------
/tst/id3-algorithm.ts:
--------------------------------------------------------------------------------
  1 | import { strict as assert } from 'assert';
  2 | import DecisionTree from '../lib/decision-tree.js';
  3 | 
  4 | describe('ID3 Algorithm Tests', () => {
  5 |   describe('Entropy and Information Gain', () => {
  6 |     it('should handle zero entropy datasets (all same target)', () => {
  7 |       const dt = new DecisionTree('target', ['feature1']);
  8 |       const zeroEntropyData = [
  9 |         { feature1: 'value1', target: 'class1' },
 10 |         { feature1: 'value2', target: 'class1' },
 11 |         { feature1: 'value3', target: 'class1' },
 12 |         { feature1: 'value4', target: 'class1' }
 13 |       ];
 14 |       
 15 |       dt.train(zeroEntropyData);
 16 |       // With zero entropy, any prediction should be 'class1'
 17 |       const prediction1 = dt.predict({ feature1: 'value1' });
 18 |       const prediction2 = dt.predict({ feature1: 'unknown' });
 19 |       assert.strictEqual(prediction1, 'class1');
 20 |       assert.strictEqual(prediction2, 'class1');
 21 |     });
 22 | 
 23 |     it('should handle maximum entropy datasets (perfect distribution)', () => {
 24 |       const dt = new DecisionTree('target', ['feature1']);
 25 |       const maxEntropyData = [
 26 |         { feature1: 'value1', target: 'class1' },
 27 |         { feature1: 'value2', target: 'class2' },
 28 |         { feature1: 'value3', target: 'class3' },
 29 |         { feature1: 'value4', target: 'class4' }
 30 |       ];
 31 |       
 32 |       dt.train(maxEntropyData);
 33 |       // Should still make predictions despite high entropy
 34 |       const prediction = dt.predict({ feature1: 'value1' });
 35 |       assert.ok(['class1', 'class2', 'class3', 'class4'].includes(prediction));
 36 |     });
 37 | 
 38 |     it('should handle balanced vs imbalanced datasets', () => {
 39 |       const dt = new DecisionTree('target', ['feature1']);
 40 |       
 41 |       // Balanced dataset
 42 |       const balancedData = [
 43 |         { feature1: 'value1', target: 'class1' },
 44 |         { feature1: 'value2', target: 'class2' }
 45 |       ];
 46 |       
 47 |       dt.train(balancedData);
 48 |       const balancedPrediction = dt.predict({ feature1: 'value1' });
 49 |       assert.strictEqual(balancedPrediction, 'class1');
 50 |       
 51 |       // Imbalanced dataset (90/10)
 52 |       const imbalancedData = [
 53 |         { feature1: 'value1', target: 'class1' },
 54 |         { feature1: 'value1', target: 'class1' },
 55 |         { feature1: 'value1', target: 'class1' },
 56 |         { feature1: 'value1', target: 'class1' },
 57 |         { feature1: 'value1', target: 'class1' },
 58 |         { feature1: 'value1', target: 'class1' },
 59 |         { feature1: 'value1', target: 'class1' },
 60 |         { feature1: 'value1', target: 'class1' },
 61 |         { feature1: 'value1', target: 'class1' },
 62 |         { feature1: 'value2', target: 'class2' }
 63 |       ];
 64 |       
 65 |       dt.train(imbalancedData);
 66 |       const imbalancedPrediction = dt.predict({ feature1: 'value1' });
 67 |       assert.strictEqual(imbalancedPrediction, 'class1');
 68 |     });
 69 |   });
 70 | 
 71 |   describe('Feature Selection and Splitting', () => {
 72 |     it('should handle single feature datasets', () => {
 73 |       const dt = new DecisionTree('target', ['feature1']);
 74 |       const singleFeatureData = [
 75 |         { feature1: 'value1', target: 'class1' },
 76 |         { feature1: 'value2', target: 'class2' }
 77 |       ];
 78 |       
 79 |       dt.train(singleFeatureData);
 80 |       const prediction1 = dt.predict({ feature1: 'value1' });
 81 |       const prediction2 = dt.predict({ feature1: 'value2' });
 82 |       assert.strictEqual(prediction1, 'class1');
 83 |       assert.strictEqual(prediction2, 'class2');
 84 |     });
 85 | 
 86 |     it('should handle categorical vs numerical features', () => {
 87 |       const dt = new DecisionTree('target', ['category', 'number']);
 88 |       const mixedData = [
 89 |         { category: 'A', number: 1, target: 'class1' },
 90 |         { category: 'A', number: 2, target: 'class1' },
 91 |         { category: 'B', number: 1, target: 'class2' },
 92 |         { category: 'B', number: 2, target: 'class2' }
 93 |       ];
 94 |       
 95 |       dt.train(mixedData);
 96 |       const prediction1 = dt.predict({ category: 'A', number: 1 });
 97 |       const prediction2 = dt.predict({ category: 'B', number: 2 });
 98 |       assert.strictEqual(prediction1, 'class1');
 99 |       assert.strictEqual(prediction2, 'class2');
100 |     });
101 | 
102 |     it('should handle features with no predictive power', () => {
103 |       const dt = new DecisionTree('target', ['useful', 'useless']);
104 |       const data = [
105 |         { useful: 'A', useless: 'X', target: 'class1' },
106 |         { useful: 'A', useless: 'Y', target: 'class1' },
107 |         { useful: 'B', useless: 'X', target: 'class2' },
108 |         { useful: 'B', useless: 'Y', target: 'class2' }
109 |       ];
110 |       
111 |       dt.train(data);
112 |       // The tree should prioritize the 'useful' feature
113 |       const prediction1 = dt.predict({ useful: 'A', useless: 'anything' });
114 |       const prediction2 = dt.predict({ useful: 'B', useless: 'anything' });
115 |       assert.strictEqual(prediction1, 'class1');
116 |       assert.strictEqual(prediction2, 'class2');
117 |     });
118 |   });
119 | 
120 |   describe('Tree Structure and Depth', () => {
121 |     it('should handle very deep tree structures', () => {
122 |       const dt = new DecisionTree('target', ['level1', 'level2', 'level3', 'level4', 'level5']);
123 |       const deepData = [
124 |         { level1: 'A', level2: 'A', level3: 'A', level4: 'A', level5: 'A', target: 'class1' },
125 |         { level1: 'A', level2: 'A', level3: 'A', level4: 'A', level5: 'B', target: 'class2' },
126 |         { level1: 'A', level2: 'A', level3: 'A', level4: 'B', level5: 'A', target: 'class3' },
127 |         { level1: 'A', level2: 'A', level3: 'A', level4: 'B', level5: 'B', target: 'class4' },
128 |         { level1: 'A', level2: 'A', level3: 'B', level4: 'A', level5: 'A', target: 'class5' },
129 |         { level1: 'A', level2: 'A', level3: 'B', level4: 'A', level5: 'B', target: 'class6' },
130 |         { level1: 'A', level2: 'A', level3: 'B', level4: 'B', level5: 'A', target: 'class7' },
131 |         { level1: 'A', level2: 'A', level3: 'B', level4: 'B', level5: 'B', target: 'class8' }
132 |       ];
133 |       
134 |       dt.train(deepData);
135 |       const prediction = dt.predict({ level1: 'A', level2: 'A', level3: 'A', level4: 'A', level5: 'A' });
136 |       assert.strictEqual(prediction, 'class1');
137 |     });
138 | 
139 |     it('should handle wide tree structures (many feature values)', () => {
140 |       const dt = new DecisionTree('target', ['feature']);
141 |       const wideData = [];
142 |       
143 |       // Create data with 20 different feature values
144 |       for (let i = 0; i < 20; i++) {
145 |         wideData.push({ feature: `value${i}`, target: `class${i % 3 + 1}` });
146 |       }
147 |       
148 |       dt.train(wideData);
149 |       const prediction = dt.predict({ feature: 'value5' });
150 |       assert.ok(['class1', 'class2', 'class3'].includes(prediction));
151 |     });
152 | 
153 |     it('should handle balanced vs unbalanced tree splits', () => {
154 |       const dt = new DecisionTree('target', ['feature']);
155 |       
156 |       // Balanced split
157 |       const balancedData = [
158 |         { feature: 'A', target: 'class1' },
159 |         { feature: 'A', target: 'class1' },
160 |         { feature: 'B', target: 'class2' },
161 |         { feature: 'B', target: 'class2' }
162 |       ];
163 |       
164 |       dt.train(balancedData);
165 |       const balancedPrediction = dt.predict({ feature: 'A' });
166 |       assert.strictEqual(balancedPrediction, 'class1');
167 |       
168 |       // Unbalanced split (3 vs 1)
169 |       const unbalancedData = [
170 |         { feature: 'A', target: 'class1' },
171 |         { feature: 'A', target: 'class1' },
172 |         { feature: 'A', target: 'class1' },
173 |         { feature: 'B', target: 'class2' }
174 |       ];
175 |       
176 |       dt.train(unbalancedData);
177 |       const unbalancedPrediction = dt.predict({ feature: 'A' });
178 |       assert.strictEqual(unbalancedPrediction, 'class1');
179 |     });
180 |   });
181 | 
182 |   describe('Algorithm Correctness', () => {
183 |     it('should verify information gain calculations', () => {
184 |       const dt = new DecisionTree('target', ['feature']);
185 |       const data = [
186 |         { feature: 'A', target: 'class1' },
187 |         { feature: 'A', target: 'class1' },
188 |         { feature: 'B', target: 'class2' },
189 |         { feature: 'B', target: 'class2' }
190 |       ];
191 |       
192 |       dt.train(data);
193 |       const model = dt.toJSON();
194 |       
195 |       // Verify the tree structure makes sense
196 |       assert.ok(model.model.type === 'feature' || model.model.type === 'result');
197 |       if (model.model.type === 'feature') {
198 |         assert.ok(model.model.vals);
199 |         assert.ok(Array.isArray(model.model.vals));
200 |         assert.ok(model.model.vals.length === 2); // Two feature values: A and B
201 |       }
202 |     });
203 | 
204 |     it('should handle tie-breaking in feature selection', () => {
205 |       const dt = new DecisionTree('target', ['feature1', 'feature2']);
206 |       const tieData = [
207 |         { feature1: 'A', feature2: 'X', target: 'class1' },
208 |         { feature1: 'A', feature2: 'Y', target: 'class2' },
209 |         { feature1: 'B', feature2: 'X', target: 'class1' },
210 |         { feature1: 'B', feature2: 'Y', target: 'class2' }
211 |       ];
212 |       
213 |       dt.train(tieData);
214 |       // Both features should have equal information gain
215 |       // The algorithm should handle this gracefully
216 |       const prediction = dt.predict({ feature1: 'A', feature2: 'X' });
217 |       assert.ok(['class1', 'class2'].includes(prediction));
218 |     });
219 | 
220 |     it('should handle datasets with perfect correlation', () => {
221 |       const dt = new DecisionTree('target', ['feature1', 'feature2']);
222 |       const correlatedData = [
223 |         { feature1: 'A', feature2: 'X', target: 'class1' },
224 |         { feature1: 'A', feature2: 'X', target: 'class1' },
225 |         { feature1: 'B', feature2: 'Y', target: 'class2' },
226 |         { feature1: 'B', feature2: 'Y', target: 'class2' }
227 |       ];
228 |       
229 |       dt.train(correlatedData);
230 |       // The tree should use one feature and ignore the other
231 |       const prediction = dt.predict({ feature1: 'A', feature2: 'X' });
232 |       assert.strictEqual(prediction, 'class1');
233 |     });
234 |   });
235 | });
236 | 


--------------------------------------------------------------------------------
/tst/xgboost-loss-functions.ts:
--------------------------------------------------------------------------------
  1 | import { strict as assert } from 'assert';
  2 | import { MSELoss, LogisticLoss, CrossEntropyLoss, LossFunctionFactory } from '../lib/shared/loss-functions.js';
  3 | 
  4 | describe('XGBoost Loss Functions - MSE Loss', function() {
  5 |   it('should calculate gradient correctly', () => {
  6 |     const prediction = 0.8;
  7 |     const actual = 1.0;
  8 |     const gradient = MSELoss.calculateGradient(prediction, actual);
  9 |     assert.ok(Math.abs(gradient - (-0.2)) < 1e-10);
 10 |   });
 11 | 
 12 |   it('should calculate hessian correctly', () => {
 13 |     const prediction = 0.8;
 14 |     const actual = 1.0;
 15 |     const hessian = MSELoss.calculateHessian(prediction, actual);
 16 |     assert.strictEqual(hessian, 1);
 17 |   });
 18 | 
 19 |   it('should calculate loss correctly', () => {
 20 |     const predictions = [0.8, 1.2, 0.9];
 21 |     const actuals = [1.0, 1.0, 1.0];
 22 |     const loss = MSELoss.calculateLoss(predictions, actuals);
 23 |     const expected = (0.04 + 0.04 + 0.01) / 3; // (0.2^2 + 0.2^2 + 0.1^2) / 3
 24 |     assert.ok(Math.abs(loss - expected) < 1e-10);
 25 |   });
 26 | 
 27 |   it('should calculate gradients and hessians together', () => {
 28 |     const predictions = [0.8, 1.2, 0.9];
 29 |     const actuals = [1.0, 1.0, 1.0];
 30 |     const result = MSELoss.calculateGradientsAndHessians(predictions, actuals);
 31 |     
 32 |     assert.ok(Array.isArray(result.gradient));
 33 |     assert.ok(Array.isArray(result.hessian));
 34 |     assert.strictEqual(result.gradient.length, 3);
 35 |     assert.strictEqual(result.hessian.length, 3);
 36 |     
 37 |     assert.ok(Math.abs(result.gradient[0] - (-0.2)) < 1e-10);
 38 |     assert.ok(Math.abs(result.gradient[1] - 0.2) < 1e-10);
 39 |     assert.ok(Math.abs(result.gradient[2] - (-0.1)) < 1e-10);
 40 |     
 41 |     assert.strictEqual(result.hessian[0], 1);
 42 |     assert.strictEqual(result.hessian[1], 1);
 43 |     assert.strictEqual(result.hessian[2], 1);
 44 |   });
 45 | 
 46 |   it('should handle perfect predictions', () => {
 47 |     const predictions = [1.0, 2.0, 3.0];
 48 |     const actuals = [1.0, 2.0, 3.0];
 49 |     const loss = MSELoss.calculateLoss(predictions, actuals);
 50 |     assert.strictEqual(loss, 0);
 51 |     
 52 |     const result = MSELoss.calculateGradientsAndHessians(predictions, actuals);
 53 |     assert.deepStrictEqual(result.gradient, [0, 0, 0]);
 54 |     assert.deepStrictEqual(result.hessian, [1, 1, 1]);
 55 |   });
 56 | });
 57 | 
 58 | describe('XGBoost Loss Functions - Logistic Loss', function() {
 59 |   it('should calculate sigmoid correctly', () => {
 60 |     assert.strictEqual(LogisticLoss.sigmoid(0), 0.5);
 61 |     assert.ok(Math.abs(LogisticLoss.sigmoid(1) - 0.7310585786300049) < 1e-10);
 62 |     assert.ok(Math.abs(LogisticLoss.sigmoid(-1) - 0.2689414213699951) < 1e-10);
 63 |   });
 64 | 
 65 |   it('should handle extreme values in sigmoid', () => {
 66 |     assert.ok(LogisticLoss.sigmoid(500) > 0.999);
 67 |     assert.ok(LogisticLoss.sigmoid(-500) < 0.001);
 68 |   });
 69 | 
 70 |   it('should calculate gradient correctly', () => {
 71 |     const prediction = 1.0; // log-odds
 72 |     const actual = 1;
 73 |     const gradient = LogisticLoss.calculateGradient(prediction, actual);
 74 |     const expected = LogisticLoss.sigmoid(prediction) - actual;
 75 |     assert.ok(Math.abs(gradient - expected) < 1e-10);
 76 |   });
 77 | 
 78 |   it('should calculate hessian correctly', () => {
 79 |     const prediction = 1.0;
 80 |     const actual = 1;
 81 |     const hessian = LogisticLoss.calculateHessian(prediction, actual);
 82 |     const prob = LogisticLoss.sigmoid(prediction);
 83 |     const expected = prob * (1 - prob);
 84 |     assert.ok(Math.abs(hessian - expected) < 1e-10);
 85 |   });
 86 | 
 87 |   it('should calculate loss correctly', () => {
 88 |     const predictions = [1.0, -1.0, 0.0];
 89 |     const actuals = [1, 0, 1];
 90 |     const loss = LogisticLoss.calculateLoss(predictions, actuals);
 91 |     assert.ok(loss > 0);
 92 |     assert.ok(typeof loss === 'number');
 93 |   });
 94 | 
 95 |   it('should calculate gradients and hessians together', () => {
 96 |     const predictions = [1.0, -1.0, 0.0];
 97 |     const actuals = [1, 0, 1];
 98 |     const result = LogisticLoss.calculateGradientsAndHessians(predictions, actuals);
 99 |     
100 |     assert.ok(Array.isArray(result.gradient));
101 |     assert.ok(Array.isArray(result.hessian));
102 |     assert.strictEqual(result.gradient.length, 3);
103 |     assert.strictEqual(result.hessian.length, 3);
104 |     
105 |     // Check that gradients and hessians are calculated
106 |     result.gradient.forEach(grad => assert.ok(typeof grad === 'number'));
107 |     result.hessian.forEach(hess => assert.ok(typeof hess === 'number' && hess >= 0));
108 |   });
109 | 
110 |   it('should handle perfect predictions', () => {
111 |     const predictions = [10.0, -10.0, 5.0];
112 |     const actuals = [1, 0, 1];
113 |     const loss = LogisticLoss.calculateLoss(predictions, actuals);
114 |     assert.ok(loss >= 0); // Should be non-negative
115 |   });
116 | });
117 | 
118 | describe('XGBoost Loss Functions - Cross Entropy Loss', function() {
119 |   it('should calculate softmax correctly', () => {
120 |     const x = [1.0, 2.0, 3.0];
121 |     const softmax = CrossEntropyLoss.softmax(x);
122 |     
123 |     assert.ok(Array.isArray(softmax));
124 |     assert.strictEqual(softmax.length, 3);
125 |     
126 |     // Sum should be 1
127 |     const sum = softmax.reduce((a, b) => a + b, 0);
128 |     assert.ok(Math.abs(sum - 1) < 1e-10);
129 |     
130 |     // All values should be positive
131 |     softmax.forEach(val => assert.ok(val > 0));
132 |   });
133 | 
134 |   it('should handle extreme values in softmax', () => {
135 |     const x = [1000, 1001, 1002];
136 |     const softmax = CrossEntropyLoss.softmax(x);
137 |     
138 |     // Should not overflow
139 |     assert.ok(Array.isArray(softmax));
140 |     assert.ok(softmax.every(val => val >= 0 && val <= 1));
141 |     
142 |     const sum = softmax.reduce((a, b) => a + b, 0);
143 |     assert.ok(Math.abs(sum - 1) < 1e-10);
144 |   });
145 | 
146 |   it('should calculate gradient correctly', () => {
147 |     const predictions = [1.0, 2.0, 3.0];
148 |     const actual = 1; // Index of correct class
149 |     const gradient = CrossEntropyLoss.calculateGradient(predictions, actual);
150 |     
151 |     assert.ok(Array.isArray(gradient));
152 |     assert.strictEqual(gradient.length, 3);
153 |     
154 |     // Sum should be 0
155 |     const sum = gradient.reduce((a, b) => a + b, 0);
156 |     assert.ok(Math.abs(sum) < 1e-10);
157 |   });
158 | 
159 |   it('should calculate hessian correctly', () => {
160 |     const predictions = [1.0, 2.0, 3.0];
161 |     const actual = 1;
162 |     const hessian = CrossEntropyLoss.calculateHessian(predictions, actual);
163 |     
164 |     assert.ok(Array.isArray(hessian));
165 |     assert.strictEqual(hessian.length, 3);
166 |     assert.ok(Array.isArray(hessian[0]));
167 |     assert.strictEqual(hessian[0].length, 3);
168 |   });
169 | 
170 |   it('should calculate loss correctly', () => {
171 |     const predictions = [1.0, 2.0, 3.0];
172 |     const actuals = [1, 0, 2];
173 |     const loss = CrossEntropyLoss.calculateLoss(predictions, actuals);
174 |     
175 |     assert.ok(loss >= 0);
176 |     assert.ok(typeof loss === 'number');
177 |   });
178 | 
179 |   it('should calculate gradients and hessians together', () => {
180 |     const predictions = [1.0, 2.0, 3.0];
181 |     const actuals = [1, 0, 2];
182 |     const result = CrossEntropyLoss.calculateGradientsAndHessians(predictions, actuals);
183 |     
184 |     assert.ok(Array.isArray(result.gradient));
185 |     assert.ok(Array.isArray(result.hessian));
186 |     assert.strictEqual(result.gradient.length, 3);
187 |     assert.strictEqual(result.hessian.length, 3);
188 |     
189 |     // Check that gradients and hessians are calculated
190 |     result.gradient.forEach(grad => assert.ok(typeof grad === 'number'));
191 |     result.hessian.forEach(hess => assert.ok(typeof hess === 'number' && hess >= 0));
192 |   });
193 | 
194 |   it('should handle perfect predictions', () => {
195 |     const predictions = [10.0, 0.0, 0.0];
196 |     const actuals = [0, 1, 2];
197 |     const loss = CrossEntropyLoss.calculateLoss(predictions, actuals);
198 |     assert.ok(loss >= 0); // Should be non-negative
199 |   });
200 | });
201 | 
202 | describe('XGBoost Loss Functions - Factory', function() {
203 |   it('should create MSE loss for regression', () => {
204 |     const lossFunction = LossFunctionFactory.create('regression');
205 |     assert.strictEqual(lossFunction, MSELoss);
206 |   });
207 | 
208 |   it('should create logistic loss for binary classification', () => {
209 |     const lossFunction = LossFunctionFactory.create('binary');
210 |     assert.strictEqual(lossFunction, LogisticLoss);
211 |   });
212 | 
213 |   it('should create cross entropy loss for multiclass classification', () => {
214 |     const lossFunction = LossFunctionFactory.create('multiclass');
215 |     assert.strictEqual(lossFunction, CrossEntropyLoss);
216 |   });
217 | 
218 |   it('should throw error for unsupported objective', () => {
219 |     assert.throws(() => LossFunctionFactory.create('unsupported' as any));
220 |   });
221 | 
222 |   it('should have consistent interface across all loss functions', () => {
223 |     const objectives = ['regression', 'binary', 'multiclass'] as const;
224 |     
225 |     objectives.forEach(objective => {
226 |       const lossFunction = LossFunctionFactory.create(objective);
227 |       
228 |       // Test that all required methods exist
229 |       assert.ok(typeof lossFunction.calculateGradientsAndHessians === 'function');
230 |       assert.ok(typeof lossFunction.calculateLoss === 'function');
231 |       
232 |       // Test with sample data
233 |       const predictions = [0.5, 1.0, 1.5];
234 |       const actuals = [0, 1, 1];
235 |       
236 |       const result = lossFunction.calculateGradientsAndHessians(predictions, actuals);
237 |       assert.ok(Array.isArray(result.gradient));
238 |       assert.ok(Array.isArray(result.hessian));
239 |       assert.strictEqual(result.gradient.length, 3);
240 |       assert.strictEqual(result.hessian.length, 3);
241 |       
242 |       const loss = lossFunction.calculateLoss(predictions, actuals);
243 |       assert.ok(typeof loss === 'number');
244 |     });
245 |   });
246 | });
247 | 
248 | describe('XGBoost Loss Functions - Edge Cases', function() {
249 |   it('should handle empty arrays', () => {
250 |     const predictions: number[] = [];
251 |     const actuals: number[] = [];
252 |     
253 |     const mseLoss = MSELoss.calculateLoss(predictions, actuals);
254 |     assert.ok(isNaN(mseLoss) || mseLoss === 0);
255 |     
256 |     const logisticLoss = LogisticLoss.calculateLoss(predictions, actuals);
257 |     assert.ok(isNaN(logisticLoss) || logisticLoss === 0);
258 |     
259 |     const crossEntropyLoss = CrossEntropyLoss.calculateLoss(predictions, actuals);
260 |     assert.ok(isNaN(crossEntropyLoss) || crossEntropyLoss === 0);
261 |   });
262 | 
263 |   it('should handle single element arrays', () => {
264 |     const predictions = [0.5];
265 |     const actuals = [1];
266 |     
267 |     const mseResult = MSELoss.calculateGradientsAndHessians(predictions, actuals);
268 |     assert.strictEqual(mseResult.gradient.length, 1);
269 |     assert.strictEqual(mseResult.hessian.length, 1);
270 |     
271 |     const logisticResult = LogisticLoss.calculateGradientsAndHessians(predictions, actuals);
272 |     assert.strictEqual(logisticResult.gradient.length, 1);
273 |     assert.strictEqual(logisticResult.hessian.length, 1);
274 |     
275 |     const crossEntropyResult = CrossEntropyLoss.calculateGradientsAndHessians(predictions, actuals);
276 |     assert.strictEqual(crossEntropyResult.gradient.length, 1);
277 |     assert.strictEqual(crossEntropyResult.hessian.length, 1);
278 |   });
279 | 
280 |   it('should handle identical predictions and actuals', () => {
281 |     const predictions = [1.0, 2.0, 3.0];
282 |     const actuals = [1.0, 2.0, 3.0];
283 |     
284 |     const mseLoss = MSELoss.calculateLoss(predictions, actuals);
285 |     assert.strictEqual(mseLoss, 0);
286 |     
287 |     const mseResult = MSELoss.calculateGradientsAndHessians(predictions, actuals);
288 |     assert.deepStrictEqual(mseResult.gradient, [0, 0, 0]);
289 |     assert.deepStrictEqual(mseResult.hessian, [1, 1, 1]);
290 |   });
291 | 
292 |   it('should handle very large numbers', () => {
293 |     const predictions = [1e10, -1e10, 0];
294 |     const actuals = [1, 0, 1];
295 |     
296 |     // Should not throw or return NaN
297 |     assert.doesNotThrow(() => {
298 |       MSELoss.calculateLoss(predictions, actuals);
299 |       LogisticLoss.calculateLoss(predictions, actuals);
300 |       CrossEntropyLoss.calculateLoss(predictions, actuals);
301 |     });
302 |   });
303 | 
304 |   it('should handle very small numbers', () => {
305 |     const predictions = [1e-10, -1e-10, 0];
306 |     const actuals = [1, 0, 1];
307 |     
308 |     // Should not throw or return NaN
309 |     assert.doesNotThrow(() => {
310 |       MSELoss.calculateLoss(predictions, actuals);
311 |       LogisticLoss.calculateLoss(predictions, actuals);
312 |       CrossEntropyLoss.calculateLoss(predictions, actuals);
313 |     });
314 |   });
315 | });
316 | 


--------------------------------------------------------------------------------
/src/shared/caching-system.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Intelligent Caching System for Performance Optimization
  3 |  * Provides caching for statistics, split points, and computations
  4 |  */
  5 | 
  6 | import { TrainingData } from './types.js';
  7 | 
  8 | export interface CachedStatistics {
  9 |   mean: number;
 10 |   variance: number;
 11 |   std: number;
 12 |   min: number;
 13 |   max: number;
 14 |   quartiles: number[];
 15 |   uniqueValues: Set<any>;
 16 |   sortedValues: number[];
 17 |   lastUpdated: number;
 18 |   sampleCount: number;
 19 | }
 20 | 
 21 | export interface CachedSplitPoint {
 22 |   threshold: number;
 23 |   gain: number;
 24 |   leftCount: number;
 25 |   rightCount: number;
 26 |   lastUpdated: number;
 27 | }
 28 | 
 29 | export interface CachedNodeData {
 30 |   entropy: number;
 31 |   gini: number;
 32 |   sampleCount: number;
 33 |   targetDistribution: Map<any, number>;
 34 |   lastUpdated: number;
 35 | }
 36 | 
 37 | export interface CacheConfig {
 38 |   maxSize: number;
 39 |   ttl: number; // Time to live in milliseconds
 40 |   enableStatisticsCache: boolean;
 41 |   enableSplitPointCache: boolean;
 42 |   enableNodeCache: boolean;
 43 |   enablePredictionCache: boolean;
 44 | }
 45 | 
 46 | export class PerformanceCache {
 47 |   private statisticsCache = new Map<string, CachedStatistics>();
 48 |   private splitPointCache = new Map<string, CachedSplitPoint>();
 49 |   private nodeCache = new Map<string, CachedNodeData>();
 50 |   private predictionCache = new Map<string, any>();
 51 |   private config: CacheConfig;
 52 |   private accessCounts = new Map<string, number>();
 53 |   private lastAccess = new Map<string, number>();
 54 | 
 55 |   constructor(config: Partial<CacheConfig> = {}) {
 56 |     this.config = {
 57 |       maxSize: 1000,
 58 |       ttl: 300000, // 5 minutes
 59 |       enableStatisticsCache: true,
 60 |       enableSplitPointCache: true,
 61 |       enableNodeCache: true,
 62 |       enablePredictionCache: true,
 63 |       ...config
 64 |     };
 65 |   }
 66 | 
 67 |   /**
 68 |    * Gets cached statistics for a feature
 69 |    */
 70 |   getStatistics(feature: string, data: TrainingData[]): CachedStatistics | null {
 71 |     if (!this.config.enableStatisticsCache) return null;
 72 | 
 73 |     const key = this.generateStatisticsKey(feature, data);
 74 |     const cached = this.statisticsCache.get(key);
 75 | 
 76 |     if (cached && this.isValid(cached.lastUpdated)) {
 77 |       this.updateAccess(key);
 78 |       return cached;
 79 |     }
 80 | 
 81 |     if (cached) {
 82 |       this.statisticsCache.delete(key);
 83 |     }
 84 | 
 85 |     return null;
 86 |   }
 87 | 
 88 |   /**
 89 |    * Sets cached statistics for a feature
 90 |    */
 91 |   setStatistics(feature: string, data: TrainingData[], statistics: CachedStatistics): void {
 92 |     if (!this.config.enableStatisticsCache) return;
 93 | 
 94 |     const key = this.generateStatisticsKey(feature, data);
 95 |     this.setCacheEntry(key, statistics, this.statisticsCache);
 96 |   }
 97 | 
 98 |   /**
 99 |    * Gets cached split point for a feature
100 |    */
101 |   getSplitPoint(feature: string, data: TrainingData[]): CachedSplitPoint | null {
102 |     if (!this.config.enableSplitPointCache) return null;
103 | 
104 |     const key = this.generateSplitPointKey(feature, data);
105 |     const cached = this.splitPointCache.get(key);
106 | 
107 |     if (cached && this.isValid(cached.lastUpdated)) {
108 |       this.updateAccess(key);
109 |       return cached;
110 |     }
111 | 
112 |     if (cached) {
113 |       this.splitPointCache.delete(key);
114 |     }
115 | 
116 |     return null;
117 |   }
118 | 
119 |   /**
120 |    * Sets cached split point for a feature
121 |    */
122 |   setSplitPoint(feature: string, data: TrainingData[], splitPoint: CachedSplitPoint): void {
123 |     if (!this.config.enableSplitPointCache) return;
124 | 
125 |     const key = this.generateSplitPointKey(feature, data);
126 |     this.setCacheEntry(key, splitPoint, this.splitPointCache);
127 |   }
128 | 
129 |   /**
130 |    * Gets cached node data
131 |    */
132 |   getNodeData(nodeId: string, data: TrainingData[]): CachedNodeData | null {
133 |     if (!this.config.enableNodeCache) return null;
134 | 
135 |     const key = this.generateNodeKey(nodeId, data);
136 |     const cached = this.nodeCache.get(key);
137 | 
138 |     if (cached && this.isValid(cached.lastUpdated)) {
139 |       this.updateAccess(key);
140 |       return cached;
141 |     }
142 | 
143 |     if (cached) {
144 |       this.nodeCache.delete(key);
145 |     }
146 | 
147 |     return null;
148 |   }
149 | 
150 |   /**
151 |    * Sets cached node data
152 |    */
153 |   setNodeData(nodeId: string, data: TrainingData[], nodeData: CachedNodeData): void {
154 |     if (!this.config.enableNodeCache) return;
155 | 
156 |     const key = this.generateNodeKey(nodeId, data);
157 |     this.setCacheEntry(key, nodeData, this.nodeCache);
158 |   }
159 | 
160 |   /**
161 |    * Gets cached prediction
162 |    */
163 |   getPrediction(sample: TrainingData, modelId: string): any | null {
164 |     if (!this.config.enablePredictionCache) return null;
165 | 
166 |     const key = this.generatePredictionKey(sample, modelId);
167 |     const cached = this.predictionCache.get(key);
168 | 
169 |     if (cached && this.isValid(cached.lastUpdated)) {
170 |       this.updateAccess(key);
171 |       return cached;
172 |     }
173 | 
174 |     if (cached) {
175 |       this.predictionCache.delete(key);
176 |     }
177 | 
178 |     return null;
179 |   }
180 | 
181 |   /**
182 |    * Sets cached prediction
183 |    */
184 |   setPrediction(sample: TrainingData, modelId: string, prediction: any): void {
185 |     if (!this.config.enablePredictionCache) return;
186 | 
187 |     const key = this.generatePredictionKey(sample, modelId);
188 |     this.setCacheEntry(key, prediction, this.predictionCache);
189 |   }
190 | 
191 |   /**
192 |    * Clears all caches
193 |    */
194 |   clear(): void {
195 |     this.statisticsCache.clear();
196 |     this.splitPointCache.clear();
197 |     this.nodeCache.clear();
198 |     this.predictionCache.clear();
199 |     this.accessCounts.clear();
200 |     this.lastAccess.clear();
201 |   }
202 | 
203 |   /**
204 |    * Clears expired entries
205 |    */
206 |   clearExpired(): void {
207 |     const now = Date.now();
208 |     
209 |     this.clearExpiredFromCache(this.statisticsCache, now);
210 |     this.clearExpiredFromCache(this.splitPointCache, now);
211 |     this.clearExpiredFromCache(this.nodeCache, now);
212 |     this.clearExpiredFromCache(this.predictionCache, now);
213 |   }
214 | 
215 |   /**
216 |    * Gets cache statistics
217 |    */
218 |   getCacheStats(): {
219 |     statisticsCache: { size: number; hitRate: number };
220 |     splitPointCache: { size: number; hitRate: number };
221 |     nodeCache: { size: number; hitRate: number };
222 |     predictionCache: { size: number; hitRate: number };
223 |     totalSize: number;
224 |     memoryUsage: number;
225 |   } {
226 |     const stats = {
227 |       statisticsCache: this.getCacheStatsForMap(this.statisticsCache),
228 |       splitPointCache: this.getCacheStatsForMap(this.splitPointCache),
229 |       nodeCache: this.getCacheStatsForMap(this.nodeCache),
230 |       predictionCache: this.getCacheStatsForMap(this.predictionCache),
231 |       totalSize: 0,
232 |       memoryUsage: 0
233 |     };
234 | 
235 |     stats.totalSize = stats.statisticsCache.size + stats.splitPointCache.size + 
236 |                      stats.nodeCache.size + stats.predictionCache.size;
237 | 
238 |     // Estimate memory usage (rough calculation)
239 |     stats.memoryUsage = this.estimateMemoryUsage();
240 | 
241 |     return stats;
242 |   }
243 | 
244 |   private generateStatisticsKey(feature: string, data: TrainingData[]): string {
245 |     const dataHash = this.hashData(data);
246 |     return `stats_${feature}_${dataHash}`;
247 |   }
248 | 
249 |   private generateSplitPointKey(feature: string, data: TrainingData[]): string {
250 |     const dataHash = this.hashData(data);
251 |     return `split_${feature}_${dataHash}`;
252 |   }
253 | 
254 |   private generateNodeKey(nodeId: string, data: TrainingData[]): string {
255 |     const dataHash = this.hashData(data);
256 |     return `node_${nodeId}_${dataHash}`;
257 |   }
258 | 
259 |   private generatePredictionKey(sample: TrainingData, modelId: string): string {
260 |     const sampleHash = this.hashObject(sample);
261 |     return `pred_${modelId}_${sampleHash}`;
262 |   }
263 | 
264 |   private hashData(data: TrainingData[]): string {
265 |     // Simple hash based on data length and first few values
266 |     if (data.length === 0) return 'empty';
267 |     
268 |     const sample = data[0];
269 |     const keys = Object.keys(sample).sort();
270 |     const hash = keys.map(key => `${key}:${sample[key]}`).join('|');
271 |     return `${data.length}_${hash}`;
272 |   }
273 | 
274 |   private hashObject(obj: any): string {
275 |     return JSON.stringify(obj, Object.keys(obj).sort());
276 |   }
277 | 
278 |   private isValid(timestamp: number): boolean {
279 |     return Date.now() - timestamp < this.config.ttl;
280 |   }
281 | 
282 |   private updateAccess(key: string): void {
283 |     this.accessCounts.set(key, (this.accessCounts.get(key) || 0) + 1);
284 |     this.lastAccess.set(key, Date.now());
285 |   }
286 | 
287 |   private setCacheEntry<T>(key: string, value: T, cache: Map<string, T>): void {
288 |     // Check if we need to evict entries
289 |     if (cache.size >= this.config.maxSize) {
290 |       this.evictLeastRecentlyUsed(cache);
291 |     }
292 | 
293 |     cache.set(key, value);
294 |     this.lastAccess.set(key, Date.now());
295 |   }
296 | 
297 |   private evictLeastRecentlyUsed<T>(cache: Map<string, T>): void {
298 |     let oldestKey = '';
299 |     let oldestTime = Date.now();
300 | 
301 |     for (const [key, value] of cache.entries()) {
302 |       const lastAccessTime = this.lastAccess.get(key) || 0;
303 |       if (lastAccessTime < oldestTime) {
304 |         oldestTime = lastAccessTime;
305 |         oldestKey = key;
306 |       }
307 |     }
308 | 
309 |     if (oldestKey) {
310 |       cache.delete(oldestKey);
311 |       this.lastAccess.delete(oldestKey);
312 |       this.accessCounts.delete(oldestKey);
313 |     }
314 |   }
315 | 
316 |   private clearExpiredFromCache<T extends { lastUpdated: number }>(
317 |     cache: Map<string, T>, 
318 |     now: number
319 |   ): void {
320 |     for (const [key, value] of cache.entries()) {
321 |       if (now - value.lastUpdated >= this.config.ttl) {
322 |         cache.delete(key);
323 |         this.lastAccess.delete(key);
324 |         this.accessCounts.delete(key);
325 |       }
326 |     }
327 |   }
328 | 
329 |   private getCacheStatsForMap<T extends { lastUpdated: number }>(
330 |     cache: Map<string, T>
331 |   ): { size: number; hitRate: number } {
332 |     const size = cache.size;
333 |     const totalAccesses = Array.from(this.accessCounts.values()).reduce((a, b) => a + b, 0);
334 |     const hitRate = totalAccesses > 0 ? 
335 |       Array.from(this.accessCounts.values()).reduce((a, b) => a + b, 0) / totalAccesses : 0;
336 | 
337 |     return { size, hitRate };
338 |   }
339 | 
340 |   private estimateMemoryUsage(): number {
341 |     // Rough estimation of memory usage in bytes
342 |     let totalBytes = 0;
343 | 
344 |     // Statistics cache
345 |     for (const [key, stats] of this.statisticsCache.entries()) {
346 |       totalBytes += key.length * 2; // String length * 2 bytes per char
347 |       totalBytes += 8 * 8; // 8 numbers * 8 bytes each
348 |       totalBytes += stats.uniqueValues.size * 8; // Set entries
349 |       totalBytes += stats.sortedValues.length * 8; // Array entries
350 |     }
351 | 
352 |     // Split point cache
353 |     for (const [key, split] of this.splitPointCache.entries()) {
354 |       totalBytes += key.length * 2;
355 |       totalBytes += 4 * 8; // 4 numbers * 8 bytes each
356 |     }
357 | 
358 |     // Node cache
359 |     for (const [key, node] of this.nodeCache.entries()) {
360 |       totalBytes += key.length * 2;
361 |       totalBytes += 3 * 8; // 3 numbers * 8 bytes each
362 |       totalBytes += node.targetDistribution.size * 16; // Map entries
363 |     }
364 | 
365 |     // Prediction cache
366 |     for (const [key, pred] of this.predictionCache.entries()) {
367 |       totalBytes += key.length * 2;
368 |       totalBytes += JSON.stringify(pred).length * 2;
369 |     }
370 | 
371 |     return totalBytes;
372 |   }
373 | }
374 | 
375 | /**
376 |  * Global cache instance
377 |  */
378 | export const globalCache = new PerformanceCache();
379 | 
380 | /**
381 |  * Convenience functions for global cache
382 |  */
383 | export function getCachedStatistics(feature: string, data: TrainingData[]): CachedStatistics | null {
384 |   return globalCache.getStatistics(feature, data);
385 | }
386 | 
387 | export function setCachedStatistics(feature: string, data: TrainingData[], statistics: CachedStatistics): void {
388 |   globalCache.setStatistics(feature, data, statistics);
389 | }
390 | 
391 | export function getCachedSplitPoint(feature: string, data: TrainingData[]): CachedSplitPoint | null {
392 |   return globalCache.getSplitPoint(feature, data);
393 | }
394 | 
395 | export function setCachedSplitPoint(feature: string, data: TrainingData[], splitPoint: CachedSplitPoint): void {
396 |   globalCache.setSplitPoint(feature, data, splitPoint);
397 | }
398 | 
399 | export function getCachedPrediction(sample: TrainingData, modelId: string): any | null {
400 |   return globalCache.getPrediction(sample, modelId);
401 | }
402 | 
403 | export function setCachedPrediction(sample: TrainingData, modelId: string, prediction: any): void {
404 |   globalCache.setPrediction(sample, modelId, prediction);
405 | }
406 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to Decision Tree Library
  2 | 
  3 | Thank you for your interest in contributing to this machine learning library! This guide will help you contribute high-quality code that maintains the project's standards.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Getting Started](#getting-started)
  8 | - [Development Workflow](#development-workflow)
  9 | - [Code Quality Standards](#code-quality-standards)
 10 | - [Testing Requirements](#testing-requirements)
 11 | - [Pull Request Process](#pull-request-process)
 12 | - [Code Style Guidelines](#code-style-guidelines)
 13 | - [Project Structure](#project-structure)
 14 | - [Common Issues](#common-issues)
 15 | 
 16 | ## Getting Started
 17 | 
 18 | ### Prerequisites
 19 | 
 20 | - Node.js 20+ or Bun 1.0+
 21 | - npm, yarn, or bun package manager
 22 | - Git
 23 | - Basic understanding of TypeScript
 24 | - Familiarity with machine learning concepts
 25 | 
 26 | ### Why Use Bun?
 27 | 
 28 | Bun is fully supported and offers several advantages for development:
 29 | 
 30 | - **Faster Installation**: Package installation is significantly faster than npm
 31 | - **Built-in TypeScript**: No need for ts-node or additional TypeScript tooling
 32 | - **Faster Test Execution**: Bun's test runner is optimized for speed
 33 | - **Better Performance**: Generally faster execution for JavaScript/TypeScript code
 34 | - **Compatible**: Works with all existing npm packages and scripts
 35 | 
 36 | ### Setting Up Development Environment
 37 | 
 38 | 1. **Fork the repository** on GitHub
 39 | 2. **Clone your fork** locally:
 40 |    ```bash
 41 |    git clone https://github.com/your-username/nodejs-decision-tree.git
 42 |    cd nodejs-decision-tree
 43 |    ```
 44 | 
 45 | 3. **Install dependencies**:
 46 |    ```bash
 47 |    # Using npm
 48 |    npm install
 49 |    
 50 |    # Using Bun (recommended for faster installation)
 51 |    bun install
 52 |    ```
 53 | 
 54 | 4. **Build the project**:
 55 |    ```bash
 56 |    # Using npm
 57 |    npm run build
 58 |    
 59 |    # Using Bun
 60 |    bun run build
 61 |    ```
 62 | 
 63 | 5. **Run tests** to ensure everything works:
 64 |    ```bash
 65 |    # Using npm
 66 |    npm test
 67 |    
 68 |    # Using Bun (faster test execution)
 69 |    bun test
 70 |    ```
 71 | 
 72 | ## Development Workflow
 73 | 
 74 | ### 1. Create a Feature Branch
 75 | 
 76 | ```bash
 77 | git checkout -b feature/your-feature-name
 78 | # or
 79 | git checkout -b fix/issue-description
 80 | ```
 81 | 
 82 | Use descriptive branch names:
 83 | - `feature/add-new-algorithm`
 84 | - `fix/performance-issue`
 85 | - `docs/update-readme`
 86 | - `test/add-edge-cases`
 87 | 
 88 | ### 2. Make Your Changes
 89 | 
 90 | - **Source Code**: Make changes in the `src/` directory
 91 | - **Tests**: Add/update tests in the `tst/` directory
 92 | - **Documentation**: Update README.md and other docs as needed
 93 | 
 94 | ### 3. Build and Test
 95 | 
 96 | #### Using npm
 97 | ```bash
 98 | # Build the project
 99 | npm run build
100 | 
101 | # Run all tests
102 | npm test
103 | 
104 | # Run specific test categories
105 | npm test -- --grep "Decision Tree"
106 | npm test -- --grep "Performance Tests"
107 | 
108 | # Run tests in watch mode (for development)
109 | npm run test:watch
110 | ```
111 | 
112 | #### Using Bun (Recommended)
113 | ```bash
114 | # Build the project
115 | bun run build
116 | 
117 | # Run all tests (Bun has built-in TypeScript support)
118 | bun test
119 | 
120 | # Run specific test categories
121 | bun test --grep "Decision Tree"
122 | bun test --grep "Performance Tests"
123 | 
124 | # Run tests in watch mode (for development)
125 | bun test --watch
126 | 
127 | # Note: Bun can run TypeScript files directly without compilation
128 | # For development, you can also run: bun test tst/*.ts
129 | ```
130 | 
131 | ### 4. Commit Your Changes
132 | 
133 | Use conventional commit messages:
134 | 
135 | ```bash
136 | git add .
137 | git commit -m "feat: add new feature description"
138 | git commit -m "fix: resolve issue description"
139 | git commit -m "test: add tests for new functionality"
140 | git commit -m "docs: update documentation"
141 | ```
142 | 
143 | ### 5. Push and Create Pull Request
144 | 
145 | ```bash
146 | git push origin feature/your-feature-name
147 | ```
148 | 
149 | Then create a pull request on GitHub.
150 | 
151 | ## Code Quality Standards
152 | 
153 | ### TypeScript Requirements
154 | 
155 | - **Full Type Safety**: All code must be properly typed
156 | - **No `any` Types**: Avoid using `any` unless absolutely necessary
157 | - **Interface Definitions**: Define clear interfaces for all data structures
158 | - **Generic Types**: Use generics where appropriate for reusability
159 | 
160 | ```typescript
161 | // ✅ Good
162 | interface TrainingData {
163 |   features: string[];
164 |   target: string;
165 |   data: Record<string, any>[];
166 | }
167 | 
168 | // ❌ Bad
169 | const data: any = [...];
170 | ```
171 | 
172 | ### Error Handling
173 | 
174 | - **Graceful Degradation**: Handle edge cases gracefully
175 | - **Clear Error Messages**: Provide meaningful error messages
176 | - **Input Validation**: Validate inputs at appropriate boundaries
177 | 
178 | ```typescript
179 | // ✅ Good
180 | if (!Array.isArray(data)) {
181 |   throw new Error('Training data must be an array');
182 | }
183 | 
184 | // ❌ Bad
185 | // Silent failure or unclear error
186 | ```
187 | 
188 | ### Performance Considerations
189 | 
190 | - **Efficient Algorithms**: Use efficient algorithms and data structures
191 | - **Memory Management**: Avoid memory leaks and excessive memory usage
192 | - **Scalability**: Consider performance with large datasets
193 | - **Benchmarking**: Include performance tests for new features
194 | 
195 | ## Testing Requirements
196 | 
197 | ### Test Coverage
198 | 
199 | - **100% Pass Rate**: All 421 tests must pass before merging
200 | - **New Feature Tests**: Add comprehensive tests for new functionality
201 | - **Edge Case Coverage**: Test boundary conditions and error scenarios
202 | - **Performance Tests**: Include performance benchmarks where applicable (basic performance tests only)
203 | 
204 | ### Test Categories
205 | 
206 | Add tests to appropriate files:
207 | 
208 | | Test File | Purpose | When to Use |
209 | |-----------|---------|-------------|
210 | | `decision-tree.ts` | Core Decision Tree functionality | New Decision Tree features |
211 | | `random-forest.ts` | Random Forest functionality | New Random Forest features |
212 | | `xgboost.ts` | XGBoost functionality | New XGBoost features |
213 | | `performance-tests.ts` | Basic performance benchmarks | Performance-related changes |
214 | | `edge-cases.ts` | Edge cases and error handling | Error handling improvements |
215 | | `data-validation.ts` | Input validation | Input validation changes |
216 | | `type-safety.ts` | TypeScript type safety | Type system changes |
217 | 
218 | ### Test Structure
219 | 
220 | ```typescript
221 | describe('Feature Name', function() {
222 |   it('should handle normal case', () => {
223 |     // Arrange
224 |     const input = createTestData();
225 |     
226 |     // Act
227 |     const result = functionUnderTest(input);
228 |     
229 |     // Assert
230 |     assert.strictEqual(result, expectedValue);
231 |   });
232 | 
233 |   it('should handle edge case', () => {
234 |     // Test edge cases
235 |   });
236 | 
237 |   it('should throw error for invalid input', () => {
238 |     // Test error conditions
239 |   });
240 | });
241 | ```
242 | 
243 | ### Performance Testing
244 | 
245 | For performance-related changes, add basic benchmarks (avoid extensive performance tests):
246 | 
247 | ```typescript
248 | it('should perform within reasonable time limits', () => {
249 |   const startTime = Date.now();
250 |   // Perform operation
251 |   const endTime = Date.now();
252 |   
253 |   const executionTime = endTime - startTime;
254 |   assert.ok(executionTime < MAX_ALLOWED_TIME, 
255 |     `Operation took ${executionTime}ms, expected < ${MAX_ALLOWED_TIME}ms`);
256 | });
257 | ```
258 | 
259 | **Note**: Extensive performance and memory usage tests have been removed to focus on core functionality. Only add basic performance tests when necessary.
260 | 
261 | ## Pull Request Process
262 | 
263 | ### Before Submitting
264 | 
265 | - [ ] All 421 tests pass (`npm test` or `bun test`)
266 | - [ ] Code builds without errors (`npm run build` or `bun run build`)
267 | - [ ] New features have comprehensive tests
268 | - [ ] Documentation is updated
269 | - [ ] Code follows style guidelines
270 | - [ ] No breaking changes (or clearly documented)
271 | 
272 | ### Pull Request Template
273 | 
274 | ```markdown
275 | ## Description
276 | Brief description of changes and motivation.
277 | 
278 | ## Type of Change
279 | - [ ] Bug fix (non-breaking change that fixes an issue)
280 | - [ ] New feature (non-breaking change that adds functionality)
281 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
282 | - [ ] Documentation update
283 | - [ ] Performance improvement
284 | - [ ] Test addition/improvement
285 | 
286 | ## Testing
287 | - [ ] All existing tests pass
288 | - [ ] New tests added for new functionality
289 | - [ ] Manual testing completed
290 | - [ ] Performance impact assessed (if applicable - basic tests only)
291 | - [ ] Edge cases tested
292 | 
293 | ## Checklist
294 | - [ ] Code follows existing style guidelines
295 | - [ ] Self-review completed
296 | - [ ] Documentation updated (if needed)
297 | - [ ] No breaking changes (or clearly documented if intentional)
298 | - [ ] TypeScript types are properly defined
299 | - [ ] Error handling is appropriate
300 | - [ ] Performance considerations addressed
301 | ```
302 | 
303 | ### Review Process
304 | 
305 | 1. **Automated Checks**: CI will run tests and build checks
306 | 2. **Code Review**: Maintainers will review code quality and functionality
307 | 3. **Feedback**: Address any feedback or requested changes
308 | 4. **Approval**: Once approved, changes will be merged
309 | 
310 | ## Code Style Guidelines
311 | 
312 | ### General Guidelines
313 | 
314 | - **Consistent Formatting**: Follow existing code style
315 | - **Clear Naming**: Use descriptive variable and function names
316 | - **Comments**: Add comments for complex logic
317 | - **ES Modules**: Use ES module syntax (no CommonJS)
318 | 
319 | ### TypeScript Specific
320 | 
321 | ```typescript
322 | // ✅ Good - Clear interfaces
323 | interface AlgorithmConfig {
324 |   nEstimators: number;
325 |   maxDepth?: number;
326 |   randomState?: number;
327 | }
328 | 
329 | // ✅ Good - Proper typing
330 | function trainModel(data: TrainingData[], config: AlgorithmConfig): void {
331 |   // Implementation
332 | }
333 | 
334 | // ❌ Bad - Unclear types
335 | function train(data: any, options: any): any {
336 |   // Implementation
337 | }
338 | ```
339 | 
340 | ### File Organization
341 | 
342 | - **One class per file**: Keep classes in separate files
343 | - **Shared utilities**: Put shared code in `src/shared/`
344 | - **Clear exports**: Use named exports where appropriate
345 | - **Import organization**: Group imports logically
346 | 
347 | ### Error Messages
348 | 
349 | ```typescript
350 | // ✅ Good - Clear and actionable
351 | throw new Error('Training data must be an array of objects with at least one sample');
352 | 
353 | // ❌ Bad - Vague
354 | throw new Error('Invalid data');
355 | ```
356 | 
357 | ## Project Structure
358 | 
359 | ```
360 | src/
361 | ├── decision-tree.ts          # Main Decision Tree class
362 | ├── random-forest.ts          # Random Forest implementation
363 | ├── xgboost.ts               # XGBoost implementation
364 | └── shared/                  # Shared utilities
365 |     ├── types.ts             # TypeScript type definitions
366 |     ├── loss-functions.ts    # Loss function implementations
367 |     └── gradient-boosting.ts # Gradient boosting utilities
368 | 
369 | tst/                         # Test files
370 | ├── decision-tree.ts         # Decision Tree tests
371 | ├── random-forest.ts         # Random Forest tests
372 | ├── xgboost.ts              # XGBoost tests
373 | ├── performance-tests.ts    # Performance benchmarks
374 | └── ...                     # Other test files
375 | 
376 | examples/                    # Usage examples
377 | ├── javascript-usage.js     # JavaScript examples
378 | ├── typescript-usage.ts     # TypeScript examples
379 | └── ...                     # Algorithm-specific examples
380 | ```
381 | 
382 | ## Common Issues
383 | 
384 | ### Build Errors
385 | 
386 | **Issue**: TypeScript compilation errors
387 | **Solution**: 
388 | ```bash
389 | # Using npm
390 | npm run build
391 | # Fix any TypeScript errors shown
392 | 
393 | # Using Bun
394 | bun run build
395 | # Fix any TypeScript errors shown
396 | ```
397 | 
398 | ### Test Failures
399 | 
400 | **Issue**: Tests failing
401 | **Solution**:
402 | ```bash
403 | # Using npm
404 | npm test
405 | # Fix failing tests or update expectations
406 | 
407 | # Using Bun
408 | bun run test
409 | # Fix failing tests or update expectations
410 | ```
411 | 
412 | ### Import/Export Issues
413 | 
414 | **Issue**: Module import/export errors
415 | **Solution**: Ensure you're using ES module syntax:
416 | ```typescript
417 | // ✅ Correct
418 | import DecisionTree from 'decision-tree';
419 | 
420 | // ❌ Incorrect (CommonJS)
421 | const DecisionTree = require('decision-tree');
422 | ```
423 | 
424 | ### Performance Issues
425 | 
426 | **Issue**: New code is too slow
427 | **Solution**: 
428 | - Add performance tests
429 | - Optimize algorithms
430 | - Consider memory usage
431 | - Use appropriate data structures
432 | 
433 | ## Getting Help
434 | 
435 | - **Issues**: Open an issue for bugs or feature requests
436 | - **Discussions**: Use GitHub Discussions for questions
437 | - **Code Review**: Ask for help during code review
438 | - **Documentation**: Check existing documentation and examples
439 | 
440 | ## Recognition
441 | 
442 | Contributors will be recognized in:
443 | - README.md contributors section
444 | - Release notes
445 | - GitHub contributors list
446 | 
447 | Thank you for contributing to this project! Your efforts help make this library better for everyone.
448 | 


--------------------------------------------------------------------------------
/src/shared/memory-optimization.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Memory-Efficient Data Structures and Optimizations
  3 |  * Provides optimized data storage and processing for large datasets
  4 |  */
  5 | 
  6 | import { TrainingData } from './types.js';
  7 | 
  8 | export interface OptimizedDataset {
  9 |   continuousFeatures: Map<string, Float32Array>;
 10 |   discreteFeatures: Map<string, any[]>;
 11 |   targetValues: any[];
 12 |   indices: Map<string, Map<any, number[]>>;
 13 |   valueMapping: Map<string, Map<any, number>>;
 14 |   reverseMapping: Map<string, Map<number, any>>;
 15 |   featureTypes: Map<string, 'discrete' | 'continuous'>;
 16 |   sampleCount: number;
 17 |   featureCount: number;
 18 | }
 19 | 
 20 | export interface MemoryOptimizedDataProcessor {
 21 |   processData(data: TrainingData[], features: string[], target: string, featureTypes: Map<string, 'discrete' | 'continuous'>): OptimizedDataset;
 22 |   getFeatureValues(dataset: OptimizedDataset, feature: string): Float32Array | any[];
 23 |   getTargetValues(dataset: OptimizedDataset): any[];
 24 |   getSample(dataset: OptimizedDataset, index: number): TrainingData;
 25 |   getSubset(dataset: OptimizedDataset, indices: number[]): OptimizedDataset;
 26 |   clear(dataset: OptimizedDataset): void;
 27 | }
 28 | 
 29 | export class MemoryOptimizedProcessor implements MemoryOptimizedDataProcessor {
 30 |   private compressionEnabled: boolean;
 31 |   private maxMemoryUsage: number;
 32 | 
 33 |   constructor(config: { compressionEnabled?: boolean; maxMemoryUsage?: number } = {}) {
 34 |     this.compressionEnabled = config.compressionEnabled ?? true;
 35 |     this.maxMemoryUsage = config.maxMemoryUsage ?? 100 * 1024 * 1024; // 100MB
 36 |   }
 37 | 
 38 |   /**
 39 |    * Processes raw data into memory-optimized format
 40 |    */
 41 |   processData(
 42 |     data: TrainingData[], 
 43 |     features: string[], 
 44 |     target: string, 
 45 |     featureTypes: Map<string, 'discrete' | 'continuous'>
 46 |   ): OptimizedDataset {
 47 |     const sampleCount = data.length;
 48 |     const featureCount = features.length;
 49 |     
 50 |     const continuousFeatures = new Map<string, Float32Array>();
 51 |     const discreteFeatures = new Map<string, any[]>();
 52 |     const targetValues: any[] = new Array(sampleCount);
 53 |     const indices = new Map<string, Map<any, number[]>>();
 54 |     const valueMapping = new Map<string, Map<any, number>>();
 55 |     const reverseMapping = new Map<string, Map<number, any>>();
 56 | 
 57 |     // Initialize indices and mappings for each feature
 58 |     for (const feature of features) {
 59 |       indices.set(feature, new Map());
 60 |       valueMapping.set(feature, new Map());
 61 |       reverseMapping.set(feature, new Map());
 62 |     }
 63 | 
 64 |     // Process each feature
 65 |     for (const feature of features) {
 66 |       const featureType = featureTypes.get(feature) || 'discrete';
 67 |       
 68 |       if (featureType === 'continuous') {
 69 |         const values = new Float32Array(sampleCount);
 70 |         for (let i = 0; i < sampleCount; i++) {
 71 |           const value = Number(data[i][feature]);
 72 |           values[i] = isNaN(value) ? 0 : value;
 73 |         }
 74 |         continuousFeatures.set(feature, values);
 75 |       } else {
 76 |         const values: any[] = new Array(sampleCount);
 77 |         const uniqueValues = new Set<any>();
 78 |         
 79 |         for (let i = 0; i < sampleCount; i++) {
 80 |           const value = data[i][feature];
 81 |           values[i] = value;
 82 |           uniqueValues.add(value);
 83 |         }
 84 | 
 85 |         // Create value mapping for compression
 86 |         if (this.compressionEnabled && uniqueValues.size < sampleCount * 0.5) {
 87 |           const mapping = new Map<any, number>();
 88 |           const reverse = new Map<number, any>();
 89 |           let nextId = 0;
 90 | 
 91 |           for (const value of uniqueValues) {
 92 |             mapping.set(value, nextId);
 93 |             reverse.set(nextId, value);
 94 |             nextId++;
 95 |           }
 96 | 
 97 |           valueMapping.set(feature, mapping);
 98 |           reverseMapping.set(feature, reverse);
 99 |         }
100 | 
101 |         discreteFeatures.set(feature, values);
102 |       }
103 |     }
104 | 
105 |     // Process target values
106 |     for (let i = 0; i < sampleCount; i++) {
107 |       targetValues[i] = data[i][target];
108 |     }
109 | 
110 |     // Build indices for fast lookups
111 |     this.buildIndices(data, features, indices);
112 | 
113 |     return {
114 |       continuousFeatures,
115 |       discreteFeatures,
116 |       targetValues,
117 |       indices,
118 |       valueMapping,
119 |       reverseMapping,
120 |       featureTypes,
121 |       sampleCount,
122 |       featureCount
123 |     };
124 |   }
125 | 
126 |   /**
127 |    * Gets feature values in optimized format
128 |    */
129 |   getFeatureValues(dataset: OptimizedDataset, feature: string): Float32Array | any[] {
130 |     if (dataset.continuousFeatures.has(feature)) {
131 |       return dataset.continuousFeatures.get(feature)!;
132 |     } else if (dataset.discreteFeatures.has(feature)) {
133 |       return dataset.discreteFeatures.get(feature)!;
134 |     } else {
135 |       throw new Error(`Feature ${feature} not found in dataset`);
136 |     }
137 |   }
138 | 
139 |   /**
140 |    * Gets target values
141 |    */
142 |   getTargetValues(dataset: OptimizedDataset): any[] {
143 |     return dataset.targetValues;
144 |   }
145 | 
146 |   /**
147 |    * Gets a single sample from the dataset
148 |    */
149 |   getSample(dataset: OptimizedDataset, index: number): TrainingData {
150 |     if (index < 0 || index >= dataset.sampleCount) {
151 |       throw new Error(`Index ${index} out of range`);
152 |     }
153 | 
154 |     const sample: TrainingData = {};
155 | 
156 |     // Add continuous features
157 |     for (const [feature, values] of dataset.continuousFeatures.entries()) {
158 |       sample[feature] = values[index];
159 |     }
160 | 
161 |     // Add discrete features
162 |     for (const [feature, values] of dataset.discreteFeatures.entries()) {
163 |       sample[feature] = values[index];
164 |     }
165 | 
166 |     return sample;
167 |   }
168 | 
169 |   /**
170 |    * Gets a subset of the dataset
171 |    */
172 |   getSubset(dataset: OptimizedDataset, indices: number[]): OptimizedDataset {
173 |     const subsetCount = indices.length;
174 |     const continuousFeatures = new Map<string, Float32Array>();
175 |     const discreteFeatures = new Map<string, any[]>();
176 |     const targetValues: any[] = new Array(subsetCount);
177 | 
178 |     // Process continuous features
179 |     for (const [feature, values] of dataset.continuousFeatures.entries()) {
180 |       const subsetValues = new Float32Array(subsetCount);
181 |       for (let i = 0; i < subsetCount; i++) {
182 |         subsetValues[i] = values[indices[i]];
183 |       }
184 |       continuousFeatures.set(feature, subsetValues);
185 |     }
186 | 
187 |     // Process discrete features
188 |     for (const [feature, values] of dataset.discreteFeatures.entries()) {
189 |       const subsetValues: any[] = new Array(subsetCount);
190 |       for (let i = 0; i < subsetCount; i++) {
191 |         subsetValues[i] = values[indices[i]];
192 |       }
193 |       discreteFeatures.set(feature, subsetValues);
194 |     }
195 | 
196 |     // Process target values
197 |     for (let i = 0; i < subsetCount; i++) {
198 |       targetValues[i] = dataset.targetValues[indices[i]];
199 |     }
200 | 
201 |     return {
202 |       continuousFeatures,
203 |       discreteFeatures,
204 |       targetValues,
205 |       indices: new Map(), // Rebuild indices if needed
206 |       valueMapping: dataset.valueMapping,
207 |       reverseMapping: dataset.reverseMapping,
208 |       featureTypes: dataset.featureTypes,
209 |       sampleCount: subsetCount,
210 |       featureCount: dataset.featureCount
211 |     };
212 |   }
213 | 
214 |   /**
215 |    * Clears memory used by the dataset
216 |    */
217 |   clear(dataset: OptimizedDataset): void {
218 |     dataset.continuousFeatures.clear();
219 |     dataset.discreteFeatures.clear();
220 |     dataset.targetValues.length = 0;
221 |     dataset.indices.clear();
222 |     dataset.valueMapping.clear();
223 |     dataset.reverseMapping.clear();
224 |     dataset.featureTypes.clear();
225 |   }
226 | 
227 |   /**
228 |    * Estimates memory usage of the dataset
229 |    */
230 |   estimateMemoryUsage(dataset: OptimizedDataset): number {
231 |     let totalBytes = 0;
232 | 
233 |     // Continuous features (Float32Array)
234 |     for (const values of dataset.continuousFeatures.values()) {
235 |       totalBytes += values.length * 4; // 4 bytes per float32
236 |     }
237 | 
238 |     // Discrete features (array of any)
239 |     for (const values of dataset.discreteFeatures.values()) {
240 |       totalBytes += values.length * 8; // Rough estimate
241 |     }
242 | 
243 |     // Target values
244 |     totalBytes += dataset.targetValues.length * 8;
245 | 
246 |     // Indices
247 |     for (const featureIndices of dataset.indices.values()) {
248 |       for (const indexArray of featureIndices.values()) {
249 |         totalBytes += indexArray.length * 4; // 4 bytes per int32
250 |       }
251 |     }
252 | 
253 |     return totalBytes;
254 |   }
255 | 
256 |   /**
257 |    * Compresses discrete features using value mapping
258 |    */
259 |   compressDiscreteFeature(dataset: OptimizedDataset, feature: string): void {
260 |     const values = dataset.discreteFeatures.get(feature);
261 |     if (!values) return;
262 | 
263 |     const uniqueValues = new Set(values);
264 |     if (uniqueValues.size >= values.length * 0.5) return; // Not worth compressing
265 | 
266 |     const mapping = new Map<any, number>();
267 |     const reverse = new Map<number, any>();
268 |     let nextId = 0;
269 | 
270 |     for (const value of uniqueValues) {
271 |       mapping.set(value, nextId);
272 |       reverse.set(nextId, value);
273 |       nextId++;
274 |     }
275 | 
276 |     dataset.valueMapping.set(feature, mapping);
277 |     dataset.reverseMapping.set(feature, reverse);
278 |   }
279 | 
280 |   /**
281 |    * Decompresses discrete features
282 |    */
283 |   decompressDiscreteFeature(dataset: OptimizedDataset, feature: string): any[] {
284 |     const values = dataset.discreteFeatures.get(feature);
285 |     const mapping = dataset.reverseMapping.get(feature);
286 |     
287 |     if (!values || !mapping) return values || [];
288 | 
289 |     return values.map(id => mapping.get(id) || id);
290 |   }
291 | 
292 |   private buildIndices(
293 |     data: TrainingData[], 
294 |     features: string[], 
295 |     indices: Map<string, Map<any, number[]>>
296 |   ): void {
297 |     for (const feature of features) {
298 |       const featureIndices = indices.get(feature)!;
299 |       
300 |       for (let i = 0; i < data.length; i++) {
301 |         const value = data[i][feature];
302 |         if (!featureIndices.has(value)) {
303 |           featureIndices.set(value, []);
304 |         }
305 |         featureIndices.get(value)!.push(i);
306 |       }
307 |     }
308 |   }
309 | }
310 | 
311 | /**
312 |  * Memory-efficient statistics calculator
313 |  */
314 | export class MemoryEfficientStatistics {
315 |   private processor: MemoryOptimizedProcessor;
316 | 
317 |   constructor(processor: MemoryOptimizedProcessor) {
318 |     this.processor = processor;
319 |   }
320 | 
321 |   /**
322 |    * Calculates statistics for a continuous feature efficiently
323 |    */
324 |   calculateContinuousStatistics(dataset: OptimizedDataset, feature: string): {
325 |     mean: number;
326 |     variance: number;
327 |     std: number;
328 |     min: number;
329 |     max: number;
330 |     quartiles: number[];
331 |   } {
332 |     const values = dataset.continuousFeatures.get(feature);
333 |     if (!values) throw new Error(`Feature ${feature} not found or not continuous`);
334 | 
335 |     const sorted = new Float32Array(values).sort();
336 |     const n = sorted.length;
337 | 
338 |     // Calculate basic statistics
339 |     let sum = 0;
340 |     let min = sorted[0];
341 |     let max = sorted[n - 1];
342 | 
343 |     for (let i = 0; i < n; i++) {
344 |       sum += sorted[i];
345 |     }
346 | 
347 |     const mean = sum / n;
348 | 
349 |     // Calculate variance
350 |     let variance = 0;
351 |     for (let i = 0; i < n; i++) {
352 |       const diff = sorted[i] - mean;
353 |       variance += diff * diff;
354 |     }
355 |     variance /= n;
356 | 
357 |     const std = Math.sqrt(variance);
358 | 
359 |     // Calculate quartiles
360 |     const q1 = this.percentile(sorted, 25);
361 |     const q2 = this.percentile(sorted, 50);
362 |     const q3 = this.percentile(sorted, 75);
363 | 
364 |     return {
365 |       mean,
366 |       variance,
367 |       std,
368 |       min,
369 |       max,
370 |       quartiles: [q1, q2, q3]
371 |     };
372 |   }
373 | 
374 |   /**
375 |    * Calculates statistics for a discrete feature efficiently
376 |    */
377 |   calculateDiscreteStatistics(dataset: OptimizedDataset, feature: string): {
378 |     uniqueValues: any[];
379 |     valueCounts: Map<any, number>;
380 |     cardinality: number;
381 |   } {
382 |     const values = dataset.discreteFeatures.get(feature);
383 |     if (!values) throw new Error(`Feature ${feature} not found or not discrete`);
384 | 
385 |     const valueCounts = new Map<any, number>();
386 |     const uniqueValues = new Set<any>();
387 | 
388 |     for (const value of values) {
389 |       uniqueValues.add(value);
390 |       valueCounts.set(value, (valueCounts.get(value) || 0) + 1);
391 |     }
392 | 
393 |     return {
394 |       uniqueValues: Array.from(uniqueValues),
395 |       valueCounts,
396 |       cardinality: uniqueValues.size
397 |     };
398 |   }
399 | 
400 |   private percentile(sortedValues: Float32Array, p: number): number {
401 |     const index = (p / 100) * (sortedValues.length - 1);
402 |     const lower = Math.floor(index);
403 |     const upper = Math.ceil(index);
404 |     const weight = index - lower;
405 | 
406 |     if (upper >= sortedValues.length) return sortedValues[sortedValues.length - 1];
407 |     if (lower === upper) return sortedValues[lower];
408 | 
409 |     return sortedValues[lower] * (1 - weight) + sortedValues[upper] * weight;
410 |   }
411 | }
412 | 
413 | /**
414 |  * Global memory-optimized processor instance
415 |  */
416 | export const globalMemoryProcessor = new MemoryOptimizedProcessor();
417 | 
418 | /**
419 |  * Convenience functions
420 |  */
421 | export function processDataOptimized(
422 |   data: TrainingData[], 
423 |   features: string[], 
424 |   target: string, 
425 |   featureTypes: Map<string, 'discrete' | 'continuous'>
426 | ): OptimizedDataset {
427 |   return globalMemoryProcessor.processData(data, features, target, featureTypes);
428 | }
429 | 
430 | export function estimateMemoryUsage(dataset: OptimizedDataset): number {
431 |   return globalMemoryProcessor.estimateMemoryUsage(dataset);
432 | }
433 | 


--------------------------------------------------------------------------------
/tst/continuous-variables.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Tests for Continuous Variable Support
  3 |  * Tests data type detection, CART algorithm, and hybrid functionality
  4 |  */
  5 | 
  6 | import { strict as assert } from 'assert';
  7 | import DecisionTree from '../lib/decision-tree.js';
  8 | import { DataTypeDetector, detectDataTypes, recommendAlgorithm } from '../lib/shared/data-type-detection.js';
  9 | import { createCARTTree } from '../lib/shared/cart-algorithm.js';
 10 | import { globalCache } from '../lib/shared/caching-system.js';
 11 | 
 12 | // Test data generators
 13 | function generateContinuousData(sampleCount: number): any[] {
 14 |   const data: any[] = [];
 15 |   for (let i = 0; i < sampleCount; i++) {
 16 |     data.push({
 17 |       age: Math.floor(Math.random() * 80) + 18,
 18 |       income: Math.random() * 100000 + 20000,
 19 |       score: Math.random() * 100,
 20 |       target: Math.random() > 0.5
 21 |     });
 22 |   }
 23 |   return data;
 24 | }
 25 | 
 26 | function generateDiscreteData(sampleCount: number): any[] {
 27 |   const colors = ['red', 'blue', 'green', 'yellow'];
 28 |   const shapes = ['circle', 'square', 'triangle'];
 29 |   const data: any[] = [];
 30 |   
 31 |   for (let i = 0; i < sampleCount; i++) {
 32 |     data.push({
 33 |       color: colors[Math.floor(Math.random() * colors.length)],
 34 |       shape: shapes[Math.floor(Math.random() * shapes.length)],
 35 |       target: Math.random() > 0.5
 36 |     });
 37 |   }
 38 |   return data;
 39 | }
 40 | 
 41 | function generateMixedData(sampleCount: number): any[] {
 42 |   const colors = ['red', 'blue', 'green'];
 43 |   const data: any[] = [];
 44 |   
 45 |   for (let i = 0; i < sampleCount; i++) {
 46 |     data.push({
 47 |       age: Math.floor(Math.random() * 80) + 18,
 48 |       income: Math.random() * 100000 + 20000,
 49 |       color: colors[Math.floor(Math.random() * colors.length)],
 50 |       target: Math.random() > 0.5
 51 |     });
 52 |   }
 53 |   return data;
 54 | }
 55 | 
 56 | function generateRegressionData(sampleCount: number): any[] {
 57 |   const data: any[] = [];
 58 |   for (let i = 0; i < sampleCount; i++) {
 59 |     const x1 = Math.random() * 10;
 60 |     const x2 = Math.random() * 5;
 61 |     const noise = (Math.random() - 0.5) * 0.5;
 62 |     const y = 2 * x1 + 3 * x2 + noise;
 63 |     
 64 |     data.push({
 65 |       x1,
 66 |       x2,
 67 |       target: y
 68 |     });
 69 |   }
 70 |   return data;
 71 | }
 72 | 
 73 | describe('Data Type Detection', function() {
 74 |   describe('Continuous Data Detection', function() {
 75 |     it('should detect continuous features correctly', function() {
 76 |       const data = generateContinuousData(100);
 77 |       const features = ['age', 'income', 'score'];
 78 |       
 79 |       const analysis = detectDataTypes(data, features);
 80 |       
 81 |       assert.strictEqual(analysis.age.type, 'continuous');
 82 |       assert.strictEqual(analysis.income.type, 'continuous');
 83 |       assert.strictEqual(analysis.score.type, 'continuous');
 84 |       assert(analysis.age.confidence > 0.7);
 85 |       assert(analysis.income.confidence > 0.7);
 86 |       assert(analysis.score.confidence > 0.7);
 87 |     });
 88 | 
 89 |     it('should detect discrete features correctly', function() {
 90 |       const data = generateDiscreteData(100);
 91 |       const features = ['color', 'shape'];
 92 |       
 93 |       const analysis = detectDataTypes(data, features);
 94 |       
 95 |       assert.strictEqual(analysis.color.type, 'discrete');
 96 |       assert.strictEqual(analysis.shape.type, 'discrete');
 97 |       assert(analysis.color.confidence > 0.7);
 98 |       assert(analysis.shape.confidence > 0.7);
 99 |     });
100 | 
101 |     it('should detect mixed data types correctly', function() {
102 |       const data = generateMixedData(100);
103 |       const features = ['age', 'income', 'color'];
104 |       
105 |       const analysis = detectDataTypes(data, features);
106 |       
107 |       assert.strictEqual(analysis.age.type, 'continuous');
108 |       assert.strictEqual(analysis.income.type, 'continuous');
109 |       assert.strictEqual(analysis.color.type, 'discrete');
110 |     });
111 | 
112 |     it('should recommend appropriate algorithms', function() {
113 |       const continuousData = generateContinuousData(100);
114 |       const discreteData = generateDiscreteData(100);
115 |       const mixedData = generateMixedData(100);
116 |       
117 |       const continuousRec = recommendAlgorithm(continuousData, ['age', 'income'], 'target');
118 |       const discreteRec = recommendAlgorithm(discreteData, ['color', 'shape'], 'target');
119 |       const mixedRec = recommendAlgorithm(mixedData, ['age', 'income', 'color'], 'target');
120 |       
121 |       assert.strictEqual(continuousRec.algorithm, 'cart');
122 |       assert.strictEqual(discreteRec.algorithm, 'id3');
123 |       assert.strictEqual(mixedRec.algorithm, 'hybrid');
124 |     });
125 |   });
126 | 
127 |   describe('Edge Cases', function() {
128 |     it('should handle empty datasets', function() {
129 |       const analysis = detectDataTypes([], ['feature1']);
130 |       assert.strictEqual(analysis.feature1.type, 'discrete');
131 |       assert.strictEqual(analysis.feature1.confidence, 0);
132 |     });
133 | 
134 |     it('should handle single value datasets', function() {
135 |       const data = [{ feature1: 5, target: true }];
136 |       const analysis = detectDataTypes(data, ['feature1']);
137 |       assert.strictEqual(analysis.feature1.type, 'discrete');
138 |     });
139 | 
140 |     it('should handle missing values', function() {
141 |       const data = [
142 |         { feature1: 1, target: true },
143 |         { feature1: null, target: false },
144 |         { feature1: 3, target: true }
145 |       ];
146 |       const analysis = detectDataTypes(data, ['feature1']);
147 |       assert(analysis.feature1.type === 'discrete' || analysis.feature1.type === 'continuous');
148 |     });
149 |   });
150 | });
151 | 
152 | describe('Decision Tree with Continuous Variables', function() {
153 |   describe('CART Algorithm', function() {
154 |     it('should train on continuous data using CART', function() {
155 |       const data = generateContinuousData(100);
156 |       const dt = new DecisionTree('target', ['age', 'income', 'score'], {
157 |         algorithm: 'cart',
158 |         autoDetectTypes: true
159 |       });
160 |       
161 |       dt.train(data);
162 |       
163 |       assert.strictEqual(dt.getAlgorithm(), 'cart');
164 |       const featureTypes = dt.getFeatureTypes();
165 |       assert.strictEqual(featureTypes.age, 'continuous');
166 |       assert.strictEqual(featureTypes.income, 'continuous');
167 |       assert.strictEqual(featureTypes.score, 'continuous');
168 |     });
169 | 
170 |     it('should make predictions on continuous data', function() {
171 |       const data = generateContinuousData(100);
172 |       const dt = new DecisionTree('target', ['age', 'income', 'score'], {
173 |         algorithm: 'cart',
174 |         autoDetectTypes: true
175 |       });
176 |       
177 |       dt.train(data);
178 |       
179 |       const testSample = { age: 30, income: 50000, score: 75 };
180 |       const prediction = dt.predict(testSample);
181 |       
182 |       assert(typeof prediction === 'boolean');
183 |     });
184 | 
185 |     it('should handle regression tasks', function() {
186 |       const data = generateRegressionData(100);
187 |       const dt = new DecisionTree('target', ['x1', 'x2'], {
188 |         algorithm: 'cart',
189 |         criterion: 'mse',
190 |         autoDetectTypes: true
191 |       });
192 |       
193 |       dt.train(data);
194 |       
195 |       const testSample = { x1: 5, x2: 2 };
196 |       const prediction = dt.predict(testSample);
197 |       
198 |       assert(typeof prediction === 'number');
199 |     });
200 |   });
201 | 
202 |   describe('Hybrid Algorithm', function() {
203 |     it('should automatically select hybrid approach for mixed data', function() {
204 |       const data = generateMixedData(100);
205 |       const dt = new DecisionTree('target', ['age', 'income', 'color'], {
206 |         algorithm: 'auto',
207 |         autoDetectTypes: true
208 |       });
209 |       
210 |       dt.train(data);
211 |       
212 |       const featureTypes = dt.getFeatureTypes();
213 |       assert.strictEqual(featureTypes.age, 'continuous');
214 |       assert.strictEqual(featureTypes.income, 'continuous');
215 |       assert.strictEqual(featureTypes.color, 'discrete');
216 |     });
217 | 
218 |     it('should make predictions on mixed data', function() {
219 |       const data = generateMixedData(100);
220 |       const dt = new DecisionTree('target', ['age', 'income', 'color'], {
221 |         algorithm: 'auto',
222 |         autoDetectTypes: true
223 |       });
224 |       
225 |       dt.train(data);
226 |       
227 |       const testSample = { age: 30, income: 50000, color: 'red' };
228 |       const prediction = dt.predict(testSample);
229 |       
230 |       assert(typeof prediction === 'boolean');
231 |     });
232 |   });
233 | 
234 | });
235 | 
236 | describe('Caching System', function() {
237 |   beforeEach(function() {
238 |     globalCache.clear();
239 |   });
240 | 
241 |   describe('Prediction Caching', function() {
242 |     it('should cache predictions for repeated samples', function() {
243 |       const data = generateContinuousData(100);
244 |       const dt = new DecisionTree('target', ['age', 'income', 'score'], {
245 |         algorithm: 'cart',
246 |         autoDetectTypes: true,
247 |         cachingEnabled: true
248 |       });
249 |       
250 |       dt.train(data);
251 |       
252 |       const testSample = { age: 30, income: 50000, score: 75 };
253 |       
254 |       // First prediction (cold cache)
255 |       const start1 = Date.now();
256 |       const prediction1 = dt.predict(testSample);
257 |       const duration1 = Date.now() - start1;
258 |       
259 |       // Second prediction (warm cache)
260 |       const start2 = Date.now();
261 |       const prediction2 = dt.predict(testSample);
262 |       const duration2 = Date.now() - start2;
263 |       
264 |       assert.strictEqual(prediction1, prediction2);
265 |       // Cache should work (predictions should be identical)
266 |       // Note: Timing assertions are flaky, so we just verify cache functionality
267 |     });
268 | 
269 |     it('should provide cache statistics', function() {
270 |       const data = generateContinuousData(100);
271 |       const dt = new DecisionTree('target', ['age', 'income', 'score'], {
272 |         algorithm: 'cart',
273 |         autoDetectTypes: true,
274 |         cachingEnabled: true
275 |       });
276 |       
277 |       dt.train(data);
278 |       
279 |       // Make some predictions
280 |       const testSamples = generateContinuousData(10);
281 |       testSamples.forEach(sample => dt.predict(sample));
282 |       
283 |       const stats = dt.getCacheStats();
284 |       assert(stats !== null);
285 |       assert(typeof stats.predictionCache.size === 'number');
286 |     });
287 |   });
288 | });
289 | 
290 | 
291 | describe('Model Persistence', function() {
292 |   describe('Continuous Variable Support', function() {
293 |     it('should save and load models with continuous variables', function() {
294 |       const data = generateContinuousData(100);
295 |       const dt = new DecisionTree('target', ['age', 'income', 'score'], {
296 |         algorithm: 'cart',
297 |         autoDetectTypes: true
298 |       });
299 |       
300 |       dt.train(data);
301 |       
302 |       const modelJson = dt.toJSON();
303 |       assert(modelJson.featureTypes !== undefined);
304 |       assert(modelJson.algorithm !== undefined);
305 |       assert(modelJson.config !== undefined);
306 |       
307 |       const loadedDt = new DecisionTree(modelJson);
308 |       assert.strictEqual(loadedDt.getAlgorithm(), 'cart');
309 |       
310 |       const featureTypes = loadedDt.getFeatureTypes();
311 |       assert.strictEqual(featureTypes.age, 'continuous');
312 |       assert.strictEqual(featureTypes.income, 'continuous');
313 |       assert.strictEqual(featureTypes.score, 'continuous');
314 |     });
315 | 
316 |     it('should maintain prediction consistency after loading', function() {
317 |       const data = generateContinuousData(100);
318 |       const dt = new DecisionTree('target', ['age', 'income', 'score'], {
319 |         algorithm: 'cart',
320 |         autoDetectTypes: true
321 |       });
322 |       
323 |       dt.train(data);
324 |       
325 |       const testSample = { age: 30, income: 50000, score: 75 };
326 |       const originalPrediction = dt.predict(testSample);
327 |       
328 |       const modelJson = dt.toJSON();
329 |       const loadedDt = new DecisionTree(modelJson);
330 |       const loadedPrediction = loadedDt.predict(testSample);
331 |       
332 |       assert.strictEqual(originalPrediction, loadedPrediction);
333 |     });
334 |   });
335 | });
336 | 
337 | describe('Edge Cases and Error Handling', function() {
338 |   describe('Invalid Data Handling', function() {
339 |     it('should handle non-numeric continuous values gracefully', function() {
340 |       const data = [
341 |         { age: 25, income: 'invalid', target: true },
342 |         { age: 30, income: 50000, target: false },
343 |         { age: 35, income: 75000, target: true }
344 |       ];
345 |       
346 |       const dt = new DecisionTree('target', ['age', 'income'], {
347 |         algorithm: 'cart',
348 |         autoDetectTypes: true
349 |       });
350 |       
351 |       // Should not throw an error
352 |       assert.doesNotThrow(() => dt.train(data));
353 |     });
354 | 
355 |     it('should handle missing features in prediction', function() {
356 |       const data = generateContinuousData(100);
357 |       const dt = new DecisionTree('target', ['age', 'income', 'score'], {
358 |         algorithm: 'cart',
359 |         autoDetectTypes: true
360 |       });
361 |       
362 |       dt.train(data);
363 |       
364 |       const testSample = { age: 30 }; // Missing income and score
365 |       
366 |       // Should not throw an error, should use fallback
367 |       assert.doesNotThrow(() => dt.predict(testSample));
368 |     });
369 |   });
370 | 
371 |   describe('Configuration Validation', function() {
372 |     it('should validate algorithm configuration', function() {
373 |       const data = generateContinuousData(100);
374 |       
375 |       assert.throws(() => {
376 |         new DecisionTree('target', ['age', 'income'], {
377 |           algorithm: 'invalid' as any
378 |         });
379 |       });
380 |     });
381 | 
382 |     it('should validate criterion configuration', function() {
383 |       const data = generateContinuousData(100);
384 |       
385 |       assert.throws(() => {
386 |         new DecisionTree('target', ['age', 'income'], {
387 |           algorithm: 'cart',
388 |           criterion: 'invalid' as any
389 |         });
390 |       });
391 |     });
392 |   });
393 | });
394 | 
395 | 


--------------------------------------------------------------------------------
/src/decision-tree.ts:
--------------------------------------------------------------------------------
  1 | import _ from 'lodash';
  2 | import { 
  3 |   TreeNode, 
  4 |   DecisionTreeData, 
  5 |   TrainingData, 
  6 |   NODE_TYPES,
  7 |   DecisionTreeConfig
  8 | } from './shared/types.js';
  9 | import { randomUUID, prob, log2, mostCommon } from './shared/utils.js';
 10 | import { createTree, entropy, gain, maxGain } from './shared/id3-algorithm.js';
 11 | import { createCARTTree, CARTConfig } from './shared/cart-algorithm.js';
 12 | import { DataTypeDetector, detectDataTypes, recommendAlgorithm } from './shared/data-type-detection.js';
 13 | import { globalCache, getCachedPrediction, setCachedPrediction } from './shared/caching-system.js';
 14 | import { processDataOptimized, OptimizedDataset } from './shared/memory-optimization.js';
 15 | 
 16 | /**
 17 |  * Decision Tree Algorithm
 18 |  * @module DecisionTree
 19 |  */
 20 | 
 21 | /**
 22 |  * Decision Tree class implementing ID3 and CART algorithms with continuous variable support
 23 |  */
 24 | class DecisionTree {
 25 |   public static readonly NODE_TYPES = NODE_TYPES;
 26 |   
 27 |   private model!: TreeNode;
 28 |   private data: any[] = [];
 29 |   private target!: string;
 30 |   private features!: string[];
 31 |   private config: DecisionTreeConfig;
 32 |   private featureTypes: Map<string, 'discrete' | 'continuous'> = new Map();
 33 |   private algorithm: 'id3' | 'cart' | 'auto' = 'auto';
 34 |   private optimizedDataset?: OptimizedDataset;
 35 |   private dataTypeDetector: DataTypeDetector;
 36 | 
 37 |   constructor(...args: any[]) {
 38 |     const numArgs = args.length;
 39 |     
 40 |     // Default configuration
 41 |     this.config = {
 42 |       algorithm: 'auto',
 43 |       minSamplesSplit: 2,
 44 |       minSamplesLeaf: 1,
 45 |       maxDepth: undefined,
 46 |       criterion: 'gini',
 47 |       continuousSplitting: 'binary',
 48 |       autoDetectTypes: true,
 49 |       discreteThreshold: 20,
 50 |       continuousThreshold: 20,
 51 |       confidenceThreshold: 0.7,
 52 |       statisticalTests: true,
 53 |       handleMissingValues: true,
 54 |       numericOnlyContinuous: true,
 55 |       cachingEnabled: true,
 56 |       memoryOptimization: true
 57 |     };
 58 | 
 59 |     this.dataTypeDetector = new DataTypeDetector({
 60 |       discreteThreshold: this.config.discreteThreshold,
 61 |       continuousThreshold: this.config.continuousThreshold,
 62 |       confidenceThreshold: this.config.confidenceThreshold,
 63 |       statisticalTests: this.config.statisticalTests,
 64 |       handleMissingValues: this.config.handleMissingValues,
 65 |       numericOnlyContinuous: this.config.numericOnlyContinuous
 66 |     });
 67 | 
 68 |     // Configuration validation will be called after merging user input
 69 |     
 70 |     if (numArgs === 1) {
 71 |       this.import(args[0]);
 72 |     }
 73 |     else if (numArgs === 2) {
 74 |       const [target, features] = args;
 75 |       
 76 |       if (!target || typeof target !== 'string') {
 77 |         throw new Error('`target` argument is expected to be a String. Check documentation on usage');
 78 |       }
 79 |       if (!features || !Array.isArray(features)) {
 80 |         throw new Error('`features` argument is expected to be an Array<String>. Check documentation on usage');
 81 |       }
 82 | 
 83 |       this.target = target;
 84 |       this.features = features;
 85 |     }
 86 |     else if (numArgs === 3) {
 87 |       // Check if third argument is an array (data) or object (config)
 88 |       if (Array.isArray(args[2])) {
 89 |         // [data, target, features] pattern
 90 |         const [data, target, features] = args;
 91 |         const instance = new DecisionTree(target, features);
 92 |         instance.train(data);
 93 |         return instance;
 94 |       } else {
 95 |         // [target, features, config] pattern
 96 |         const [target, features, config] = args;
 97 |         
 98 |         if (!target || typeof target !== 'string') {
 99 |           throw new Error('`target` argument is expected to be a String. Check documentation on usage');
100 |         }
101 |         if (!features || !Array.isArray(features)) {
102 |           throw new Error('`features` argument is expected to be an Array<String>. Check documentation on usage');
103 |         }
104 |         if (config && typeof config === 'object') {
105 |           this.config = { ...this.config, ...config };
106 |         }
107 | 
108 |         this.target = target;
109 |         this.features = features;
110 |         
111 |         // Validate configuration after merging user input
112 |         this.validateConfig();
113 |       }
114 |     }
115 |     else if (numArgs === 4) {
116 |       const [data, target, features, config] = args;
117 |       const instance = new DecisionTree(target, features, config);
118 |       instance.train(data);
119 |       return instance;
120 |     }
121 |     
122 |     // Validate configuration for cases where it wasn't validated yet
123 |     if (numArgs === 2) {
124 |       this.validateConfig();
125 |     }
126 |     else if (numArgs !== 1 && numArgs !== 3 && numArgs !== 4) {
127 |       throw new Error('Invalid arguments passed to constructor. Check documentation on usage');
128 |     }
129 |   }
130 | 
131 |   /**
132 |    * Trains the decision tree with provided data
133 |    * @param data - Array of training data objects
134 |    */
135 |   train(data: TrainingData[]): void {
136 |     if (!data || !Array.isArray(data)) {
137 |       throw new Error('`data` argument is expected to be an Array<Object>. Check documentation on usage');
138 |     }
139 | 
140 |     this.data = data;
141 | 
142 |     // Detect data types if auto-detection is enabled
143 |     if (this.config.autoDetectTypes) {
144 |       this.detectDataTypes(data);
145 |     }
146 | 
147 |     // Determine algorithm if auto mode
148 |     if (this.config.algorithm === 'auto') {
149 |       this.selectAlgorithm(data);
150 |     } else {
151 |       this.algorithm = this.config.algorithm as 'id3' | 'cart';
152 |     }
153 | 
154 |     // Create optimized dataset if memory optimization is enabled
155 |     if (this.config.memoryOptimization) {
156 |       this.optimizedDataset = processDataOptimized(data, this.features, this.target, this.featureTypes);
157 |     }
158 | 
159 |     // Train the model using the selected algorithm
160 |     this.model = this.createTree(data);
161 |   }
162 | 
163 |   /**
164 |    * Validates the configuration
165 |    */
166 |   private validateConfig(): void {
167 |     const validAlgorithms = ['auto', 'id3', 'cart'];
168 |     if (this.config.algorithm && !validAlgorithms.includes(this.config.algorithm)) {
169 |       throw new Error(`Invalid algorithm: ${this.config.algorithm}. Must be one of: ${validAlgorithms.join(', ')}`);
170 |     }
171 | 
172 |     const validCriteria = ['gini', 'entropy', 'mse', 'mae'];
173 |     if (this.config.criterion && !validCriteria.includes(this.config.criterion)) {
174 |       throw new Error(`Invalid criterion: ${this.config.criterion}. Must be one of: ${validCriteria.join(', ')}`);
175 |     }
176 | 
177 |     const validSplitting = ['binary', 'multiway'];
178 |     if (this.config.continuousSplitting && !validSplitting.includes(this.config.continuousSplitting)) {
179 |       throw new Error(`Invalid continuousSplitting: ${this.config.continuousSplitting}. Must be one of: ${validSplitting.join(', ')}`);
180 |     }
181 |   }
182 | 
183 |   /**
184 |    * Detects data types for all features
185 |    */
186 |   private detectDataTypes(data: TrainingData[]): void {
187 |     const featureAnalysis = this.dataTypeDetector.analyzeFeatures(data, this.features);
188 |     
189 |     for (const [feature, analysis] of Object.entries(featureAnalysis)) {
190 |       this.featureTypes.set(feature, analysis.type);
191 |     }
192 |   }
193 | 
194 |   /**
195 |    * Selects the best algorithm based on data characteristics
196 |    */
197 |   private selectAlgorithm(data: TrainingData[]): void {
198 |     const recommendation = recommendAlgorithm(data, this.features, this.target, {
199 |       discreteThreshold: this.config.discreteThreshold,
200 |       continuousThreshold: this.config.continuousThreshold,
201 |       confidenceThreshold: this.config.confidenceThreshold,
202 |       statisticalTests: this.config.statisticalTests,
203 |       handleMissingValues: this.config.handleMissingValues,
204 |       numericOnlyContinuous: this.config.numericOnlyContinuous
205 |     });
206 | 
207 |     this.algorithm = recommendation.algorithm === 'hybrid' ? 'cart' : recommendation.algorithm;
208 |   }
209 | 
210 |   /**
211 |    * Creates the tree using the selected algorithm
212 |    */
213 |   private createTree(data: TrainingData[]): TreeNode {
214 |     if (this.algorithm === 'cart') {
215 |       const cartConfig: CARTConfig = {
216 |         minSamplesSplit: this.config.minSamplesSplit || 2,
217 |         minSamplesLeaf: this.config.minSamplesLeaf || 1,
218 |         maxDepth: this.config.maxDepth,
219 |         criterion: this.config.criterion || 'gini',
220 |         continuousSplitting: this.config.continuousSplitting || 'binary'
221 |       };
222 | 
223 |       return createCARTTree(data, this.target, this.features, Object.fromEntries(this.featureTypes), cartConfig);
224 |     } else {
225 |       // Use ID3 algorithm
226 |       return createTree(
227 |         data, 
228 |         this.target, 
229 |         this.features,
230 |         this.config.maxDepth,
231 |         this.config.minSamplesSplit
232 |       );
233 |     }
234 |   }
235 | 
236 |   /**
237 |    * Predicts class for a given sample
238 |    * @param sample - Sample data to predict
239 |    * @returns Predicted class value
240 |    */
241 |   predict(sample: TrainingData): any {
242 |     // Check cache first if enabled
243 |     if (this.config.cachingEnabled) {
244 |       const modelId = this.getModelId();
245 |       const cachedPrediction = getCachedPrediction(sample, modelId);
246 |       if (cachedPrediction !== null) {
247 |         return cachedPrediction;
248 |       }
249 |     }
250 | 
251 |     let root = this.model;
252 |     while (root.type !== NODE_TYPES.RESULT) {
253 |       let attr = root.name;
254 |       let sampleVal = sample[attr];
255 |       let childNode: TreeNode | undefined;
256 | 
257 |       // Handle continuous variables with threshold-based splitting
258 |       if (root.splitThreshold !== undefined && root.splitOperator) {
259 |         const numericVal = Number(sampleVal);
260 |         if (!isNaN(numericVal)) {
261 |           if (root.splitOperator === 'lte' && numericVal <= root.splitThreshold) {
262 |             childNode = root.vals![0]; // Left child (<= threshold)
263 |           } else if (root.splitOperator === 'gt' && numericVal > root.splitThreshold) {
264 |             childNode = root.vals![1]; // Right child (> threshold)
265 |           }
266 |         }
267 |       } else {
268 |         // Handle discrete variables with exact matching
269 |         childNode = _.find(root.vals, function (node) {
270 |           return node.name == sampleVal;
271 |         });
272 |       }
273 | 
274 |       // For CART trees, we need to traverse through intermediate nodes
275 |       if (childNode && childNode.child) {
276 |         root = childNode.child;
277 |       } else if (childNode) {
278 |         root = childNode;
279 |       } else {
280 |         // Fallback to first child if no match found
281 |         if (root.vals && root.vals.length > 0) {
282 |           root = root.vals[0].child || root.vals[0];
283 |         } else {
284 |           break;
285 |         }
286 |       }
287 |     }
288 | 
289 |     const prediction = root.val;
290 | 
291 |     // Cache prediction if enabled
292 |     if (this.config.cachingEnabled) {
293 |       const modelId = this.getModelId();
294 |       setCachedPrediction(sample, modelId, prediction);
295 |     }
296 | 
297 |     return prediction;
298 |   }
299 | 
300 |   /**
301 |    * Gets a unique model identifier for caching
302 |    */
303 |   private getModelId(): string {
304 |     return `${this.algorithm}_${this.target}_${this.features.join('_')}`;
305 |   }
306 | 
307 |   /**
308 |    * Evaluates prediction accuracy on samples
309 |    * @param samples - Array of test samples
310 |    * @returns Accuracy ratio (correct predictions / total predictions)
311 |    */
312 |   evaluate(samples: TrainingData[]): number {
313 |     let total = 0;
314 |     let correct = 0;
315 | 
316 |     _.each(samples, (s) => {
317 |       total++;
318 |       let pred = this.predict(s);
319 |       let actual = s[this.target];
320 |       if (_.isEqual(pred, actual)) {
321 |         correct++;
322 |       }
323 |     });
324 | 
325 |     return correct / total;
326 |   }
327 | 
328 |   /**
329 |    * Imports a previously saved model with the toJSON() method
330 |    * @param json - JSON representation of the model
331 |    */
332 |   import(json: DecisionTreeData): void {
333 |     const {model, data, target, features, featureTypes, algorithm, config} = json;
334 | 
335 |     this.model = model;
336 |     this.data = data;
337 |     this.target = target;
338 |     this.features = features;
339 |     
340 |     // Restore continuous variable support
341 |     if (featureTypes) {
342 |       this.featureTypes = new Map(Object.entries(featureTypes));
343 |     }
344 |     
345 |     if (algorithm) {
346 |       this.algorithm = algorithm;
347 |     }
348 |     
349 |     if (config) {
350 |       this.config = { ...this.config, ...config };
351 |     }
352 |   }
353 | 
354 |   /**
355 |    * Returns JSON representation of trained model
356 |    * @returns JSON object containing model data
357 |    */
358 |   toJSON(): DecisionTreeData {
359 |     const {target, features} = this;
360 |     const model = this.model;
361 |     const featureTypes = Object.fromEntries(this.featureTypes);
362 | 
363 |     return {
364 |       model, 
365 |       data: [], // Don't store training data in exported model
366 |       target, 
367 |       features,
368 |       featureTypes,
369 |       algorithm: this.algorithm,
370 |       config: this.config
371 |     };
372 |   }
373 | 
374 |   /**
375 |    * Gets the algorithm used by this tree
376 |    * @returns Algorithm name
377 |    */
378 |   getAlgorithm(): 'id3' | 'cart' | 'auto' {
379 |     return this.algorithm;
380 |   }
381 | 
382 |   /**
383 |    * Gets the feature types detected for this tree
384 |    * @returns Map of feature names to their types
385 |    */
386 |   getFeatureTypes(): { [feature: string]: 'discrete' | 'continuous' } {
387 |     return Object.fromEntries(this.featureTypes);
388 |   }
389 | 
390 |   /**
391 |    * Gets the configuration used by this tree
392 |    * @returns Configuration object
393 |    */
394 |   getConfig(): DecisionTreeConfig {
395 |     return { ...this.config };
396 |   }
397 | 
398 |   /**
399 |    * Gets cache statistics if caching is enabled
400 |    * @returns Cache statistics or null if caching is disabled
401 |    */
402 |   getCacheStats(): any | null {
403 |     if (!this.config.cachingEnabled) return null;
404 |     return globalCache.getCacheStats();
405 |   }
406 | 
407 |   /**
408 |    * Clears the prediction cache
409 |    */
410 |   clearCache(): void {
411 |     if (this.config.cachingEnabled) {
412 |       globalCache.clear();
413 |     }
414 |   }
415 | }
416 | 
417 | 
418 | // Export the DecisionTree class
419 | export default DecisionTree;
420 | 


--------------------------------------------------------------------------------