├── README.md ├── csv2js.py ├── data_util.js ├── decision-tree-demo.html ├── golf.csv ├── helpers.js ├── leaf.csv ├── learningjs.js ├── package.json ├── test_lr.js ├── test_tree.js ├── wpbc_real_test.csv └── wpbc_real_train.csv /README.md: -------------------------------------------------------------------------------- 1 | LearningJS: A Javascript Implementation of Logistic Regression and C4.5 Decision Tree Algorithms 2 | ========== 3 | Author: Yandong Liu. Email: yandongl @ cs.cmu.edu 4 | 5 | # Update 6 | I've made some update on the data loading logic so now it reads in csv-format file. Previous version is still accessible but it's no longer supported. 7 | 8 | # Introduction 9 | Javascript implementation of several machine learning algorithms including Decision Tree and Logistic Regression this far. More to come. 10 | 11 | # Online Demo 12 | Here's a online [demo](http://www.cs.cmu.edu/~yandongl/learningjs/decision-tree-demo.html) with visualization and a few datasets. 13 | 14 | # Data format 15 | Input files need to be in CSV-format with 1st line being feature names. One of the features has to be called 'label'. E.g. 16 |
17 | outlook, temp, humidity, wind, label 18 | text, real, text, text, feature_type 19 | 'Sunny',80,'High', 'Weak', 'No' 20 | 'Sunny',82,'High', 'Strong', 'No' 21 | 'Overcast',73,'High', 'Weak', 'Yes' 22 |23 | There's also an optional 2nd line for feature types and the 'label' column for 2nd line has to be called 'feature_type'. This is useful if feature types are mixed. For Logistic Regression, all features should be real numbers. E.g. 24 |
25 | label,a,b,c,d,e,f,g,h,i,j,k,l,m 26 | 1,1,0.72694,1.4742,0.32396,0.98535,1,0.83592,0.0046566,0.0039465,0.04779,0.12795,0.016108,0.0052323 27 | 2,2,0.74173,1.5257,0.36116,0.98152,0.99825,0.79867,0.0052423,0.0050016,0.02416,0.090476,0.0081195,0.002708 28 | 3,3,0.76722,1.5725,0.38998,0.97755,1,0.80812,0.0074573,0.010121,0.011897,0.057445,0.0032891,0.00092068 29 | 1,4,0.73797,1.4597,0.35376,0.97566,1,0.81697,0.0068768,0.0086068,0.01595,0.065491,0.0042707,0.0011544 30 |31 | 32 | 33 | # Usage 34 | Data loading: data_util.js provides three methods: 35 | 36 | * `loadTextFile`: the csv-format file will be loaded from disk and columns are parsed as strings unless 2nd line specifies feature types. 37 | * `loadRealFile`: the csv-format file will be loaded from disk and columns are parsed as real numbers. 38 | * `loadString`: a big string will be chopped into lines and columns are parsed as strings unless 2nd line specifies feature types. 39 | 40 | In the loading callback function you will obtain a data object D on which you can apply the learning methods. Note that only Decision Tree supports both real and categorical features. Logistic Regression works on real features only. 41 | 42 | 43 | ```javascript 44 | 45 | 46 | 47 | loadString(content, function(D) { 48 | var tree = new learningjs.tree(); 49 | tree.train(D, function(model, err){ 50 | if(err) { 51 | console.log(err); 52 | } else { 53 | model.calcAccuracy(D.data, D.targets, function(acc, correct, total){ 54 | console.log( 'training: got '+correct +' correct out of '+total+' examples. accuracy:'+(acc*100.0).toFixed(2)+'%'); 55 | }); 56 | } 57 | }); 58 | }); 59 | ``` 60 | 61 | # Use in Nodejs 62 | Similarly you need to import the lib and do the same: 63 | 64 | ```javascript 65 | var learningjs = require('learningjs.js'); 66 | var data_util = require('data_util.js'); 67 | var tree = new learningjs.tree(); 68 | data_util.loadRealFile(fn_csv, function(D) { 69 | 70 | //normalize data 71 | data_util.normalize(D.data, D.nfeatures); 72 | 73 | //logistic regression. following params are optional 74 | D.optimizer = 'sgd'; //default choice. other choice is 'gd' 75 | D.learning_rate = 0.005; 76 | D.l2_weight = 0.0; 77 | D.iterations = 1000; //increase number of iterations for better performance 78 | 79 | new learningjs.logistic().train(D, function(model, err){ 80 | if(err) { 81 | console.log(err); 82 | } else { 83 | model.calcAccuracy(D.data, D.targets, function(acc, correct, total){ 84 | console.log('training: got '+correct +' correct out of '+total+' examples. accuracy:'+(acc*100.0).toFixed(2)+'%'); 85 | }); 86 | data_util.loadRealFile(fn_test, function(T) { 87 | model.calcAccuracy(T.data, T.targets, function(acc, correct, total){ 88 | console.log(' test: got '+correct +' correct out of '+total+' examples. accuracy:'+(acc*100.0).toFixed(2)+'%'); 89 | }); 90 | }); 91 | } 92 | }); 93 | }); 94 | ``` 95 | 96 | # License 97 | MIT 98 | -------------------------------------------------------------------------------- /csv2js.py: -------------------------------------------------------------------------------- 1 | ''' 2 | convert csv file to json objects 3 | 4 | usage: cat csv_file | python csv2js.py > json_file 5 | 6 | note: 7 | 1) 1st line must be header 8 | 2) also remember to remove label from 'features' so it's not used for training 9 | 3) then drag and drop to the drop zone of learningjs page 10 | ''' 11 | import sys 12 | 13 | header=True 14 | h=[] 15 | data=[] 16 | print 'var trainData=[' 17 | lastfs=None 18 | for l in sys.stdin: 19 | if(header): 20 | fs = l.strip().split(',') 21 | h = fs 22 | header=False 23 | else: 24 | fs = l.strip().split(',') 25 | if lastfs!=None: 26 | print '{', 27 | print ','.join([h[idx]+':\''+a+'\'' for idx,a in enumerate(lastfs)]), 28 | print '},' 29 | lastfs = fs 30 | print '{', 31 | print ','.join([h[idx]+':\''+a+'\'' for idx,a in enumerate(lastfs)]), 32 | print '}' 33 | print ']' 34 | 35 | print 'var features=[' 36 | print ','.join(['\''+a+'\'' for a in h]) 37 | print ']' 38 | 39 | -------------------------------------------------------------------------------- /data_util.js: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////////////////////// 2 | // utilities for 3 | // loading a real number file 4 | // loading a text file 5 | // normalize matrix of real numbers 6 | // 7 | // author: yandong liu 8 | // email: yandongl _at_ cs.cmu.edu 9 | //////////////////////////////////////////////////////////////////////////////////////////////// 10 | 'use strict'; 11 | if(typeof require === 'function') { 12 | var fs = require('fs'); 13 | var lazy = require("lazy"); 14 | } 15 | 16 | var util = { 17 | 18 | normalize: function(data, nfeatures) { 19 | for(var i=0;i
Author: Yandong Liu. Email: yandongl @ cs.cmu.edu. Date: 2013.5
14 |15 | Update: I've made some update on the data loading logic so now it reads in csv-format file. Previous version is still accessible but it's no longer supported. 16 |
17 |24 | Drop training data file here 25 | | 26 |27 | Drop test data file here 28 | | 29 | |
38 | Introduction: 39 | Javascript implementation of several machine learning algorithms including Decision Tree and Logistic Regression this far. More to come. 40 |
41 | 42 |43 | Data format: Input files need to be in CSV-format with 1st line being feature names. One of the features has to be called 'label'. E.g. 44 |
45 | outlook, temp, humidity, wind, label 46 | text, real, text, text, feature_type 47 | 'Sunny',80,'High', 'Weak', 'No' 48 | 'Sunny',82,'High', 'Strong', 'No' 49 | 'Overcast',73,'High', 'Weak', 'Yes' 50 |51 | There's also an optional 2nd line for feature types and the 'label' column for 2nd line has to be called 'feature_type'. This is useful if feature types are mixed. 52 | 53 | Usage: 54 |
55 |
64 |
66 | loadString(content, function(D) { 67 | var tree = new learningjs.tree(); 68 | tree.train(D, function(model, err){ 69 | if(err) { 70 | console.log(err); 71 | } else { 72 | model.calcAccuracy(D.data, D.targets, function(acc, correct, total){ 73 | console.log( 'training: got '+correct +' correct out of '+total+' examples. accuracy:'+(acc*100.0).toFixed(2)+'%'); 74 | }); 75 | } 76 | }); 77 | }); 78 |79 | Check the source code of this page and see how it works on the dropped files. 80 | 81 | 82 |
84 | data_util.loadRealFile(fn_csv, function(D) { 85 | 86 | //normalize data 87 | data_util.normalize(D.data, D.nfeatures); 88 | 89 | //logistic regression. following params are optional 90 | D.optimizer = 'sgd'; //default choice. other choice is 'gd' 91 | D.learning_rate = 0.005; 92 | D.l2_weight = 0.000001; 93 | D.iterations = 1000; //increase number of iterations for better performance 94 | 95 | new learningjs.logistic().train(D, function(model, err){ 96 | if(err) { 97 | console.log(err); 98 | } else { 99 | model.calcAccuracy(D.data, D.targets, function(acc, correct, total){ 100 | console.log('training: got '+correct +' correct out of '+total+' examples. accuracy:'+(acc*100.0).toFixed(2)+'%'); 101 | }); 102 | data_util.loadRealFile(fn_test, function(T) { 103 | model.calcAccuracy(T.data, T.targets, function(acc, correct, total){ 104 | console.log(' test: got '+correct +' correct out of '+total+' examples. accuracy:'+(acc*100.0).toFixed(2)+'%'); 105 | }); 106 | }); 107 | } 108 | }); 109 | }); 110 |111 | Here's a sample code file for tree and logistic regression for its application in nodejs. 112 | 113 |
115 | License: MIT 116 |
117 | 118 | Also see the source code 119 | 120 | 121 | 122 | 123 |