├── .gitignore ├── random_forest ├── loadData.sql ├── createTable.sql └── random_forest.py ├── server ├── server.js ├── config │ ├── helpers.js │ └── middleware.js ├── package.json └── neuralNet │ ├── neuralNetRouter.js │ └── neuralNetLogic.js ├── stripSolutionCode.sh └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | npm-debug.log 3 | *.log 4 | *.csv 5 | -------------------------------------------------------------------------------- /random_forest/loadData.sql: -------------------------------------------------------------------------------- 1 | LOAD DATA INFILE '/Users/preston/Desktop/train.csv' 2 | INTO TABLE random_forest 3 | FIELDS TERMINATED BY ',' 4 | ENCLOSED BY '"' 5 | LINES TERMINATED BY '\n' 6 | IGNORE 1 ROWS; -------------------------------------------------------------------------------- /server/server.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | 3 | var app = express(); 4 | var port = process.env.PORT || 5000; 5 | 6 | app.listen(port); 7 | console.log('Server now listening on port ' + port); 8 | 9 | // configure server with middleware and routing 10 | require('./config/middleware.js')(app, express); 11 | 12 | // export app for testing and flexibility 13 | module.exports = app; 14 | -------------------------------------------------------------------------------- /random_forest/createTable.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS random_forest ( 2 | passengerId int, 3 | survived int, 4 | pClass int, 5 | passenger varchar (65), 6 | sex varchar (7), 7 | age int, 8 | siblings int, 9 | parents int, 10 | ticket varchar (18), 11 | fare float, 12 | cabin varchar (15), 13 | embarked varchar(1), 14 | PRIMARY KEY (passengerId) 15 | ); -------------------------------------------------------------------------------- /server/config/helpers.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | module.exports = { 4 | errorLogger: function(error, req, res, next) { 5 | // log the error then send it to the next middleware in 6 | // middleware.js 7 | console.error(error.stack); 8 | next(error); 9 | }, 10 | errorHandler: function(error, req, res, next) { 11 | // send error message to client 12 | // message for graceful error handling on app 13 | res.status(500).send({error: error.message}); 14 | } 15 | }; 16 | -------------------------------------------------------------------------------- /stripSolutionCode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | git checkout master 4 | git merge solution 5 | # all .js files, exclusing the node_modules folder 6 | FILES=`find . -path ./neuralNet/server/node_modules -prune -o -name "*.js"` 7 | 8 | # for loop to read each file 9 | for f in $FILES 10 | do 11 | # remove (in place with the -i '' flag that only works on os x) everything in between the solution tags, inclusive of the solution tag lines themselves 12 | sed -i '' '/START SOLUTION CODE/,/END SOLUTION CODE/d' $f 13 | done 14 | -------------------------------------------------------------------------------- /server/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "learningMachines", 3 | "version": "0.5.0", 4 | "description": "A fun playground for learning how to implement machine learning!", 5 | "main": "server.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "node server.js" 9 | }, 10 | "author": "Preston Parry", 11 | "license": "ISC", 12 | "dependencies": { 13 | "babyparse": "^0.4.3", 14 | "body-parser": "^1.12.2", 15 | "brain": "^0.7.0", 16 | "cookie-parser": "^1.3.4", 17 | "express": "^4.12.3", 18 | "morgan": "^1.5.2", 19 | "mysql": "^2.6.0", 20 | "paralleljs": "^0.2.1" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /server/config/middleware.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var path = require('path'); 4 | var bodyParser = require('body-parser'); 5 | var cookieParser = require('cookie-parser'); 6 | var helpers = require('./helpers.js'); // our custom middleware 7 | var morgan = require('morgan'); // used for logging incoming request 8 | var neuralNetRouter = require('../neuralNet/neuralNetRouter.js'); 9 | 10 | module.exports = function(app, express) { 11 | app.use(morgan('dev')); 12 | app.use(cookieParser()); 13 | app.use(bodyParser.urlencoded({extended: true})); 14 | app.use(bodyParser.json()); 15 | app.use(helpers.errorLogger); 16 | app.use(helpers.errorHandler); 17 | // app.use(express.static(path.join(__dirname + '../../../dist'))); 18 | // neuralNet router is where we're keeping all of our api endpoints for the neural net. This lets us extend the server to include other machine learning algorithms under their own api endpoints 19 | app.use('/neuralNet', neuralNetRouter); 20 | }; 21 | -------------------------------------------------------------------------------- /server/neuralNet/neuralNetRouter.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | var express = require('express'); 3 | var neuralNetRouter = express.Router(); 4 | var neuralNetLogic = require('./neuralNetLogic.js'); 5 | 6 | // this could certainly be done more simply, but I actually prefer laying each one out individually, as we do here 7 | // this allows us to see very directly which methods we expect to be publicly available 8 | // and also allows us to add in some logging for the user to see 9 | 10 | neuralNetRouter.get('/formatData', function(req, res) { 11 | console.log('heard a request to formatData'); 12 | neuralNetLogic.formatData(req, res); 13 | }); 14 | 15 | neuralNetRouter.get('/startNet', function(req, res) { 16 | console.log('heard a request to startNet'); 17 | neuralNetLogic.startNet(req, res); 18 | }); 19 | 20 | neuralNetRouter.get('/loadAndTestBrain', function(req, res) { 21 | console.log('heard a request to loadAndTestBrain'); 22 | neuralNetLogic.loadAndTestBrain(req, res); 23 | }); 24 | 25 | neuralNetRouter.get('/kagglePredict', function(req, res) { 26 | console.log('heard a request to kagglePredict'); 27 | neuralNetLogic.kagglePredict(req, res); 28 | }); 29 | 30 | neuralNetRouter.get('/kaggleTrain', function(req, res) { 31 | console.log('heard a request to kaggleTrain'); 32 | neuralNetLogic.kaggleTrain(req, res); 33 | }); 34 | 35 | module.exports = neuralNetRouter; 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Conjurer's Introduction to Machine Learning in JavaScript 2 | > Getting machine learning up and running is similar to picking up other external libraries. Take on these projects and you'll have solved problems using neural networks, random forests, and Support Vector Machines! 3 | 4 | ## PreCourse Steps: 5 | 6 | * Fork and clone this repo (and I'm never opposed to some appreciation in the form of a star!) 7 | * Download the kaggle data and move it to the right folder 8 | * https://www.kaggle.com/c/GiveMeSomeCredit/data 9 | * Move the data file to `learningMachines/neuralNet/server/neuralNet/train.csv`, and make sure it is named `train.csv` 10 | * Use npm to install all dependencies 11 | * Start the server! 12 | * `nodemon server.js` 13 | 14 | You can now make api calls to this server, either through your browser (`http://localhost:5000/neuralNet/startNet`), or through curl on your command line (`curl localhost:5000/neuralNet/startNet`) 15 | 16 | ### The key files in our node server are in the neuralNet folder. 17 | `neuralNetLogic.js` is where we have all the actual JS logic built out. 18 | 19 | ### Your turn! 20 | Here are the things I expect you to do 21 | 1. Create a new net 22 | 2. Train that net 23 | 3. Get predicted outcomes from that net in testing 24 | 4. Add in new data to train the net. Rewrite what's currently in formatData with new data points, or 'features' as they're called in data science, that are combinations of the raw data we already have. Examples would include exact ratios that the net currently can't access because we've already transformed the data into a number between 0 and 1. 25 | 26 | #### Extra Credit 27 | * Handle cases that have missing data ("NA") differently than cases that have full data 28 | * Perform any other feature engineering you can think of 29 | 30 | #### Fantasy Mode 31 | - Parallelize the training of multiple nets at the same time. Training each net is synchronous, so parallelizing won't help you train a single net any faster. But you could try creating multiple versions that have different parameters (number of nodes, hidden layers, learning rate, etc.) and train those in parallel with each other. 32 | 33 | - Build out grid search to try different combinations of number of hidden layers and number of nodes. 34 | Trying different combinations of hyperparameters (the parameters that determine the shape or conditions of the algorithm, such as number of nodes, or number of hidden layers) to find the optimal set is called grid search. scikit-learn has a [good module explaining and implementing grid search](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html). We can't use their implementation direction, but it's a good explanation of the high-level concept. 35 | -------------------------------------------------------------------------------- /random_forest/random_forest.py: -------------------------------------------------------------------------------- 1 | # python uses this "import" syntax to load in external modules, much like node.js uses "require" to load in modules. 2 | # as with node.js, these modules can be core modules (like fs, or path), or external libraries (underscore, brainjs) 3 | import sys 4 | import os.path as path 5 | import csv 6 | 7 | # from ... import is simply a way of specifying a more specific path to find a module to import 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.cross_validation import train_test_split 10 | from sklearn.feature_extraction import DictVectorizer 11 | 12 | classifier = RandomForestClassifier(n_jobs=-1) 13 | vectorizer = DictVectorizer(sparse=False) 14 | 15 | # X is a matrix with the features for our training data (what information we know about each row, without the answers) 16 | X = [] 17 | # y is the answer for each row 18 | y = [] 19 | 20 | with open('titanic.csv', 'rU') as openInputFile: 21 | # csv.DictReader will take a csv file, and read it in as an array of python dictionaries (similar to JavaScript objects, or hashes) 22 | inputRows = csv.DictReader(openInputFile) 23 | # ignore the header row 24 | firstRow = False 25 | for row in inputRows: 26 | if(firstRow): 27 | # if possible, read in the data as floats (numbers with decimal points) rather than strings 28 | # for key in row: 29 | # try: 30 | # row[key] = float(row[key]) 31 | # except: 32 | # pass 33 | X.append(row) 34 | else: 35 | headerRow = row 36 | firstRow = True 37 | 38 | # TODO: binarize the categorical data in the categorical fields 39 | # information like 'embarked' represents a category (which city did you embark in?) rather than a number (how much money did you pay for your fare?) 40 | # a common way to format categorical data for machine learning is to turn it into binary values 41 | # so if we have three categories ('C','Q',and 'S'), we would turn that into something like separate columns for 'embarkedC', 'embarkedQ', and 'embarkedS', with a value of either 0 or 1 42 | # define your own binarize function here, that will take in categorical data, and turn it into a single binary representation of which category is present for this row 43 | def binarize(columnName, columnValue, passengerObj): 44 | try: 45 | columnValue = str(columnValue) 46 | except: 47 | pass 48 | keyName = columnName + columnValue 49 | passengerObj[keyName] = 1 50 | return passengerObj 51 | 52 | cleanedX = [] 53 | # SOLUTION CODE BELOW 54 | for row in X: 55 | # {'Fare': '7.75', 'Name': 'Dooley, Mr. Patrick', 'Embarked': 'Q', 'Age': '32', 'Parch': '0', 'Pclass': '3', 'Sex': 'male', 'Survived': '0', 'SibSp': '0', 'PassengerId': '891', 'Ticket': '370376', 'Cabin': ''} 56 | 57 | # split out our y values (these are the answers we're looking for- in our case, whether this person survived or not) 58 | y.append(row['Survived']) 59 | del row['Survived'] 60 | 61 | row['totalConnections'] = int(row['Parch']) + int(row['SibSp']) 62 | row = binarize('Pclass', row['Pclass'], row) 63 | row = binarize('SibSp', row['SibSp'], row) 64 | row = binarize('Parch', row['Parch'], row) 65 | row['Fare'] = float(row['Fare']) 66 | try: 67 | row['Age'] = float(row['Age']) 68 | row['ageMissing'] = 0 69 | except: 70 | row['ageMissing'] = 1 71 | 72 | row['SibSp'] = float(row['SibSp']) 73 | row['Parch'] = float(row['Parch']) 74 | 75 | # if the cabin is known, grab the first letter from it, which might represent something useful like which deck they're on 76 | try: 77 | row['cabinDeck'] = row['Cabin'][0] 78 | row['hasAssignedCabin'] = 1 79 | except: 80 | row['hasAssignedCabin'] = 0 81 | pass 82 | row['TicketFirstChar'] = str(row['Ticket'])[0] 83 | # TODO: figure out why we keep getting key errors when deleting survived from a row 84 | 85 | cleanedX.append(row) 86 | # END SOLUTION CODE 87 | 88 | 89 | vectorizedX = vectorizer.fit_transform(cleanedX) 90 | 91 | X_train, X_test, y_train, y_test = train_test_split(vectorizedX, y, test_size=.2) 92 | classifier.fit( X_train, y_train ) 93 | 94 | # NOTE: if your score is really high (say, over 0.85), check to make sure you removed the answer from the information you gave the random forest to train on 95 | # otherwise it's simply going to learn that the 'Survived' column you fed it is HIGHLY correlated with the answer 96 | print 'Your random forest\'s score on the test data is:' 97 | print classifier.score( X_test, y_test ) 98 | 99 | -------------------------------------------------------------------------------- /server/neuralNet/neuralNetLogic.js: -------------------------------------------------------------------------------- 1 | var path = require('path'); 2 | var fs = require('fs'); 3 | var Baby = require('babyparse'); 4 | //This line effectively means we've placed the "brain" object that brain.js gives us into the usable scope for this file. We have not yet done anything with that object- that's your task! 5 | var brain = require('brain'); 6 | 7 | //TODO: your code here to create a new neural net instance 8 | 9 | module.exports = { 10 | // this is our main entry point 11 | startNet: function(req,res) { 12 | 13 | fs.readFile(path.join(__dirname, 'train.csv'), 'utf8', function(err, fileData) { 14 | if(err) { 15 | console.error('error reading in the train.csv file. Please make sure it is saved in the same directory as neuralNetLogic.js, and is named train.csv'); 16 | console.error(err); 17 | } else { 18 | // csv files can be saved in a number of different ways. PapaParse (with it's node.js version BabyParse) will take care of all the messiness for us, and reliably return data in a consistent format for us to work with. 19 | rows = Baby.parse(fileData, { 20 | header:true, 21 | dynamicTyping: true 22 | }).data; 23 | 24 | // format the data. see that modular function below 25 | var formattedData = module.exports.formatData(rows); 26 | 27 | // split the data into a test set (20% of the data) and a training set (80% of the data) 28 | var training = []; 29 | var testing = []; 30 | for(var i = 0; i < formattedData.length; i++) { 31 | if(Math.random() > .8) { 32 | testing.push(formattedData[i]); 33 | } else { 34 | training.push(formattedData[i]); 35 | } 36 | } 37 | 38 | // pass this formatted data into trainBrain 39 | module.exports.trainBrain(training, testing); 40 | } 41 | }); 42 | 43 | }, 44 | 45 | trainBrain: function(trainingData, testingData) { 46 | console.time('trainBrain'); 47 | console.log('Training your very own Brain'); 48 | 49 | //TODO: Your code here to train the neural net 50 | 51 | console.timeEnd('trainBrain'); 52 | 53 | // now test the results and see how our machine did! 54 | module.exports.getPredictions(testingData); 55 | }, 56 | 57 | // get predictions from the neural net on expected default likelihood for data we haven't tested it on 58 | getPredictions: function(testData) { 59 | //TODO: Your code here to get the predicted values for each item in our testData 60 | //Here's what an object in the testData array should look like after you've gotten the predicted result from the net: 61 | /* 62 | { input: 63 | { utilizationRate: 0.21939333333333333, 64 | age: 0.3486238532110092, 65 | thirtyDaysLate: 0.01020408163265306, 66 | monthlyIncome: 0.031789238577839024, 67 | openCreditLines: 0.1767766952966369, 68 | ninetyDaysLate: 0.1, 69 | realEstateLines: 0, 70 | sixtyDaysLate: 0, 71 | numDependents: 0 }, 72 | output: { defaulted: 0 }, 73 | netPrediction: { defaulted: 0.34634397489904356 } } 74 | */ 75 | // note that the predicted results are stored in a property called netPrediction 76 | // remember, this is an engineering practice. The rest of this code expects these objects to adhere to a certain API, following the example above. 77 | 78 | module.exports.testBrain(testData); 79 | }, 80 | 81 | //Test our brain with a given set of testData 82 | //Logs the output of default rate at that prediction level 83 | testBrain: function(testData) { 84 | 85 | // everything below is formatting the output 86 | // first we create a results obj with keys labeled 0 to 100 in increments of 5 87 | // eash position in results is an object itself 88 | // Each position aggregates the count of loans the neural net has predicted have this level of risk 89 | // and the number of observed defaults at that level of risk 90 | var results = {}; 91 | for(var j = 0; j <=100; j+=5) { 92 | results[j] = { 93 | count: 0, 94 | defaulted: 0 95 | }; 96 | } 97 | 98 | for(var i = 0; i < testData.length; i++) { 99 | //we format the net's prediction to be an int between 0 and 100 100 | var prediction = Math.round( testData[i].netPrediction.defaulted * 100); 101 | // then, we group up into buckets of 5 102 | var predictionKey = Math.floor(prediction/5) * 5; 103 | //We then increment the total number of cases that the net predicts exist at this level of risk 104 | // (i.e., if the net's prediction for a given input is .38745, we would add one more to the 35 bucket, since we now have one more observation that the net has predicted has a 39% chance of defaulting) 105 | results[predictionKey].count++; 106 | //And whether this input resulted in a default or not 107 | results[predictionKey].defaulted += testData[i].output.defaulted; 108 | } 109 | 110 | //We don't normally like to assume the keys are going to be ordered, but it's a time-saving shortcut to make at the moment, and the consequences are very low if it's not perfectly ordered 111 | for(var key in results) { 112 | console.log(key + '- count: ' + results[key].count + ' defaulted: ' + results[key].defaulted + ' Default Rate: ' + Math.round(results[key].defaulted/results[key].count * 100) + '%' ); 113 | } 114 | 115 | }, 116 | 117 | // neural nets expect to get data that is only between 0 and 1 (or -1 and 1). 118 | // the easiest way to do that is what's called min-max normalizing 119 | // the highest number in the dataset becomes 1, and the lowest number becomes 0, with everything else scaled in between 120 | // we use a slightly modified version of that here that is designed to minimize the effects of outliers 121 | //You can ignore this until extra credit 122 | formatData: function(data) { 123 | 124 | /* 125 | each item in our incoming data is going to be an object that looks like this: 126 | { 127 | SeriousDlqin2yrs: '0', 128 | ID: '150000', 129 | RevolvingUtilizationOfUnsecuredLines: '0.850282951', 130 | age: '64', 131 | 'NumberOfTime30-59DaysPastDueNotWorse': '0', 132 | DebtRatio: '0.249908077', 133 | MonthlyIncome: '8158', 134 | NumberOfOpenCreditLinesAndLoans: '8', 135 | NumberOfTimes90DaysLate: '0', 136 | NumberRealEstateLoansOrLines: '2', 137 | 'NumberOfTime60-89DaysPastDueNotWorse': '0', 138 | NumberOfDependents: '0' 139 | } 140 | */ 141 | 142 | console.log('formatting Data'); 143 | var formattedResults = []; 144 | 145 | for(var i = 0; i < data.length; i++) { 146 | var rawRow = data[i]; 147 | 148 | var formattedRow = {}; 149 | formattedRow.id = rawRow.ID; 150 | // brain.js expects each row object to have an input property (all the information we know about that row), and an output property (what we are trying to predict) 151 | formattedRow.input = {}; 152 | formattedRow.output = { 153 | defaulted: rawRow.SeriousDlqin2yrs 154 | }; 155 | 156 | // we are using a VERY rough approximation of min-max normalization here 157 | // for example, the largest age in the dataset, so we're just dividing by 109 158 | formattedRow.input.age = rawRow.age/109; 159 | formattedRow.input.thirtyDaysLate = rawRow['NumberOfTime30-59DaysPastDueNotWorse'] / 98; 160 | formattedRow.input.monthlyIncome = Math.sqrt(rawRow.MonthlyIncome) / 1735; 161 | formattedRow.input.openCreditLines = Math.sqrt(rawRow.NumberOfOpenCreditLinesAndLoans)/8; 162 | formattedRow.input.ninetyDaysLate = Math.sqrt(rawRow['NumberOfTimes90DaysLate']) / 10; 163 | formattedRow.input.realEstateLines = rawRow.NumberRealEstateLoansOrLines/ 54; 164 | formattedRow.input.sixtyDaysLate = Math.sqrt(rawRow['NumberOfTime60-89DaysPastDueNotWorse']) / 10; 165 | formattedRow.input.numDependents = Math.sqrt(rawRow.NumberOfDependents) / 5; 166 | //if the utilization rate is below 1, we divide it by 3 to make it smaller (taking the cube root would make it larger); 167 | if(rawRow.RevolvingUtilizationOfUnsecuredLines < 1) { 168 | formattedRow.input.utilizationRate = rawRow.RevolvingUtilizationOfUnsecuredLines/3; 169 | } else { 170 | //otherwise we take the cube root of it, and then divide by 37 (which is the max number we would have after cube rooting ). 171 | formattedRow.input.utilizationRate = Math.pow(rawRow.RevolvingUtilizationOfUnsecuredLines, 1/3)/37; 172 | } 173 | 174 | // TODO: perform some feature engineering 175 | // for example, try adding up the total number of days a person has been late on their previous loans 176 | 177 | 178 | formattedResults.push(formattedRow); 179 | } 180 | return formattedResults; 181 | 182 | }, 183 | 184 | // this still needs to be refactored to read in a file, rather than from the database 185 | kagglePredict: function(req, res) { 186 | db.query('SELECT * FROM submission', function(err, response) { 187 | if(err) { 188 | console.error(err); 189 | } else { 190 | var formattedData = module.exports.formatData(response); 191 | var netName = 'hiddenLayers9,40,50,80learningRate0.31428981244655'; 192 | fs.readFile(netName, 'utf8', function(err, data) { 193 | if(err) { 194 | console.error(err); 195 | } else { 196 | net.fromJSON(JSON.parse(data)); 197 | res.send('Loaded the brain! Testing it now.') 198 | var results = []; 199 | results.push('id'); 200 | results.push('prediction'); 201 | results.push('\n'); 202 | for (var i = 0; i < formattedData.length; i++) { 203 | results. push(formattedData[i].id); 204 | results.push(net.run(formattedData[i].input).defaulted); 205 | results.push('\n'); 206 | } 207 | var predictionFileName = 'kagglePredictions' + netName + '.csv'; 208 | fs.writeFile(predictionFileName, results.join(','), function(err) { 209 | if(err) { 210 | console.log('did not write to file successfully'); 211 | } else { 212 | console.log('wrote predictions to file successfully!'); 213 | } 214 | }) 215 | // console.log(results.join(',')); 216 | // module.exports.testBrain(formattedData); 217 | } 218 | }); 219 | 220 | } 221 | }); 222 | } 223 | 224 | }; 225 | --------------------------------------------------------------------------------