├── pySetup
    ├── parameterMakers
    │   ├── __init__.py
    │   ├── clMultinomialNB.py
    │   ├── clVowpalWabbit.py
    │   ├── clPerceptron.py
    │   ├── clSGDClassifier.py
    │   ├── clKnn.py
    │   ├── svcShrinking.py
    │   ├── svcFirstParameterMaker.py
    │   ├── clLogisticRegression.py
    │   ├── clnnSknn3Layer.py
    │   ├── clAdaLossAll.py
    │   ├── clnnSknn.py
    │   ├── clnnNoLearn.py
    │   ├── clAdaBoost.py
    │   ├── clRfBootstrapBoth.py
    │   ├── rfGiniParamMaker.py
    │   ├── clExtraTrees.py
    │   ├── clnnSklearnMLP.py
    │   ├── paramMakers.py
    │   ├── clXGBoost.py
    │   └── rfEntropyParamMaker.py
    ├── extendedTrainingList.py
    ├── sendMessages.py
    ├── randomizedSearchList.py
    ├── stepsToAddNewClassifier.txt
    ├── makeBigClassifiers.py
    ├── utilsPyShell.js
    ├── makeClassifiers.py
    ├── classifierList.js
    ├── utils.js
    ├── controllerPython.js
    ├── splitDatasets.py
    ├── training.py
    ├── makePredictions.py
    └── testingFileNames.js
├── requirements.txt
├── .gitignore
├── test
    └── regression
    │   ├── test.js
    │   ├── trainAlgorithms.js
    │   ├── deleteRemnantsAndRunNewTest.js
    │   ├── splitDataset.js
    │   └── makePredictions.js
├── machineJS.js
├── shutDown.js
├── advancedAPI.md
├── package.json
├── README.md
└── processArgs.js


/pySetup/parameterMakers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib
2 | numpy
3 | pandas
4 | scipy
5 | cython
6 | xgboost
7 | python-dateutil
8 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clMultinomialNB.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import numpy as np
 3 | 
 4 | def makeParams(X, y, globalArgs, dev, problemType):
 5 | 
 6 |     parameters_to_try = {
 7 |         'alpha': np.random.uniform(0,1,1000)
 8 |     }
 9 |         
10 |     return parameters_to_try
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | node_modules
 2 | npm-debug.log
 3 | kaggleTrainingData.csv
 4 | formattingData*.txt
 5 | formattedData*.txt
 6 | bestNet*.txt
 7 | kagglePredictions*.txt
 8 | *.csv
 9 | *.p
10 | *.pkl
11 | randomForest/bestRF/*
12 | *.zip
13 | *.npy
14 | *.pyc
15 | predictions/*
16 | *.xls
17 | *.xlsx
18 | data/*
19 | *.npz
20 | *.txt
21 | src/*
22 | .npmignore
23 | pySetup/testingFileNames
24 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clVowpalWabbit.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # some comments on good parameters:
 3 |     # https://www.reddit.com/r/MachineLearning/comments/1mq8fb/why_i_love_scikitlearn/
 4 | 
 5 | # a sklearn wrapper for vw:
 6 |     # https://github.com/josephreisinger/vowpal_porpoise/blob/master/examples/example_sklearn.py
 7 | 
 8 | # presentation with some good explanations of vw params:
 9 |     # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1
10 | 
11 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clPerceptron.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import numpy as np
 3 | 
 4 | def makeParams(X, y, globalArgs, dev, problemType):
 5 | 
 6 |     # I am not yet confident in eta0
 7 |     parameters_to_try = {
 8 |         "penalty": [None,'l2','l1','elasticnet'],
 9 |         "alpha": scipy.stats.expon(.00001,.001),
10 |         "shuffle": [True,False],
11 |         # "eta0": scipy.stats.expon(.0001,1),
12 |     }        
13 |         
14 |     return parameters_to_try
15 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clSGDClassifier.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import numpy as np
 3 | 
 4 | def makeParams(X, y, globalArgs, dev, problemType):
 5 | 
 6 |     # I am not yet confident in eta0
 7 |     parameters_to_try = {
 8 |         "loss": ['hinge','log','modified_huber','squared_hinge','squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'],
 9 |         "penalty": ['none','l2','l1','elasticnet'],
10 |         "alpha": scipy.stats.expon(.00001,.001),
11 |         "shuffle": [True,False],
12 |         "epsilon": scipy.stats.expon(.001,1)
13 |         # "eta0": scipy.stats.expon(.0001,1),
14 |     }        
15 |         
16 |     return parameters_to_try
17 | 


--------------------------------------------------------------------------------
/pySetup/extendedTrainingList.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def getAll():
 3 |     return {
 4 |         'clRfEntropy':True,
 5 |         'clRfGini':True,
 6 |         'clRfBootstrapTrue':True,
 7 |         'clSVCFirst':False,
 8 |         'clSVCShrinking':False,
 9 |         'clKnn':False,
10 |         'clLogisticRegression':False,
11 |         'clnnSknn3Layer':True,
12 |         'clnnSknn':True,
13 |         'clAdaBoost':False,
14 |         'clAdaLossLinear':False,
15 |         'clAdaLossSquare':False,
16 |         'clAdaLossExponential':False,
17 |         'clXGBoost':False,
18 |         'clMultinomialNB':False,
19 |         'clPerceptron':False,
20 |         'clSGDClassifier':True,
21 |         'clExtraTrees':True,
22 |         'clnnSklearnMLP':True
23 |     }
24 | 


--------------------------------------------------------------------------------
/pySetup/sendMessages.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | def printParent(text):
 4 |     messageObj = {
 5 |         'text': text,
 6 |         'type': 'console.log'
 7 |     }
 8 |     print json.dumps(messageObj)
 9 | 
10 | 
11 | def messageParent(messageText, type):
12 |     messageObj = {
13 |         'text': messageText,
14 |         'type': type
15 |     }
16 |     print json.dumps(messageObj)
17 | 
18 | 
19 | def obviousPrint(label, obj):
20 |     printParent('#######################################################################################################################')
21 |     printParent('#######################################################################################################################')
22 |     printParent(label)
23 |     printParent(obj)
24 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clKnn.py:
--------------------------------------------------------------------------------
 1 | def makeParams(X, y, globalArgs, dev, problemType):
 2 | 
 3 |     # TODO: knn breaks with sparse matrices. it consumes huge amounts of memory. 
 4 |     # https://github.com/ClimbsRocks/machineJS/issues/74
 5 | 
 6 |     # leaf size only applies to ball or kd tree, so i'm not sure if we can include it in grid search or not
 7 |     parameters_to_try = {
 8 |         # 'algorithm': ['ball_tree','kd_tree','brute'],
 9 |         # 'weights': ['uniform','distance'],
10 |         # 'leaf_size': [15,30,60,120],
11 |         'n_neighbors': [2,5,10,25,100]
12 |     }
13 | 
14 |     if dev:
15 |         parameters_to_try.pop('n_neighbors', None)
16 |         # parameters_to_try.pop('max_features', None)
17 |         
18 |     return parameters_to_try
19 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/svcShrinking.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | def makeParams(X, y, globalArgs, dev, problemType):
 4 | 
 5 |     # at some point in the future when we have figured out whether we have a probability or a classifiaction problem, we can set 'probability' equal to True only when we have a probability problem. that property just enables us to invoke predict_proba, but it slows down training time noticeably
 6 |     # an easy way to split this out would be to have one svm that is shrinking, and one that is not
 7 |     # shrinking is actually set for us in makeClassifiers.py, but we are keeping it here just to make it obvious
 8 |     parameters_to_try = {
 9 |         # 'shrinking': False,
10 |         'C': [1, 10, 100, 1000], 
11 |         'gamma': [0.001, 0.0001], 
12 |         'kernel': ['rbf']
13 |     }
14 | 
15 |     if dev:
16 |         parameters_to_try.pop('C', None)
17 |         
18 |     return parameters_to_try
19 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/svcFirstParameterMaker.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | def makeParams(X, y, globalArgs, dev, problemType):
 4 | 
 5 |     # at some point in the future when we have figured out whether we have a probability or a classifiaction problem, we can set 'probability' equal to True only when we have a probability problem. that property just enables us to invoke predict_proba, but it slows down training time noticeably
 6 |     # an easy way to split this out would be to have one svm that is shrinking, and one that is not
 7 |     # shrinking is actually set for us in makeClassifiers.py, but we are keeping it here just to make it obvious
 8 |     parameters_to_try = {
 9 |         # 'shrinking': True,
10 |         'C': [1, 10, 100, 1000], 
11 |         'gamma': [0.001, 0.0001], 
12 |         'kernel': ['rbf']
13 |     }
14 | 
15 |     if dev:
16 |         parameters_to_try.pop('C', None)
17 |         
18 |     return parameters_to_try
19 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clLogisticRegression.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import numpy as np
 3 | 
 4 | def makeParams(X, y, globalArgs, dev, problemType):
 5 | 
 6 |     # Other parameters to try: tol, class_weight, penalty
 7 |     # I do not want to spend long optimizing logistic regressions, as we have other classifiers that are generally considered more effective across many different problem types. 
 8 |     # TODO: break all of these out into their own classifiers- newton-cg, lbfgs, and liblinear
 9 |         # we are spending all the time training them as their own separate instances anyways (that's what gridsearch does), we might as well make use of that output for our creative ensembling
10 |     parameters_to_try = {
11 |         'C': scipy.stats.expon(.001,1),
12 |         'solver': ['newton-cg', 'lbfgs', 'liblinear'],
13 |     }
14 | 
15 |     if dev:
16 |         parameters_to_try.pop('C', None)
17 |         
18 |     return parameters_to_try
19 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clnnSknn3Layer.py:
--------------------------------------------------------------------------------
 1 | def makeParams(X, y, globalArgs, dev, problemType):
 2 | 
 3 |     # TODO: figure out more interesting parameters to try
 4 |         # follow a similar pattern to what we did for brainjs, basing the number of nodes on the size of the input
 5 |         # test number of hidden layers
 6 |     # TODO: break out each type into it's own classifier
 7 |     try:
 8 |         # if dense
 9 |         numFeatures = len(X[0])
10 |     except:
11 |         # if sparse
12 |         numFeatures = X.shape[1]
13 |     parameters_to_try = {
14 |         'learning_rate': [0.001, 0.01],
15 |         'hidden0__units': [ numFeatures / 2, numFeatures ],
16 |         'hidden1__units': [ numFeatures / 2, numFeatures ],
17 |         'hidden2__units': [ numFeatures / 2, numFeatures ]
18 |     }
19 | 
20 |     if dev:
21 |         parameters_to_try.pop('learning_rate', None)
22 |         # parameters_to_try.pop('hidden0__units', None)
23 |         
24 |     return parameters_to_try
25 | 


--------------------------------------------------------------------------------
/pySetup/randomizedSearchList.py:
--------------------------------------------------------------------------------
 1 | # this file simply holds a list of all the classifiers we have enabled RandomizedSearchCV for. 
 2 | # if you would like to have more control over the process, and use GridSearchCV, please modify this file to say False for the algorithm you want to run GridSearchCV on.
 3 | 
 4 | def rsList():
 5 |     return {
 6 |         'clnnSknn': False,
 7 |         'clnnNoLearn': False,
 8 |         'clKnn': False,
 9 |         'clSVCFirst': False,
10 |         'clSVCShrinking': False,
11 |         'clnnSknn3Layer': False,
12 |         'clRfEntropy': True,
13 |         'clLogisticRegression': True,
14 |         'clXGBoost': True,
15 |         'clRfGini': True,
16 |         'clRfBootstrapTrue': True,
17 |         'clAdaBoost': True,
18 |         'clAdaLossLinear': True,
19 |         'clAdaLossSquare': True,
20 |         'clAdaLossExponential': True,
21 |         'clMultinomialNB': True,
22 |         'clPerceptron': True,
23 |         'clSGDClassifier': True,
24 |         'clExtraTrees': True,
25 |         'clnnSklearnMLP': True
26 |     }
27 | 


--------------------------------------------------------------------------------
/test/regression/test.js:
--------------------------------------------------------------------------------
 1 | global.rTest = {};
 2 | var expect = require('chai').expect;
 3 | var mocha = require('mocha');
 4 | var path = require('path');
 5 | var fs = require('fs');
 6 | 
 7 | var makePredictions = require('./makePredictions');
 8 | var splitDataset = require('./splitDataset');
 9 | var trainAlgorithms = require('./trainAlgorithms');
10 | var deleteRemnantsAndRunNewTest = require('./deleteRemnantsAndRunNewTest');
11 | 
12 | 
13 | // this block will contain all the tests for the entire data-formatter package
14 | describe('regression problems', function() {
15 |   // this timeout should be long enough to handle tests on a variety of machines. If you are getting a timeout error, consider bumping this up even more. 
16 |   this.timeout(600000);
17 | 
18 |   rTest.startTime = Date.now();
19 | 
20 |   before(deleteRemnantsAndRunNewTest);
21 | 
22 | 
23 |   // TODO: run this separately for each type of problem we're solving (regression, category, then eventually multi-labe, etc.)
24 | 
25 |   // setDefaultArgs();
26 | 
27 |   trainAlgorithms();
28 | 
29 |   makePredictions();
30 | 
31 |   splitDataset();
32 | 
33 | });
34 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clAdaLossAll.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import numpy as np
 3 | 
 4 | def makeParams(X, y, globalArgs, dev, problemType):
 5 | 
 6 |     # GridSearchCV parameters:
 7 |     # parameters_to_try = {
 8 |     #     'n_estimators': [5,50,150],
 9 |     #     'learning_rate': [.1, .3],
10 |     #     'algorithm':['SAMME','SAMME.R']
11 |     # }
12 | 
13 |     # RandomizedSearchCV parameters:
14 |     # ideally, I think this would be a gamma distribution most likely. 
15 |     
16 | 
17 |     parameters_to_try = {
18 |         "n_estimators": scipy.stats.randint(25,500),
19 |         "learning_rate": scipy.stats.expon(.001, 2),
20 |         # "loss": ['linear','square','exponential'],
21 |         "algorithm": ['SAMME','SAMME.R']
22 |     }
23 | 
24 |     if problemType not in ['category', 'multi-category']:
25 |         try:
26 |             parameters_to_try.pop('algorithm', None)
27 |         except:
28 |             pass
29 | 
30 |     if dev:
31 |         parameters_to_try.pop('learning_rate', None)
32 |         try:
33 |             parameters_to_try.pop('algorithm', None)
34 |         except:
35 |             pass
36 |         
37 |     return parameters_to_try
38 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clnnSknn.py:
--------------------------------------------------------------------------------
 1 | def makeParams(X, y, globalArgs, dev, problemType):
 2 | 
 3 |     # guidance on params:
 4 |         # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1
 5 | 
 6 |     # TODO: figure out more interesting parameters to try
 7 |         # follow a similar pattern to what we did for brainjs, basing the number of nodes on the size of the input
 8 |         # test number of hidden layers
 9 |     # TODO: break out each type into it's own classifier
10 |     try:
11 |         # if dense
12 |         numFeatures = len(X[0])
13 |     except:
14 |         # if sparse
15 |         numFeatures = X.shape[1]
16 | 
17 |     parameters_to_try = {
18 |         'learning_rate': [0.001, 0.01],
19 |         'hidden0__units': [ numFeatures / 2, numFeatures ]
20 |         # 'hidden1__units': [ numFeatures / 2, numFeatures, numFeatures * 3 ],
21 |         # 'hidden2__units': [ numFeatures / 2, numFeatures, numFeatures * 3 ]
22 |     }
23 | 
24 |     if dev:
25 |         # parameters_to_try.pop('learning_rate', None)
26 |         parameters_to_try['learning_rate'] = [.001,.01]
27 |         parameters_to_try.pop('hidden0__units', None)
28 |         
29 |     return parameters_to_try
30 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clnnNoLearn.py:
--------------------------------------------------------------------------------
 1 | def makeParams(X, y, globalArgs, dev, problemType):
 2 | 
 3 |     # guidance on params:
 4 |         # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1
 5 | 
 6 |     # TODO: figure out more interesting parameters to try
 7 |         # follow a similar pattern to what we did for brainjs, basing the number of nodes on the size of the input
 8 |         # test number of hidden layers
 9 |     # TODO: break out each type into it's own classifier
10 |     try:
11 |         # if dense
12 |         numFeatures = len(X[0])
13 |     except:
14 |         # if sparse
15 |         numFeatures = X.shape[1]
16 | 
17 |     parameters_to_try = {
18 |         'learning_rate': [0.001, 0.01],
19 |         'hidden0__units': [ numFeatures / 2, numFeatures ]
20 |         # 'hidden1__units': [ numFeatures / 2, numFeatures, numFeatures * 3 ],
21 |         # 'hidden2__units': [ numFeatures / 2, numFeatures, numFeatures * 3 ]
22 |     }
23 | 
24 |     if dev:
25 |         # parameters_to_try.pop('learning_rate', None)
26 |         parameters_to_try['learning_rate'] = [.001,.01]
27 |         parameters_to_try.pop('hidden0__units', None)
28 |         
29 |     return parameters_to_try
30 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clAdaBoost.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import numpy as np
 3 | 
 4 | def makeParams(X, y, globalArgs, dev, problemType):
 5 | 
 6 |     # GridSearchCV parameters:
 7 |     # parameters_to_try = {
 8 |     #     'n_estimators': [5,50,150],
 9 |     #     'learning_rate': [.1, .3],
10 |     #     'algorithm':['SAMME','SAMME.R']
11 |     # }
12 | 
13 |     # RandomizedSearchCV parameters:
14 |     # ideally, I think this would be a gamma distribution most likely. 
15 |     
16 | 
17 |     parameters_to_try = {
18 |         "n_estimators": scipy.stats.randint(25,500),
19 |         "learning_rate": scipy.stats.expon(.001, 2),
20 |         "loss": ['linear','square','exponential'],
21 |         "algorithm": ['SAMME','SAMME.R']
22 |     }
23 | 
24 |     if problemType not in ['category', 'multi-category']:
25 |         try:
26 |             parameters_to_try.pop('algorithm', None)
27 |         except:
28 |             pass
29 |     else:
30 |         parameters_to_try.pop('loss', None)
31 |         
32 | 
33 |     if dev:
34 |         parameters_to_try.pop('learning_rate', None)
35 |         try:
36 |             parameters_to_try.pop('algorithm', None)
37 |         except:
38 |             pass
39 |         
40 |     return parameters_to_try
41 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clRfBootstrapBoth.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import scipy
 3 | import numpy as np
 4 | 
 5 | from sendMessages import printParent
 6 | 
 7 | def makeParams(X, y, globalArgs, dev, problemType):
 8 | 
 9 |     try:
10 |         # if dense
11 |         numColumns = len(X[0])
12 |     except:
13 |         # if sparse
14 |         numColumns = X.shape[1]
15 | 
16 |     sqrtNum = int(math.sqrt(numColumns))
17 |     # GridSearchCV parameters:
18 | 
19 |     # max_features_to_try = [sqrtNum + x for x in (-2,0,2)]
20 |     # max_features_to_try.append('log2')
21 |     # max_features_to_try.append(None)
22 | 
23 |     # parameters_to_try = {
24 |     #     'max_features': max_features_to_try,
25 |     #     'min_samples_leaf':[1,2,5,25,50,100,150]
26 |     # }
27 | 
28 | 
29 |     maxFeaturesList = np.random.lognormal(sqrtNum, 2, 10)
30 |     # if using lognormal, check out this link:
31 |         # http://stackoverflow.com/questions/12937824/lognormal-random-numbers-centered-around-a-high-value
32 |     # 'max_features': scipy.stats.lognorm([sqrtNum/5], int(sqrtNum)),
33 | 
34 |     # RandomizedSearchCV parameters:
35 |     parameters_to_try = {
36 |         'max_features': scipy.stats.randint(1,numColumns),
37 |         'min_samples_leaf': scipy.stats.randint(1,200),
38 |         'min_samples_split': scipy.stats.randint(2,20),
39 |     }
40 | 
41 |     if dev:
42 |         parameters_to_try.pop('min_samples_leaf', None)
43 |         parameters_to_try.pop('max_features', None)
44 |         parameters_to_try['max_features'] = [sqrtNum, 'log2']
45 |                 
46 |     return parameters_to_try
47 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/rfGiniParamMaker.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import scipy
 3 | import numpy as np
 4 | 
 5 | from sendMessages import printParent
 6 | 
 7 | def makeParams(X, y, globalArgs, dev, problemType):
 8 | 
 9 |     try:
10 |         # if dense
11 |         numColumns = len(X[0])
12 |     except:
13 |         # if sparse
14 |         numColumns = X.shape[1]
15 | 
16 |     sqrtNum = int(math.sqrt(numColumns))
17 |     # GridSearchCV parameters:
18 | 
19 |     # max_features_to_try = [sqrtNum + x for x in (-2,0,2)]
20 |     # max_features_to_try.append('log2')
21 |     # max_features_to_try.append(None)
22 | 
23 |     # parameters_to_try = {
24 |     #     'max_features': max_features_to_try,
25 |     #     'min_samples_leaf':[1,2,5,25,50,100,150]
26 |     # }
27 | 
28 | 
29 |     maxFeaturesList = np.random.lognormal(sqrtNum, 2, 10)
30 |     # if using lognormal, check out this link:
31 |         # http://stackoverflow.com/questions/12937824/lognormal-random-numbers-centered-around-a-high-value
32 |     # 'max_features': scipy.stats.lognorm([sqrtNum/5], int(sqrtNum)),
33 | 
34 |     # RandomizedSearchCV parameters:
35 |     parameters_to_try = {
36 |         'max_features': scipy.stats.randint(1,numColumns),
37 |         'min_samples_leaf': scipy.stats.randint(1,200),
38 |         'min_samples_split': scipy.stats.randint(2,20),
39 |         'bootstrap': [True,False]
40 |     }
41 | 
42 |     if dev:
43 |         parameters_to_try.pop('min_samples_leaf', None)
44 |         parameters_to_try.pop('max_features', None)
45 |         parameters_to_try['max_features'] = [sqrtNum, 'log2']
46 |                 
47 |     return parameters_to_try
48 | 


--------------------------------------------------------------------------------
/pySetup/stepsToAddNewClassifier.txt:
--------------------------------------------------------------------------------
 1 | BeforEach: copy the exact classifier name to your clipboard so we can be consistent. 
 2 | 
 3 | 1. classifierList.js- just add the name, in either universal, classifierOnly, or regressionOnly
 4 | 2. paramaterMakers/classifierName: create the parameters, make available through a function called makeParams that returns a dict that can be passed directly into the classifier. 
 5 | 3. paramMakers.py: add the new file as a module to be imported and as a part of the returned dict. make sure to add a comma in between properties :)
 6 | 4. makeClassifiers.py: add the instantiated classifier here with the parameters it should have. make sure to add a comma in between properties :)
 7 | 5. paramMakers folder: if this is just a new split of the same algorith (rf with gini, and rf with entropy), go back to paramMakers/classifierName and make sure the new one is mutually exclusive with the old one
 8 | 6. randomizedSearchList.py: add in this classifier, and whether it supports RandomizedSearchCV (it should- training goes much faster that way!)
 9 | 7. extendedTrainingList.py: state whether there is an extended training version of this classifier available. This step is somewhat redundent, but makes it clear that a classifier not being in makeBigClassifiers.py is intentional
10 | 8. makeBigClassifiers.py: add in a 'larger' version of the classifier- typically by bumping up n_estimators dramatically
11 | 9. Update test suites to support this new algorithm
12 |     A. For each problemType that is supported:
13 |         a) add it in as an expected file
14 |         b) run the test dataset against this algorithm, to find it's expected error rates
15 |         c) create a new test expecting the error rate to be roughly that


--------------------------------------------------------------------------------
/pySetup/makeBigClassifiers.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier
 2 | from sklearn.linear_model import SGDClassifier
 3 | from sklearn.ensemble import ExtraTreesClassifier
 4 | from sklearn.neural_network import MLPClassifier
 5 | 
 6 | from sklearn.ensemble import RandomForestRegressor
 7 | from sklearn.ensemble import ExtraTreesRegressor
 8 | 
 9 | 
10 | def makeAll(globalArgs, dev, problemType):
11 |     estimator_count=200
12 |     if dev:
13 |         estimator_count=120
14 |         
15 |     iterationCount=20
16 |     if dev:
17 |         iterationCount=2
18 | 
19 |     if problemType == 'category' or problemType == 'multi-category':
20 |     
21 |         return {
22 |             'clRfGini': RandomForestClassifier(n_estimators=estimator_count, n_jobs=-1, criterion='gini'),
23 |             'clRfBootstrapTrue': RandomForestClassifier(n_estimators=estimator_count, n_jobs=-1, bootstrap=True),
24 |             'clRfEntropy': RandomForestClassifier(n_estimators=estimator_count, n_jobs=-1, criterion='entropy'),
25 |             'clSGDClassifier': SGDClassifier(n_iter=iterationCount),
26 |             'clExtraTrees': ExtraTreesClassifier(n_estimators=estimator_count, n_jobs=-1),
27 |             'clnnSklearnMLP': MLPClassifier(max_iter=iterationCount*20)
28 |         }
29 | 
30 |     else:
31 |     
32 |         return {
33 |             'clRfGini': RandomForestRegressor(n_estimators=estimator_count, n_jobs=-1),
34 |             'clRfBootstrapTrue': RandomForestRegressor(n_estimators=estimator_count, n_jobs=-1, bootstrap=True),
35 |             'clRfEntropy': RandomForestRegressor(n_estimators=estimator_count, n_jobs=-1),
36 |             'clExtraTrees': ExtraTreesRegressor(n_estimators=estimator_count, n_jobs=-1),
37 |         }
38 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clExtraTrees.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import scipy
 3 | import numpy as np
 4 | 
 5 | from sendMessages import printParent
 6 | 
 7 | def makeParams(X, y, globalArgs, dev, problemType):
 8 | 
 9 |     try:
10 |         # if dense
11 |         numColumns = len(X[0])
12 |     except:
13 |         # if sparse
14 |         numColumns = X.shape[1]
15 | 
16 |     sqrtNum = int(math.sqrt(numColumns))
17 |     # GridSearchCV parameters:
18 | 
19 |     # max_features_to_try = [sqrtNum + x for x in (-2,0,2)]
20 |     # max_features_to_try.append('log2')
21 |     # max_features_to_try.append(None)
22 | 
23 |     # parameters_to_try = {
24 |     #     'max_features': max_features_to_try,
25 |     #     'min_samples_leaf':[1,2,5,25,50,100,150]
26 |     # }
27 | 
28 | 
29 |     maxFeaturesList = np.random.lognormal(sqrtNum, 2, 10)
30 |     # if using lognormal, check out this link:
31 |         # http://stackoverflow.com/questions/12937824/lognormal-random-numbers-centered-around-a-high-value
32 |     # 'max_features': scipy.stats.lognorm([sqrtNum/5], int(sqrtNum)),
33 | 
34 |     # RandomizedSearchCV parameters:
35 |     parameters_to_try = {
36 |         'criterion': ['gini','entropy'],
37 |         'max_features': scipy.stats.randint(1,numColumns),
38 |         'min_samples_split': scipy.stats.randint(2,20),
39 |         'min_samples_leaf': scipy.stats.randint(1,100),
40 |         'bootstrap': [True,False]
41 |     }
42 | 
43 |     if problemType not in ['category', 'multi-category']:
44 |         parameters_to_try.pop('criterion', None)
45 | 
46 |     # if dev:
47 |     #     parameters_to_try.pop('min_samples_leaf', None)
48 |     #     parameters_to_try.pop('max_features', None)
49 |     #     parameters_to_try['max_features'] = [sqrtNum, 'log2']
50 |                 
51 |     return parameters_to_try
52 | 


--------------------------------------------------------------------------------
/machineJS.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | // (function() {
 4 |   module.exports = function(argsObj) {
 5 |     if(argsObj !== undefined) {
 6 |       for(var key in argsObj) {
 7 |         global.argv[key] = argsObj[key];
 8 |       }
 9 |     }
10 | 
11 |     // if( global.argv.validationRound ) {
12 |     //   console.log('global.argv before processArgs in machineJS validationRound');
13 |     //   console.log(global.argv);
14 |     // }
15 | 
16 |     if( argv.validationRound !== true ) {
17 |       processArgs();
18 |     } else {
19 |       argv.ensemblerArgs.validationRound = false;
20 |     }
21 | 
22 |     if (argv.devEnsemble || argv.ensemble) {
23 |       ensembler.createEnsemble( argv.ensemblerArgs );
24 |     } else if( argv.makePredictions ) {
25 |       controllerPython.makeAllPredictions( argv.makePredictions );
26 |     } else {
27 |       controllerPython.startTraining(argv);  
28 |     }
29 | 
30 |     shutDown(controllerPython);
31 |   
32 |   };
33 | 
34 |   var path = require('path');
35 |   global.rootDir = path.dirname(__filename);
36 |   global.argv = {};
37 | 
38 |   var controllerPython = require('./pySetup/controllerPython.js');
39 |   var shutDown = require('./shutDown.js');
40 |   var processArgs = require('./processArgs.js');
41 | 
42 |   var ensembler = require('ensembler');
43 | 
44 |   console.log('thanks for inviting us along on your machine learning journey!\n');
45 | 
46 | 
47 |   // allow the module to be invoked from the command line
48 |   // since this is all wrapped in an IIFE, this if statement will execute and check if machineJS was invoked from another module, or without a parent (from the command line)
49 |   if( !module.parent ) {
50 |     var userArgs = require('minimist')(process.argv.slice(1));
51 |     for( var key in userArgs ) {
52 |       global.argv[key] = userArgs[key];
53 |     }
54 | 
55 |     module.exports();
56 |   }
57 | 
58 | // })();
59 | 


--------------------------------------------------------------------------------
/shutDown.js:
--------------------------------------------------------------------------------
 1 | var exec = require('child_process').execSync;
 2 | 
 3 | module.exports = function(controllerPython) {
 4 | 
 5 | 
 6 |   // kills off all the child processes if the parent process faces an uncaught exception and crashes. 
 7 |   // this prevents you from having zombie child processes running indefinitely.
 8 |   // lifted directly from: https://www.exratione.com/2013/05/die-child-process-die/
 9 |   // This is a somewhat ugly approach, but it has the advantage of working
10 |   // in conjunction with most of what third parties might choose to do with
11 |   // uncaughtException listeners, while preserving whatever the exception is.
12 |   process.once("uncaughtException", function (error) {
13 |     // If this was the last of the listeners, then shut down the child and rethrow.
14 |     // Our assumption here is that any other code listening for an uncaught
15 |     // exception is going to do the sensible thing and call process.exit().
16 |     if (process.listeners("uncaughtException").length === 0) {
17 |       console.log('we heard an unexpected shutdown event that is causing everything to close');
18 |       controllerPython.killAll();
19 |       throw error;
20 |     }
21 |   });
22 | 
23 |   if (process.platform === "win32") {
24 |     var rl = require("readline").createInterface({
25 |       input: process.stdin,
26 |       output: process.stdout
27 |     });
28 | 
29 |     rl.on("SIGINT", function () {
30 |       process.emit("SIGINT");
31 |     });
32 |   }
33 | 
34 |   process.on("SIGINT", function () {
35 |     //graceful shutdown
36 |     console.log('heard sigint in machineJS')
37 |     controllerPython.killAll();
38 | 
39 |     // if we hear a Ctrl + c, we can safely assume the user wants to exit. 
40 |     // exec('pkill -9 node');
41 |     process.exit();
42 |   });
43 | 
44 |   process.on("killAll", function() {
45 |     controllerPython.killAll();
46 |     process.exit();
47 | 
48 |   });
49 | 
50 | };
51 | 


--------------------------------------------------------------------------------
/test/regression/trainAlgorithms.js:
--------------------------------------------------------------------------------
 1 | var expect = require('chai').expect;
 2 | var mocha = require('mocha');
 3 | var fs = require('fs');
 4 | var path = require('path');
 5 | var rTest = global.rTest;
 6 | 
 7 | module.exports = function() {
 8 |   
 9 |   describe('training and tuning algorithms', function() {
10 | 
11 |     it('should successfully train one instance of all algorithms in classifierList, for this problemType', function() {
12 |       
13 |       var classifierList = require(path.join(rTest.mjsLocation, 'pySetup', 'classifierList'));
14 |       classifierList = Object.keys(classifierList.longDataSet);
15 |       var trainedAlgos = fs.readdirSync(rTest.bestClassifiersTestLocation);
16 | 
17 |       function verifyAllClassifiersTrained() {
18 |         var foundClassifiers = [];
19 |         // for each classifier we expected to train:
20 |         for( var i = 0; i < classifierList.length; i++ ) {
21 |           var found = false;
22 | 
23 |           // compare it against the ones that finished:
24 |           for( var j = 0; j < trainedAlgos.length; j++ ){ 
25 |             if( trainedAlgos[j].indexOf( classifierList[i] ) !== -1 ) {
26 |               foundClassifiers.push(classifierList[i]);
27 |             }
28 |           }
29 |         }
30 | 
31 |         // if we have found all of them at the end, return true
32 |         return foundClassifiers;
33 |       }
34 | 
35 |       expect( verifyAllClassifiersTrained() ).to.deep.equal(classifierList);;
36 |     });
37 | 
38 |     // it('should write each algorithm to a file in the correct directory', function() {
39 | 
40 |     // });
41 | 
42 |     // it('should name the trained algorithm file after the algorithm\'s name', function() {
43 | 
44 |     // });
45 | 
46 |     // it('should have acceptably low error rates for each algorithm', function() {
47 |     //   // TODO: probably break this out into separate tests, one for each algo. that way it will be easier to find which one failed.
48 |     // });
49 | 
50 | 
51 |   });
52 | 
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clnnSklearnMLP.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | import scipy
 4 | 
 5 | def makeParams(X, y, globalArgs, dev, problemType):
 6 | 
 7 |     # guidance on params:
 8 |         # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1
 9 | 
10 |     
11 | 
12 |     try:
13 |         # if dense
14 |         numFeatures = len(X[0])
15 |         XLength = len(X)
16 |     except:
17 |         # if sparse
18 |         numFeatures = X.shape[1]
19 |         XLength = X.shape[0]
20 | 
21 |     if XLength < 10000:
22 |         # according to the docs, l-bfgs performs best for small datasets (thousands of items)
23 |         algorithmType = 'l-bfgs'
24 |     else:
25 |         # adam is optimized stochastic gradient descent which performs well for large datasets
26 |         algorithmType = 'adam'
27 | 
28 |     # we want to test up to 10 layers, with anywhere from .5 * numFeatures to 10*numFeatures per layer
29 |     hiddenLayers = []
30 | 
31 |     # for each hidden layer, we will have these numbers * numFeatures number of nodes
32 |     nodeMultipliers = [.5,1,2,3,5,10]
33 | 
34 |     # we will create options for each number of hidden layers listed below
35 |     hiddenLayerMultipliers = [1,2,3,5,10]
36 | 
37 |     for hlNum in hiddenLayerMultipliers:
38 |         for nodeNum in nodeMultipliers:
39 |             hiddenLayers.append( [ nodeNum * numFeatures for x in range(hlNum) ])            
40 | 
41 | 
42 |     # parameters we are not searching currently:
43 |         # tol
44 |         # shuffle
45 |         # batch_size
46 |         # hidden_layer_sizes
47 | 
48 |     parameters_to_try = {
49 |         'hidden_layer_sizes': hiddenLayers,
50 |         'activation': ['logistic','tanh','relu'],
51 |         'algorithm': [algorithmType],
52 |         'alpha': scipy.stats.expon(.00001,.001),
53 |         'learning_rate': ['constant','invscaling','adaptive'],
54 |         'learning_rate_init': scipy.stats.expon(.01,.0001),
55 |         'early_stopping': [True],
56 |         'validation_fraction': np.random.uniform(0.8,1,1000),
57 |         'epsilon': scipy.stats.expon( math.pow(10,-7), math.pow(10,-9))
58 |     }
59 |         
60 |     return parameters_to_try
61 | 


--------------------------------------------------------------------------------
/test/regression/deleteRemnantsAndRunNewTest.js:
--------------------------------------------------------------------------------
 1 | var path = require('path');
 2 | var execSync = require('child_process').execSync;
 3 | var rimraf = require('rimraf');
 4 | var fs = require('fs');
 5 | 
 6 | // set default values
 7 | // since node.js executes the file as node loads the file in, these lines of code will get run every time, regardless of whether we run the module.exports function or not
 8 | global.rTest.testFileLocation = path.dirname(__filename);
 9 | global.rTest.mjsLocation = path.join(rTest.testFileLocation, '..','..');
10 | global.rTest.dataLocation = path.join(rTest.mjsLocation,'node_modules','data-for-tests','rossman');
11 | 
12 | rTest.dfTestResultsLocation = path.join(rTest.testFileLocation, 'dfTestResults');
13 | rTest.rTestPredictionsLocation = path.join(rTest.testFileLocation, 'rTestPredictions');
14 | rTest.bestClassifiersTestLocation = path.join(rTest.testFileLocation, 'bestClassifiersTest');
15 | rTest.validationIndicesLocation = path.join(rTest.dataLocation, 'dfValidationIndicesrossmantest.pkl');
16 | 
17 | 
18 | module.exports = function() {
19 |  try {
20 |     // remove any folders we might have created when running the test suite previously
21 |     // rimraf is `rm -rf` for node
22 |     rimraf.sync(rTest.dfTestResultsLocation);
23 |     rimraf.sync(rTest.rTestPredictionsLocation);
24 |     rimraf.sync(rTest.bestClassifiersTestLocation);
25 |     fs.unlinkSync(rTest.validationIndicesLocation);
26 |   } catch(err) {
27 |     // do nothing! There is nothing to delete
28 |   }
29 | 
30 |   // to see detailed output while running the tests, use node-inspector.
31 |     // npm install -g node-inspector
32 |     // change "node" below to be "node-debug"
33 |   execSync('node machineJS.js ' 
34 |     + path.join(rTest.dataLocation,'tinyTrain.csv') 
35 |     + ' --predict ' + path.join(rTest.dataLocation,'test.csv') 
36 |     + ' --join ' + path.join(rTest.dataLocation, 'store.csv') 
37 |     + ' --dfOutputFolder ' + path.join(rTest.testFileLocation, 'dfTestResults')
38 |     + ' --predictionsFolder ' + path.join(rTest.testFileLocation, 'rTestPredictions')
39 |     + ' --ensemblerOutputFolder ' + rTest.testFileLocation
40 |     + ' --bestClassifiersFolder ' + path.join(rTest.testFileLocation, 'bestClassifiersTest')
41 |   );
42 | };
43 | 


--------------------------------------------------------------------------------
/test/regression/splitDataset.js:
--------------------------------------------------------------------------------
 1 | var expect = require('chai').expect;
 2 | var mocha = require('mocha');
 3 | var fs = require('fs');
 4 | var path = require('path');
 5 | var execSync = require('child_process').execSync;
 6 | csv = require('csv');
 7 | 
 8 | module.exports = function() {
 9 |   
10 |   describe('splitting the formatted dataset', function() {
11 | 
12 |     it('should create a new validation split when we do not have one already', function() {
13 |       var fileStats = fs.statSync(path.join(rTest.dataLocation, 'dfValidationIndicesrossmantest.pkl'));
14 |       expect( new Date(fileStats.ctime) ).to.be.above(rTest.startTime);
15 |     });
16 | 
17 | 
18 |     it('should copy the validation dataset to the valdiation folder in predictions', function(done) {
19 |       fs.readFile(path.join(rTest.rTestPredictionsLocation, 'validation','validationData.npz'), function(err, data) {
20 |         console.log('err',err);
21 |         expect(err).to.be.null;
22 |         done();
23 |       });
24 |     });
25 | 
26 | 
27 |     it('should copy the validation IDs and Y to the valdiation folder in predictions', function(done) {
28 |       var validationFilePath = path.join(rTest.rTestPredictionsLocation, 'validation','validationIDsAndY.csv');
29 |       fs.readFile(validationFilePath, function(err, data) {
30 | 
31 |         expect(err).to.be.null;
32 | 
33 |         data = csv.parse(data.toString('utf8'), function(err, output) {
34 | 
35 |           expect(output.length).to.be.within(51000 - 200, 51000 + 200);
36 |           done();
37 | 
38 |         });
39 |       });
40 |     });
41 | 
42 | 
43 |     it('should use the existing validation split if it already exists', function() {
44 |       var secondStartTime = Date.now();
45 |       // TODO: try running the test again, and 
46 |       execSync('node machineJS.js ' 
47 |         + path.join(rTest.dataLocation,'tinyTrain.csv') 
48 |         + ' --predict ' + path.join(rTest.dataLocation,'test.csv') 
49 |         + ' --join ' + path.join(rTest.dataLocation, 'store.csv') 
50 |         + ' --dfOutputFolder ' + path.join(rTest.testFileLocation, 'dfTestResults')
51 |         + ' --predictionsFolder ' + path.join(rTest.testFileLocation, 'rTestPredictions')
52 |         + ' --ensemblerOutputFolder ' + rTest.testFileLocation
53 |         + ' --bestClassifiersFolder ' + path.join(rTest.testFileLocation, 'bestClassifiersTest')
54 |         + ' --splitDataTest true'
55 |       );
56 |       
57 |       var fileStats = fs.statSync(path.join(rTest.dataLocation, 'dfValidationIndicesrossmantest.pkl'));
58 |       expect( new Date(fileStats.ctime) ).to.be.below(secondStartTime);
59 | 
60 |     });
61 | 
62 |   });
63 |   
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/pySetup/utilsPyShell.js:
--------------------------------------------------------------------------------
 1 | var py = global.pythonNamespace;
 2 | 
 3 | var path = require('path');
 4 | var PythonShell = require('python-shell');
 5 | 
 6 | var pySetupLocation = py.pySetupLocation;
 7 | 
 8 | module.exports = {
 9 | 
10 |   // these are messages we expect to get from our python shell.
11 |   // anything not in this list is likely an error.
12 |   expectedMessages: {
13 |     dictVectMapping: true,
14 |     fileNames: true,
15 |     trainingResults: true,
16 |     splitFileNames: true
17 |   },
18 | 
19 |   attachLogListener: function(referenceToShell) {
20 |     referenceToShell.on('message', function(message) {
21 |       if(message.type === 'console.log') {
22 |         console.log('snake says:',message.text);
23 |       }
24 |       else if ( !module.exports.expectedMessages[ message.type ] ){
25 |         console.log('heard a message:',message);
26 |       }
27 |     });
28 |   },
29 | 
30 |   generatePythonOptions: function(fileNameFromRoot, otherArgs) {
31 |     // the first argument for all python shells is going to be a path to a file, relative to the root of machineJS
32 |     var fullPathToFile = path.join(global.rootDir, fileNameFromRoot);
33 |     var args = [];
34 |     args = args.concat(fullPathToFile, otherArgs);
35 | 
36 |     var pySetupLocation = path.join(argv.machineJSLocation, 'pySetup');
37 | 
38 |     return {
39 |       scriptPath: pySetupLocation,
40 |       args: args,
41 |       mode:'json'
42 |     };
43 |   },
44 | 
45 |   startPythonShell: function(scriptName, callback, pythonOptions) {
46 |     var pyShell = PythonShell.run(scriptName, pythonOptions, function (err, results) {
47 |       if (err) {
48 |         // TODO: add in logging of the error message if verbosity is set to
49 |         // right now we get error messages for a bunch of things the user should not concern themselves with, including:
50 |           // deprecation warnings (we're optionally using a pre-release version of sklearn; we'll refactor to take care of those deprecation warnings once they're merged into an officially released version)
51 |           // searches that fail to converge
52 |         // to avoid distracting the user, we're only logging error messages with an exit code that is not 0, meaning that the process failed to finish executing
53 |         if( err.exitCode !== 0 ) {
54 |           console.error(err);
55 |         } else {
56 |           callback();
57 |         }
58 |       } else {
59 |         console.log('successfully finished running',scriptName + '!');
60 |         callback();
61 | 
62 |       }
63 |     });
64 | 
65 |     module.exports.attachLogListener(pyShell);
66 |     py.referencesToChildren.push(pyShell);
67 | 
68 |     return pyShell;
69 |   }
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/paramMakers.py:
--------------------------------------------------------------------------------
 1 | # ok, unfortunately, this is how it probably has to work:
 2 | # 1. we manually (hard code the names in here) import all the individual parameterMaker files here
 3 | # 2. we have a master calculateParams function
 4 | 
 5 | # 3. into that function we will pass X and y
 6 | # 4. that function will then go off and invoke all the individual parameterMaker functions, saving their results into a dictionary with keys that mirror 'clRandomForest'
 7 | # 5. that function will then return the dictionary
 8 | # 6. then back in training.py we can look up the classifierName within that dictionary to get the parameters
 9 | 
10 | import rfGiniParamMaker
11 | import rfEntropyParamMaker
12 | import svcFirstParameterMaker
13 | import svcFirstParameterMaker
14 | import svcShrinking
15 | import clnnSknn
16 | import clnnSknn3Layer
17 | import clKnn
18 | import clLogisticRegression
19 | import clAdaBoost
20 | import clXGBoost
21 | import clRfBootstrapBoth
22 | import clAdaLossAll
23 | import clMultinomialNB
24 | import clPerceptron
25 | import clSGDClassifier
26 | import clExtraTrees
27 | import clnnSklearnMLP
28 | from sendMessages import printParent
29 | 
30 | def makeAll(X,y,globalArgs, dev, problemType):
31 |     returnDict = {
32 |         'clRfGini':rfGiniParamMaker.makeParams(X,y,globalArgs, dev, problemType),
33 |         'clRfEntropy':rfEntropyParamMaker.makeParams(X,y,globalArgs, dev, problemType),
34 |         'clSVCFirst':svcFirstParameterMaker.makeParams(X,y,globalArgs, dev, problemType),
35 |         'clSVCFirst':svcFirstParameterMaker.makeParams(X,y,globalArgs, dev, problemType),
36 |         'clSVCShrinking':svcShrinking.makeParams(X,y,globalArgs, dev, problemType),
37 |         'clKnn':clKnn.makeParams(X,y,globalArgs, dev, problemType),
38 |         'clLogisticRegression':clLogisticRegression.makeParams(X,y,globalArgs, dev, problemType),
39 |         'clnnSknn3Layer':clnnSknn3Layer.makeParams(X,y,globalArgs, dev, problemType),
40 |         'clnnSknn':clnnSknn.makeParams(X,y,globalArgs, dev, problemType),
41 |         'clAdaBoost':clAdaBoost.makeParams(X,y,globalArgs, dev, problemType),
42 |         'clAdaLossLinear':clAdaLossAll.makeParams(X,y,globalArgs, dev, problemType),
43 |         'clAdaLossSquare':clAdaLossAll.makeParams(X,y,globalArgs, dev, problemType),
44 |         'clAdaLossExponential':clAdaLossAll.makeParams(X,y,globalArgs, dev, problemType),
45 |         'clXGBoost':clXGBoost.makeParams(X,y,globalArgs, dev, problemType),
46 |         'clRfBootstrapTrue': clRfBootstrapBoth.makeParams(X,y,globalArgs, dev, problemType),
47 |         'clMultinomialNB': clMultinomialNB.makeParams(X,y,globalArgs,dev,problemType),
48 |         'clPerceptron': clPerceptron.makeParams(X,y,globalArgs,dev,problemType),
49 |         'clSGDClassifier': clSGDClassifier.makeParams(X,y,globalArgs,dev,problemType),
50 |         'clExtraTrees': clExtraTrees.makeParams(X,y,globalArgs,dev,problemType),
51 |         'clnnSklearnMLP': clnnSklearnMLP.makeParams(X,y,globalArgs,dev,problemType)
52 |     }
53 |     return returnDict
54 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/clXGBoost.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import numpy as np
 3 | 
 4 | def makeParams(X, y, globalArgs, dev, problemType):
 5 | 
 6 | 
 7 | 
 8 | # great overall walkthrough of xgb. probably the best starting point.
 9 |     # http://www.slideshare.net/ShangxuanZhang/kaggle-winning-solution-xgboost-algorithm-let-us-learn-from-its-author
10 | 
11 | 
12 | # list of params that can be tuned:
13 |     # https://www.kaggle.com/forums/f/15/kaggle-forum/t/17120/how-to-tuning-xgboost-in-an-efficient-way
14 | 
15 | 
16 | # discussion by xgb library itself:
17 |     # https://github.com/dmlc/xgboost/blob/master/doc/param_tuning.md
18 | 
19 |     # other parameters to investigate that might only exist in the sklearn implementation:
20 |         # learning_rate
21 |         # n_estimators (i have a feeling this is num_boost_round)
22 |         # subsample
23 |         # max_features
24 | 
25 |     # Other params to invesigat:
26 |         # Split on:
27 |             # booster [default=gbtree]
28 |                 # which booster to use, can be gbtree or gblinear. gbtree uses tree based model while gblinear uses linear function.
29 | 
30 | # param_space = {'max_depth': [2,4,6,8,10], 'n_estimators': [200,300,400,500,600,700,800], 'learning_rate' : uniform(loc=0.001,scale=0.2), 'subsample': uniform(loc=0.6,scale=0.39), 'colsample_bytree':uniform(loc=0.6,scale=0.39), }
31 | 
32 | # param_dist = {
33 | #     'max_depth': randint(2, 8),
34 | #     'gamma': uniform(0.2, 0.6),
35 | #     'subsample': beta(10, 1),
36 | # }
37 | # and then do a randomized grid search like this
38 | 
39 | # clf = xgb.XGBClassifier(n_estimators = 20)
40 | # n_iter_search = 100
41 | # random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring='roc_auc', verbose=10)
42 | # random_search.fit(X_train, y_train)
43 | 
44 | 
45 | 
46 | # {'max_depth': [2,4,6],
47 | #                     'n_estimators': [50,100,200]}
48 | 
49 | # official docs:
50 |     # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
51 | 
52 | # samuel reuther had a good reply:
53 |     # https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/13910/xgboost-parameter-tuning
54 | 
55 | # forum that talks about specific numbers:
56 | # https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/12947/achieve-0-50776-on-the-leaderboard-in-a-minute-with-xgboost/76028
57 | 
58 | # slide 12 has exact param recommendations:
59 |     # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1
60 | 
61 |         
62 | 
63 | 
64 | 
65 |     # RandomSearchCV parameters:
66 |     parameters_to_try = {
67 |         'max_depth': scipy.stats.randint(1,150),
68 |         'subsample': np.random.uniform(.80,1,1000),
69 |         'colsample_bytree': np.random.uniform(.80,1,1000)
70 |     }
71 | 
72 |     # TODO: create two separate XGBoosts, one for gbtree adn one for gblinear
73 |     # 'booster': ['gbtree','gblinear']
74 | 
75 |     if dev:
76 |         parameters_to_try.pop('subsample', None)
77 |         parameters_to_try.pop('colsample_bytree', None)
78 |         # parameters_to_try.pop('num_round', None)
79 |         # parameters_to_try.pop('eta', None)
80 |         
81 |     return parameters_to_try
82 | 


--------------------------------------------------------------------------------
/pySetup/makeClassifiers.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier
 2 | from sklearn.svm import SVC
 3 | from sklearn.neighbors import KNeighborsClassifier
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.ensemble import AdaBoostClassifier
 6 | from sklearn.naive_bayes import MultinomialNB
 7 | from sklearn.linear_model import Perceptron
 8 | from sklearn.linear_model import SGDClassifier
 9 | from sklearn.neural_network import MLPClassifier
10 | 
11 | import xgboost
12 | from sklearn.ensemble import ExtraTreesClassifier
13 | 
14 | from sklearn.ensemble import RandomForestRegressor
15 | from sklearn.neighbors import KNeighborsRegressor
16 | from sklearn.ensemble import AdaBoostRegressor
17 | from sklearn.svm import SVR
18 | from sklearn.linear_model import LinearRegression
19 | from sklearn.ensemble import ExtraTreesRegressor
20 | 
21 | # other splits for SVC kernel:
22 | # linear, poly, rbf, sigmoid, precomputed
23 | 
24 | def makeClassifiers(globalArgs, dev, problemType):
25 | 
26 |     n_iter=10
27 |     n_estimators=20
28 |     if(dev):
29 |         n_iter=2
30 |         n_estimators=5
31 | 
32 |     if problemType == 'category' or problemType == 'multi-category':
33 |         return {
34 |             'clRfGini': RandomForestClassifier(n_estimators=n_estimators, n_jobs=1, criterion='gini'),
35 |             'clRfBootstrapTrue': RandomForestClassifier(n_estimators=n_estimators, n_jobs=1, bootstrap=True),
36 |             'clRfEntropy': RandomForestClassifier(n_estimators=n_estimators, n_jobs=1, criterion='entropy'),
37 |             'clSVCFirst': SVC(probability=True, shrinking=False),
38 |             'clSVCShrinking': SVC(probability=True, shrinking=True),
39 |             'clKnn': KNeighborsClassifier(),
40 |             'clLogisticRegression': LogisticRegression(penalty='l2', dual=False, max_iter=100, warm_start=True),
41 |             'clAdaBoost': AdaBoostClassifier(),
42 |             'clXGBoost': xgboost.XGBClassifier(),
43 |             'clMultinomialNB': MultinomialNB(),
44 |             'clPerceptron': Perceptron(),
45 |             'clSGDClassifier': SGDClassifier(n_iter=n_iter),
46 |             'clExtraTrees': ExtraTreesClassifier(n_estimators=n_estimators, n_jobs=1),
47 |             'clnnSklearnMLP': MLPClassifier(),
48 |         }
49 | 
50 |     # Regression models
51 |     else:
52 |         return {
53 |             'clRfGini': RandomForestRegressor(n_estimators=n_estimators, n_jobs=1),
54 |             'clRfBootstrapTrue': RandomForestRegressor(n_estimators=n_estimators, n_jobs=1, bootstrap=True),
55 |             # 'clRfEntropy': RandomForestRegressor(n_estimators=n_estimators, n_jobs=1, criterion='entropy'),
56 |             'clSVCFirst': SVR(shrinking=False),
57 |             'clSVCShrinking': SVR(shrinking=True),
58 |             'clKnn': KNeighborsRegressor(),
59 |             'clLogisticRegression': LinearRegression(),
60 |             'clAdaBoost': AdaBoostRegressor(),
61 |             'clAdaLossLinear': AdaBoostRegressor(loss='linear'),
62 |             'clAdaLossSquare': AdaBoostRegressor(loss='square'),
63 |             'clAdaLossExponential': AdaBoostRegressor(loss='exponential'),
64 |             'clXGBoost': xgboost.XGBRegressor(),
65 |             'clExtraTrees': ExtraTreesRegressor(n_estimators=n_estimators, n_jobs=1),
66 |         }
67 | 


--------------------------------------------------------------------------------
/pySetup/parameterMakers/rfEntropyParamMaker.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import scipy
 3 | import numpy as np
 4 | 
 5 | from sendMessages import printParent
 6 | 
 7 | def makeParams(X, y, globalArgs, dev, problemType):
 8 | 
 9 |     try:
10 |         # if dense
11 |         numColumns = len(X[0])
12 |     except:
13 |         # if sparse
14 |         numColumns = X.shape[1]
15 | 
16 |     sqrtNum = int(math.sqrt(numColumns))
17 |     # GridSearchCV parameters:
18 | 
19 |     # max_features_to_try = [sqrtNum + x for x in (-2,0,2)]
20 |     # max_features_to_try.append('log2')
21 |     # max_features_to_try.append(None)
22 | 
23 |     # parameters_to_try = {
24 |     #     'max_features': max_features_to_try,
25 |     #     'min_samples_leaf':[1,2,5,25,50,100,150]
26 |     # }
27 | 
28 | 
29 |     maxFeaturesList = np.random.lognormal(sqrtNum, 2, 10)
30 |     # if using lognormal, check out this link:
31 |         # http://stackoverflow.com/questions/12937824/lognormal-random-numbers-centered-around-a-high-value
32 |     # 'max_features': scipy.stats.lognorm([sqrtNum/5], int(sqrtNum)),
33 | 
34 |     # RandomizedSearchCV parameters:
35 |     parameters_to_try = {
36 |         'max_features': scipy.stats.randint(1,numColumns),
37 |         'min_samples_leaf': scipy.stats.randint(1,200),
38 |         'min_samples_split': scipy.stats.randint(2,20),
39 |         'bootstrap': [True,False]
40 |     }
41 | 
42 |     if dev:
43 |         parameters_to_try.pop('min_samples_leaf', None)
44 |         parameters_to_try.pop('max_features', None)
45 |         parameters_to_try['max_features'] = [sqrtNum, 'log2']
46 |                 
47 |     return parameters_to_try
48 | 
49 | '''
50 | determine which parameters we want to mess with
51 |     https://www.kaggle.com/forums/f/15/kaggle-forum/t/4092/how-to-tune-rf-parameters-in-practice
52 |     A. M-Try (number of features it tries at each decision point in a tree). Starts at square root of features available, but tweak it up and down by a few (probably no more than 3 in each direction; it seems even 1 or 2 is enough)
53 |     B. Number of folds for cross-validation: 10 is what most people use, but more gives you better accuracy (likely at the cost of compute time). again, returns are pretty rapidly diminishing. 
54 |     C. platt scaling of the results to increase overall accuracy at the cost of outliers (which sounds perfect for an ensemble)
55 |     D. preprocessing the data might help- FUTURE
56 |     E. Principle Component Analysis to decrease dependence between features
57 |     F. Number of trees
58 |     G. Possibly ensemble different random forests together. this is where the creative ensembling comes into play!
59 |     H. Splitting criteria
60 |     I. AdaBoost
61 |     J. Can bump up nodesize as much as possible to decrease training time (split)
62 |         consider doing this first, finding what node size we finally start decreasing accuracy on, then use that node size for the rest of the testing we do, then possibly bumping it down a bit again at the end. 
63 |             https://www.kaggle.com/c/the-analytics-edge-mit-15-071x/forums/t/7890/node-size-in-random-forest
64 |     K. min_samples_leaf- smaller leaf makes you more prone to capturing noise from the training data. Try for at least 50??
65 |         http://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
66 |     L. random_state: adds reliability. Would be a good one to split on if ensembling different RFs together. 
67 |     M. oob_score: something about intelligent cross-validation. 
68 |     N. allusions to regularization, or what I think they mean- feature selection. 
69 | 
70 | '''
71 | 


--------------------------------------------------------------------------------
/advancedAPI.md:
--------------------------------------------------------------------------------
 1 | # Advanced Options:
 2 | 
 3 | If you've done ML before, `machineJS` is incredibly useful in that it already puts in place most of the structure for all the parts of the process you have to repeat each time. 
 4 | 
 5 | As much as possible, we've tried to avoid hardcoding in values, instead allowing the user to pass in values, or setting default values if the user doesn't pass in any arguments. 
 6 | 
 7 | To get the best idea of all the options available to you, please check out `processArgs.js`, where we set many of the [default values](https://github.com/ClimbsRocks/machineJS/blob/master/processArgs.js) that you'd want to modify. In the meantime, here are some of the more widely used options. 
 8 | 
 9 | - `--alreadyFormatted`: A boolean value 'true' or 'false', noting if your data has already been formatted. Useful if you're just tweaking parameters and don't want to repeat the oftentimes time-expensive data formatting process again. If you pass in this flag, make sure your files are included in the  `pySetup/testingFileNames.js` json list. I've included a couple of examples. You can get the fileNames from `pySetup/utils.js`, inside of the formatData function. Just copy paste the fileNames obj data-formatter gives to the callback, into the `testingFileNames.js` file, giving it a property of whatever the 'outputFileName' property is. You should be able to pick up the pattern pretty easily :)
10 | - `--join`: a path to a data file that will be joined in with your training and testing data, in the same way you'd join SQL tables. 
11 | - `--predict`: see above (Format of Prediction File).
12 | - `--dev`: This flag indicates that you are doing engineering work on machineJS itself. It does things like:
13 |   a) set the number of rounds to a third of what it normally is
14 |   b) assume we already have data formatted
15 |   c) if no data is passed in, automatically use the kaggleGiveCredit.csv dataset
16 | - `--devKaggle`: Does all the same things as `--dev`, but also runs `--predict` on the default dataset kaggleGiveCreditTest.csv
17 | - `--devEnsemble`: Assumes that we already have predictions made for us by the rest of the module and present in predictions/*.csv. Allows you to focus on assembling your ensemble without having to retrain the models each time :)
18 | - `--dfOutputFolder`: if, for some reason, you want the results of `data-formatter` written to a different directory. We use this for the test suite, but it probably isn't useful for much other than that. 
19 | - `--ensemblerOutputFolder`: much like the `dfOutputFolder` option above, you can choose to overwrite the default location for the output results. Used in our test suite, but probably not useful for many other cases.
20 | - `--bestClassifiersFolder`: much like the `dfOutputFolder` option above, you can choose to overwrite the default location for the bestClassifier. Used in our test suite, but probably not useful for many other cases.
21 | 
22 | ### Validation Splits
23 | The `ensembler` module, which uses machine learning to aggregate together all the results of each trained algorithm, will always benefit from more information, and thus, more trained algorithms. 
24 | 
25 | To support this, we are using a consistent valdiation data split for a given test.csv dataset. This means you can change your training.csv data (new feature engineering, new ways of normalizing the data, etc.), but still use the predictions from previous training data sets. The only stipulation is that the rows must be in the same order. What you put into each row is entirely up to you!
26 | 
27 | If you ever want to ask machineJS to create a new validation split for you, simply delete the `*validationData.npz` files from the data-formatterResults directory.
28 | 
29 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "machinejs",
 3 |   "version": "0.9.5",
 4 |   "description": "Automated machine learning structure. Ensembles random forests, XGBoost, neural networks, AdaBoost, KNN, SVM together, handling data formatting ensembling, and running predictions for you. Feel free to tweak the settings if you want a lot of control, or just start it up and let it run if you're looking for a push-button MVP solution.",
 5 |   "main": "machineJS.js",
 6 |   "scripts": {
 7 |     "test": "npm run test:regression",
 8 |     "test:regression": "mocha test/regression/test.js",
 9 |     "test:categorical": "mocha test/categorical/test.js",
10 |     "dev": "node machineJS.js data/rossman/short/rossShortTrainDev.csv --predict data/rossman/short/test.csv --join data/rossman/short/store.csv --alreadyFormatted --dev",
11 |     "devEnsemble": "node machineJS.js data/rossman/rossShortTrainDev.csv --predict data/rossman/test.csv --join data/rossman/store.csv --alreadyFormatted --devEnsemble",
12 |     "train:rossman": "node machineJS.js data/rossman/tran_filled_gap.csv --predict data/rossman/test.csv --join data/rossman/store.csv --alreadyFormatted",
13 |     "train:rossShort": "node machineJS.js data/rossman/short/rossShortTrainDev.csv --predict data/rossman/short/test.csv --join store.csv",
14 |     "train:numeraiDev": "node machineJS.js data/numerai/numerai_training_data.csv --predict data/numerai/numerai_test_data.csv --binaryOutput --alreadyFormatted",
15 |     "train:numerai": "node machineJS.js data/numerai/numerai_training_data_tournament.csv --predict data/numerai/numerai_tournament_data.csv --alreadyFormatted",
16 |     "ensemble:rossShort": "node machineJS.js data/rossman/short/rossShortTrainDev.csv --predict data/rossman/short/test.csv --join data/rossman/store.csv --alreadyFormatted --devEnsemble",
17 |     "ensemble:numerai": "node machineJS.js data/numerai/numerai_training_data_tournament.csv --predict data/numerai/numerai_tournament_data.csv --alreadyFormatted --devEnsemble",
18 |     "train:giveCredit": "node machineJS.js data/giveCredit/train.csv --predict data/giveCredit/test.csv",
19 |     "train:homesite": "node machineJS.js data/homesite/train.csv --predict data/homesite/test.csv",
20 |     "train:homesiteShort": "node machineJS.js data/homesite/shortTrain.csv --predict data/homesite/shortTest.csv",
21 |     "train:telstra": "node machineJS.js data/telstra/train.csv --predict data/telstra/test.csv"
22 |   },
23 |   "repository": {
24 |     "type": "git",
25 |     "url": "http://github.com/ClimbsRocks/machinejs.git"
26 |   },
27 |   "keywords": [
28 |     "neuralNet",
29 |     "neural network",
30 |     "machine learning",
31 |     "ml",
32 |     "algorithms",
33 |     "random forest",
34 |     "svm",
35 |     "naive bayes",
36 |     "bagging",
37 |     "optimization",
38 |     "data science",
39 |     "brainjs",
40 |     "date night",
41 |     "scikit-learn",
42 |     "sklearn",
43 |     "ensemble",
44 |     "data formatting",
45 |     "javascript",
46 |     "js",
47 |     "XGBoost",
48 |     "scikit-neuralnetwork",
49 |     "KNN",
50 |     "K nearest neighbors",
51 |     "GridSearch",
52 |     "GridSearchCV",
53 |     "grid search",
54 |     "python",
55 |     "RandomizedSearchCV",
56 |     "preprocessing",
57 |     "data-formatter",
58 |     "SVM",
59 |     "kaggle",
60 |     "kaggle competition"
61 |   ],
62 |   "author": "Preston Parry",
63 |   "license": "MIT",
64 |   "bin": {
65 |     "machineJS": "machineJS.js"
66 |   },
67 |   "bugs": {
68 |     "url": "https://github.com/ClimbsRocks/machineJS/issues"
69 |   },
70 |   "homepage": "https://github.com/ClimbsRocks/machineJS",
71 |   "dependencies": {
72 |     "babyparse": "^0.4.3",
73 |     "data-formatter": "latest",
74 |     "ensembler": "latest",
75 |     "fast-csv": "^0.6.0",
76 |     "longjohn": "^0.2.9",
77 |     "minimist": "^1.1.2",
78 |     "mkdirp": "^0.5.1",
79 |     "python-shell": "^0.2.0"
80 |   },
81 |   "devDependencies": {
82 |     "chai": "^3.4.1",
83 |     "csv": "^0.4.6",
84 |     "data-for-tests": "0.0.3",
85 |     "mocha": "^2.3.3",
86 |     "rimraf": "^2.4.3"
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/pySetup/classifierList.js:
--------------------------------------------------------------------------------
  1 | var argv = global.argv;
  2 | 
  3 | /*
  4 | classifier summary descriptions
  5 | 'clRfEntropy', randomForest using the entropy criterion 
  6 | 'clRfGini', randomForest using the Gini criterion
  7 | 'clSVCFirst',  first SVC. SVC models train in quadratic time, and should only be used on datasets with fewer than a few tens of thousands of 
  8 | 'clSVCShrinking'  sets the shrinking parameter equal to true. SVC models train in quadratic time, and should only be used on datasets with han a few tens of thousands of rows 
  9 | 'clnnSknn' scikit-neuralnetwork's wrapper around pyLearn2's neueral network. this is designed to be compatible with scikit-learn. It had very active development through mid 2015, but does not appear to be supported since then. 
 10 | 'clKnn': k-nearest-neighbors. a relatively different way of approaching the problem
 11 | 'clLogisticRegression': standard logistic regression. right now though it is trying to cast from float(64) to S(32), and choking on that, so it is commented out for future development at some later point in time. 
 12 | 'clAdaBoost': trains multiple classifiers, where each additional classifier focuses on the difficult test cases for the previous one.
 13 | */
 14 | 
 15 | module.exports = function(problemType, dataLength) {
 16 |   // these algorithms work for all problemTypes and dataLengths we have encountered so far
 17 |   var universalAlgorithms = {
 18 |     clRfGini: 'clRfGini',
 19 |     clXGBoost: 'clXGBoost',
 20 |     clRfBootstrapTrue: 'clRfBootstrapTrue',
 21 |     clAdaBoost: 'clAdaBoost',
 22 |     clExtraTrees: 'clExtraTrees'
 23 |   };
 24 | 
 25 |   // these algorithms only work on classification problems, due to being instantiated with classification-specific parameters 
 26 |   var classifierOnlyAlgorithms = {
 27 |     clLogisticRegression: 'clLogisticRegression',
 28 |     clMultinomialNB: 'clMultinomialNB',
 29 |     clRfEntropy: 'clRfEntropy',
 30 |     clPerceptron: 'clPerceptron',
 31 |     clnnSklearnMLP: 'clnnSklearnMLP',
 32 |     clSGDClassifier: 'clSGDClassifier'
 33 |   }
 34 | 
 35 |   var regressionOnlyAlgorithms = {
 36 |     clAdaLossLinear: 'clAdaLossLinear',
 37 |     clAdaLossSquare: 'clAdaLossSquare',
 38 |     clAdaLossExponential: 'clAdaLossExponential'
 39 |   }
 40 | 
 41 |   // these algorithms have a time coplexity that is prohibitive for long data sets
 42 |   var delForLongDatasets = {    
 43 |     clSVCFirst: 'clSVCFirst',
 44 |     clSVCShrinking: 'clSVCShrinking'
 45 |   };
 46 | 
 47 |   // these algorithms just aren't working right now for one reason or another
 48 |   var brokenRegressionAlgorithms = {
 49 |     clKnn: 'clKnn'
 50 |   };
 51 | 
 52 |   // the clnn algos may not be broken, but we're probably going to deprecate them pretty shortly since sklearn launched their own MLP
 53 |   var brokenClassifierAlgorithms = {
 54 |     clKnn: 'clKnn'
 55 |   };
 56 | 
 57 |   // these are algorithms we are in the process of implementing now or shortly
 58 |   var notImplementedYetAlgorithms = {
 59 |     clnnNoLearn: 'clnnNoLearn',
 60 |     clLinearRegression: 'clLinearRegression',
 61 |     clLinearSVC: 'clLinearSVC'
 62 |   }
 63 | 
 64 |   // this entire next section is dedicated to extending the universalAlgorithms object, which we will eventually return
 65 | 
 66 |   // we use the 'all' flag inside processArgs to set initial placeholder values for all possible classifiers we may end up training
 67 |   // then, once data-formatter has run, we will know the problemType and only return those classifiers
 68 |   if( problemType === 'category' || problemType === 'multi-category' || problemType === 'all') {
 69 |     for(var key in classifierOnlyAlgorithms) {
 70 |       universalAlgorithms[key] = classifierOnlyAlgorithms[key];
 71 |     }
 72 |   }
 73 | 
 74 |   if( problemType === 'multi-category' ) {
 75 |     delete universalAlgorithms['clnnSklearnMLP'];
 76 |   }
 77 | 
 78 |   if(problemType === 'regression' || problemType === 'all') {
 79 |     for(var key in regressionOnlyAlgorithms) {
 80 |       universalAlgorithms[key] = regressionOnlyAlgorithms[key];
 81 |     }    
 82 |   } 
 83 | 
 84 |   if( dataLength === 'longDataSet' ) {
 85 |     for( var key in delForLongDatasets ) {
 86 |       delete universalAlgorithms[key];
 87 |     }
 88 |   }
 89 | 
 90 |   if( argv.devEnsemble ) {
 91 |     delete universalAlgorithms['clXGBoost']
 92 |     delete universalAlgorithms['clRfBootstrapTrue']
 93 |   }
 94 | 
 95 |   // scikit-learn's MLP is only available in v^0.18.0
 96 |   // if the user has not installed that version, we want to make sure to remove that from our classifierList
 97 |   // try{
 98 |     
 99 |   // }  
100 | 
101 |   return universalAlgorithms;
102 | 
103 | }
104 | 
105 | // module.exports = {
106 | //   dev: {
107 | //     clRfGini: 'clRfGini',
108 | //     clXGBoost: 'clXGBoost',
109 | //     clRfBootstrapTrue: 'clRfBootstrapTrue'
110 | //   },
111 | //   shortDataSet: {
112 | //     clXGBoost: 'clXGBoost',
113 | //     clRfEntropy: 'clRfEntropy',
114 | //     clAdaBoost: 'clAdaBoost',
115 | //     clRfGini: 'clRfGini', 
116 | //     // clLogisticRegression: 'clLogisticRegression'
117 | //   },
118 | //   longDataSet: {
119 | //     // clSVCFirst: 'clSVCFirst',
120 | //     // clSVCShrinking: 'clSVCShrinking',
121 | //     // clnnNoLearn: 'clnnNoLearn',
122 | //     // clnnSknn3Layer: 'clnnSknn3Layer',
123 | //     // clnnSknn: 'clnnSknn',
124 | //     // clKnn: 'clKnn',
125 | //     // clRfEntropy: 'clRfEntropy',
126 | //     clLogisticRegression: 'clLogisticRegression',
127 | //     clAdaBoost: 'clAdaBoost',
128 | //     // clAdaLossLinear: 'clAdaLossLinear',
129 | //     // clAdaLossSquare: 'clAdaLossSquare',
130 | //     // clAdaLossExponential: 'clAdaLossExponential',
131 | //     clRfGini: 'clRfGini',
132 | //     clXGBoost: 'clXGBoost',
133 | //     clRfBootstrapTrue: 'clRfBootstrapTrue'
134 | //   }
135 | // };
136 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # machineJS
 2 | > a fully-featured default process for machine learning- all the parts are here and have functional default values in place. Modify to your heart's delight so you can focus on the important parts for your dataset, or run it all the way through with the default values to have fully automated machine learning!
 3 | 
 4 | # [`auto_ml`](https://github.com/ClimbsRocks/auto_ml) - machineJS, but better!
 5 | I just built out [v2 of this project](https://github.com/ClimbsRocks/auto_ml) that now gives you analytics info from your models, and is production-ready. machineJS is an amazing research project that clearly proved there's a hunger for automated machine learning. 
 6 | 
 7 | [auto_ml](https://github.com/ClimbsRocks/auto_ml) tackles this exact same goal, but with more features, cleaner code, and the ability to be copy/pasted into production. 
 8 | 
 9 | Check it out!
10 | https://github.com/ClimbsRocks/auto_ml
11 | 
12 | ## What is machineJS?
13 | `machineJS` provides a fully automated framework for applying machine learning to a dataset.
14 | 
15 | All you have to do is give it a .csv file, with some basic information about each column in the first row, and it will go off and do all the machine learning for you!
16 | 
17 | If you've already done this kind of thing before, it's useful as an outline, putting in place a working structure for you to make modifications within, rather than having to build from scratch again every time. 
18 | 
19 | machineJS will tell you:
20 | 
21 | - Which algorithms are going to be most effective for this problem 
22 | - Which features are most useful
23 | - Whether this problem is solvable by machine learning at all (useful if you're not sure you've collected enough data yet)
24 | - How effective machine learning can be with this problem, to compare against other potential solutions (like just taking a grouped average)
25 | 
26 | If you haven't done much (or any) machine learning before- it does some fairly advanced stuff for you!
27 | 
28 | ## Installation:
29 | 
30 | ### As a standalone directory (recommended)
31 | If you want to install this in it's own standalone repo, and work on the source code directly, then from the command line, type the following:
32 | 
33 | 1. `git clone https://github.com/ClimbsRocks/machineJS.git`
34 | 2. `cd machineJS`
35 | 3. `npm install`
36 | 4. `pip install -r requirements.txt`
37 | 5. `git clone https://github.com/scikit-learn/scikit-learn.git`
38 | 6. `cd scikit-learn`
39 | 7. `python setup.py build`
40 | 8. `sudo python setup.py install`
41 | 
42 | 
43 | <!-- ### As a node_module
44 | If you are installing this as a node_module to be used within another repo:
45 | 
46 | 1. `npm install --save machinejs`
47 | 2. `cd node_modules/machinejs`
48 | 3. `./installPythonDependencies.sh`
49 |  -->
50 | <!-- ## How to use
51 | You can use machineJS either from the command line, or as a node module by requiring it into files being run by node.js.
52 |  -->
53 | ### From the command line
54 | `node machineJS.js path/to/trainData.csv --predict path/to/testData.csv`
55 | 
56 | <!-- ### As a node_module
57 | ``` 
58 | var machineJS = require('machinejs');
59 | machineJS({
60 |   dataFile: 'path/to/trainData.csv',
61 |   predict: 'path/to/testData.csv'
62 | });
63 | ```
64 |  -->
65 | ## Format of Data Files:
66 | We use the `data-formatter` module to automatically format your data, and even perform some basic feature engineering on it. 
67 | Please refer to `data-formatter`'s [docs](https://github.com/ClimbsRocks/data-formatter) for information on how to label each column to be ready for `machineJS`.
68 | 
69 | ## How to customize/dive in deeper:
70 | machineJS is designed to be super easy to use without diving into any of the internals. Be a conjurer- just give it data and let it run!
71 | That said, it's super powerful once you start customizing it. 
72 | 
73 | It's designed to be relatively easy to modify, and well-documented. The [obvious place to start](https://github.com/ClimbsRocks/machineJS/blob/master/processArgs.js) is inside `processArgs.js`. Here we set nearly all the parameters that are used throughout the project. 
74 | 
75 | The other obvious area many people will be interested in is adding in new models, and different hyperparameter search spaces. This can be found in the `pySetup` folder. The [exact steps](https://github.com/ClimbsRocks/machineJS/blob/master/pySetup/stepsToAddNewClassifier.txt) are listed in `stepsToAddNewClassifier.txt`. 
76 | 
77 | ## What types of problems does this library work on?
78 | `machineJS` works on both regression and categorical problems, as long as there is a single output column in the training data. This includes multi-category (frequently called multi-class) problems, where the category you are predicting is one of many possible categories. 
79 | There are no immediate plans to support multiple output columns in the training data. If you have three output columns you're interested in predicting, and they cannot be combined into a single column in the training data, you could run `machineJS` once for each of those three columns. 
80 | 
81 | This library is well-tested on Macs. I've designed it to work on PCs as well, but I haven't tested that at all yet. If you're a PC user, I'd love some issues or Pull Requests to make this work for PCs!
82 | 
83 | 
84 | #### Note: This library is designed to run across all but one cores on the host machine. What this means for you:
85 | 1. Please plug in.
86 | 2. Close all programs and restart right before invoking (this will clear out as much RAM as possible).
87 | 3. Expect some noise from your fan- you're finally putting your computer to use!
88 | 4. Don't expect to be able to do anything intense while this is running. Internet browsing or code editing is fine, but watching a movie may get challenging.
89 | 5. Please don't run any other Python scripts while this is running.
90 | 
91 | Thanks for inviting us along on your machine learning journey!
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/test/regression/makePredictions.js:
--------------------------------------------------------------------------------
  1 | var expect = require('chai').expect;
  2 | var mocha = require('mocha');
  3 | var fs = require('fs');
  4 | var path = require('path');
  5 | var rTest = global.rTest;
  6 | var csv = require('csv');
  7 | 
  8 | 
  9 | module.exports = function() {
 10 | 
 11 |   describe('the predictions for each classifier', function() {
 12 | 
 13 |     // If you have added a new classifier, and it works for regressions, add it here!
 14 |     // ******************************************************************************
 15 |     var expectedMinimumTrainingScores = {
 16 |       // clRfGini: 0.955,
 17 |       // clXGBoost: 0.87,
 18 |       clAdaBoost: 0.535
 19 |     };
 20 | 
 21 |     var expectedMinimumValidationScores = {
 22 |       clRfGini: 0.845,
 23 |       clXGBoost: 0.74,
 24 |       clAdaBoost: 0.525
 25 |     };
 26 |     // ******************************************************************************
 27 | 
 28 |  
 29 |     // run the tests for each classifier we expect to have trained
 30 |     for( var clName in expectedMinimumTrainingScores ) {
 31 |       (function testSingleAlgo(clName) {
 32 | 
 33 |         describe('predictions for ' + clName, function() {
 34 | 
 35 |           var validationFileName;
 36 |           var predictionFileName;
 37 |           var validationData;
 38 |           var predictionsData;
 39 | 
 40 |           before(function(done) {
 41 |             var validationFiles = fs.readdirSync(path.join(rTest.rTestPredictionsLocation, 'validation'));
 42 |             var predictionsFiles = fs.readdirSync(rTest.rTestPredictionsLocation);
 43 | 
 44 | 
 45 |             for(var i = 0; i < validationFiles.length; i++) {
 46 |               if( validationFiles[i].indexOf(clName) !== -1 ) {
 47 |                 validationFileName = validationFiles[i];
 48 |               }
 49 |             }
 50 | 
 51 |             for(var i = 0; i < predictionsFiles.length; i++) {
 52 |               if( predictionsFiles[i].indexOf(clName) !== -1 ) {
 53 |                 predictionFileName = predictionsFiles[i];
 54 |               }
 55 |             }
 56 | 
 57 |             // read in both our predictions data and our validation data
 58 |             fs.readFile(path.join(rTest.rTestPredictionsLocation, 'validation', validationFileName), function(err, data) {
 59 |               if(err) {
 60 |                 console.error(err);
 61 |                 done();
 62 |               }
 63 |               csv.parse(data, function(err, output) {
 64 |                 if(err) {
 65 |                   console.error(err);
 66 |                 }
 67 |                 validationData = output;
 68 | 
 69 | 
 70 |                 fs.readFile(path.join(rTest.rTestPredictionsLocation, predictionFileName), function(err, data) {
 71 |                   if(err) {
 72 |                     console.error(err);
 73 |                     done();
 74 |                   }
 75 |                   csv.parse(data, function(err, output) {
 76 |                     if(err) {
 77 |                       console.error(err);
 78 |                     }
 79 |                     predictionsData = output;
 80 |                     done();
 81 |                   });
 82 |                 });
 83 | 
 84 | 
 85 |               });
 86 |             });
 87 | 
 88 |           });
 89 | 
 90 |           var errorRow;
 91 | 
 92 |           it('should have validation error and training error in the first row of the validationData', function() {
 93 |             errorRow = validationData.shift();
 94 |             // console.log(errorRow[0]);
 95 |             // console.log(errorRow[1]);
 96 |             expect(parseFloat(errorRow[0], 10)).to.be.a('number');
 97 |             expect(parseFloat(errorRow[1], 10)).to.be.a('number');
 98 |           });
 99 | 
100 |           it('should have done at least as well as it has in the past', function() {
101 |             console.log('\n');
102 |             console.log('this classifier\'s expected Validation Error:', expectedMinimumValidationScores[clName], 'this classifier\'s observed Validation Error:', Math.round(errorRow[0] * 1000) / 1000);
103 |             console.log('this classifier\'s expected Training Error:', expectedMinimumTrainingScores[clName], 'this classifier\'s observed Training Error:', Math.round(errorRow[1] * 1000) / 1000);
104 |             console.log('\n');
105 |             expect(errorRow[0]).to.be.above(expectedMinimumValidationScores[clName]);
106 |             expect(errorRow[1]).to.be.above(expectedMinimumTrainingScores[clName]);
107 |           });
108 | 
109 |           it('should have the pretty names for this dataset in the second row of the validationData', function() {
110 |             var headerRow = validationData.shift();
111 |             expect(headerRow[0].toLowerCase()).to.equal('id');
112 |             expect(headerRow[1].toLowerCase()).to.equal('sales');
113 |           });
114 | 
115 |           it('should make predictions against the validation data set', function() {
116 |             expect(validationData.length).to.be.within(51000 - 300, 51000 + 300);
117 | 
118 |             var shortestRowLength = Infinity;
119 |             for(var i = 0; i < validationData.length; i++) {
120 |               if( validationData[i].length < shortestRowLength ) {
121 |                 shortestRowLength = validationData[i].length;
122 |               }
123 |             }
124 | 
125 |             expect(shortestRowLength).to.equal(2);
126 | 
127 |           });
128 | 
129 |           it('should have the pretty names for this dataset in the first row of the predictionsData', function() {
130 |             var headerRow = predictionsData.shift();
131 |             expect(headerRow[0].toLowerCase()).to.equal('id');
132 |             expect(headerRow[1].toLowerCase()).to.equal('sales');
133 |           });
134 | 
135 |           it('should make predictions against the test data set', function() {
136 |             expect(predictionsData.length).to.equal(41088)
137 | 
138 |             var shortestRowLength = Infinity;
139 |             for(var i = 0; i < predictionsData.length; i++) {
140 |               if( predictionsData[i].length < shortestRowLength ) {
141 |                 shortestRowLength = predictionsData[i].length;
142 |               }
143 |             }
144 | 
145 |             expect(shortestRowLength).to.equal(2);
146 | 
147 |           });
148 | 
149 | 
150 |         });
151 | 
152 |         after(function() {
153 |           predictionsData = null;
154 |           validationData = null;
155 |         })
156 | 
157 |       })(clName);
158 | 
159 |     }
160 | 
161 |   });
162 | 
163 |   
164 | }
165 | 
166 | 


--------------------------------------------------------------------------------
/pySetup/utils.js:
--------------------------------------------------------------------------------
  1 | var py = global.pythonNamespace;
  2 | var argv = global.argv;
  3 | var path = require('path');
  4 | var utilsPyShell = require('./utilsPyShell.js');
  5 | var df = require('data-formatter');
  6 | 
  7 | module.exports = {
  8 |   dictVectMapping: {
  9 |     // this will be given to us by DictVectorizer, a python module that takes dictionaries and turns them into arrays. Obviously since dictionaries are not ordered, we need to keep track of which fields end up in which indices. 
 10 |   },
 11 | 
 12 |   fileNames: {
 13 |     // this will be given to us by dataFormatting.py once it has created the files with the formatted data.
 14 |     // ID
 15 |     // X_train
 16 |     // y_train
 17 |     // X_test
 18 |     // y_test
 19 |     // X_train_nn- used by neural networks. we will use the same ID and y_train files as the rest of the dataset. It is only the input features that have to be normalized, not the output features. 
 20 |     // X_test_nn- used by neural networks. we will use the same ID and y_train files as the rest of the dataset. It is only the input features that have to be normalized, not the output features. 
 21 |     // trainingDataLength- technically not a file name, but fits much more logically here than reading in that file again in node.js
 22 |     // problemType: 'regression' or 'category' or 'multi-category'
 23 |   },
 24 | 
 25 |   splitData: function(callback) {
 26 |     var dfArgs = {
 27 |       fileNames: module.exports.fileNames,
 28 |       searchPercent: argv.searchPercent,
 29 |       validationPercent: argv.validationPercent,
 30 |     };
 31 | 
 32 |     // generatePythonOptions assumes the first input is the name of a data file that training.py or makePredictions.py will be run on. Pass in ignoreMe.csv for now until we refactor that. 
 33 |     var pythonOptions = utilsPyShell.generatePythonOptions('ignoreMe.csv', [JSON.stringify(argv), JSON.stringify(module.exports.fileNames) 
 34 |       ] );
 35 | 
 36 |     if( argv.splitDataTest ) {
 37 |       // if this is being run from within our test suite, pass in a blank callback to halt executtion after splitDatasets
 38 |       callback = function() {};
 39 |     }
 40 |     
 41 |     var pyShell = utilsPyShell.startPythonShell('splitDatasets.py', callback, pythonOptions);
 42 |     pyShell.on('message', function(message) {
 43 |       if(message.type === 'splitFileNames') {
 44 |         for( var key in message.text) {
 45 |           module.exports.fileNames[key] = message.text[key];
 46 |         }
 47 |         global.argv.fileNames = module.exports.fileNames;
 48 |       }
 49 |     });
 50 | 
 51 |   },
 52 | 
 53 |   formatData: function( callback ) {
 54 |     // the callback function will be invoked with an object that holds the fileNames needed by module.exports.fileNames
 55 | 
 56 |     var dataFormatterArgs = {
 57 |       trainingData: argv.dataFile,
 58 |       testingData: argv.predict,
 59 |       trainingPrettyName: argv.outputFileName,
 60 |       testingPrettyName: argv.testOutputFileName,
 61 |       joinFileName: argv.join,
 62 |       on: argv.on,
 63 |       allFeatureCombinations: argv.allFeatureCombinations,
 64 |       keepAllFeatures: argv.keepAllFeatures
 65 |     };
 66 | 
 67 |     if( argv.dfOutputFolder ) {
 68 |       dataFormatterArgs.outputFolder = argv.dfOutputFolder;
 69 |     }
 70 | 
 71 |     df(dataFormatterArgs, function(fileNames) {
 72 |       console.log('Here are the fileNames from data-formatter. If you want to skip the data-formatter part next time you want to play with this dataset, copy and paste this object into machineJS/pySetup/testingFileNames.js, following the instructions included in that file.');
 73 |       console.log(fileNames);
 74 |       // df takes in a callback function that will be invoked with the fileNames object, holding the names and locations of the files it saved the data into
 75 |       module.exports.fileNames = fileNames;
 76 |       callback();
 77 |     });
 78 | 
 79 |   },
 80 | 
 81 |   kickOffTraining: function( callback, classifierName) {
 82 |     var pythonOptions = utilsPyShell.generatePythonOptions(argv.dataFile, [JSON.stringify(argv), JSON.stringify(module.exports.fileNames), classifierName, module.exports.fileNames.problemType, global.bestSearchScore]);
 83 | 
 84 |     var emitFinishedTrainingCallback = function() {
 85 |       global.finishedAlgos++;
 86 |       process.emit('algoFinishedTraining');
 87 |       callback();
 88 |     };
 89 | 
 90 | 
 91 |     var pyShell = utilsPyShell.startPythonShell('training.py', emitFinishedTrainingCallback, pythonOptions);
 92 |     pyShell.on('message', function(message) {
 93 | 
 94 |       // once we get a message back with the trained results, 
 95 |       if(message.type === 'trainingResults') {
 96 |         var classifierName = message.text.algoName;
 97 | 
 98 |         // save it into our allResults array
 99 |         global.allTrainingResults.push(message.text);
100 |         global.trainedAlgoCounts[classifierName]++;
101 | 
102 |         // see if this is the best searchScore we've encountered so far
103 |         if( message.text.searchScore > global.bestSearchScore ) {
104 |           global.bestSearchScore = message.text.searchScore;
105 |         }
106 | 
107 |         // see if this is the best search result for that algorithm so far
108 |         var prevBestResult = global.trainingResultsSummary[classifierName];
109 |         if( message.text.searchScore > prevBestResult || prevBestResult === undefined ) {
110 |           global.trainingResultsSummary[classifierName] = message.text.searchScore;
111 |         }
112 |         // global.trainedAlgos[classifierName] = message.text;
113 |       }
114 |     });
115 |   },
116 | 
117 |   makePredictions: function( callback, classifierName) {
118 |     console.log('kicking off the process of making predictions on the predicting data set for:', classifierName);
119 | 
120 |     var startPredictionsScript = function() {
121 |       if( global.copyValidationData && classifierName.slice(0,4) !== 'clnn' ) {
122 |         var copyValidationData = true;
123 |         global.copyValidationData = false;
124 |       } else {
125 |         var copyValidationData = false;
126 |       }
127 | 
128 |       var classifierTrainingObj = global.allTrainingResults[global.allTrainingResults.length -1];
129 |       var classifierTrainingScore = classifierTrainingObj.longTrainScore;
130 |       var classifierSearchScore = classifierTrainingObj.searchScore
131 | 
132 |       var pythonOptions = utilsPyShell.generatePythonOptions(argv.predict, [module.exports.dictVectMapping, JSON.stringify(argv), JSON.stringify(module.exports.fileNames), classifierName, module.exports.fileNames.problemType, classifierTrainingScore, copyValidationData, classifierSearchScore ]);
133 | 
134 |       // if this hyperparameter search did not yield an algorithm that was close enough to our best that it was worth investing in a longTraining, we did not train it and gave it a score of 0. 
135 |       // therefore, we only want to make predictions using this classifier if we actually trained an algorithm successfully (classifierTrainingScore > 0)
136 |       // if( classifierTrainingScore > 0 ) {
137 |       utilsPyShell.startPythonShell('makePredictions.py', callback, pythonOptions);
138 |       // } else {
139 |       //   // ensembler needs to know to not listen for predictions results from this algorithm
140 |       //   process.emit('algoSkippedTraining');
141 |       // }
142 | 
143 |     };
144 | 
145 |     startPredictionsScript();
146 | 
147 |   }
148 | 
149 | }
150 | 


--------------------------------------------------------------------------------
/pySetup/controllerPython.js:
--------------------------------------------------------------------------------
  1 | var py = global.pythonNamespace = {};
  2 | var exec = require('child_process').exec;
  3 | var ensembler = require('ensembler');
  4 | 
  5 | var path = require('path');
  6 | var fs = require('fs');
  7 | var pySetupLocation = path.dirname(__filename);
  8 | py.pySetupLocation= pySetupLocation;
  9 | py.referencesToChildren= [];
 10 | var utils = require('./utils.js');
 11 | var classifierOptions = require('./classifierList.js');
 12 | 
 13 | argv = global.argv;
 14 | 
 15 | var startOneClassifier = function(classifierList) {
 16 | 
 17 |   if( classifierList.length > 0 ) {
 18 |     // for our last classifier, tell it to run on all cores on the machine
 19 |     // this way, when the second-to-last classifier finishes, and those half the machine cores are empty, we can put them to use!
 20 |     if( classifierList.length === 1 ) {
 21 |       argv.numCPUs = argv.computerTotalCPUs;
 22 |     }
 23 | 
 24 |     var classifierName = classifierList.shift();
 25 | 
 26 |     var algosBestScore = global.trainingResultsSummary[classifierName];
 27 | 
 28 |     // if we have trained more than two of this algorithm, and it's best score is not within X percent of the best we've found so far, don't bother training another one. 
 29 |     // during the ensemble round, we are intentionally skipping over any algorithms that did not perform well during the earlier round of training. this should save significant amounts of time and make sure we only have high quality results at the end. 
 30 |     // if you want to train all classifiers during the ensemble round, simply add in an expression to the boolean phrase below
 31 |     if( global.trainedAlgoCounts[classifierName] < 2 || algosBestScore > global.bestSearchScore * argv.continueToTrainThreshold ) {
 32 |       // kick off training, and then, once that is done, invoke the callback, which starts the process of making predictions
 33 |       utils.kickOffTraining( function() {
 34 |         module.exports.makePredictions(classifierName);
 35 |       }, classifierName);
 36 | 
 37 |     } else {
 38 |       // since we said at the start to expect a certain number of algorithms to be trained, we must still emit an event to notify ensembler that we are skipping over an algorithm
 39 |       process.emit('algoSkippedTraining');
 40 |     }
 41 | 
 42 |     
 43 |   }
 44 | };
 45 | 
 46 | 
 47 | module.exports = {
 48 |   killAll: function() {
 49 |     // kill all child processes
 50 |     for (var i = 0; i < py.referencesToChildren.length; i++) {
 51 |       py.referencesToChildren[i].childProcess.kill();
 52 |     }
 53 | 
 54 |     // following the .kill() routine for each child is frequently not killing all the child processes of that child process. so if our python shell is running 8 other python scripts to spread the training out around all the cores, those 8 other python scripts are continuing to run after the above. 
 55 |     // the following command will be executed on the command line and will kill all Python processes. 
 56 |     // the unfortunate side effect is that any unrelated Python processes running on this machine will also be killed. But since this library takes up all the cores on the machine anyways, the user would likely have a very hard time running other Python scripts simultaneously regardless. 
 57 |     exec('pkill -9 Python');
 58 |   },
 59 | 
 60 |   startClassifiers: function(classifierList) {
 61 |     var classifiersByRound = module.exports.makeClassifierList();
 62 | 
 63 |     startOneClassifier(classifiersByRound);
 64 |     startOneClassifier(classifiersByRound);
 65 | 
 66 | 
 67 |     // whenever one estimator finishes training (or has not performed well enough in training so far to justify training another instance of it), we want to start training another!
 68 |     process.on('algoFinishedTraining', function() {
 69 |       startOneClassifier(classifiersByRound);
 70 |     });
 71 | 
 72 |     process.on('algoSkippedTraining', function() {
 73 |       startOneClassifier(classifiersByRound);
 74 |     });
 75 | 
 76 |   },
 77 | 
 78 |   makeClassifierList: function() {
 79 |     var classifierList = classifierOptions(utils.fileNames.problemType, utils.fileNames.trainingDataLength);
 80 | 
 81 |     classifierList = Object.keys( classifierList );
 82 |     var classifiersByRound = [];
 83 | 
 84 |     // we are going to get many trained classifiers from this!
 85 |     // let's talk through an example:
 86 |     // say we want to run 100 iterations of RandomizedSearchCV
 87 |       // we could run a single round of rsCV with 100 iterations, and get a single trained classifier out of it at the end
 88 |       // or, we could run 10 rounds, with 10 iterations each, and have 10 trained classifiers at the end!
 89 |     // ensembler works best when it has more predictions to work with, so this second option is immediately appealling
 90 |     // the second option is also appealling in that we will have a bunch of midway results
 91 |       // say running 100 iterations takes 100 hours
 92 |       // and we end up only having 90 hours
 93 |       // if we split this up into multiple rounds, we will now have 8 or 9 algorithms trained by this point, one of which is likely the best one
 94 |       // whereas if it were an all-or-nothing game of having to get to all 100, we would have nothing.
 95 |     // another thing that's appealling about running multiple rounds is it let's us test more algorithms against the valdiation data set. it's somewhat difficult to predict how each algorithm is going to generalize, so having a chance to actually test them against the validation data set gives us more options
 96 |     // the drawback is that it will take more time (training a "bigger" version of the selected algorithm 10 times is not trivial, nor is running 10 rounds of predictions against the validation and test data sets)
 97 | 
 98 |     if(argv.devEnsemble) {
 99 |       argv.numRounds = 1;
100 |     }
101 | 
102 |     for( var i = 0; i < argv.numRounds; i++) {
103 |       for( var j = 0; j < classifierList.length; j++) {
104 |         classifiersByRound.push(classifierList[j]);
105 |       }
106 |     }
107 | 
108 |     numberOfClassifiers = classifiersByRound.length;
109 | 
110 |     // tell ensembler how many algos to wait for before ensembler takes over
111 |     ensembler.startListeners( numberOfClassifiers, argv.ensemblerArgs);
112 | 
113 |     return classifiersByRound;
114 |   },
115 | 
116 |   startTraining: function() {
117 | 
118 |     if( argv.validationRound ) {
119 |       module.exports.startClassifiers();
120 |       
121 |     } else if( argv.alreadyFormatted ) {
122 |     // if we have already formatted the data, skip over repeating that step. This allows us to train more classifiers rapidly without repeating the oftentimes lengthy data formatting process. 
123 |       utils.splitData(function() {
124 |         module.exports.startClassifiers();
125 |       });
126 |     } else {
127 |       // here is where we invoke data-formatter to handle all our data formatting needs
128 |         // for more information, please check out that repo!
129 |         // https://github.com/ClimbsRocks/data-formatter
130 |       utils.formatData( function() {
131 |         utils.splitData(function() {
132 |           module.exports.startClassifiers();
133 |         });
134 |       });
135 |     }
136 | 
137 |   },
138 | 
139 | 
140 |   makePredictions: function(classifierName) {
141 | 
142 |     utils.makePredictions( function() {
143 |       process.emit('algoFinishedPredicting');
144 |     }, classifierName);
145 |   }
146 | 
147 | };
148 | 


--------------------------------------------------------------------------------
/pySetup/splitDatasets.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | import random
  4 | from os import path
  5 | import ntpath
  6 | import cPickle as pickle
  7 | 
  8 | import numpy as np
  9 | from scipy.sparse import csr_matrix, csc_matrix
 10 | 
 11 | from sendMessages import printParent
 12 | from sendMessages import messageParent
 13 | from sendMessages import obviousPrint
 14 | 
 15 | args = json.loads(sys.argv[2])
 16 | fileNames = json.loads(sys.argv[3])
 17 | XFileName = fileNames['X_train']
 18 | XnnFileName = fileNames['X_train_nn']
 19 | ynnFileName = fileNames['y_train_nn']
 20 | idFileName = fileNames['id_train']
 21 | yTrainFileName = fileNames['y_train']
 22 | validationSplitColumnFileName = fileNames['validation_split_column']
 23 | hasCustomValidationSplit = fileNames['hasCustomValidationSplit']
 24 | 
 25 | outputDirectory = path.dirname(XFileName)
 26 | 
 27 | # what percent of our dataset to not train on, but to set aside for validation and stacking/blending?
 28 | validationPercent = args['validationPercent']
 29 | 
 30 | 
 31 | # we are not supporting dense matrices at the moment. 
 32 | def load_sparse_csr(filename):
 33 |     loader = np.load(filename)
 34 |     return csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) 
 35 | 
 36 | X = load_sparse_csr(XFileName)
 37 | 
 38 | 
 39 | numRows = X.shape[0]
 40 | 
 41 | includeOrNot = [random.random() for x in range(0,numRows)]
 42 | 
 43 | # we want to save the validation indices with the test data. that way we can have multiple different versions of the sme training data set scattered throughout a computer, but still use these same validationIndices for all of them
 44 | validationIndexFolder = path.dirname(args['predict'])
 45 | validationIndexFileName = 'dfValidationIndices' + args['testOutputFileName'] + '.pkl'
 46 | validationIndicesFile = path.join( validationIndexFolder, validationIndexFileName )
 47 | 
 48 | 
 49 | writeToFile = True
 50 | createNewSplit = False
 51 | 
 52 | if hasCustomValidationSplit:
 53 |     # load the validation split column
 54 |     validationSplitColumn = load_sparse_csr(validationSplitColumnFileName)
 55 |     # create both training and validation indices
 56 |     # validationIndices are rows we will hold out as the validation data set
 57 |     # trainingIndices are rows we will include in the training data set
 58 |     validationIndices = []
 59 |     trainingIndices = []
 60 |     for idx, item in enumerate(validationSplitColumn.todense().tolist()[0]):
 61 |         if item == 1:
 62 |             validationIndices.append(idx)
 63 |         else:
 64 |             trainingIndices.append(idx)
 65 | 
 66 | else:
 67 |     # try to load in existing validationIndices
 68 |     try:
 69 |         with open(validationIndicesFile, 'rb') as openFile:
 70 |             validationIndices = pickle.load(openFile)
 71 | 
 72 |             # check to make sure that the validation length is less than the length of our X dataset
 73 |             if len(validationIndices) > numRows * ( validationPercent + .02):
 74 |                 printParent('validationIndices too long')
 75 |                 # if it isn't, create a new validationIndices for this dataset, but do not write it to file
 76 |                 # this lets us keep our larger validationIndices split (for the full training data set), while still having something to work with for this smaller dataset we're currently testing on.
 77 |                 writeToFile = False
 78 |                 raise IndexError("this dataset is shorter than the one we built the validation split on previously")
 79 | 
 80 |             # check to make sure that the validation length is within a few percentage points of our validationPercent number (in other words, if X is 10,000 rows long, and the length of the validationIndices is only 1,200, then we know validationIndices was built on a smaller test dataset earlier.)
 81 |             elif len(validationIndices) < numRows * validationPercent * .98:
 82 |                 printParent('validationIndices too short')
 83 |                 # If it is not, create a new validationIndices and write that to file
 84 |                 raise IndexError("this dataset is longer than the one we built the validation split on previously")
 85 |                 
 86 |             # In both cases, fall into the except state below
 87 |             # but create a variable that lays out whether to write that new validationIndices to file or not in the try block, and then use that in the except block below
 88 | 
 89 |             # if we found existing validationIndices that meet the criteria above, we still want to split our incoming dataset on those indices
 90 |             # this allows us to change our feature engineering on a training dataset, and pass those features through to machineJS
 91 |             trainingIndices = []
 92 |             validationIndicesCopy = validationIndices[:]
 93 |             # it should already be sorted, but we're being safe here in case of future changes
 94 |             validationIndicesCopy.sort()
 95 |             validationIndicesCounter = 0
 96 | 
 97 |             # linear comparison of two lists to only put indices into trainingIndices if they are not in validationIndices
 98 |             for x in range(0,numRows):
 99 |                 if x == validationIndicesCopy[validationIndicesCounter]:
100 |                     validationIndicesCounter += 1
101 |                 else:
102 |                     trainingIndices.append(x)
103 |             del validationIndicesCopy
104 | 
105 | 
106 |     # in the case that we were not able to load in validationIndices successfully, we want to write our validationIndices to file for all future runs to use
107 |     except:
108 |         createNewSplit = True
109 |         validationIndices = []
110 |         trainingIndices = []
111 |         for idx, randomNum in enumerate(includeOrNot):
112 |             if randomNum < validationPercent:
113 |                 validationIndices.append(idx)
114 |             else:
115 |                 trainingIndices.append(idx)
116 | 
117 |         if writeToFile:
118 |             with open(validationIndicesFile, 'wb') as writeFile:
119 |                 # now save that file as a .pkl next to where our test data sits. 
120 |                 pickle.dump(validationIndices, writeFile)
121 | 
122 | 
123 | # continued callout to the person originally responsible for this function:
124 | # http://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format
125 | def save_sparse_csr(filename,array):
126 |     np.savez(filename,data=array.data ,indices=array.indices, indptr=array.indptr, shape=array.shape )
127 | 
128 | 
129 | # we want to write the splits of the training data every time
130 | # but only create a new validationIndices in certain circumstances
131 | def splitDataset(data, name, fileCategory):
132 | 
133 |     # uses slicing, one of the most useful and least-well-known features of scipy sparse matrices
134 |     # you pass in a list of row indices you want to keep, and it will create a sliced copy that includes only those rows
135 |     # slicing also works on column indices
136 |     # callout to the person who first opened my eyes to them:
137 |     # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best
138 | 
139 |     # if this "sparse" matrix only has a single value for each row, we have to treat it as a column matrix, and slice it accordingly
140 |     # this is the case for our idColumn, and frequently our y values as well.
141 |     if data.shape[0] == 1:
142 |         validation = data[:,validationIndices]
143 |         trainingData = data[:,trainingIndices]
144 | 
145 |     else:
146 |         validation = data[validationIndices,:]
147 |         trainingData = data[trainingIndices,:]
148 | 
149 |     # ntpath theoretically works really well across systems
150 |     name = ntpath.basename(name)
151 |     # remove the file extension
152 |     name = name[0:-4]
153 | 
154 |     validationFile = path.join(outputDirectory, name + 'validationData.npz')
155 |     trainingDataFile = path.join(outputDirectory, name + 'trainingData.npz')
156 | 
157 |     save_sparse_csr(trainingDataFile, trainingData)
158 |     save_sparse_csr(validationFile, validation)
159 | 
160 |     # send the file names back to the parent process, where we aggregate and save them
161 |     fileNameDict = {
162 |         fileCategory + 'trainingData': trainingDataFile,
163 |         fileCategory + 'validationData': validationFile
164 |     }
165 |     messageParent(fileNameDict, 'splitFileNames')
166 | 
167 | 
168 | # we are going to repeat this process several times:
169 |     # idColumn
170 |     # X_train
171 |     # y_train
172 |     # X_train_nn
173 | # they are just slightly different enough that i don't want to loop through them. The code below is super readable
174 | 
175 | splitDataset(X, XFileName, 'X_train')
176 | del X
177 | 
178 | idColumn = load_sparse_csr(idFileName)
179 | splitDataset(idColumn, idFileName, 'id_train')
180 | del idColumn
181 | 
182 | yColumn = load_sparse_csr(yTrainFileName)
183 | splitDataset(yColumn, yTrainFileName, 'y_train')
184 | del yColumn
185 | 
186 | Xnn = load_sparse_csr(XnnFileName)
187 | splitDataset(Xnn, XnnFileName, 'X_train_nn')
188 | del Xnn
189 | 
190 | ynn = load_sparse_csr(ynnFileName)
191 | splitDataset(ynn, ynnFileName, 'y_train_nn')
192 | del ynn
193 | 
194 | 


--------------------------------------------------------------------------------
/processArgs.js:
--------------------------------------------------------------------------------
  1 | var path = require('path');
  2 | var mkdirp = require('mkdirp');
  3 | // we will soon save path.dirname(__filename) into argv.machineJSLocation, but to get all this started by loading our require statements, we'll type it in directly here
  4 | var utils = require(path.join(path.dirname(__filename), 'pySetup','utils.js'));
  5 | var classifierListOptions = require(path.join(path.dirname(__filename), 'pySetup', 'classifierList.js'));
  6 | 
  7 | module.exports = function() {
  8 |   if(argv.dev || argv.devKaggle || argv.devEnsemble) {
  9 |     argv.dev = true;
 10 |   } else {
 11 |     argv.dev = false;
 12 |   }
 13 | 
 14 | 
 15 |   var dataFile = global.argv.dataFile || process.argv[2];
 16 |   argv.computerTotalCPUs = require('os').cpus().length;
 17 |   argv.machineJSLocation = path.dirname(__filename);
 18 | 
 19 |   // setting defaults if using the --dev or --devKaggle flags (speeds up development time when doing engineering work on the machineJS library itself)
 20 |   if( argv.dev ) {
 21 |     require('longjohn');
 22 |     if (dataFile === undefined) {
 23 |       dataFile = 'rossShortTrainDev.csv';
 24 |     }
 25 |     if ( (argv.devKaggle && !argv.predict) || argv.devEnsemble) {
 26 |       argv.predict = argv.predict || 'rossmantest.csv';
 27 |     }
 28 |   }
 29 | 
 30 |   argv.dataFile = dataFile;
 31 |   argv.dataFileName = path.basename( argv.dataFile );
 32 |   argv.dataFilePretty = argv.dataFileName.slice(0,-4);
 33 |   argv.binaryOutput = argv.binaryOutput || false; //python doesn't like undefined, so explicitly set this to false if it does not exist
 34 |   argv.outputFileName = argv.dataFilePretty;
 35 |   if( argv.outputFileName === 'train' ) {
 36 |     dataFileFolder = path.parse(argv.dataFile).dir.split(path.sep).pop();
 37 |     argv.outputFileName = dataFileFolder + argv.dataFilePretty;
 38 |   }
 39 | 
 40 |   // python throws a keyError if you try to look up a key that doesn't exist, so we are explicitly giving it a blank value to ensure the key will exist when we need it later
 41 |   argv.join = argv.join || '';
 42 |   argv.on = argv.on || '';
 43 |   argv.allFeatureCombinations = argv.allFeatureCombinations || '';
 44 |   argv.keepAllFeatures = argv.keepAllFeatures || '';
 45 |   argv.dfOutputFolder = argv.dfOutputFolder || path.join(argv.machineJSLocation,'pySetup','data-formatterResults');
 46 |   argv.matrixOutput = argv.matrixOutput || '';
 47 | 
 48 | 
 49 |   argv.testFileName = path.basename( argv.predict );
 50 |   argv.testFilePretty = argv.testFileName.slice(0,-4);
 51 |   argv.testOutputFileName = argv.testFilePretty;
 52 | 
 53 |   if( argv.testOutputFileName === 'test' ) {
 54 |     dataFileFolder = path.parse(argv.dataFile).dir.split(path.sep).pop();
 55 |     argv.testOutputFileName = dataFileFolder + argv.testFilePretty;
 56 |   }
 57 | 
 58 |   /*
 59 |   in splitDatasets.py, we are going to break our data out into three groups:
 60 |     1. The group we run the hyperparameter search over (GridSearchCV or RandomizedSearchCV).
 61 |       Since the best hyperparameters for a random subset of the data are going to be the same as the entire dataset, 
 62 |       we run the search on only a subset of the data to drastically speed up search time
 63 |     2. The training data we will train our (now-optimized) algorithm on. 
 64 |       Now that we have our best hyperparameters, create an algorithm with those parameters, and train it on a larger portion of our overall dataset. 
 65 |     3. The validation set. This is a holdout set we do not include in the training set. 
 66 |       We use this to test how well our algorithm generalizes to data it hasn't seen yet. 
 67 |       We also use this, later down the road, for ensembler to create stacked/blended ensembles with.
 68 |       For a given test.csv dataset, we will determine the validation dataset once, and then use that each time.
 69 |       This means that we can include all the algorithms you've trained on this dataset in our ensembling. 
 70 |       This lets you change how you format the data (normalization, scaling, new feature engineering, etc.), and still use all these algorithms in the final ensemble.
 71 |       You can easily start over with a new validation set by simply deleting the validation.pkl file saved next to your test.csv file.
 72 |     */
 73 |   if( argv.dev ) {
 74 |     argv.searchPercent = argv.searchPercent || .1;
 75 |     argv.validationPercent = argv.validationPercent || .85;
 76 |   } else {
 77 |     argv.searchPercent = argv.searchPercent || .3;
 78 |     argv.validationPercent = argv.validationPercent || .3;
 79 |   }
 80 | 
 81 |   /*
 82 |   set out how many combinations of parameters we want to try. 
 83 |   numRounds is how many different times we will run RandomizedSearchCV for that algorithm.
 84 |   so if we have numRounds = 20, we will search for optimal hyperparameters for each algorithm 20 times
 85 |   numIterationsPerRound is how many different combinations of hyperparameters we will attempt for each of those rounds
 86 |   so numIterationsPerRound = 10 means we will try 10 different combinations of hyperparameters each round. 
 87 |   for competitions, more numRounds and lower numIterationsPerRound is ideal. In that case, we have more material to feed into ensembler, since we will have more algos trained at the end. For production environments, fewer numRounds and much higher numIterationsPerRound means that each of the algos we train will be higher quality. We will probably miss out on accuracy to a tiny degree, but we will need far fewer algos to accomplish this, which will be much more efficient in a production environment. 
 88 |   bumping up these values will increase accuracy at the cost of compute time
 89 |   */
 90 | 
 91 |   if( argv.dev ) {
 92 |     argv.numRounds = argv.numRounds || 2;
 93 |     argv.numIterationsPerRound = argv.numIterationsPerRound || 5;
 94 | 
 95 |   } else {
 96 |     argv.numRounds = argv.numRounds || 3;
 97 |     argv.numIterationsPerRound = argv.numIterationsPerRound || 8;
 98 |     
 99 |   }
100 | 
101 | 
102 |   // keep track of where we will be saving data during all of the intermediate stages
103 |   argv.predictionsFolder = argv.predictionsFolder || path.join(argv.machineJSLocation, 'predictions', argv.testOutputFileName);
104 |   argv.validationFolder = path.join(argv.predictionsFolder, 'validation');
105 |   argv.bestClassifiersFolder = argv.bestClassifiersFolder || path.join(argv.machineJSLocation, 'pySetup','bestClassifiers',argv.outputFileName);
106 |   // create these folders if they do not already exist
107 |   mkdirp(argv.predictionsFolder);
108 |   mkdirp(argv.validationFolder);
109 |   mkdirp(argv.bestClassifiersFolder);
110 | 
111 |   // allow the user to specify a different location for the output
112 |   argv.ensemblerOutputFolder = argv.ensemblerOutputFolder || argv.machineJSLocation;
113 | 
114 |   /*
115 |   the first time we run machineJS, it will just make predictions for a ton of different algos
116 |     then ensembler will add all the predictions of these algo to the validation data. 
117 |     in other words, for each row of data, we will now have the original input data (height = 5'2", gender = female, etc.), as well as the predictions from all the stage 0 predictors (randomForest says .99 probability, MLP says .997 probability, perceptron says .97, etc.).
118 |     then ensembler asks machineJS to try to train a new algo that takes these stage 0 predictions into account
119 |   */
120 |   // keep track of whether this is the validation round or the original stage 0 round
121 |   if( argv.validationRound !== true ) {
122 |     argv.validationRound = false;
123 |   }
124 |   var nextValidationRound = !argv.validationRound;
125 | 
126 |   // these are the arguments we will pass to ensembler
127 |   argv.ensemblerArgs = {
128 |     inputFolder: argv.predictionsFolder,
129 |     outputFolder: argv.ensemblerOutputFolder,
130 |     validationFolder: argv.validationFolder,
131 |     fileNameIdentifier: argv.outputFileName,
132 |     validationRound: nextValidationRound
133 |   };
134 | 
135 |   // sometimes we want the probability (.97), sometimes we just want a binary yes or no (1)
136 |   if( argv.binaryOutput ) {
137 |     argv.kaggleBinaryOutputFolder = path.join(argv.predictionsFolder, 'kaggleBinaryOutput');
138 |     mkdirp(argv.kaggleBinaryOutputFolder);
139 |   }
140 | 
141 |   // sometimes we want matrix output. This is useful when we are trying to, say, categorize a shopper into one of 12 different categories. With matrix output, each of those 12 categories will come out as their own column, with a 0 or 1, as opposed to a single column with values from 1 - 12.
142 |   // this hasn't been tested in a while. 
143 |   if( argv.matrixOutput ) {
144 |     argv.matrixOutputFolder = path.join(argv.predictionsFolder, 'matrixOutput');
145 |     mkdirp(argv.matrixOutputFolder);
146 |   }
147 | 
148 |   // store information on all the algos we've trained so far
149 |   global.allTrainingResults = [];
150 |   global.trainingResultsSummary = {};
151 |   global.trainedAlgoCounts = {};
152 |   global.bestSearchScore = 0;
153 |   global.finishedAlgos = 0;
154 |   global.copyValidationData = true;
155 | 
156 |   // each classifier is only allowed to take up half the CPUs on the machine.
157 |   // we will be training two in parallel
158 |   // this way, if a single classifier takes so long to train that it effectively fails, we can still train classifiers on the other cores
159 |   argv.numCPUs = argv.numCPUs || Math.round( argv.computerTotalCPUs / 2 ) + 1;
160 | 
161 |   // we have several different objects in our classifierListOptions, depending on the length of dataset we're training against. 
162 |   // rather than trying to build in the logic of figuring out which ones we want before we have formatted and understood our data, just add in all the possible options as keys.
163 |   classifierListOptions = classifierListOptions('all');
164 |   for( var algo in classifierListOptions ) {
165 |     global.trainedAlgoCounts[algo] = 0;
166 |   }
167 | 
168 | 
169 |   // we are setting the minimum threshold an algorithm must hit in order to justify us training that algorithm for an extended period of time.
170 |   // this comes into play for algorithms that have a considerably longer longTraining time than testing time, such as our random forests with 1200 trees. 
171 |   // it takes only ~3 minutes to do the hyperparameter search, but ~40 to do the long training. we obviously don't want to undertake that long training unless that algo is "good enough". 
172 |   // in this case, good enough is defined as being within 3% of our best algo at that stage in the process. 
173 |   argv.longTrainThreshold = argv.longTrainThreshold || .97;
174 |   argv.continueToTrainThreshold = argv.continueToTrainThreshold || argv.longTrainThreshold;
175 |   
176 |   // formatting our data can take a long time. Unless you're performing additional feature engineering, the results are basically the same every time we run data-formatter. So, we can save ourselves a lot of time by just using the previously calculated results from data-formatter. 
177 |   // the entire process the user follows when using previously formatted data is exactly the same as formatting the data anew. You must pass in the exact same arguments to machineJS as you would to run data-formatter from scratch- we depend on that information you're passing in. 
178 |   // take in a flag to tell machineJS that we want to use previously formatted data. This is always the case when the dev flags have been passed in.
179 |   if( argv.alreadyFormatted === undefined ) {
180 |     if( argv.dev || argv.ensemble ) {
181 |       argv.alreadyFormatted = true;
182 |     } else {
183 |       argv.alreadyFormatted = false;
184 |     }
185 |   }
186 | 
187 |   // if we are using previously formatted data, load in the names of the right files from machineJS/pySetup/testingFileNames.js. Follow instructions in that file for more information on the exact format expected.
188 |   if( argv.alreadyFormatted ) {
189 | 
190 |     if( argv.fileNames !== undefined ) {
191 |       utils.fileNames = argv.fileNames;
192 |     } else {
193 |       var fileNamesOptions = require(path.join(argv.machineJSLocation,'pySetup','testingFileNames.js'));
194 |       utils.fileNames = fileNamesOptions[argv.outputFileName];
195 |       argv.fileNames = utils.fileNames;
196 |     }
197 | 
198 |     try{
199 |       utils.fileNames = JSON.parse(utils.fileNames);
200 |     } catch(err) {
201 | 
202 |     }
203 |   }
204 | 
205 | };
206 | 


--------------------------------------------------------------------------------
/pySetup/training.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import csv
  3 | import os
  4 | import os.path as path
  5 | import json
  6 | import joblib
  7 | import logging
  8 | import time
  9 | 
 10 | import numpy as np
 11 | from sklearn.cross_validation import train_test_split
 12 | from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
 13 | from scipy.sparse import csr_matrix, vstack
 14 | 
 15 | from sendMessages import printParent
 16 | from sendMessages import messageParent
 17 | from sendMessages import obviousPrint
 18 | 
 19 | logging.basicConfig()
 20 | 
 21 | import warnings
 22 | startTime = time.time()
 23 | 
 24 | from randomizedSearchList import rsList
 25 | randomizedSearchCVList = rsList()
 26 | 
 27 | # these lines will give us an object with keys for each classifier name, and values that will return classifiers to us. 
 28 | from makeClassifiers import makeClassifiers
 29 | globalArgs = json.loads(sys.argv[2])
 30 | fileNames = json.loads(sys.argv[3])
 31 | 
 32 | classifierName = sys.argv[4]
 33 | problemType = sys.argv[5]
 34 | bestSearchScore = float(sys.argv[6])
 35 | 
 36 | sys.path.append(globalArgs['machineJSLocation'] + '/pySetup/parameterMakers')
 37 | import paramMakers
 38 | 
 39 | import makeBigClassifiers
 40 | import extendedTrainingList
 41 | 
 42 | dev = False
 43 | if( globalArgs['dev'] ):
 44 |     dev = True
 45 | 
 46 | def load_sparse_csr(filename):
 47 |     loader = np.load(filename)
 48 |     return csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) 
 49 | 
 50 | 
 51 | classifierCreater = makeClassifiers(globalArgs, dev, problemType)
 52 | 
 53 | X = []
 54 | y = []
 55 | headerRow = []
 56 | 
 57 | 
 58 | # for the validationRound, we have saved the data into the dataFile property of globalArgs
 59 | if globalArgs['validationRound']:
 60 |     X_file_name = globalArgs['dataFile']
 61 | # for neural networks, we need to train on data normalized to the range of {0,1} or {-1,1}
 62 | # data-formatter did that for us already, so we just have to load in the correct feature data
 63 | elif( classifierName[0:4] == 'clnn' ):
 64 |     X_file_name = fileNames['X_train_nntrainingData']
 65 | else:    
 66 |     X_file_name = fileNames['X_traintrainingData']
 67 | 
 68 | if globalArgs['validationRound']:
 69 |     y_file_name = globalArgs['validationYs']
 70 | else:
 71 |     # for neural networks, the y values do not need to be normalized
 72 |     y_file_name = fileNames['y_traintrainingData']
 73 | 
 74 | 
 75 | try:
 76 |     X = load_sparse_csr(X_file_name)
 77 | 
 78 | # the following block works for dense arrays
 79 | except:
 80 |     # our X_train file has a header row, so the user can see the results of data-formatter in a pretty way if they'd like.
 81 |     # we need to remove this row form our actual dataset
 82 |     # none of our other files from data-formatter have header rows
 83 |     with open(X_file_name, 'rU') as openInputFile:
 84 |         inputRows = csv.reader(openInputFile)
 85 |         firstRow=False
 86 |         for row in inputRows:
 87 |             if(firstRow):
 88 |                 rowAsFloats = []
 89 |                 # make sure that floats that were saved as scientific notation are actually read in as floats
 90 |                 # this should be non-controversial, as by this point we should have turned all categorical data into binary representation (0 or 1).
 91 |                 for idx, val in enumerate(row):
 92 |                     try:
 93 |                         val = float(val)
 94 |                     except:
 95 |                         printParent(headerRow[idx])
 96 |                         printParent(val)
 97 |                     rowAsFloats.append( val )
 98 |                 X.append(row)
 99 |             else:
100 |                 headerRow = row
101 |                 firstRow=True
102 |             
103 | 
104 |     X = np.array(X)
105 | 
106 | try:
107 |     y = load_sparse_csr(y_file_name)
108 | 
109 | except:
110 |     # supports dense input, which is used in validationRound
111 |     with open(y_file_name, 'rU') as openOutputFile:
112 |         outputRows = csv.reader(openOutputFile)
113 |         # this might be unnecessary now that we have run our data through data-formatter
114 |         # we might be able to load in the y_train data directly
115 |         firstRow = False
116 |         for row in outputRows:
117 |             if firstRow:
118 |                 try:
119 |                     row[0] = float(row[0])
120 |                 except:
121 |                     row[0] = row[0]
122 |                 y.append(row[0])
123 |             else:
124 |                 # ignore the first row as it holds our header
125 |                 firstRow = True
126 |     y = np.array(y)
127 | 
128 | try:
129 |     if y.shape[0] == 1:
130 |         y = y.todense().tolist()[0]
131 | except:
132 |     pass
133 | 
134 | if fileNames['testingDataLength'] < 100000:
135 |     # train on all the available (non-validation) data
136 |     testSize = 0
137 |     # a small data set should have many rounds of cross-validation. this will take longer to train, but means we will be training on more data
138 |     cvRounds = 3
139 | elif fileNames['testingDataLength'] < 200000:
140 |     testSize = .25
141 |     cvRounds = 2
142 | else:
143 |     # if this is the stage 0 round
144 |     # we have already separated out our validation data (currently 30% of the entire training data set by default)
145 |     # the data that we have loaded in here is the 70% that is not our validation data
146 |     # we want to have 30% of our entire training data set used as our "search" data set, meaning it is ~43% of this 70% data set
147 |     # the number we must give though is how much we want saved for testing, which is 1-.43 = .57
148 |     testSize = .57
149 |     cvRounds = 2
150 | 
151 | if globalArgs['validationRound']:
152 |     # if this is the validation round, we do not want to split our data out any further. 
153 |     # take only the validation portion of these datasets
154 |         # right now they are the combined validation + test datasets
155 |         # we want them to only be the validation portions
156 |     combinedLength = X.shape[0]
157 |     validationLength = combinedLength - fileNames['testingDataLength']
158 |     validationIndices = range( validationLength )
159 | 
160 |     # slicing the X array to only contain the training data
161 |     X_train = X[validationIndices , : ]
162 | 
163 |     # unless we are doing multi-category or multi-label predictions, we have converted y to be a list, meaning we have to slice it differently
164 |     try:
165 |         # slicing sparse matrices, if y is for multi-label or multi-category predictions
166 |         y_train = y[validationIndices , : ]
167 |     except:
168 |         # slicing standard python lists
169 |         y_train = y[ 0 : validationLength ]
170 | 
171 |     # set X and y equal to the versions of themselves that only have the validation data
172 |     # this makes our lives easier later on when we go to train the big classifier on the "full" dataset
173 |     X = X_train
174 |     y = y_train
175 | 
176 | else:
177 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=0)
178 | 
179 |     # if we're developing, train on only a small percentage of the dataset, and do not train the final large classifier (where we significantly bump up the number of estimators).
180 |     if dev:
181 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=0)
182 | 
183 | # instantiate a new classifier, given the type passed in to us
184 | classifier = classifierCreater[classifierName]
185 | 
186 | # if possible, have the algorithm warm_start, taking advantage of the training it's done previously and then simply building on top of that
187 | try:
188 |     classifier.set_params(warm_start=True)
189 | except:
190 |     pass
191 | 
192 | # XGBoost requires data to be in it's own particular format. 
193 | if classifierName == 'clXGBoost':
194 |     try:
195 |         X_train = classifier.DMatrix( X_train )
196 |         X = classifier.DMatrix( X )
197 |     except:
198 |         pass
199 | 
200 | # create features that are custom to the size of the input data. 
201 | # Each individual paramaterMaker file sits in the paramaterMakers folder. If you want to modify what the parameters are, or submit a PR with a better combination of parameters to try, that is the place to start. 
202 | allParams = paramMakers.makeAll(X,y,globalArgs, dev, problemType)
203 | parameters_to_try = allParams[classifierName]
204 | 
205 | printParent('we are about to run a cross-validated search for the best hyperparameters for ' + classifierName)
206 | 
207 | try:
208 |     if randomizedSearchCVList[classifierName]:
209 |         # error_score=0 means that if some combinations of parameters fail to train properly, the rest of the search process will work.
210 |         # numIterationsPerRound defaults to 8, unless the user has passed in a more specific value.
211 |         n_iter = globalArgs['numIterationsPerRound']
212 |         if classifierName in ['clSGDClassifier']:
213 |             # these algorithms train very quickly, and have many parameters to try, so they get more attempts than other algorithms
214 |             n_iter = n_iter * 2
215 |         searchCV = RandomizedSearchCV(classifier, parameters_to_try, n_jobs=globalArgs['numCPUs'], error_score=0, n_iter=n_iter, refit=True, cv=cvRounds)
216 |     else:
217 |         searchCV = GridSearchCV(classifier, parameters_to_try, n_jobs=globalArgs['numCPUs'], error_score=0, refit=True, cv=cvRounds)
218 | except:
219 |         searchCV = GridSearchCV(classifier, parameters_to_try, n_jobs=globalArgs['numCPUs'], error_score=0, refit=True, cv=cvRounds)    
220 | 
221 | searchCV.fit(X_train, y_train ) 
222 | printParent('\n')
223 | printParent('*********************************************************************************************************')
224 | printParent(classifierName + "'s best score from the hyperparameter search attempts is:")
225 | printParent(searchCV.best_score_)
226 | printParent('*********************************************************************************************************')
227 | printParent(classifierName + "'s best parameters this time are:")
228 | printParent(searchCV.best_params_)
229 | printParent('\n')
230 | 
231 | printParent(classifierName + "'s total hyperparameter searching time is:")
232 | # this will give time in minutes, to one decimal point
233 | finishTrainTime = time.time()
234 | printParent( round((finishTrainTime - startTime)/60, 1) )
235 | 
236 | 
237 | longTrainThreshold = bestSearchScore * globalArgs['longTrainThreshold']
238 | messageObj = {
239 |     "searchScore": searchCV.best_score_,
240 |     "algoName": classifierName
241 | }
242 | 
243 | # only put in the (oftentimes considerable) effort of longTraining this algorithm if it meets the threshold defined by longTrainThreshold
244 |     # and do not train up a long version of the first two. that is a time-consuming process for an algorithm that is probably not very well optimized
245 | # Get info on whether this algo supports creating a larger version of that classifier. 
246 | # for example, a random forest you can train with more trees, a neural network you can train for more epochs, etc.
247 | extendedTraining = extendedTrainingList.getAll()[classifierName]
248 | if ((searchCV.best_score_ > longTrainThreshold and longTrainThreshold > 0) or globalArgs['validationRound']) and extendedTraining:
249 | 
250 |     allBigClassifiers = makeBigClassifiers.makeAll(globalArgs, dev, problemType)
251 |     longTrainClassifier = allBigClassifiers[classifierName]
252 | 
253 |     longTrainClassifier.set_params(**searchCV.best_params_)
254 | 
255 | # grab the best esimator from our searchCV
256 | else:
257 |     longTrainClassifier = searchCV.best_estimator_
258 | 
259 | startLongTrainTime = time.time()
260 | 
261 | # when doing the cross-validated search, we potentially been holding out a significant portion of the dataset
262 | # once we have found the best hyperparameters, train on the entire dataset
263 |     # we have already verified that this is the best set of hyperparameters using cross-validation
264 | if X.shape[0] != X_train.shape[0] or extendedTraining:
265 |     longTrainClassifier.fit(X, y)
266 | 
267 |     finishLongTrainTime = time.time()
268 |     printParent(classifierName + "'s training on the longer data set took:")
269 |     printParent( round((finishLongTrainTime - startLongTrainTime)/60, 1) )
270 | 
271 | 
272 | longTrainClassifierScore = longTrainClassifier.score(X, y)
273 | printParent(classifierName + "'s score against the larger training data set is:")
274 | printParent(longTrainClassifierScore)
275 | messageObj['longTrainScore'] = longTrainClassifierScore
276 | 
277 | 
278 | # save our classifiers from the validationRound to a separate folder
279 | if globalArgs['validationRound']:
280 |     classifierFolder = path.join(globalArgs['bestClassifiersFolder'], 'ensemblingAlgos', 'best' + classifierName)
281 | else:
282 |     classifierFolder = path.join(globalArgs['bestClassifiersFolder'], 'best' + classifierName)
283 | 
284 | if not os.path.exists(classifierFolder):
285 |     os.makedirs(classifierFolder)
286 | 
287 | joblib.dump(longTrainClassifier,  path.join(classifierFolder, 'best' + classifierName + '.pkl') )
288 | 
289 | messageParent(messageObj, 'trainingResults')
290 | 


--------------------------------------------------------------------------------
/pySetup/makePredictions.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import os.path as path
  4 | import sys
  5 | import csv
  6 | import time
  7 | import joblib
  8 | import numpy as np
  9 | import pandas as pd
 10 | import logging
 11 | import xgboost
 12 | 
 13 | from scipy.sparse import csr_matrix, vstack
 14 | 
 15 | from sendMessages import printParent
 16 | from sendMessages import messageParent
 17 | from sendMessages import obviousPrint
 18 | 
 19 | logging.basicConfig()
 20 | 
 21 | fileNames = json.loads(sys.argv[4])
 22 | classifierName = sys.argv[5]
 23 | argv = json.loads(sys.argv[3])
 24 | problemType = sys.argv[6]
 25 | trainingScore = sys.argv[7]
 26 | copyValidationData = sys.argv[8]
 27 | searchScore = sys.argv[9]
 28 | 
 29 | if argv['validationRound']:
 30 |     X_file_name = argv['dataFile']
 31 | 
 32 | else:
 33 |     if( classifierName[0:4] == 'clnn' ):
 34 |         nn = True
 35 |         X_file_name = fileNames['X_test_nn']
 36 |     else:
 37 |         nn = False
 38 |         X_file_name = fileNames['X_test']
 39 | 
 40 | id_file_name = fileNames['id_test']
 41 | 
 42 | 
 43 | XTest = []
 44 | testIDColumn = []
 45 | 
 46 | # load up the prediction data set, without the header row
 47 | try:
 48 |     def load_sparse_csr(filename):
 49 |         loader = np.load(filename)
 50 |         return csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) 
 51 |     
 52 |     XTest = load_sparse_csr(X_file_name)
 53 | except:
 54 |     with open(X_file_name, 'rU') as x_file:
 55 |         inputRows = csv.reader(x_file)
 56 |         headerRow = False
 57 |         for row in inputRows:
 58 |             if(headerRow):
 59 |                 XTest.append(row)
 60 |             else:
 61 |                 headerRow = True
 62 | 
 63 | if argv['validationRound']:
 64 |     # in the validation file, we have combined the validationData and the test data
 65 |     # split out to only have the test data
 66 |     testLength = fileNames['testingDataLength']
 67 |     combinedValidationLength = XTest.shape[0]
 68 |     testIndices = range( combinedValidationLength - testLength, combinedValidationLength)
 69 | 
 70 |     XTest = XTest[ testIndices , : ]
 71 | 
 72 | # should be pretty safe to convert the testIDColumn to a list, since it is always going to be a single value per row
 73 | # to get a single vector (in this case, our ID column) to be saved as a sparse matrix, we have to do some vaguely hacky stuff
 74 | # the following line converts it to a normal python list
 75 | testIDColumn = load_sparse_csr( id_file_name ).todense().tolist()[0]
 76 | 
 77 | 
 78 | try:
 79 |     idHeader = fileNames['idHeader']
 80 |     outputHeader = fileNames['outputHeader']
 81 | except:
 82 |     # read in the y_file simply to get the pretty header name for the output column
 83 |     with open(y_file_name, 'rU') as y_file:
 84 |         inputRows = csv.reader(y_file)
 85 |         outputHeader = False
 86 |         for row in inputRows:
 87 |             if outputHeader == False:
 88 |                 outputHeader = row[0]
 89 |             else:
 90 |                 pass
 91 | 
 92 | if argv['validationRound']:
 93 |     classifierFile = path.join( argv['bestClassifiersFolder'], 'ensemblingAlgos', 'best' + classifierName, 'best' + classifierName + '.pkl')
 94 | else:
 95 |     classifierFile = path.join( argv['bestClassifiersFolder'], 'best' + classifierName, 'best' + classifierName + '.pkl')
 96 | 
 97 | # load up the previously trained (and tuned!) classifier
 98 | classifier = joblib.load( classifierFile )
 99 | 
100 | try:
101 |     classifier.set_params(n_jobs=-1)
102 | except:
103 |     pass
104 | 
105 | 
106 | # get predictions for each item in the prediction data set
107 | if problemType == 'category':
108 |     try:
109 |         testDataPredictions = classifier.predict_proba(XTest)
110 |     except:
111 |         # perceptron does not support predict_proba
112 |         # and MultinomialNB does not do probability predictions all that well
113 |         testDataPredictions = classifier.predict(XTest)
114 | 
115 | # else will handle both regression and multi-category predictions at the moment. 
116 | else:
117 |     testDataPredictions = classifier.predict(XTest)
118 | 
119 | 
120 | if not argv['validationRound']:
121 |     validationFile = fileNames['X_trainvalidationData']
122 |     validationData = load_sparse_csr(validationFile)
123 |     validationIdFile = fileNames['id_trainvalidationData']
124 |     validationIDs = load_sparse_csr( validationIdFile ).todense().tolist()[0]
125 | 
126 |     if nn:
127 |         validationYFile = fileNames['y_train_nnvalidationData']
128 |     else:
129 |         validationYFile = fileNames['y_trainvalidationData']
130 |     validationY = load_sparse_csr(validationYFile).todense().tolist()[0]
131 | 
132 | 
133 |     if problemType == 'category':
134 |         try:
135 |             validationPredictions = classifier.predict_proba(validationData)
136 |         except:
137 |             validationPredictions = classifier.predict(validationData)
138 |             
139 |     else:
140 |         # else will handle both regression and multi-category predictions for now
141 |         validationPredictions = classifier.predict(validationData)
142 | 
143 |     validationScore = classifier.score(validationData,validationY)
144 | 
145 |     printParent('\n')
146 |     printParent('***************')
147 |     printParent(classifierName + "'s score on the validation set is:")
148 |     printParent(validationScore)
149 |     printParent('***************')
150 | else:
151 |     # we still need something to write to the file. we will write the score from the hyperparameter search, which is the cross-validation score on the holdout data from that search. in that way, it's actualy a pretty accurate score to be using. 
152 |     validationScore = searchScore
153 | 
154 | # write our predictions on the test data to a file
155 | if argv['validationRound']:
156 |     predictionsPath = path.join( argv['predictionsFolder'], 'ensembledPredictions' )
157 | 
158 | else:
159 |     predictionsPath = argv['predictionsFolder']
160 | 
161 | 
162 | # using the outputFileName here so that if people have different input files (different feature engineering), that will show up in our file names.
163 | predictionsFileName = argv['outputFileName'] + classifierName + str(time.time()) + '.csv'
164 | 
165 | # create the directory if it doesn't exist already
166 | if not os.path.exists(predictionsPath):
167 |     os.makedirs(predictionsPath)
168 | 
169 | with open( path.join(predictionsPath, predictionsFileName) , 'w+') as predictionsFile:
170 |     csvwriter = csv.writer(predictionsFile)
171 | 
172 |     # we are going to have to modify this when we allow it to make categorical predictions too. 
173 |     # write the scores to the top row
174 |     csvwriter.writerow([validationScore, trainingScore])
175 |     csvwriter.writerow([idHeader,outputHeader])
176 |     for idx, prediction in enumerate(testDataPredictions):
177 |         rowID = testIDColumn[idx]
178 | 
179 |         try:
180 |             len(prediction)
181 |             csvwriter.writerow([int(rowID),prediction[1]])
182 |         except:
183 |             csvwriter.writerow([int(rowID),prediction])
184 | 
185 | if not argv['validationRound']:
186 | 
187 |     # write our validation predictions to a file too
188 |     validationPath = path.join( predictionsPath, 'validation')
189 |     validationFileName = argv['outputFileName'] + classifierName + str(time.time()) +'.csv'
190 | 
191 |     # to keep things super consistent, we will combine our test and validation data, so there's no risk of order getting mixed up in ensembler
192 |     totalPredictions = np.concatenate( (validationPredictions, testDataPredictions), axis=0 )
193 |     validationAndTestIDs = np.concatenate( (validationIDs, testIDColumn), axis=0 )
194 | 
195 |     with open( path.join(validationPath, validationFileName) , 'w+') as validationFile:
196 |         csvwriter = csv.writer(validationFile)
197 | 
198 |         # at the top of each validation file, write the score for that classifier on the validation set
199 |         csvwriter.writerow([validationScore, trainingScore])
200 | 
201 |         # we are going to have to modify this when we allow it to make categorical predictions too. 
202 |         csvwriter.writerow([idHeader,outputHeader])
203 |         for idx, prediction in enumerate(totalPredictions):
204 |             rowID = validationAndTestIDs[idx]
205 |             try:
206 |                 len(prediction)
207 |                 csvwriter.writerow([int(rowID),prediction[1]])
208 |             except:
209 |                 csvwriter.writerow([int(rowID),prediction])
210 | 
211 |     # continued callout to the person originally responsible for this function:
212 |     # http://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format
213 |     def save_sparse_csr(filename,array):
214 |         np.savez(filename,data=array.data ,indices=array.indices, indptr=array.indptr, shape=array.shape )
215 | 
216 |     if copyValidationData and nn == False:
217 |         allValidationDataFile = path.join( validationPath, 'validationData.npz')
218 |         allValidationIDsFile = path.join( validationPath, 'validationIDs.npz')
219 |         allValidationYsFile = path.join( validationPath, 'validationYs.npz')
220 | 
221 |         # to make sure we keep everything consistent, we write the combined validation data and test data to a file
222 |         allValidationData = vstack( [validationData, XTest] )
223 |         save_sparse_csr(allValidationDataFile, allValidationData)
224 | 
225 |         # we already loaded in this data, but then immediately converted it to a dense list. 
226 |             # so we are going to load it in again, this time as a sparse csr matrix, and then immediately save it as a sparse csr matrix elsewhere
227 |             # we could just as easily copy the original file to a new location, but since we're not coyping anywhere else, this is slightly more consistent stylistically
228 |         validationSparseIDs = load_sparse_csr( validationIdFile )
229 |         save_sparse_csr( allValidationIDsFile, validationSparseIDs )
230 | 
231 |         validationSparseYs = load_sparse_csr(validationYFile)
232 |         save_sparse_csr( allValidationYsFile, validationSparseYs )
233 | 
234 |         # with open( path.join(validationPath, 'validationIDsAndY.csv') , 'w+') as validationFile:
235 |         #     csvwriter = csv.writer(validationFile)
236 | 
237 |         #     # we are going to have to modify this when we allow it to make categorical predictions too. 
238 |         #     csvwriter.writerow([idHeader,outputHeader])
239 |         #     for idx, rowID in enumerate(validationAndTestIDs):
240 |         #         # our test data will not have y values attached, so we will try to find a y value for this ID, but if we can't, we assume it is a test value, and we set the y value to None
241 |         #         try:
242 |         #             yValue = validationY[idx]
243 |         #         except:
244 |         #             yValue = None
245 |         #         try:
246 |         #             len(yValue)
247 |         #             csvwriter.writerow([int(rowID),yValue[1]])
248 |         #         except:
249 |         #             csvwriter.writerow([int(rowID),yValue])
250 | 
251 | # The following sections write our output in a format that the user requested. This output is not used for anything else later down the line in machineJS or ensembler, it is solely for the user. 
252 | 
253 | 
254 | # if the final output is binary, create a separate file at this stage that can be easily uploaded to kaggle by rounding the predicted value to the nearest int
255 | # We will use the actual probability in ensembler, but it's nice at this stage to be able to upload results to kaggle and get some feedback
256 | if argv[ 'binaryOutput'] == 'true':
257 |                     
258 |     # add kaggle to the front of the name to make it obvious that this is for kaggle
259 |     # this also keeps the rest of our files consistent for ensembler
260 |     kagglePath = argv['kaggleBinaryOutputFolder']
261 |     kaggleFileName = argv['outputFileName'] + classifierName + str(time.time()) + '.csv'
262 |     with open( path.join(kagglePath, kaggleFileName) , 'w+') as predictionsFile:
263 |         csvwriter = csv.writer(predictionsFile)
264 | 
265 |         csvwriter.writerow([idHeader,outputHeader])
266 |         for idx, prediction in enumerate(testDataPredictions):
267 | 
268 |             rowID = testIDColumn[idx]
269 |             # I'm not sure why we're checking if prediction is already a list
270 |                 # or why we're taking the second item in that list
271 |             try:
272 |                 len(prediction)
273 |                 prediction = int( round( prediction[1] ) )
274 |             except:
275 |                 prediction = int( round( prediction ) )
276 |                 pass
277 |             csvwriter.writerow( [rowID,prediction] )
278 | 
279 | # for multi-category data, we can choose to output a single column with all the categories contained in that column, or we can translate that into a set of binary columns, where each column represents a single categorical value. 
280 | # if the final output is matrixOutput, create a separate file at this stage that can be easily referenced by the user
281 | # We will use the single categorical column in ensembler, but it's nice at this stage to be able to view results in the expected format and get some feedback
282 | if argv[ 'matrixOutput'] == 'true':
283 | 
284 |     # convert our predictions on the test set to a pandas series
285 |     pdPredictions = pd.Series(testDataPredictions)
286 | 
287 |     # take our single column of category predictions, and turn it into a matrix, where each column represents a yes or no for a single category
288 |     # prefix puts our outputHeader in front of each of the values for our header row
289 |     matrixPredictions = pd.get_dummies(pdPredictions, prefix=outputHeader)
290 |     # get the header row from the data frame:
291 |     matrixHeaderRow = matrixPredictions.columns.values.tolist()
292 |     # convert from pandas data frame to a python list
293 |     matrixPredictions = matrixPredictions.values.tolist()
294 | 
295 |     # add the id to the header row
296 |     outputFileHeaderRow = [idHeader] + matrixHeaderRow
297 | 
298 |     # add matrix to the front of the name to make it obvious
299 |     # this also keeps the rest of our files consistent for ensembler
300 |     matrixPath = argv['matrixOutputFolder']
301 |     matrixFileName = argv['outputFileName'] + classifierName + str(time.time()) + '.csv'
302 |     with open( path.join(matrixPath, matrixFileName) , 'w+') as predictionsFile:
303 |         csvwriter = csv.writer(predictionsFile)
304 | 
305 |         csvwriter.writerow(outputFileHeaderRow)
306 |         for idx, listOfMatrixPredictions in enumerate(matrixPredictions):
307 | 
308 |             rowID = testIDColumn[idx]
309 |             csvwriter.writerow( [rowID] + listOfMatrixPredictions )
310 | 


--------------------------------------------------------------------------------
/pySetup/testingFileNames.js:
--------------------------------------------------------------------------------
  1 | // right now, machineJS is designed to console.log the list of fileNames 
  2 | // if you copy/paste them in here, you can add in the --alreadyFormatted flag
  3 | // this will allow you to skip over repeating the data-formatter part of the process if you've already run it
  4 | // make sure you name your new object after the training data set you passed in
  5 | // right now, that name can be found as the last word after the "_" before the ".npz" file extension
  6 | // if you look at the examples below, you can look at the property name of each object, find where in each file name that property name resides, and then grab the same part of the file name for your files as your own property name. 
  7 | 
  8 | module.exports = {
  9 | 
 10 |   rossmantrain: { 
 11 |     idHeader: 'Id',
 12 |     outputHeader: 'sales',
 13 |     id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_rossmantrain.npz',
 14 |     y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_rossmantrain.npz',
 15 |     id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_rossmantestrossmantrain.npz',
 16 |     X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_rossmantestrossmantrain.npz',
 17 |     X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_rossmantrain.npz',
 18 |     X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_rossmantrain.npz',
 19 |     y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_rossmantrain.npz',
 20 |     X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_rossmantestrossmantrain.npz',
 21 |     testingDataLength: 41088,
 22 |     trainingDataLength: 1017209,
 23 |     problemType: 'regression' 
 24 |   },
 25 | 
 26 |   numerai_training_data_tournament: {
 27 |     idHeader: "t_id",
 28 |     outputHeader: "target",
 29 |     id_train: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_numerai_training_data_tournament.npz",
 30 |     y_train: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_numerai_training_data_tournament.npz",
 31 |     id_test: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_numerai_tournament_datanumerai_training_data_tournament.npz",
 32 |     X_test: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_numerai_tournament_datanumerai_training_data_tournament.npz",
 33 |     X_train: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_numerai_training_data_tournament.npz",
 34 |     X_train_nn: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_numerai_training_data_tournament.npz",
 35 |     y_train_nn: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_numerai_training_data_tournament.npz",
 36 |     X_test_nn: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_numerai_tournament_datanumerai_training_data_tournament.npz",
 37 |     testingDataLength: 19461,
 38 |     trainingDataLength: 55038,
 39 |     problemType: "category"
 40 |   },
 41 | 
 42 |   homesitetrain: { 
 43 |     idHeader: 'quotenumber',
 44 |     outputHeader: 'quoteconversion_flag',
 45 |     id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_homesitetrain.npz',
 46 |     y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_homesitetrain.npz',
 47 |     id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_homesitetesthomesitetrain.npz',
 48 |     X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_homesitetesthomesitetrain.npz',
 49 |     X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_homesitetrain.npz',
 50 |     X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_homesitetrain.npz',
 51 |     y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_homesitetrain.npz',
 52 |     X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_homesitetesthomesitetrain.npz',
 53 |     testingDataLength: 173836,
 54 |     trainingDataLength: 260753,
 55 |     problemType: 'category' 
 56 |   },
 57 | 
 58 |   walmarttrain: { 
 59 |     idHeader: 'visitnumber',
 60 |     outputHeader: 'triptype',
 61 |     id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_walmarttrain.npz',
 62 |     y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_walmarttrain.npz',
 63 |     id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_walmarttestwalmarttrain.npz',
 64 |     X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_walmarttestwalmarttrain.npz',
 65 |     X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_walmarttrain.npz',
 66 |     X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_walmarttrain.npz',
 67 |     y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_walmarttrain.npz',
 68 |     X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_walmarttestwalmarttrain.npz',
 69 |     testingDataLength: 9935,
 70 |     trainingDataLength: 95674,
 71 |     problemType: 'multi-category' 
 72 |   },
 73 | 
 74 |   shortTrain: {
 75 |     idHeader: 'visitnumber',
 76 |     outputHeader: 'triptype',
 77 |     id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_shortTrain.npz',
 78 |     y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_shortTrain.npz',
 79 |     id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_shortTestshortTrain.npz',
 80 |     X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_shortTestshortTrain.npz',
 81 |     X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_shortTrain.npz',
 82 |     X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_shortTrain.npz',
 83 |     y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_shortTrain.npz',
 84 |     X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_shortTestshortTrain.npz',
 85 |     testingDataLength: 9935,
 86 |     trainingDataLength: 10115,
 87 |     problemType: 'multi-category' 
 88 |   },
 89 | 
 90 |   telstratrain: { 
 91 |     idHeader: 'id',
 92 |     outputHeader: 'fault_severity',
 93 |     id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_telstratrain.npz',
 94 |     y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_telstratrain.npz',
 95 |     id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_telstratesttelstratrain.npz',
 96 |     X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_telstratesttelstratrain.npz',
 97 |     X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_telstratrain.npz',
 98 |     X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_telstratrain.npz',
 99 |     y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_telstratrain.npz',
100 |     X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_telstratesttelstratrain.npz',
101 |     testingDataLength: 11171,
102 |     trainingDataLength: 7381,
103 |     problemType: 'multi-category' 
104 |   },
105 | 
106 |   numerai_training_data: { idHeader: 't_id',
107 |     outputHeader: 'target',
108 |     id_train: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/id_train_numerai_training_data.npz',
109 |     y_train: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/y_train_numerai_training_data.npz',
110 |     id_test: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/id_test_numerai_tournament_datanumerai_training_data.npz',
111 |     X_test: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/X_test_numerai_tournament_datanumerai_training_data.npz',
112 |     X_train: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/X_train_numerai_training_data.npz',
113 |     X_train_nn: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/X_train_nn_numerai_training_data.npz',
114 |     y_train_nn: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/y_train_nn_numerai_training_data.npz',
115 |     X_test_nn: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/X_test_nn_numerai_tournament_datanumerai_training_data.npz',
116 |     testingDataLength: 19461,
117 |     trainingDataLength: 55038,
118 |     problemType: 'category' },
119 | 
120 |   // here's an example of what the fileNames will look like if you engaged in labelEncoding
121 |     evenTrain: { labelMapping: 
122 |        { 'Arbitration': 0,
123 |          'Billing disputes': 1,
124 |          'Adding money': 8,
125 |          'Overdraft, savings or rewards features': 3,
126 |          'Money was not available when promised': 14,
127 |          'Unexpected/Other fees': 5,
128 |          'Credit monitoring or identity protection': 6,
129 |          'Making/receiving payments, sending money': 7,
130 |          'Loan servicing, payments, escrow account': 2,
131 |          'Unsolicited issuance of credit card': 9,
132 |          'Improper use of my credit report': 10,
133 |          'Lender sold the property': 11,
134 |          'Sale of account': 12,
135 |          'Problems when you are unable to pay': 13,
136 |          'Improper contact or sharing of info': 4,
137 |          'Shopping for a loan or lease': 15,
138 |          'Getting a loan': 16,
139 |          'Excessive fees': 17,
140 |          'Identity theft / Fraud / Embezzlement': 18,
141 |          'Advertising and marketing': 19,
142 |          'Bankruptcy': 20,
143 |          'Communication tactics': 21,
144 |          'Cont\'d attempts collect debt not owed': 22,
145 |          'Charged fees or interest I didn\'t expect': 23,
146 |          'Lost or stolen check': 24,
147 |          'Lender damaged or destroyed vehicle': 25,
148 |          'Forbearance / Workout plans': 26,
149 |          'Taking/threatening an illegal action': 27,
150 |          'Received a loan I didn\'t apply for': 28,
151 |          'Fraud or scam': 29,
152 |          'Account terms and changes': 30,
153 |          'Application, originator, mortgage broker': 31,
154 |          'Other fee': 32,
155 |          'Convenience checks': 33,
156 |          'Incorrect/missing disclosures or info': 34,
157 |          'Incorrect information on credit report': 35,
158 |          'Can\'t contact lender': 36,
159 |          'Taking out the loan or lease': 37,
160 |          'Application processing delay': 38,
161 |          'Using a debit or ATM card': 39,
162 |          'Advertising, marketing or disclosures': 40,
163 |          'Credit card protection / Debt protection': 41,
164 |          'Late fee': 42,
165 |          'Credit reporting company\'s investigation': 43,
166 |          'Managing the loan or lease': 44,
167 |          'Can\'t repay my loan': 45,
168 |          'Other transaction issues': 46,
169 |          'Privacy': 47,
170 |          'Payment to acct not credited': 48,
171 |          'Balance transfer': 71,
172 |          'Transaction issue': 50,
173 |          'Disclosure verification of debt': 51,
174 |          'Rewards': 52,
175 |          'Incorrect exchange rate': 53,
176 |          'Credit decision / Underwriting': 54,
177 |          'Lost or stolen money order': 55,
178 |          'Unauthorized transactions/trans. issues': 56,
179 |          'Lender repossessed or sold the vehicle': 57,
180 |          'Shopping for a line of credit': 58,
181 |          'Deposits and withdrawals': 59,
182 |          'Account opening, closing, or management': 60,
183 |          'Can\'t stop charges to bank account': 61,
184 |          'Balance transfer fee': 62,
185 |          'Wrong amount charged or received': 63,
186 |          'Customer service / Customer relations': 64,
187 |          'Applied for loan/did not receive money': 65,
188 |          'Credit determination': 66,
189 |          'Fees': 67,
190 |          'Disclosures': 68,
191 |          'Managing, opening, or closing account': 69,
192 |          'APR or interest rate': 70,
193 |          'Closing/Cancelling account': 49,
194 |          'Loan modification,collection,foreclosure': 72,
195 |          'Dealing with my lender or servicer': 73,
196 |          'Other': 74,
197 |          'Managing the line of credit': 75,
198 |          'Charged bank acct wrong day or amt': 76,
199 |          'Overlimit fee': 77,
200 |          'Unable to get credit report/credit score': 78,
201 |          'Delinquent account': 79,
202 |          'Cash advance fee': 80,
203 |          'Problems caused by my funds being low': 81,
204 |          'Other service issues': 82,
205 |          'False statements or representation': 83,
206 |          'Cash advance': 84,
207 |          'Credit line increase/decrease': 85,
208 |          'Customer service/Customer relations': 86,
209 |          'Settlement process and costs': 87,
210 |          'Payoff process': 88,
211 |          'Billing statement': 89 },
212 |       labelEncoded: true,
213 |       idHeader: 'id',
214 |       outputHeader: 'issue',
215 |       id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_evenTrain.npz',
216 |       id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_evenTestevenTrain.npz',
217 |       y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_evenTrain.npz',
218 |       validation_split_column: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/validation_split_column_evenTrain.npz',
219 |       hasCustomValidationSplit: false,
220 |       X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_evenTestevenTrain.npz',
221 |       X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_evenTrain.npz',
222 |       X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_evenTrain.npz',
223 |       y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_evenTrain.npz',
224 |       X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_evenTestevenTrain.npz',
225 |       testingDataLength: 5710,
226 |       trainingDataLength: 51399,
227 |       problemType: 'multi-category' }
228 | };
229 | 
230 | 


--------------------------------------------------------------------------------