├── pySetup ├── parameterMakers │ ├── __init__.py │ ├── clMultinomialNB.py │ ├── clVowpalWabbit.py │ ├── clPerceptron.py │ ├── clSGDClassifier.py │ ├── clKnn.py │ ├── svcShrinking.py │ ├── svcFirstParameterMaker.py │ ├── clLogisticRegression.py │ ├── clnnSknn3Layer.py │ ├── clAdaLossAll.py │ ├── clnnSknn.py │ ├── clnnNoLearn.py │ ├── clAdaBoost.py │ ├── clRfBootstrapBoth.py │ ├── rfGiniParamMaker.py │ ├── clExtraTrees.py │ ├── clnnSklearnMLP.py │ ├── paramMakers.py │ ├── clXGBoost.py │ └── rfEntropyParamMaker.py ├── extendedTrainingList.py ├── sendMessages.py ├── randomizedSearchList.py ├── stepsToAddNewClassifier.txt ├── makeBigClassifiers.py ├── utilsPyShell.js ├── makeClassifiers.py ├── classifierList.js ├── utils.js ├── controllerPython.js ├── splitDatasets.py ├── training.py ├── makePredictions.py └── testingFileNames.js ├── requirements.txt ├── .gitignore ├── test └── regression │ ├── test.js │ ├── trainAlgorithms.js │ ├── deleteRemnantsAndRunNewTest.js │ ├── splitDataset.js │ └── makePredictions.js ├── machineJS.js ├── shutDown.js ├── advancedAPI.md ├── package.json ├── README.md └── processArgs.js /pySetup/parameterMakers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | numpy 3 | pandas 4 | scipy 5 | cython 6 | xgboost 7 | python-dateutil 8 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clMultinomialNB.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import numpy as np 3 | 4 | def makeParams(X, y, globalArgs, dev, problemType): 5 | 6 | parameters_to_try = { 7 | 'alpha': np.random.uniform(0,1,1000) 8 | } 9 | 10 | return parameters_to_try 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | npm-debug.log 3 | kaggleTrainingData.csv 4 | formattingData*.txt 5 | formattedData*.txt 6 | bestNet*.txt 7 | kagglePredictions*.txt 8 | *.csv 9 | *.p 10 | *.pkl 11 | randomForest/bestRF/* 12 | *.zip 13 | *.npy 14 | *.pyc 15 | predictions/* 16 | *.xls 17 | *.xlsx 18 | data/* 19 | *.npz 20 | *.txt 21 | src/* 22 | .npmignore 23 | pySetup/testingFileNames 24 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clVowpalWabbit.py: -------------------------------------------------------------------------------- 1 | 2 | # some comments on good parameters: 3 | # https://www.reddit.com/r/MachineLearning/comments/1mq8fb/why_i_love_scikitlearn/ 4 | 5 | # a sklearn wrapper for vw: 6 | # https://github.com/josephreisinger/vowpal_porpoise/blob/master/examples/example_sklearn.py 7 | 8 | # presentation with some good explanations of vw params: 9 | # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1 10 | 11 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clPerceptron.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import numpy as np 3 | 4 | def makeParams(X, y, globalArgs, dev, problemType): 5 | 6 | # I am not yet confident in eta0 7 | parameters_to_try = { 8 | "penalty": [None,'l2','l1','elasticnet'], 9 | "alpha": scipy.stats.expon(.00001,.001), 10 | "shuffle": [True,False], 11 | # "eta0": scipy.stats.expon(.0001,1), 12 | } 13 | 14 | return parameters_to_try 15 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clSGDClassifier.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import numpy as np 3 | 4 | def makeParams(X, y, globalArgs, dev, problemType): 5 | 6 | # I am not yet confident in eta0 7 | parameters_to_try = { 8 | "loss": ['hinge','log','modified_huber','squared_hinge','squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'], 9 | "penalty": ['none','l2','l1','elasticnet'], 10 | "alpha": scipy.stats.expon(.00001,.001), 11 | "shuffle": [True,False], 12 | "epsilon": scipy.stats.expon(.001,1) 13 | # "eta0": scipy.stats.expon(.0001,1), 14 | } 15 | 16 | return parameters_to_try 17 | -------------------------------------------------------------------------------- /pySetup/extendedTrainingList.py: -------------------------------------------------------------------------------- 1 | 2 | def getAll(): 3 | return { 4 | 'clRfEntropy':True, 5 | 'clRfGini':True, 6 | 'clRfBootstrapTrue':True, 7 | 'clSVCFirst':False, 8 | 'clSVCShrinking':False, 9 | 'clKnn':False, 10 | 'clLogisticRegression':False, 11 | 'clnnSknn3Layer':True, 12 | 'clnnSknn':True, 13 | 'clAdaBoost':False, 14 | 'clAdaLossLinear':False, 15 | 'clAdaLossSquare':False, 16 | 'clAdaLossExponential':False, 17 | 'clXGBoost':False, 18 | 'clMultinomialNB':False, 19 | 'clPerceptron':False, 20 | 'clSGDClassifier':True, 21 | 'clExtraTrees':True, 22 | 'clnnSklearnMLP':True 23 | } 24 | -------------------------------------------------------------------------------- /pySetup/sendMessages.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def printParent(text): 4 | messageObj = { 5 | 'text': text, 6 | 'type': 'console.log' 7 | } 8 | print json.dumps(messageObj) 9 | 10 | 11 | def messageParent(messageText, type): 12 | messageObj = { 13 | 'text': messageText, 14 | 'type': type 15 | } 16 | print json.dumps(messageObj) 17 | 18 | 19 | def obviousPrint(label, obj): 20 | printParent('#######################################################################################################################') 21 | printParent('#######################################################################################################################') 22 | printParent(label) 23 | printParent(obj) 24 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clKnn.py: -------------------------------------------------------------------------------- 1 | def makeParams(X, y, globalArgs, dev, problemType): 2 | 3 | # TODO: knn breaks with sparse matrices. it consumes huge amounts of memory. 4 | # https://github.com/ClimbsRocks/machineJS/issues/74 5 | 6 | # leaf size only applies to ball or kd tree, so i'm not sure if we can include it in grid search or not 7 | parameters_to_try = { 8 | # 'algorithm': ['ball_tree','kd_tree','brute'], 9 | # 'weights': ['uniform','distance'], 10 | # 'leaf_size': [15,30,60,120], 11 | 'n_neighbors': [2,5,10,25,100] 12 | } 13 | 14 | if dev: 15 | parameters_to_try.pop('n_neighbors', None) 16 | # parameters_to_try.pop('max_features', None) 17 | 18 | return parameters_to_try 19 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/svcShrinking.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | def makeParams(X, y, globalArgs, dev, problemType): 4 | 5 | # at some point in the future when we have figured out whether we have a probability or a classifiaction problem, we can set 'probability' equal to True only when we have a probability problem. that property just enables us to invoke predict_proba, but it slows down training time noticeably 6 | # an easy way to split this out would be to have one svm that is shrinking, and one that is not 7 | # shrinking is actually set for us in makeClassifiers.py, but we are keeping it here just to make it obvious 8 | parameters_to_try = { 9 | # 'shrinking': False, 10 | 'C': [1, 10, 100, 1000], 11 | 'gamma': [0.001, 0.0001], 12 | 'kernel': ['rbf'] 13 | } 14 | 15 | if dev: 16 | parameters_to_try.pop('C', None) 17 | 18 | return parameters_to_try 19 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/svcFirstParameterMaker.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | def makeParams(X, y, globalArgs, dev, problemType): 4 | 5 | # at some point in the future when we have figured out whether we have a probability or a classifiaction problem, we can set 'probability' equal to True only when we have a probability problem. that property just enables us to invoke predict_proba, but it slows down training time noticeably 6 | # an easy way to split this out would be to have one svm that is shrinking, and one that is not 7 | # shrinking is actually set for us in makeClassifiers.py, but we are keeping it here just to make it obvious 8 | parameters_to_try = { 9 | # 'shrinking': True, 10 | 'C': [1, 10, 100, 1000], 11 | 'gamma': [0.001, 0.0001], 12 | 'kernel': ['rbf'] 13 | } 14 | 15 | if dev: 16 | parameters_to_try.pop('C', None) 17 | 18 | return parameters_to_try 19 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clLogisticRegression.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import numpy as np 3 | 4 | def makeParams(X, y, globalArgs, dev, problemType): 5 | 6 | # Other parameters to try: tol, class_weight, penalty 7 | # I do not want to spend long optimizing logistic regressions, as we have other classifiers that are generally considered more effective across many different problem types. 8 | # TODO: break all of these out into their own classifiers- newton-cg, lbfgs, and liblinear 9 | # we are spending all the time training them as their own separate instances anyways (that's what gridsearch does), we might as well make use of that output for our creative ensembling 10 | parameters_to_try = { 11 | 'C': scipy.stats.expon(.001,1), 12 | 'solver': ['newton-cg', 'lbfgs', 'liblinear'], 13 | } 14 | 15 | if dev: 16 | parameters_to_try.pop('C', None) 17 | 18 | return parameters_to_try 19 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clnnSknn3Layer.py: -------------------------------------------------------------------------------- 1 | def makeParams(X, y, globalArgs, dev, problemType): 2 | 3 | # TODO: figure out more interesting parameters to try 4 | # follow a similar pattern to what we did for brainjs, basing the number of nodes on the size of the input 5 | # test number of hidden layers 6 | # TODO: break out each type into it's own classifier 7 | try: 8 | # if dense 9 | numFeatures = len(X[0]) 10 | except: 11 | # if sparse 12 | numFeatures = X.shape[1] 13 | parameters_to_try = { 14 | 'learning_rate': [0.001, 0.01], 15 | 'hidden0__units': [ numFeatures / 2, numFeatures ], 16 | 'hidden1__units': [ numFeatures / 2, numFeatures ], 17 | 'hidden2__units': [ numFeatures / 2, numFeatures ] 18 | } 19 | 20 | if dev: 21 | parameters_to_try.pop('learning_rate', None) 22 | # parameters_to_try.pop('hidden0__units', None) 23 | 24 | return parameters_to_try 25 | -------------------------------------------------------------------------------- /pySetup/randomizedSearchList.py: -------------------------------------------------------------------------------- 1 | # this file simply holds a list of all the classifiers we have enabled RandomizedSearchCV for. 2 | # if you would like to have more control over the process, and use GridSearchCV, please modify this file to say False for the algorithm you want to run GridSearchCV on. 3 | 4 | def rsList(): 5 | return { 6 | 'clnnSknn': False, 7 | 'clnnNoLearn': False, 8 | 'clKnn': False, 9 | 'clSVCFirst': False, 10 | 'clSVCShrinking': False, 11 | 'clnnSknn3Layer': False, 12 | 'clRfEntropy': True, 13 | 'clLogisticRegression': True, 14 | 'clXGBoost': True, 15 | 'clRfGini': True, 16 | 'clRfBootstrapTrue': True, 17 | 'clAdaBoost': True, 18 | 'clAdaLossLinear': True, 19 | 'clAdaLossSquare': True, 20 | 'clAdaLossExponential': True, 21 | 'clMultinomialNB': True, 22 | 'clPerceptron': True, 23 | 'clSGDClassifier': True, 24 | 'clExtraTrees': True, 25 | 'clnnSklearnMLP': True 26 | } 27 | -------------------------------------------------------------------------------- /test/regression/test.js: -------------------------------------------------------------------------------- 1 | global.rTest = {}; 2 | var expect = require('chai').expect; 3 | var mocha = require('mocha'); 4 | var path = require('path'); 5 | var fs = require('fs'); 6 | 7 | var makePredictions = require('./makePredictions'); 8 | var splitDataset = require('./splitDataset'); 9 | var trainAlgorithms = require('./trainAlgorithms'); 10 | var deleteRemnantsAndRunNewTest = require('./deleteRemnantsAndRunNewTest'); 11 | 12 | 13 | // this block will contain all the tests for the entire data-formatter package 14 | describe('regression problems', function() { 15 | // this timeout should be long enough to handle tests on a variety of machines. If you are getting a timeout error, consider bumping this up even more. 16 | this.timeout(600000); 17 | 18 | rTest.startTime = Date.now(); 19 | 20 | before(deleteRemnantsAndRunNewTest); 21 | 22 | 23 | // TODO: run this separately for each type of problem we're solving (regression, category, then eventually multi-labe, etc.) 24 | 25 | // setDefaultArgs(); 26 | 27 | trainAlgorithms(); 28 | 29 | makePredictions(); 30 | 31 | splitDataset(); 32 | 33 | }); 34 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clAdaLossAll.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import numpy as np 3 | 4 | def makeParams(X, y, globalArgs, dev, problemType): 5 | 6 | # GridSearchCV parameters: 7 | # parameters_to_try = { 8 | # 'n_estimators': [5,50,150], 9 | # 'learning_rate': [.1, .3], 10 | # 'algorithm':['SAMME','SAMME.R'] 11 | # } 12 | 13 | # RandomizedSearchCV parameters: 14 | # ideally, I think this would be a gamma distribution most likely. 15 | 16 | 17 | parameters_to_try = { 18 | "n_estimators": scipy.stats.randint(25,500), 19 | "learning_rate": scipy.stats.expon(.001, 2), 20 | # "loss": ['linear','square','exponential'], 21 | "algorithm": ['SAMME','SAMME.R'] 22 | } 23 | 24 | if problemType not in ['category', 'multi-category']: 25 | try: 26 | parameters_to_try.pop('algorithm', None) 27 | except: 28 | pass 29 | 30 | if dev: 31 | parameters_to_try.pop('learning_rate', None) 32 | try: 33 | parameters_to_try.pop('algorithm', None) 34 | except: 35 | pass 36 | 37 | return parameters_to_try 38 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clnnSknn.py: -------------------------------------------------------------------------------- 1 | def makeParams(X, y, globalArgs, dev, problemType): 2 | 3 | # guidance on params: 4 | # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1 5 | 6 | # TODO: figure out more interesting parameters to try 7 | # follow a similar pattern to what we did for brainjs, basing the number of nodes on the size of the input 8 | # test number of hidden layers 9 | # TODO: break out each type into it's own classifier 10 | try: 11 | # if dense 12 | numFeatures = len(X[0]) 13 | except: 14 | # if sparse 15 | numFeatures = X.shape[1] 16 | 17 | parameters_to_try = { 18 | 'learning_rate': [0.001, 0.01], 19 | 'hidden0__units': [ numFeatures / 2, numFeatures ] 20 | # 'hidden1__units': [ numFeatures / 2, numFeatures, numFeatures * 3 ], 21 | # 'hidden2__units': [ numFeatures / 2, numFeatures, numFeatures * 3 ] 22 | } 23 | 24 | if dev: 25 | # parameters_to_try.pop('learning_rate', None) 26 | parameters_to_try['learning_rate'] = [.001,.01] 27 | parameters_to_try.pop('hidden0__units', None) 28 | 29 | return parameters_to_try 30 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clnnNoLearn.py: -------------------------------------------------------------------------------- 1 | def makeParams(X, y, globalArgs, dev, problemType): 2 | 3 | # guidance on params: 4 | # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1 5 | 6 | # TODO: figure out more interesting parameters to try 7 | # follow a similar pattern to what we did for brainjs, basing the number of nodes on the size of the input 8 | # test number of hidden layers 9 | # TODO: break out each type into it's own classifier 10 | try: 11 | # if dense 12 | numFeatures = len(X[0]) 13 | except: 14 | # if sparse 15 | numFeatures = X.shape[1] 16 | 17 | parameters_to_try = { 18 | 'learning_rate': [0.001, 0.01], 19 | 'hidden0__units': [ numFeatures / 2, numFeatures ] 20 | # 'hidden1__units': [ numFeatures / 2, numFeatures, numFeatures * 3 ], 21 | # 'hidden2__units': [ numFeatures / 2, numFeatures, numFeatures * 3 ] 22 | } 23 | 24 | if dev: 25 | # parameters_to_try.pop('learning_rate', None) 26 | parameters_to_try['learning_rate'] = [.001,.01] 27 | parameters_to_try.pop('hidden0__units', None) 28 | 29 | return parameters_to_try 30 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clAdaBoost.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import numpy as np 3 | 4 | def makeParams(X, y, globalArgs, dev, problemType): 5 | 6 | # GridSearchCV parameters: 7 | # parameters_to_try = { 8 | # 'n_estimators': [5,50,150], 9 | # 'learning_rate': [.1, .3], 10 | # 'algorithm':['SAMME','SAMME.R'] 11 | # } 12 | 13 | # RandomizedSearchCV parameters: 14 | # ideally, I think this would be a gamma distribution most likely. 15 | 16 | 17 | parameters_to_try = { 18 | "n_estimators": scipy.stats.randint(25,500), 19 | "learning_rate": scipy.stats.expon(.001, 2), 20 | "loss": ['linear','square','exponential'], 21 | "algorithm": ['SAMME','SAMME.R'] 22 | } 23 | 24 | if problemType not in ['category', 'multi-category']: 25 | try: 26 | parameters_to_try.pop('algorithm', None) 27 | except: 28 | pass 29 | else: 30 | parameters_to_try.pop('loss', None) 31 | 32 | 33 | if dev: 34 | parameters_to_try.pop('learning_rate', None) 35 | try: 36 | parameters_to_try.pop('algorithm', None) 37 | except: 38 | pass 39 | 40 | return parameters_to_try 41 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clRfBootstrapBoth.py: -------------------------------------------------------------------------------- 1 | import math 2 | import scipy 3 | import numpy as np 4 | 5 | from sendMessages import printParent 6 | 7 | def makeParams(X, y, globalArgs, dev, problemType): 8 | 9 | try: 10 | # if dense 11 | numColumns = len(X[0]) 12 | except: 13 | # if sparse 14 | numColumns = X.shape[1] 15 | 16 | sqrtNum = int(math.sqrt(numColumns)) 17 | # GridSearchCV parameters: 18 | 19 | # max_features_to_try = [sqrtNum + x for x in (-2,0,2)] 20 | # max_features_to_try.append('log2') 21 | # max_features_to_try.append(None) 22 | 23 | # parameters_to_try = { 24 | # 'max_features': max_features_to_try, 25 | # 'min_samples_leaf':[1,2,5,25,50,100,150] 26 | # } 27 | 28 | 29 | maxFeaturesList = np.random.lognormal(sqrtNum, 2, 10) 30 | # if using lognormal, check out this link: 31 | # http://stackoverflow.com/questions/12937824/lognormal-random-numbers-centered-around-a-high-value 32 | # 'max_features': scipy.stats.lognorm([sqrtNum/5], int(sqrtNum)), 33 | 34 | # RandomizedSearchCV parameters: 35 | parameters_to_try = { 36 | 'max_features': scipy.stats.randint(1,numColumns), 37 | 'min_samples_leaf': scipy.stats.randint(1,200), 38 | 'min_samples_split': scipy.stats.randint(2,20), 39 | } 40 | 41 | if dev: 42 | parameters_to_try.pop('min_samples_leaf', None) 43 | parameters_to_try.pop('max_features', None) 44 | parameters_to_try['max_features'] = [sqrtNum, 'log2'] 45 | 46 | return parameters_to_try 47 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/rfGiniParamMaker.py: -------------------------------------------------------------------------------- 1 | import math 2 | import scipy 3 | import numpy as np 4 | 5 | from sendMessages import printParent 6 | 7 | def makeParams(X, y, globalArgs, dev, problemType): 8 | 9 | try: 10 | # if dense 11 | numColumns = len(X[0]) 12 | except: 13 | # if sparse 14 | numColumns = X.shape[1] 15 | 16 | sqrtNum = int(math.sqrt(numColumns)) 17 | # GridSearchCV parameters: 18 | 19 | # max_features_to_try = [sqrtNum + x for x in (-2,0,2)] 20 | # max_features_to_try.append('log2') 21 | # max_features_to_try.append(None) 22 | 23 | # parameters_to_try = { 24 | # 'max_features': max_features_to_try, 25 | # 'min_samples_leaf':[1,2,5,25,50,100,150] 26 | # } 27 | 28 | 29 | maxFeaturesList = np.random.lognormal(sqrtNum, 2, 10) 30 | # if using lognormal, check out this link: 31 | # http://stackoverflow.com/questions/12937824/lognormal-random-numbers-centered-around-a-high-value 32 | # 'max_features': scipy.stats.lognorm([sqrtNum/5], int(sqrtNum)), 33 | 34 | # RandomizedSearchCV parameters: 35 | parameters_to_try = { 36 | 'max_features': scipy.stats.randint(1,numColumns), 37 | 'min_samples_leaf': scipy.stats.randint(1,200), 38 | 'min_samples_split': scipy.stats.randint(2,20), 39 | 'bootstrap': [True,False] 40 | } 41 | 42 | if dev: 43 | parameters_to_try.pop('min_samples_leaf', None) 44 | parameters_to_try.pop('max_features', None) 45 | parameters_to_try['max_features'] = [sqrtNum, 'log2'] 46 | 47 | return parameters_to_try 48 | -------------------------------------------------------------------------------- /pySetup/stepsToAddNewClassifier.txt: -------------------------------------------------------------------------------- 1 | BeforEach: copy the exact classifier name to your clipboard so we can be consistent. 2 | 3 | 1. classifierList.js- just add the name, in either universal, classifierOnly, or regressionOnly 4 | 2. paramaterMakers/classifierName: create the parameters, make available through a function called makeParams that returns a dict that can be passed directly into the classifier. 5 | 3. paramMakers.py: add the new file as a module to be imported and as a part of the returned dict. make sure to add a comma in between properties :) 6 | 4. makeClassifiers.py: add the instantiated classifier here with the parameters it should have. make sure to add a comma in between properties :) 7 | 5. paramMakers folder: if this is just a new split of the same algorith (rf with gini, and rf with entropy), go back to paramMakers/classifierName and make sure the new one is mutually exclusive with the old one 8 | 6. randomizedSearchList.py: add in this classifier, and whether it supports RandomizedSearchCV (it should- training goes much faster that way!) 9 | 7. extendedTrainingList.py: state whether there is an extended training version of this classifier available. This step is somewhat redundent, but makes it clear that a classifier not being in makeBigClassifiers.py is intentional 10 | 8. makeBigClassifiers.py: add in a 'larger' version of the classifier- typically by bumping up n_estimators dramatically 11 | 9. Update test suites to support this new algorithm 12 | A. For each problemType that is supported: 13 | a) add it in as an expected file 14 | b) run the test dataset against this algorithm, to find it's expected error rates 15 | c) create a new test expecting the error rate to be roughly that -------------------------------------------------------------------------------- /pySetup/makeBigClassifiers.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | from sklearn.linear_model import SGDClassifier 3 | from sklearn.ensemble import ExtraTreesClassifier 4 | from sklearn.neural_network import MLPClassifier 5 | 6 | from sklearn.ensemble import RandomForestRegressor 7 | from sklearn.ensemble import ExtraTreesRegressor 8 | 9 | 10 | def makeAll(globalArgs, dev, problemType): 11 | estimator_count=200 12 | if dev: 13 | estimator_count=120 14 | 15 | iterationCount=20 16 | if dev: 17 | iterationCount=2 18 | 19 | if problemType == 'category' or problemType == 'multi-category': 20 | 21 | return { 22 | 'clRfGini': RandomForestClassifier(n_estimators=estimator_count, n_jobs=-1, criterion='gini'), 23 | 'clRfBootstrapTrue': RandomForestClassifier(n_estimators=estimator_count, n_jobs=-1, bootstrap=True), 24 | 'clRfEntropy': RandomForestClassifier(n_estimators=estimator_count, n_jobs=-1, criterion='entropy'), 25 | 'clSGDClassifier': SGDClassifier(n_iter=iterationCount), 26 | 'clExtraTrees': ExtraTreesClassifier(n_estimators=estimator_count, n_jobs=-1), 27 | 'clnnSklearnMLP': MLPClassifier(max_iter=iterationCount*20) 28 | } 29 | 30 | else: 31 | 32 | return { 33 | 'clRfGini': RandomForestRegressor(n_estimators=estimator_count, n_jobs=-1), 34 | 'clRfBootstrapTrue': RandomForestRegressor(n_estimators=estimator_count, n_jobs=-1, bootstrap=True), 35 | 'clRfEntropy': RandomForestRegressor(n_estimators=estimator_count, n_jobs=-1), 36 | 'clExtraTrees': ExtraTreesRegressor(n_estimators=estimator_count, n_jobs=-1), 37 | } 38 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clExtraTrees.py: -------------------------------------------------------------------------------- 1 | import math 2 | import scipy 3 | import numpy as np 4 | 5 | from sendMessages import printParent 6 | 7 | def makeParams(X, y, globalArgs, dev, problemType): 8 | 9 | try: 10 | # if dense 11 | numColumns = len(X[0]) 12 | except: 13 | # if sparse 14 | numColumns = X.shape[1] 15 | 16 | sqrtNum = int(math.sqrt(numColumns)) 17 | # GridSearchCV parameters: 18 | 19 | # max_features_to_try = [sqrtNum + x for x in (-2,0,2)] 20 | # max_features_to_try.append('log2') 21 | # max_features_to_try.append(None) 22 | 23 | # parameters_to_try = { 24 | # 'max_features': max_features_to_try, 25 | # 'min_samples_leaf':[1,2,5,25,50,100,150] 26 | # } 27 | 28 | 29 | maxFeaturesList = np.random.lognormal(sqrtNum, 2, 10) 30 | # if using lognormal, check out this link: 31 | # http://stackoverflow.com/questions/12937824/lognormal-random-numbers-centered-around-a-high-value 32 | # 'max_features': scipy.stats.lognorm([sqrtNum/5], int(sqrtNum)), 33 | 34 | # RandomizedSearchCV parameters: 35 | parameters_to_try = { 36 | 'criterion': ['gini','entropy'], 37 | 'max_features': scipy.stats.randint(1,numColumns), 38 | 'min_samples_split': scipy.stats.randint(2,20), 39 | 'min_samples_leaf': scipy.stats.randint(1,100), 40 | 'bootstrap': [True,False] 41 | } 42 | 43 | if problemType not in ['category', 'multi-category']: 44 | parameters_to_try.pop('criterion', None) 45 | 46 | # if dev: 47 | # parameters_to_try.pop('min_samples_leaf', None) 48 | # parameters_to_try.pop('max_features', None) 49 | # parameters_to_try['max_features'] = [sqrtNum, 'log2'] 50 | 51 | return parameters_to_try 52 | -------------------------------------------------------------------------------- /machineJS.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | // (function() { 4 | module.exports = function(argsObj) { 5 | if(argsObj !== undefined) { 6 | for(var key in argsObj) { 7 | global.argv[key] = argsObj[key]; 8 | } 9 | } 10 | 11 | // if( global.argv.validationRound ) { 12 | // console.log('global.argv before processArgs in machineJS validationRound'); 13 | // console.log(global.argv); 14 | // } 15 | 16 | if( argv.validationRound !== true ) { 17 | processArgs(); 18 | } else { 19 | argv.ensemblerArgs.validationRound = false; 20 | } 21 | 22 | if (argv.devEnsemble || argv.ensemble) { 23 | ensembler.createEnsemble( argv.ensemblerArgs ); 24 | } else if( argv.makePredictions ) { 25 | controllerPython.makeAllPredictions( argv.makePredictions ); 26 | } else { 27 | controllerPython.startTraining(argv); 28 | } 29 | 30 | shutDown(controllerPython); 31 | 32 | }; 33 | 34 | var path = require('path'); 35 | global.rootDir = path.dirname(__filename); 36 | global.argv = {}; 37 | 38 | var controllerPython = require('./pySetup/controllerPython.js'); 39 | var shutDown = require('./shutDown.js'); 40 | var processArgs = require('./processArgs.js'); 41 | 42 | var ensembler = require('ensembler'); 43 | 44 | console.log('thanks for inviting us along on your machine learning journey!\n'); 45 | 46 | 47 | // allow the module to be invoked from the command line 48 | // since this is all wrapped in an IIFE, this if statement will execute and check if machineJS was invoked from another module, or without a parent (from the command line) 49 | if( !module.parent ) { 50 | var userArgs = require('minimist')(process.argv.slice(1)); 51 | for( var key in userArgs ) { 52 | global.argv[key] = userArgs[key]; 53 | } 54 | 55 | module.exports(); 56 | } 57 | 58 | // })(); 59 | -------------------------------------------------------------------------------- /shutDown.js: -------------------------------------------------------------------------------- 1 | var exec = require('child_process').execSync; 2 | 3 | module.exports = function(controllerPython) { 4 | 5 | 6 | // kills off all the child processes if the parent process faces an uncaught exception and crashes. 7 | // this prevents you from having zombie child processes running indefinitely. 8 | // lifted directly from: https://www.exratione.com/2013/05/die-child-process-die/ 9 | // This is a somewhat ugly approach, but it has the advantage of working 10 | // in conjunction with most of what third parties might choose to do with 11 | // uncaughtException listeners, while preserving whatever the exception is. 12 | process.once("uncaughtException", function (error) { 13 | // If this was the last of the listeners, then shut down the child and rethrow. 14 | // Our assumption here is that any other code listening for an uncaught 15 | // exception is going to do the sensible thing and call process.exit(). 16 | if (process.listeners("uncaughtException").length === 0) { 17 | console.log('we heard an unexpected shutdown event that is causing everything to close'); 18 | controllerPython.killAll(); 19 | throw error; 20 | } 21 | }); 22 | 23 | if (process.platform === "win32") { 24 | var rl = require("readline").createInterface({ 25 | input: process.stdin, 26 | output: process.stdout 27 | }); 28 | 29 | rl.on("SIGINT", function () { 30 | process.emit("SIGINT"); 31 | }); 32 | } 33 | 34 | process.on("SIGINT", function () { 35 | //graceful shutdown 36 | console.log('heard sigint in machineJS') 37 | controllerPython.killAll(); 38 | 39 | // if we hear a Ctrl + c, we can safely assume the user wants to exit. 40 | // exec('pkill -9 node'); 41 | process.exit(); 42 | }); 43 | 44 | process.on("killAll", function() { 45 | controllerPython.killAll(); 46 | process.exit(); 47 | 48 | }); 49 | 50 | }; 51 | -------------------------------------------------------------------------------- /test/regression/trainAlgorithms.js: -------------------------------------------------------------------------------- 1 | var expect = require('chai').expect; 2 | var mocha = require('mocha'); 3 | var fs = require('fs'); 4 | var path = require('path'); 5 | var rTest = global.rTest; 6 | 7 | module.exports = function() { 8 | 9 | describe('training and tuning algorithms', function() { 10 | 11 | it('should successfully train one instance of all algorithms in classifierList, for this problemType', function() { 12 | 13 | var classifierList = require(path.join(rTest.mjsLocation, 'pySetup', 'classifierList')); 14 | classifierList = Object.keys(classifierList.longDataSet); 15 | var trainedAlgos = fs.readdirSync(rTest.bestClassifiersTestLocation); 16 | 17 | function verifyAllClassifiersTrained() { 18 | var foundClassifiers = []; 19 | // for each classifier we expected to train: 20 | for( var i = 0; i < classifierList.length; i++ ) { 21 | var found = false; 22 | 23 | // compare it against the ones that finished: 24 | for( var j = 0; j < trainedAlgos.length; j++ ){ 25 | if( trainedAlgos[j].indexOf( classifierList[i] ) !== -1 ) { 26 | foundClassifiers.push(classifierList[i]); 27 | } 28 | } 29 | } 30 | 31 | // if we have found all of them at the end, return true 32 | return foundClassifiers; 33 | } 34 | 35 | expect( verifyAllClassifiersTrained() ).to.deep.equal(classifierList);; 36 | }); 37 | 38 | // it('should write each algorithm to a file in the correct directory', function() { 39 | 40 | // }); 41 | 42 | // it('should name the trained algorithm file after the algorithm\'s name', function() { 43 | 44 | // }); 45 | 46 | // it('should have acceptably low error rates for each algorithm', function() { 47 | // // TODO: probably break this out into separate tests, one for each algo. that way it will be easier to find which one failed. 48 | // }); 49 | 50 | 51 | }); 52 | 53 | } 54 | 55 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clnnSklearnMLP.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import scipy 4 | 5 | def makeParams(X, y, globalArgs, dev, problemType): 6 | 7 | # guidance on params: 8 | # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1 9 | 10 | 11 | 12 | try: 13 | # if dense 14 | numFeatures = len(X[0]) 15 | XLength = len(X) 16 | except: 17 | # if sparse 18 | numFeatures = X.shape[1] 19 | XLength = X.shape[0] 20 | 21 | if XLength < 10000: 22 | # according to the docs, l-bfgs performs best for small datasets (thousands of items) 23 | algorithmType = 'l-bfgs' 24 | else: 25 | # adam is optimized stochastic gradient descent which performs well for large datasets 26 | algorithmType = 'adam' 27 | 28 | # we want to test up to 10 layers, with anywhere from .5 * numFeatures to 10*numFeatures per layer 29 | hiddenLayers = [] 30 | 31 | # for each hidden layer, we will have these numbers * numFeatures number of nodes 32 | nodeMultipliers = [.5,1,2,3,5,10] 33 | 34 | # we will create options for each number of hidden layers listed below 35 | hiddenLayerMultipliers = [1,2,3,5,10] 36 | 37 | for hlNum in hiddenLayerMultipliers: 38 | for nodeNum in nodeMultipliers: 39 | hiddenLayers.append( [ nodeNum * numFeatures for x in range(hlNum) ]) 40 | 41 | 42 | # parameters we are not searching currently: 43 | # tol 44 | # shuffle 45 | # batch_size 46 | # hidden_layer_sizes 47 | 48 | parameters_to_try = { 49 | 'hidden_layer_sizes': hiddenLayers, 50 | 'activation': ['logistic','tanh','relu'], 51 | 'algorithm': [algorithmType], 52 | 'alpha': scipy.stats.expon(.00001,.001), 53 | 'learning_rate': ['constant','invscaling','adaptive'], 54 | 'learning_rate_init': scipy.stats.expon(.01,.0001), 55 | 'early_stopping': [True], 56 | 'validation_fraction': np.random.uniform(0.8,1,1000), 57 | 'epsilon': scipy.stats.expon( math.pow(10,-7), math.pow(10,-9)) 58 | } 59 | 60 | return parameters_to_try 61 | -------------------------------------------------------------------------------- /test/regression/deleteRemnantsAndRunNewTest.js: -------------------------------------------------------------------------------- 1 | var path = require('path'); 2 | var execSync = require('child_process').execSync; 3 | var rimraf = require('rimraf'); 4 | var fs = require('fs'); 5 | 6 | // set default values 7 | // since node.js executes the file as node loads the file in, these lines of code will get run every time, regardless of whether we run the module.exports function or not 8 | global.rTest.testFileLocation = path.dirname(__filename); 9 | global.rTest.mjsLocation = path.join(rTest.testFileLocation, '..','..'); 10 | global.rTest.dataLocation = path.join(rTest.mjsLocation,'node_modules','data-for-tests','rossman'); 11 | 12 | rTest.dfTestResultsLocation = path.join(rTest.testFileLocation, 'dfTestResults'); 13 | rTest.rTestPredictionsLocation = path.join(rTest.testFileLocation, 'rTestPredictions'); 14 | rTest.bestClassifiersTestLocation = path.join(rTest.testFileLocation, 'bestClassifiersTest'); 15 | rTest.validationIndicesLocation = path.join(rTest.dataLocation, 'dfValidationIndicesrossmantest.pkl'); 16 | 17 | 18 | module.exports = function() { 19 | try { 20 | // remove any folders we might have created when running the test suite previously 21 | // rimraf is `rm -rf` for node 22 | rimraf.sync(rTest.dfTestResultsLocation); 23 | rimraf.sync(rTest.rTestPredictionsLocation); 24 | rimraf.sync(rTest.bestClassifiersTestLocation); 25 | fs.unlinkSync(rTest.validationIndicesLocation); 26 | } catch(err) { 27 | // do nothing! There is nothing to delete 28 | } 29 | 30 | // to see detailed output while running the tests, use node-inspector. 31 | // npm install -g node-inspector 32 | // change "node" below to be "node-debug" 33 | execSync('node machineJS.js ' 34 | + path.join(rTest.dataLocation,'tinyTrain.csv') 35 | + ' --predict ' + path.join(rTest.dataLocation,'test.csv') 36 | + ' --join ' + path.join(rTest.dataLocation, 'store.csv') 37 | + ' --dfOutputFolder ' + path.join(rTest.testFileLocation, 'dfTestResults') 38 | + ' --predictionsFolder ' + path.join(rTest.testFileLocation, 'rTestPredictions') 39 | + ' --ensemblerOutputFolder ' + rTest.testFileLocation 40 | + ' --bestClassifiersFolder ' + path.join(rTest.testFileLocation, 'bestClassifiersTest') 41 | ); 42 | }; 43 | -------------------------------------------------------------------------------- /test/regression/splitDataset.js: -------------------------------------------------------------------------------- 1 | var expect = require('chai').expect; 2 | var mocha = require('mocha'); 3 | var fs = require('fs'); 4 | var path = require('path'); 5 | var execSync = require('child_process').execSync; 6 | csv = require('csv'); 7 | 8 | module.exports = function() { 9 | 10 | describe('splitting the formatted dataset', function() { 11 | 12 | it('should create a new validation split when we do not have one already', function() { 13 | var fileStats = fs.statSync(path.join(rTest.dataLocation, 'dfValidationIndicesrossmantest.pkl')); 14 | expect( new Date(fileStats.ctime) ).to.be.above(rTest.startTime); 15 | }); 16 | 17 | 18 | it('should copy the validation dataset to the valdiation folder in predictions', function(done) { 19 | fs.readFile(path.join(rTest.rTestPredictionsLocation, 'validation','validationData.npz'), function(err, data) { 20 | console.log('err',err); 21 | expect(err).to.be.null; 22 | done(); 23 | }); 24 | }); 25 | 26 | 27 | it('should copy the validation IDs and Y to the valdiation folder in predictions', function(done) { 28 | var validationFilePath = path.join(rTest.rTestPredictionsLocation, 'validation','validationIDsAndY.csv'); 29 | fs.readFile(validationFilePath, function(err, data) { 30 | 31 | expect(err).to.be.null; 32 | 33 | data = csv.parse(data.toString('utf8'), function(err, output) { 34 | 35 | expect(output.length).to.be.within(51000 - 200, 51000 + 200); 36 | done(); 37 | 38 | }); 39 | }); 40 | }); 41 | 42 | 43 | it('should use the existing validation split if it already exists', function() { 44 | var secondStartTime = Date.now(); 45 | // TODO: try running the test again, and 46 | execSync('node machineJS.js ' 47 | + path.join(rTest.dataLocation,'tinyTrain.csv') 48 | + ' --predict ' + path.join(rTest.dataLocation,'test.csv') 49 | + ' --join ' + path.join(rTest.dataLocation, 'store.csv') 50 | + ' --dfOutputFolder ' + path.join(rTest.testFileLocation, 'dfTestResults') 51 | + ' --predictionsFolder ' + path.join(rTest.testFileLocation, 'rTestPredictions') 52 | + ' --ensemblerOutputFolder ' + rTest.testFileLocation 53 | + ' --bestClassifiersFolder ' + path.join(rTest.testFileLocation, 'bestClassifiersTest') 54 | + ' --splitDataTest true' 55 | ); 56 | 57 | var fileStats = fs.statSync(path.join(rTest.dataLocation, 'dfValidationIndicesrossmantest.pkl')); 58 | expect( new Date(fileStats.ctime) ).to.be.below(secondStartTime); 59 | 60 | }); 61 | 62 | }); 63 | 64 | } 65 | 66 | -------------------------------------------------------------------------------- /pySetup/utilsPyShell.js: -------------------------------------------------------------------------------- 1 | var py = global.pythonNamespace; 2 | 3 | var path = require('path'); 4 | var PythonShell = require('python-shell'); 5 | 6 | var pySetupLocation = py.pySetupLocation; 7 | 8 | module.exports = { 9 | 10 | // these are messages we expect to get from our python shell. 11 | // anything not in this list is likely an error. 12 | expectedMessages: { 13 | dictVectMapping: true, 14 | fileNames: true, 15 | trainingResults: true, 16 | splitFileNames: true 17 | }, 18 | 19 | attachLogListener: function(referenceToShell) { 20 | referenceToShell.on('message', function(message) { 21 | if(message.type === 'console.log') { 22 | console.log('snake says:',message.text); 23 | } 24 | else if ( !module.exports.expectedMessages[ message.type ] ){ 25 | console.log('heard a message:',message); 26 | } 27 | }); 28 | }, 29 | 30 | generatePythonOptions: function(fileNameFromRoot, otherArgs) { 31 | // the first argument for all python shells is going to be a path to a file, relative to the root of machineJS 32 | var fullPathToFile = path.join(global.rootDir, fileNameFromRoot); 33 | var args = []; 34 | args = args.concat(fullPathToFile, otherArgs); 35 | 36 | var pySetupLocation = path.join(argv.machineJSLocation, 'pySetup'); 37 | 38 | return { 39 | scriptPath: pySetupLocation, 40 | args: args, 41 | mode:'json' 42 | }; 43 | }, 44 | 45 | startPythonShell: function(scriptName, callback, pythonOptions) { 46 | var pyShell = PythonShell.run(scriptName, pythonOptions, function (err, results) { 47 | if (err) { 48 | // TODO: add in logging of the error message if verbosity is set to 49 | // right now we get error messages for a bunch of things the user should not concern themselves with, including: 50 | // deprecation warnings (we're optionally using a pre-release version of sklearn; we'll refactor to take care of those deprecation warnings once they're merged into an officially released version) 51 | // searches that fail to converge 52 | // to avoid distracting the user, we're only logging error messages with an exit code that is not 0, meaning that the process failed to finish executing 53 | if( err.exitCode !== 0 ) { 54 | console.error(err); 55 | } else { 56 | callback(); 57 | } 58 | } else { 59 | console.log('successfully finished running',scriptName + '!'); 60 | callback(); 61 | 62 | } 63 | }); 64 | 65 | module.exports.attachLogListener(pyShell); 66 | py.referencesToChildren.push(pyShell); 67 | 68 | return pyShell; 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/paramMakers.py: -------------------------------------------------------------------------------- 1 | # ok, unfortunately, this is how it probably has to work: 2 | # 1. we manually (hard code the names in here) import all the individual parameterMaker files here 3 | # 2. we have a master calculateParams function 4 | 5 | # 3. into that function we will pass X and y 6 | # 4. that function will then go off and invoke all the individual parameterMaker functions, saving their results into a dictionary with keys that mirror 'clRandomForest' 7 | # 5. that function will then return the dictionary 8 | # 6. then back in training.py we can look up the classifierName within that dictionary to get the parameters 9 | 10 | import rfGiniParamMaker 11 | import rfEntropyParamMaker 12 | import svcFirstParameterMaker 13 | import svcFirstParameterMaker 14 | import svcShrinking 15 | import clnnSknn 16 | import clnnSknn3Layer 17 | import clKnn 18 | import clLogisticRegression 19 | import clAdaBoost 20 | import clXGBoost 21 | import clRfBootstrapBoth 22 | import clAdaLossAll 23 | import clMultinomialNB 24 | import clPerceptron 25 | import clSGDClassifier 26 | import clExtraTrees 27 | import clnnSklearnMLP 28 | from sendMessages import printParent 29 | 30 | def makeAll(X,y,globalArgs, dev, problemType): 31 | returnDict = { 32 | 'clRfGini':rfGiniParamMaker.makeParams(X,y,globalArgs, dev, problemType), 33 | 'clRfEntropy':rfEntropyParamMaker.makeParams(X,y,globalArgs, dev, problemType), 34 | 'clSVCFirst':svcFirstParameterMaker.makeParams(X,y,globalArgs, dev, problemType), 35 | 'clSVCFirst':svcFirstParameterMaker.makeParams(X,y,globalArgs, dev, problemType), 36 | 'clSVCShrinking':svcShrinking.makeParams(X,y,globalArgs, dev, problemType), 37 | 'clKnn':clKnn.makeParams(X,y,globalArgs, dev, problemType), 38 | 'clLogisticRegression':clLogisticRegression.makeParams(X,y,globalArgs, dev, problemType), 39 | 'clnnSknn3Layer':clnnSknn3Layer.makeParams(X,y,globalArgs, dev, problemType), 40 | 'clnnSknn':clnnSknn.makeParams(X,y,globalArgs, dev, problemType), 41 | 'clAdaBoost':clAdaBoost.makeParams(X,y,globalArgs, dev, problemType), 42 | 'clAdaLossLinear':clAdaLossAll.makeParams(X,y,globalArgs, dev, problemType), 43 | 'clAdaLossSquare':clAdaLossAll.makeParams(X,y,globalArgs, dev, problemType), 44 | 'clAdaLossExponential':clAdaLossAll.makeParams(X,y,globalArgs, dev, problemType), 45 | 'clXGBoost':clXGBoost.makeParams(X,y,globalArgs, dev, problemType), 46 | 'clRfBootstrapTrue': clRfBootstrapBoth.makeParams(X,y,globalArgs, dev, problemType), 47 | 'clMultinomialNB': clMultinomialNB.makeParams(X,y,globalArgs,dev,problemType), 48 | 'clPerceptron': clPerceptron.makeParams(X,y,globalArgs,dev,problemType), 49 | 'clSGDClassifier': clSGDClassifier.makeParams(X,y,globalArgs,dev,problemType), 50 | 'clExtraTrees': clExtraTrees.makeParams(X,y,globalArgs,dev,problemType), 51 | 'clnnSklearnMLP': clnnSklearnMLP.makeParams(X,y,globalArgs,dev,problemType) 52 | } 53 | return returnDict 54 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/clXGBoost.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import numpy as np 3 | 4 | def makeParams(X, y, globalArgs, dev, problemType): 5 | 6 | 7 | 8 | # great overall walkthrough of xgb. probably the best starting point. 9 | # http://www.slideshare.net/ShangxuanZhang/kaggle-winning-solution-xgboost-algorithm-let-us-learn-from-its-author 10 | 11 | 12 | # list of params that can be tuned: 13 | # https://www.kaggle.com/forums/f/15/kaggle-forum/t/17120/how-to-tuning-xgboost-in-an-efficient-way 14 | 15 | 16 | # discussion by xgb library itself: 17 | # https://github.com/dmlc/xgboost/blob/master/doc/param_tuning.md 18 | 19 | # other parameters to investigate that might only exist in the sklearn implementation: 20 | # learning_rate 21 | # n_estimators (i have a feeling this is num_boost_round) 22 | # subsample 23 | # max_features 24 | 25 | # Other params to invesigat: 26 | # Split on: 27 | # booster [default=gbtree] 28 | # which booster to use, can be gbtree or gblinear. gbtree uses tree based model while gblinear uses linear function. 29 | 30 | # param_space = {'max_depth': [2,4,6,8,10], 'n_estimators': [200,300,400,500,600,700,800], 'learning_rate' : uniform(loc=0.001,scale=0.2), 'subsample': uniform(loc=0.6,scale=0.39), 'colsample_bytree':uniform(loc=0.6,scale=0.39), } 31 | 32 | # param_dist = { 33 | # 'max_depth': randint(2, 8), 34 | # 'gamma': uniform(0.2, 0.6), 35 | # 'subsample': beta(10, 1), 36 | # } 37 | # and then do a randomized grid search like this 38 | 39 | # clf = xgb.XGBClassifier(n_estimators = 20) 40 | # n_iter_search = 100 41 | # random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring='roc_auc', verbose=10) 42 | # random_search.fit(X_train, y_train) 43 | 44 | 45 | 46 | # {'max_depth': [2,4,6], 47 | # 'n_estimators': [50,100,200]} 48 | 49 | # official docs: 50 | # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md 51 | 52 | # samuel reuther had a good reply: 53 | # https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/13910/xgboost-parameter-tuning 54 | 55 | # forum that talks about specific numbers: 56 | # https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/12947/achieve-0-50776-on-the-leaderboard-in-a-minute-with-xgboost/76028 57 | 58 | # slide 12 has exact param recommendations: 59 | # http://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1 60 | 61 | 62 | 63 | 64 | 65 | # RandomSearchCV parameters: 66 | parameters_to_try = { 67 | 'max_depth': scipy.stats.randint(1,150), 68 | 'subsample': np.random.uniform(.80,1,1000), 69 | 'colsample_bytree': np.random.uniform(.80,1,1000) 70 | } 71 | 72 | # TODO: create two separate XGBoosts, one for gbtree adn one for gblinear 73 | # 'booster': ['gbtree','gblinear'] 74 | 75 | if dev: 76 | parameters_to_try.pop('subsample', None) 77 | parameters_to_try.pop('colsample_bytree', None) 78 | # parameters_to_try.pop('num_round', None) 79 | # parameters_to_try.pop('eta', None) 80 | 81 | return parameters_to_try 82 | -------------------------------------------------------------------------------- /pySetup/makeClassifiers.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | from sklearn.svm import SVC 3 | from sklearn.neighbors import KNeighborsClassifier 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.ensemble import AdaBoostClassifier 6 | from sklearn.naive_bayes import MultinomialNB 7 | from sklearn.linear_model import Perceptron 8 | from sklearn.linear_model import SGDClassifier 9 | from sklearn.neural_network import MLPClassifier 10 | 11 | import xgboost 12 | from sklearn.ensemble import ExtraTreesClassifier 13 | 14 | from sklearn.ensemble import RandomForestRegressor 15 | from sklearn.neighbors import KNeighborsRegressor 16 | from sklearn.ensemble import AdaBoostRegressor 17 | from sklearn.svm import SVR 18 | from sklearn.linear_model import LinearRegression 19 | from sklearn.ensemble import ExtraTreesRegressor 20 | 21 | # other splits for SVC kernel: 22 | # linear, poly, rbf, sigmoid, precomputed 23 | 24 | def makeClassifiers(globalArgs, dev, problemType): 25 | 26 | n_iter=10 27 | n_estimators=20 28 | if(dev): 29 | n_iter=2 30 | n_estimators=5 31 | 32 | if problemType == 'category' or problemType == 'multi-category': 33 | return { 34 | 'clRfGini': RandomForestClassifier(n_estimators=n_estimators, n_jobs=1, criterion='gini'), 35 | 'clRfBootstrapTrue': RandomForestClassifier(n_estimators=n_estimators, n_jobs=1, bootstrap=True), 36 | 'clRfEntropy': RandomForestClassifier(n_estimators=n_estimators, n_jobs=1, criterion='entropy'), 37 | 'clSVCFirst': SVC(probability=True, shrinking=False), 38 | 'clSVCShrinking': SVC(probability=True, shrinking=True), 39 | 'clKnn': KNeighborsClassifier(), 40 | 'clLogisticRegression': LogisticRegression(penalty='l2', dual=False, max_iter=100, warm_start=True), 41 | 'clAdaBoost': AdaBoostClassifier(), 42 | 'clXGBoost': xgboost.XGBClassifier(), 43 | 'clMultinomialNB': MultinomialNB(), 44 | 'clPerceptron': Perceptron(), 45 | 'clSGDClassifier': SGDClassifier(n_iter=n_iter), 46 | 'clExtraTrees': ExtraTreesClassifier(n_estimators=n_estimators, n_jobs=1), 47 | 'clnnSklearnMLP': MLPClassifier(), 48 | } 49 | 50 | # Regression models 51 | else: 52 | return { 53 | 'clRfGini': RandomForestRegressor(n_estimators=n_estimators, n_jobs=1), 54 | 'clRfBootstrapTrue': RandomForestRegressor(n_estimators=n_estimators, n_jobs=1, bootstrap=True), 55 | # 'clRfEntropy': RandomForestRegressor(n_estimators=n_estimators, n_jobs=1, criterion='entropy'), 56 | 'clSVCFirst': SVR(shrinking=False), 57 | 'clSVCShrinking': SVR(shrinking=True), 58 | 'clKnn': KNeighborsRegressor(), 59 | 'clLogisticRegression': LinearRegression(), 60 | 'clAdaBoost': AdaBoostRegressor(), 61 | 'clAdaLossLinear': AdaBoostRegressor(loss='linear'), 62 | 'clAdaLossSquare': AdaBoostRegressor(loss='square'), 63 | 'clAdaLossExponential': AdaBoostRegressor(loss='exponential'), 64 | 'clXGBoost': xgboost.XGBRegressor(), 65 | 'clExtraTrees': ExtraTreesRegressor(n_estimators=n_estimators, n_jobs=1), 66 | } 67 | -------------------------------------------------------------------------------- /pySetup/parameterMakers/rfEntropyParamMaker.py: -------------------------------------------------------------------------------- 1 | import math 2 | import scipy 3 | import numpy as np 4 | 5 | from sendMessages import printParent 6 | 7 | def makeParams(X, y, globalArgs, dev, problemType): 8 | 9 | try: 10 | # if dense 11 | numColumns = len(X[0]) 12 | except: 13 | # if sparse 14 | numColumns = X.shape[1] 15 | 16 | sqrtNum = int(math.sqrt(numColumns)) 17 | # GridSearchCV parameters: 18 | 19 | # max_features_to_try = [sqrtNum + x for x in (-2,0,2)] 20 | # max_features_to_try.append('log2') 21 | # max_features_to_try.append(None) 22 | 23 | # parameters_to_try = { 24 | # 'max_features': max_features_to_try, 25 | # 'min_samples_leaf':[1,2,5,25,50,100,150] 26 | # } 27 | 28 | 29 | maxFeaturesList = np.random.lognormal(sqrtNum, 2, 10) 30 | # if using lognormal, check out this link: 31 | # http://stackoverflow.com/questions/12937824/lognormal-random-numbers-centered-around-a-high-value 32 | # 'max_features': scipy.stats.lognorm([sqrtNum/5], int(sqrtNum)), 33 | 34 | # RandomizedSearchCV parameters: 35 | parameters_to_try = { 36 | 'max_features': scipy.stats.randint(1,numColumns), 37 | 'min_samples_leaf': scipy.stats.randint(1,200), 38 | 'min_samples_split': scipy.stats.randint(2,20), 39 | 'bootstrap': [True,False] 40 | } 41 | 42 | if dev: 43 | parameters_to_try.pop('min_samples_leaf', None) 44 | parameters_to_try.pop('max_features', None) 45 | parameters_to_try['max_features'] = [sqrtNum, 'log2'] 46 | 47 | return parameters_to_try 48 | 49 | ''' 50 | determine which parameters we want to mess with 51 | https://www.kaggle.com/forums/f/15/kaggle-forum/t/4092/how-to-tune-rf-parameters-in-practice 52 | A. M-Try (number of features it tries at each decision point in a tree). Starts at square root of features available, but tweak it up and down by a few (probably no more than 3 in each direction; it seems even 1 or 2 is enough) 53 | B. Number of folds for cross-validation: 10 is what most people use, but more gives you better accuracy (likely at the cost of compute time). again, returns are pretty rapidly diminishing. 54 | C. platt scaling of the results to increase overall accuracy at the cost of outliers (which sounds perfect for an ensemble) 55 | D. preprocessing the data might help- FUTURE 56 | E. Principle Component Analysis to decrease dependence between features 57 | F. Number of trees 58 | G. Possibly ensemble different random forests together. this is where the creative ensembling comes into play! 59 | H. Splitting criteria 60 | I. AdaBoost 61 | J. Can bump up nodesize as much as possible to decrease training time (split) 62 | consider doing this first, finding what node size we finally start decreasing accuracy on, then use that node size for the rest of the testing we do, then possibly bumping it down a bit again at the end. 63 | https://www.kaggle.com/c/the-analytics-edge-mit-15-071x/forums/t/7890/node-size-in-random-forest 64 | K. min_samples_leaf- smaller leaf makes you more prone to capturing noise from the training data. Try for at least 50?? 65 | http://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/ 66 | L. random_state: adds reliability. Would be a good one to split on if ensembling different RFs together. 67 | M. oob_score: something about intelligent cross-validation. 68 | N. allusions to regularization, or what I think they mean- feature selection. 69 | 70 | ''' 71 | -------------------------------------------------------------------------------- /advancedAPI.md: -------------------------------------------------------------------------------- 1 | # Advanced Options: 2 | 3 | If you've done ML before, `machineJS` is incredibly useful in that it already puts in place most of the structure for all the parts of the process you have to repeat each time. 4 | 5 | As much as possible, we've tried to avoid hardcoding in values, instead allowing the user to pass in values, or setting default values if the user doesn't pass in any arguments. 6 | 7 | To get the best idea of all the options available to you, please check out `processArgs.js`, where we set many of the [default values](https://github.com/ClimbsRocks/machineJS/blob/master/processArgs.js) that you'd want to modify. In the meantime, here are some of the more widely used options. 8 | 9 | - `--alreadyFormatted`: A boolean value 'true' or 'false', noting if your data has already been formatted. Useful if you're just tweaking parameters and don't want to repeat the oftentimes time-expensive data formatting process again. If you pass in this flag, make sure your files are included in the `pySetup/testingFileNames.js` json list. I've included a couple of examples. You can get the fileNames from `pySetup/utils.js`, inside of the formatData function. Just copy paste the fileNames obj data-formatter gives to the callback, into the `testingFileNames.js` file, giving it a property of whatever the 'outputFileName' property is. You should be able to pick up the pattern pretty easily :) 10 | - `--join`: a path to a data file that will be joined in with your training and testing data, in the same way you'd join SQL tables. 11 | - `--predict`: see above (Format of Prediction File). 12 | - `--dev`: This flag indicates that you are doing engineering work on machineJS itself. It does things like: 13 | a) set the number of rounds to a third of what it normally is 14 | b) assume we already have data formatted 15 | c) if no data is passed in, automatically use the kaggleGiveCredit.csv dataset 16 | - `--devKaggle`: Does all the same things as `--dev`, but also runs `--predict` on the default dataset kaggleGiveCreditTest.csv 17 | - `--devEnsemble`: Assumes that we already have predictions made for us by the rest of the module and present in predictions/*.csv. Allows you to focus on assembling your ensemble without having to retrain the models each time :) 18 | - `--dfOutputFolder`: if, for some reason, you want the results of `data-formatter` written to a different directory. We use this for the test suite, but it probably isn't useful for much other than that. 19 | - `--ensemblerOutputFolder`: much like the `dfOutputFolder` option above, you can choose to overwrite the default location for the output results. Used in our test suite, but probably not useful for many other cases. 20 | - `--bestClassifiersFolder`: much like the `dfOutputFolder` option above, you can choose to overwrite the default location for the bestClassifier. Used in our test suite, but probably not useful for many other cases. 21 | 22 | ### Validation Splits 23 | The `ensembler` module, which uses machine learning to aggregate together all the results of each trained algorithm, will always benefit from more information, and thus, more trained algorithms. 24 | 25 | To support this, we are using a consistent valdiation data split for a given test.csv dataset. This means you can change your training.csv data (new feature engineering, new ways of normalizing the data, etc.), but still use the predictions from previous training data sets. The only stipulation is that the rows must be in the same order. What you put into each row is entirely up to you! 26 | 27 | If you ever want to ask machineJS to create a new validation split for you, simply delete the `*validationData.npz` files from the data-formatterResults directory. 28 | 29 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "machinejs", 3 | "version": "0.9.5", 4 | "description": "Automated machine learning structure. Ensembles random forests, XGBoost, neural networks, AdaBoost, KNN, SVM together, handling data formatting ensembling, and running predictions for you. Feel free to tweak the settings if you want a lot of control, or just start it up and let it run if you're looking for a push-button MVP solution.", 5 | "main": "machineJS.js", 6 | "scripts": { 7 | "test": "npm run test:regression", 8 | "test:regression": "mocha test/regression/test.js", 9 | "test:categorical": "mocha test/categorical/test.js", 10 | "dev": "node machineJS.js data/rossman/short/rossShortTrainDev.csv --predict data/rossman/short/test.csv --join data/rossman/short/store.csv --alreadyFormatted --dev", 11 | "devEnsemble": "node machineJS.js data/rossman/rossShortTrainDev.csv --predict data/rossman/test.csv --join data/rossman/store.csv --alreadyFormatted --devEnsemble", 12 | "train:rossman": "node machineJS.js data/rossman/tran_filled_gap.csv --predict data/rossman/test.csv --join data/rossman/store.csv --alreadyFormatted", 13 | "train:rossShort": "node machineJS.js data/rossman/short/rossShortTrainDev.csv --predict data/rossman/short/test.csv --join store.csv", 14 | "train:numeraiDev": "node machineJS.js data/numerai/numerai_training_data.csv --predict data/numerai/numerai_test_data.csv --binaryOutput --alreadyFormatted", 15 | "train:numerai": "node machineJS.js data/numerai/numerai_training_data_tournament.csv --predict data/numerai/numerai_tournament_data.csv --alreadyFormatted", 16 | "ensemble:rossShort": "node machineJS.js data/rossman/short/rossShortTrainDev.csv --predict data/rossman/short/test.csv --join data/rossman/store.csv --alreadyFormatted --devEnsemble", 17 | "ensemble:numerai": "node machineJS.js data/numerai/numerai_training_data_tournament.csv --predict data/numerai/numerai_tournament_data.csv --alreadyFormatted --devEnsemble", 18 | "train:giveCredit": "node machineJS.js data/giveCredit/train.csv --predict data/giveCredit/test.csv", 19 | "train:homesite": "node machineJS.js data/homesite/train.csv --predict data/homesite/test.csv", 20 | "train:homesiteShort": "node machineJS.js data/homesite/shortTrain.csv --predict data/homesite/shortTest.csv", 21 | "train:telstra": "node machineJS.js data/telstra/train.csv --predict data/telstra/test.csv" 22 | }, 23 | "repository": { 24 | "type": "git", 25 | "url": "http://github.com/ClimbsRocks/machinejs.git" 26 | }, 27 | "keywords": [ 28 | "neuralNet", 29 | "neural network", 30 | "machine learning", 31 | "ml", 32 | "algorithms", 33 | "random forest", 34 | "svm", 35 | "naive bayes", 36 | "bagging", 37 | "optimization", 38 | "data science", 39 | "brainjs", 40 | "date night", 41 | "scikit-learn", 42 | "sklearn", 43 | "ensemble", 44 | "data formatting", 45 | "javascript", 46 | "js", 47 | "XGBoost", 48 | "scikit-neuralnetwork", 49 | "KNN", 50 | "K nearest neighbors", 51 | "GridSearch", 52 | "GridSearchCV", 53 | "grid search", 54 | "python", 55 | "RandomizedSearchCV", 56 | "preprocessing", 57 | "data-formatter", 58 | "SVM", 59 | "kaggle", 60 | "kaggle competition" 61 | ], 62 | "author": "Preston Parry", 63 | "license": "MIT", 64 | "bin": { 65 | "machineJS": "machineJS.js" 66 | }, 67 | "bugs": { 68 | "url": "https://github.com/ClimbsRocks/machineJS/issues" 69 | }, 70 | "homepage": "https://github.com/ClimbsRocks/machineJS", 71 | "dependencies": { 72 | "babyparse": "^0.4.3", 73 | "data-formatter": "latest", 74 | "ensembler": "latest", 75 | "fast-csv": "^0.6.0", 76 | "longjohn": "^0.2.9", 77 | "minimist": "^1.1.2", 78 | "mkdirp": "^0.5.1", 79 | "python-shell": "^0.2.0" 80 | }, 81 | "devDependencies": { 82 | "chai": "^3.4.1", 83 | "csv": "^0.4.6", 84 | "data-for-tests": "0.0.3", 85 | "mocha": "^2.3.3", 86 | "rimraf": "^2.4.3" 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /pySetup/classifierList.js: -------------------------------------------------------------------------------- 1 | var argv = global.argv; 2 | 3 | /* 4 | classifier summary descriptions 5 | 'clRfEntropy', randomForest using the entropy criterion 6 | 'clRfGini', randomForest using the Gini criterion 7 | 'clSVCFirst', first SVC. SVC models train in quadratic time, and should only be used on datasets with fewer than a few tens of thousands of 8 | 'clSVCShrinking' sets the shrinking parameter equal to true. SVC models train in quadratic time, and should only be used on datasets with han a few tens of thousands of rows 9 | 'clnnSknn' scikit-neuralnetwork's wrapper around pyLearn2's neueral network. this is designed to be compatible with scikit-learn. It had very active development through mid 2015, but does not appear to be supported since then. 10 | 'clKnn': k-nearest-neighbors. a relatively different way of approaching the problem 11 | 'clLogisticRegression': standard logistic regression. right now though it is trying to cast from float(64) to S(32), and choking on that, so it is commented out for future development at some later point in time. 12 | 'clAdaBoost': trains multiple classifiers, where each additional classifier focuses on the difficult test cases for the previous one. 13 | */ 14 | 15 | module.exports = function(problemType, dataLength) { 16 | // these algorithms work for all problemTypes and dataLengths we have encountered so far 17 | var universalAlgorithms = { 18 | clRfGini: 'clRfGini', 19 | clXGBoost: 'clXGBoost', 20 | clRfBootstrapTrue: 'clRfBootstrapTrue', 21 | clAdaBoost: 'clAdaBoost', 22 | clExtraTrees: 'clExtraTrees' 23 | }; 24 | 25 | // these algorithms only work on classification problems, due to being instantiated with classification-specific parameters 26 | var classifierOnlyAlgorithms = { 27 | clLogisticRegression: 'clLogisticRegression', 28 | clMultinomialNB: 'clMultinomialNB', 29 | clRfEntropy: 'clRfEntropy', 30 | clPerceptron: 'clPerceptron', 31 | clnnSklearnMLP: 'clnnSklearnMLP', 32 | clSGDClassifier: 'clSGDClassifier' 33 | } 34 | 35 | var regressionOnlyAlgorithms = { 36 | clAdaLossLinear: 'clAdaLossLinear', 37 | clAdaLossSquare: 'clAdaLossSquare', 38 | clAdaLossExponential: 'clAdaLossExponential' 39 | } 40 | 41 | // these algorithms have a time coplexity that is prohibitive for long data sets 42 | var delForLongDatasets = { 43 | clSVCFirst: 'clSVCFirst', 44 | clSVCShrinking: 'clSVCShrinking' 45 | }; 46 | 47 | // these algorithms just aren't working right now for one reason or another 48 | var brokenRegressionAlgorithms = { 49 | clKnn: 'clKnn' 50 | }; 51 | 52 | // the clnn algos may not be broken, but we're probably going to deprecate them pretty shortly since sklearn launched their own MLP 53 | var brokenClassifierAlgorithms = { 54 | clKnn: 'clKnn' 55 | }; 56 | 57 | // these are algorithms we are in the process of implementing now or shortly 58 | var notImplementedYetAlgorithms = { 59 | clnnNoLearn: 'clnnNoLearn', 60 | clLinearRegression: 'clLinearRegression', 61 | clLinearSVC: 'clLinearSVC' 62 | } 63 | 64 | // this entire next section is dedicated to extending the universalAlgorithms object, which we will eventually return 65 | 66 | // we use the 'all' flag inside processArgs to set initial placeholder values for all possible classifiers we may end up training 67 | // then, once data-formatter has run, we will know the problemType and only return those classifiers 68 | if( problemType === 'category' || problemType === 'multi-category' || problemType === 'all') { 69 | for(var key in classifierOnlyAlgorithms) { 70 | universalAlgorithms[key] = classifierOnlyAlgorithms[key]; 71 | } 72 | } 73 | 74 | if( problemType === 'multi-category' ) { 75 | delete universalAlgorithms['clnnSklearnMLP']; 76 | } 77 | 78 | if(problemType === 'regression' || problemType === 'all') { 79 | for(var key in regressionOnlyAlgorithms) { 80 | universalAlgorithms[key] = regressionOnlyAlgorithms[key]; 81 | } 82 | } 83 | 84 | if( dataLength === 'longDataSet' ) { 85 | for( var key in delForLongDatasets ) { 86 | delete universalAlgorithms[key]; 87 | } 88 | } 89 | 90 | if( argv.devEnsemble ) { 91 | delete universalAlgorithms['clXGBoost'] 92 | delete universalAlgorithms['clRfBootstrapTrue'] 93 | } 94 | 95 | // scikit-learn's MLP is only available in v^0.18.0 96 | // if the user has not installed that version, we want to make sure to remove that from our classifierList 97 | // try{ 98 | 99 | // } 100 | 101 | return universalAlgorithms; 102 | 103 | } 104 | 105 | // module.exports = { 106 | // dev: { 107 | // clRfGini: 'clRfGini', 108 | // clXGBoost: 'clXGBoost', 109 | // clRfBootstrapTrue: 'clRfBootstrapTrue' 110 | // }, 111 | // shortDataSet: { 112 | // clXGBoost: 'clXGBoost', 113 | // clRfEntropy: 'clRfEntropy', 114 | // clAdaBoost: 'clAdaBoost', 115 | // clRfGini: 'clRfGini', 116 | // // clLogisticRegression: 'clLogisticRegression' 117 | // }, 118 | // longDataSet: { 119 | // // clSVCFirst: 'clSVCFirst', 120 | // // clSVCShrinking: 'clSVCShrinking', 121 | // // clnnNoLearn: 'clnnNoLearn', 122 | // // clnnSknn3Layer: 'clnnSknn3Layer', 123 | // // clnnSknn: 'clnnSknn', 124 | // // clKnn: 'clKnn', 125 | // // clRfEntropy: 'clRfEntropy', 126 | // clLogisticRegression: 'clLogisticRegression', 127 | // clAdaBoost: 'clAdaBoost', 128 | // // clAdaLossLinear: 'clAdaLossLinear', 129 | // // clAdaLossSquare: 'clAdaLossSquare', 130 | // // clAdaLossExponential: 'clAdaLossExponential', 131 | // clRfGini: 'clRfGini', 132 | // clXGBoost: 'clXGBoost', 133 | // clRfBootstrapTrue: 'clRfBootstrapTrue' 134 | // } 135 | // }; 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # machineJS 2 | > a fully-featured default process for machine learning- all the parts are here and have functional default values in place. Modify to your heart's delight so you can focus on the important parts for your dataset, or run it all the way through with the default values to have fully automated machine learning! 3 | 4 | # [`auto_ml`](https://github.com/ClimbsRocks/auto_ml) - machineJS, but better! 5 | I just built out [v2 of this project](https://github.com/ClimbsRocks/auto_ml) that now gives you analytics info from your models, and is production-ready. machineJS is an amazing research project that clearly proved there's a hunger for automated machine learning. 6 | 7 | [auto_ml](https://github.com/ClimbsRocks/auto_ml) tackles this exact same goal, but with more features, cleaner code, and the ability to be copy/pasted into production. 8 | 9 | Check it out! 10 | https://github.com/ClimbsRocks/auto_ml 11 | 12 | ## What is machineJS? 13 | `machineJS` provides a fully automated framework for applying machine learning to a dataset. 14 | 15 | All you have to do is give it a .csv file, with some basic information about each column in the first row, and it will go off and do all the machine learning for you! 16 | 17 | If you've already done this kind of thing before, it's useful as an outline, putting in place a working structure for you to make modifications within, rather than having to build from scratch again every time. 18 | 19 | machineJS will tell you: 20 | 21 | - Which algorithms are going to be most effective for this problem 22 | - Which features are most useful 23 | - Whether this problem is solvable by machine learning at all (useful if you're not sure you've collected enough data yet) 24 | - How effective machine learning can be with this problem, to compare against other potential solutions (like just taking a grouped average) 25 | 26 | If you haven't done much (or any) machine learning before- it does some fairly advanced stuff for you! 27 | 28 | ## Installation: 29 | 30 | ### As a standalone directory (recommended) 31 | If you want to install this in it's own standalone repo, and work on the source code directly, then from the command line, type the following: 32 | 33 | 1. `git clone https://github.com/ClimbsRocks/machineJS.git` 34 | 2. `cd machineJS` 35 | 3. `npm install` 36 | 4. `pip install -r requirements.txt` 37 | 5. `git clone https://github.com/scikit-learn/scikit-learn.git` 38 | 6. `cd scikit-learn` 39 | 7. `python setup.py build` 40 | 8. `sudo python setup.py install` 41 | 42 | 43 | 50 | 53 | ### From the command line 54 | `node machineJS.js path/to/trainData.csv --predict path/to/testData.csv` 55 | 56 | 65 | ## Format of Data Files: 66 | We use the `data-formatter` module to automatically format your data, and even perform some basic feature engineering on it. 67 | Please refer to `data-formatter`'s [docs](https://github.com/ClimbsRocks/data-formatter) for information on how to label each column to be ready for `machineJS`. 68 | 69 | ## How to customize/dive in deeper: 70 | machineJS is designed to be super easy to use without diving into any of the internals. Be a conjurer- just give it data and let it run! 71 | That said, it's super powerful once you start customizing it. 72 | 73 | It's designed to be relatively easy to modify, and well-documented. The [obvious place to start](https://github.com/ClimbsRocks/machineJS/blob/master/processArgs.js) is inside `processArgs.js`. Here we set nearly all the parameters that are used throughout the project. 74 | 75 | The other obvious area many people will be interested in is adding in new models, and different hyperparameter search spaces. This can be found in the `pySetup` folder. The [exact steps](https://github.com/ClimbsRocks/machineJS/blob/master/pySetup/stepsToAddNewClassifier.txt) are listed in `stepsToAddNewClassifier.txt`. 76 | 77 | ## What types of problems does this library work on? 78 | `machineJS` works on both regression and categorical problems, as long as there is a single output column in the training data. This includes multi-category (frequently called multi-class) problems, where the category you are predicting is one of many possible categories. 79 | There are no immediate plans to support multiple output columns in the training data. If you have three output columns you're interested in predicting, and they cannot be combined into a single column in the training data, you could run `machineJS` once for each of those three columns. 80 | 81 | This library is well-tested on Macs. I've designed it to work on PCs as well, but I haven't tested that at all yet. If you're a PC user, I'd love some issues or Pull Requests to make this work for PCs! 82 | 83 | 84 | #### Note: This library is designed to run across all but one cores on the host machine. What this means for you: 85 | 1. Please plug in. 86 | 2. Close all programs and restart right before invoking (this will clear out as much RAM as possible). 87 | 3. Expect some noise from your fan- you're finally putting your computer to use! 88 | 4. Don't expect to be able to do anything intense while this is running. Internet browsing or code editing is fine, but watching a movie may get challenging. 89 | 5. Please don't run any other Python scripts while this is running. 90 | 91 | Thanks for inviting us along on your machine learning journey! 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /test/regression/makePredictions.js: -------------------------------------------------------------------------------- 1 | var expect = require('chai').expect; 2 | var mocha = require('mocha'); 3 | var fs = require('fs'); 4 | var path = require('path'); 5 | var rTest = global.rTest; 6 | var csv = require('csv'); 7 | 8 | 9 | module.exports = function() { 10 | 11 | describe('the predictions for each classifier', function() { 12 | 13 | // If you have added a new classifier, and it works for regressions, add it here! 14 | // ****************************************************************************** 15 | var expectedMinimumTrainingScores = { 16 | // clRfGini: 0.955, 17 | // clXGBoost: 0.87, 18 | clAdaBoost: 0.535 19 | }; 20 | 21 | var expectedMinimumValidationScores = { 22 | clRfGini: 0.845, 23 | clXGBoost: 0.74, 24 | clAdaBoost: 0.525 25 | }; 26 | // ****************************************************************************** 27 | 28 | 29 | // run the tests for each classifier we expect to have trained 30 | for( var clName in expectedMinimumTrainingScores ) { 31 | (function testSingleAlgo(clName) { 32 | 33 | describe('predictions for ' + clName, function() { 34 | 35 | var validationFileName; 36 | var predictionFileName; 37 | var validationData; 38 | var predictionsData; 39 | 40 | before(function(done) { 41 | var validationFiles = fs.readdirSync(path.join(rTest.rTestPredictionsLocation, 'validation')); 42 | var predictionsFiles = fs.readdirSync(rTest.rTestPredictionsLocation); 43 | 44 | 45 | for(var i = 0; i < validationFiles.length; i++) { 46 | if( validationFiles[i].indexOf(clName) !== -1 ) { 47 | validationFileName = validationFiles[i]; 48 | } 49 | } 50 | 51 | for(var i = 0; i < predictionsFiles.length; i++) { 52 | if( predictionsFiles[i].indexOf(clName) !== -1 ) { 53 | predictionFileName = predictionsFiles[i]; 54 | } 55 | } 56 | 57 | // read in both our predictions data and our validation data 58 | fs.readFile(path.join(rTest.rTestPredictionsLocation, 'validation', validationFileName), function(err, data) { 59 | if(err) { 60 | console.error(err); 61 | done(); 62 | } 63 | csv.parse(data, function(err, output) { 64 | if(err) { 65 | console.error(err); 66 | } 67 | validationData = output; 68 | 69 | 70 | fs.readFile(path.join(rTest.rTestPredictionsLocation, predictionFileName), function(err, data) { 71 | if(err) { 72 | console.error(err); 73 | done(); 74 | } 75 | csv.parse(data, function(err, output) { 76 | if(err) { 77 | console.error(err); 78 | } 79 | predictionsData = output; 80 | done(); 81 | }); 82 | }); 83 | 84 | 85 | }); 86 | }); 87 | 88 | }); 89 | 90 | var errorRow; 91 | 92 | it('should have validation error and training error in the first row of the validationData', function() { 93 | errorRow = validationData.shift(); 94 | // console.log(errorRow[0]); 95 | // console.log(errorRow[1]); 96 | expect(parseFloat(errorRow[0], 10)).to.be.a('number'); 97 | expect(parseFloat(errorRow[1], 10)).to.be.a('number'); 98 | }); 99 | 100 | it('should have done at least as well as it has in the past', function() { 101 | console.log('\n'); 102 | console.log('this classifier\'s expected Validation Error:', expectedMinimumValidationScores[clName], 'this classifier\'s observed Validation Error:', Math.round(errorRow[0] * 1000) / 1000); 103 | console.log('this classifier\'s expected Training Error:', expectedMinimumTrainingScores[clName], 'this classifier\'s observed Training Error:', Math.round(errorRow[1] * 1000) / 1000); 104 | console.log('\n'); 105 | expect(errorRow[0]).to.be.above(expectedMinimumValidationScores[clName]); 106 | expect(errorRow[1]).to.be.above(expectedMinimumTrainingScores[clName]); 107 | }); 108 | 109 | it('should have the pretty names for this dataset in the second row of the validationData', function() { 110 | var headerRow = validationData.shift(); 111 | expect(headerRow[0].toLowerCase()).to.equal('id'); 112 | expect(headerRow[1].toLowerCase()).to.equal('sales'); 113 | }); 114 | 115 | it('should make predictions against the validation data set', function() { 116 | expect(validationData.length).to.be.within(51000 - 300, 51000 + 300); 117 | 118 | var shortestRowLength = Infinity; 119 | for(var i = 0; i < validationData.length; i++) { 120 | if( validationData[i].length < shortestRowLength ) { 121 | shortestRowLength = validationData[i].length; 122 | } 123 | } 124 | 125 | expect(shortestRowLength).to.equal(2); 126 | 127 | }); 128 | 129 | it('should have the pretty names for this dataset in the first row of the predictionsData', function() { 130 | var headerRow = predictionsData.shift(); 131 | expect(headerRow[0].toLowerCase()).to.equal('id'); 132 | expect(headerRow[1].toLowerCase()).to.equal('sales'); 133 | }); 134 | 135 | it('should make predictions against the test data set', function() { 136 | expect(predictionsData.length).to.equal(41088) 137 | 138 | var shortestRowLength = Infinity; 139 | for(var i = 0; i < predictionsData.length; i++) { 140 | if( predictionsData[i].length < shortestRowLength ) { 141 | shortestRowLength = predictionsData[i].length; 142 | } 143 | } 144 | 145 | expect(shortestRowLength).to.equal(2); 146 | 147 | }); 148 | 149 | 150 | }); 151 | 152 | after(function() { 153 | predictionsData = null; 154 | validationData = null; 155 | }) 156 | 157 | })(clName); 158 | 159 | } 160 | 161 | }); 162 | 163 | 164 | } 165 | 166 | -------------------------------------------------------------------------------- /pySetup/utils.js: -------------------------------------------------------------------------------- 1 | var py = global.pythonNamespace; 2 | var argv = global.argv; 3 | var path = require('path'); 4 | var utilsPyShell = require('./utilsPyShell.js'); 5 | var df = require('data-formatter'); 6 | 7 | module.exports = { 8 | dictVectMapping: { 9 | // this will be given to us by DictVectorizer, a python module that takes dictionaries and turns them into arrays. Obviously since dictionaries are not ordered, we need to keep track of which fields end up in which indices. 10 | }, 11 | 12 | fileNames: { 13 | // this will be given to us by dataFormatting.py once it has created the files with the formatted data. 14 | // ID 15 | // X_train 16 | // y_train 17 | // X_test 18 | // y_test 19 | // X_train_nn- used by neural networks. we will use the same ID and y_train files as the rest of the dataset. It is only the input features that have to be normalized, not the output features. 20 | // X_test_nn- used by neural networks. we will use the same ID and y_train files as the rest of the dataset. It is only the input features that have to be normalized, not the output features. 21 | // trainingDataLength- technically not a file name, but fits much more logically here than reading in that file again in node.js 22 | // problemType: 'regression' or 'category' or 'multi-category' 23 | }, 24 | 25 | splitData: function(callback) { 26 | var dfArgs = { 27 | fileNames: module.exports.fileNames, 28 | searchPercent: argv.searchPercent, 29 | validationPercent: argv.validationPercent, 30 | }; 31 | 32 | // generatePythonOptions assumes the first input is the name of a data file that training.py or makePredictions.py will be run on. Pass in ignoreMe.csv for now until we refactor that. 33 | var pythonOptions = utilsPyShell.generatePythonOptions('ignoreMe.csv', [JSON.stringify(argv), JSON.stringify(module.exports.fileNames) 34 | ] ); 35 | 36 | if( argv.splitDataTest ) { 37 | // if this is being run from within our test suite, pass in a blank callback to halt executtion after splitDatasets 38 | callback = function() {}; 39 | } 40 | 41 | var pyShell = utilsPyShell.startPythonShell('splitDatasets.py', callback, pythonOptions); 42 | pyShell.on('message', function(message) { 43 | if(message.type === 'splitFileNames') { 44 | for( var key in message.text) { 45 | module.exports.fileNames[key] = message.text[key]; 46 | } 47 | global.argv.fileNames = module.exports.fileNames; 48 | } 49 | }); 50 | 51 | }, 52 | 53 | formatData: function( callback ) { 54 | // the callback function will be invoked with an object that holds the fileNames needed by module.exports.fileNames 55 | 56 | var dataFormatterArgs = { 57 | trainingData: argv.dataFile, 58 | testingData: argv.predict, 59 | trainingPrettyName: argv.outputFileName, 60 | testingPrettyName: argv.testOutputFileName, 61 | joinFileName: argv.join, 62 | on: argv.on, 63 | allFeatureCombinations: argv.allFeatureCombinations, 64 | keepAllFeatures: argv.keepAllFeatures 65 | }; 66 | 67 | if( argv.dfOutputFolder ) { 68 | dataFormatterArgs.outputFolder = argv.dfOutputFolder; 69 | } 70 | 71 | df(dataFormatterArgs, function(fileNames) { 72 | console.log('Here are the fileNames from data-formatter. If you want to skip the data-formatter part next time you want to play with this dataset, copy and paste this object into machineJS/pySetup/testingFileNames.js, following the instructions included in that file.'); 73 | console.log(fileNames); 74 | // df takes in a callback function that will be invoked with the fileNames object, holding the names and locations of the files it saved the data into 75 | module.exports.fileNames = fileNames; 76 | callback(); 77 | }); 78 | 79 | }, 80 | 81 | kickOffTraining: function( callback, classifierName) { 82 | var pythonOptions = utilsPyShell.generatePythonOptions(argv.dataFile, [JSON.stringify(argv), JSON.stringify(module.exports.fileNames), classifierName, module.exports.fileNames.problemType, global.bestSearchScore]); 83 | 84 | var emitFinishedTrainingCallback = function() { 85 | global.finishedAlgos++; 86 | process.emit('algoFinishedTraining'); 87 | callback(); 88 | }; 89 | 90 | 91 | var pyShell = utilsPyShell.startPythonShell('training.py', emitFinishedTrainingCallback, pythonOptions); 92 | pyShell.on('message', function(message) { 93 | 94 | // once we get a message back with the trained results, 95 | if(message.type === 'trainingResults') { 96 | var classifierName = message.text.algoName; 97 | 98 | // save it into our allResults array 99 | global.allTrainingResults.push(message.text); 100 | global.trainedAlgoCounts[classifierName]++; 101 | 102 | // see if this is the best searchScore we've encountered so far 103 | if( message.text.searchScore > global.bestSearchScore ) { 104 | global.bestSearchScore = message.text.searchScore; 105 | } 106 | 107 | // see if this is the best search result for that algorithm so far 108 | var prevBestResult = global.trainingResultsSummary[classifierName]; 109 | if( message.text.searchScore > prevBestResult || prevBestResult === undefined ) { 110 | global.trainingResultsSummary[classifierName] = message.text.searchScore; 111 | } 112 | // global.trainedAlgos[classifierName] = message.text; 113 | } 114 | }); 115 | }, 116 | 117 | makePredictions: function( callback, classifierName) { 118 | console.log('kicking off the process of making predictions on the predicting data set for:', classifierName); 119 | 120 | var startPredictionsScript = function() { 121 | if( global.copyValidationData && classifierName.slice(0,4) !== 'clnn' ) { 122 | var copyValidationData = true; 123 | global.copyValidationData = false; 124 | } else { 125 | var copyValidationData = false; 126 | } 127 | 128 | var classifierTrainingObj = global.allTrainingResults[global.allTrainingResults.length -1]; 129 | var classifierTrainingScore = classifierTrainingObj.longTrainScore; 130 | var classifierSearchScore = classifierTrainingObj.searchScore 131 | 132 | var pythonOptions = utilsPyShell.generatePythonOptions(argv.predict, [module.exports.dictVectMapping, JSON.stringify(argv), JSON.stringify(module.exports.fileNames), classifierName, module.exports.fileNames.problemType, classifierTrainingScore, copyValidationData, classifierSearchScore ]); 133 | 134 | // if this hyperparameter search did not yield an algorithm that was close enough to our best that it was worth investing in a longTraining, we did not train it and gave it a score of 0. 135 | // therefore, we only want to make predictions using this classifier if we actually trained an algorithm successfully (classifierTrainingScore > 0) 136 | // if( classifierTrainingScore > 0 ) { 137 | utilsPyShell.startPythonShell('makePredictions.py', callback, pythonOptions); 138 | // } else { 139 | // // ensembler needs to know to not listen for predictions results from this algorithm 140 | // process.emit('algoSkippedTraining'); 141 | // } 142 | 143 | }; 144 | 145 | startPredictionsScript(); 146 | 147 | } 148 | 149 | } 150 | -------------------------------------------------------------------------------- /pySetup/controllerPython.js: -------------------------------------------------------------------------------- 1 | var py = global.pythonNamespace = {}; 2 | var exec = require('child_process').exec; 3 | var ensembler = require('ensembler'); 4 | 5 | var path = require('path'); 6 | var fs = require('fs'); 7 | var pySetupLocation = path.dirname(__filename); 8 | py.pySetupLocation= pySetupLocation; 9 | py.referencesToChildren= []; 10 | var utils = require('./utils.js'); 11 | var classifierOptions = require('./classifierList.js'); 12 | 13 | argv = global.argv; 14 | 15 | var startOneClassifier = function(classifierList) { 16 | 17 | if( classifierList.length > 0 ) { 18 | // for our last classifier, tell it to run on all cores on the machine 19 | // this way, when the second-to-last classifier finishes, and those half the machine cores are empty, we can put them to use! 20 | if( classifierList.length === 1 ) { 21 | argv.numCPUs = argv.computerTotalCPUs; 22 | } 23 | 24 | var classifierName = classifierList.shift(); 25 | 26 | var algosBestScore = global.trainingResultsSummary[classifierName]; 27 | 28 | // if we have trained more than two of this algorithm, and it's best score is not within X percent of the best we've found so far, don't bother training another one. 29 | // during the ensemble round, we are intentionally skipping over any algorithms that did not perform well during the earlier round of training. this should save significant amounts of time and make sure we only have high quality results at the end. 30 | // if you want to train all classifiers during the ensemble round, simply add in an expression to the boolean phrase below 31 | if( global.trainedAlgoCounts[classifierName] < 2 || algosBestScore > global.bestSearchScore * argv.continueToTrainThreshold ) { 32 | // kick off training, and then, once that is done, invoke the callback, which starts the process of making predictions 33 | utils.kickOffTraining( function() { 34 | module.exports.makePredictions(classifierName); 35 | }, classifierName); 36 | 37 | } else { 38 | // since we said at the start to expect a certain number of algorithms to be trained, we must still emit an event to notify ensembler that we are skipping over an algorithm 39 | process.emit('algoSkippedTraining'); 40 | } 41 | 42 | 43 | } 44 | }; 45 | 46 | 47 | module.exports = { 48 | killAll: function() { 49 | // kill all child processes 50 | for (var i = 0; i < py.referencesToChildren.length; i++) { 51 | py.referencesToChildren[i].childProcess.kill(); 52 | } 53 | 54 | // following the .kill() routine for each child is frequently not killing all the child processes of that child process. so if our python shell is running 8 other python scripts to spread the training out around all the cores, those 8 other python scripts are continuing to run after the above. 55 | // the following command will be executed on the command line and will kill all Python processes. 56 | // the unfortunate side effect is that any unrelated Python processes running on this machine will also be killed. But since this library takes up all the cores on the machine anyways, the user would likely have a very hard time running other Python scripts simultaneously regardless. 57 | exec('pkill -9 Python'); 58 | }, 59 | 60 | startClassifiers: function(classifierList) { 61 | var classifiersByRound = module.exports.makeClassifierList(); 62 | 63 | startOneClassifier(classifiersByRound); 64 | startOneClassifier(classifiersByRound); 65 | 66 | 67 | // whenever one estimator finishes training (or has not performed well enough in training so far to justify training another instance of it), we want to start training another! 68 | process.on('algoFinishedTraining', function() { 69 | startOneClassifier(classifiersByRound); 70 | }); 71 | 72 | process.on('algoSkippedTraining', function() { 73 | startOneClassifier(classifiersByRound); 74 | }); 75 | 76 | }, 77 | 78 | makeClassifierList: function() { 79 | var classifierList = classifierOptions(utils.fileNames.problemType, utils.fileNames.trainingDataLength); 80 | 81 | classifierList = Object.keys( classifierList ); 82 | var classifiersByRound = []; 83 | 84 | // we are going to get many trained classifiers from this! 85 | // let's talk through an example: 86 | // say we want to run 100 iterations of RandomizedSearchCV 87 | // we could run a single round of rsCV with 100 iterations, and get a single trained classifier out of it at the end 88 | // or, we could run 10 rounds, with 10 iterations each, and have 10 trained classifiers at the end! 89 | // ensembler works best when it has more predictions to work with, so this second option is immediately appealling 90 | // the second option is also appealling in that we will have a bunch of midway results 91 | // say running 100 iterations takes 100 hours 92 | // and we end up only having 90 hours 93 | // if we split this up into multiple rounds, we will now have 8 or 9 algorithms trained by this point, one of which is likely the best one 94 | // whereas if it were an all-or-nothing game of having to get to all 100, we would have nothing. 95 | // another thing that's appealling about running multiple rounds is it let's us test more algorithms against the valdiation data set. it's somewhat difficult to predict how each algorithm is going to generalize, so having a chance to actually test them against the validation data set gives us more options 96 | // the drawback is that it will take more time (training a "bigger" version of the selected algorithm 10 times is not trivial, nor is running 10 rounds of predictions against the validation and test data sets) 97 | 98 | if(argv.devEnsemble) { 99 | argv.numRounds = 1; 100 | } 101 | 102 | for( var i = 0; i < argv.numRounds; i++) { 103 | for( var j = 0; j < classifierList.length; j++) { 104 | classifiersByRound.push(classifierList[j]); 105 | } 106 | } 107 | 108 | numberOfClassifiers = classifiersByRound.length; 109 | 110 | // tell ensembler how many algos to wait for before ensembler takes over 111 | ensembler.startListeners( numberOfClassifiers, argv.ensemblerArgs); 112 | 113 | return classifiersByRound; 114 | }, 115 | 116 | startTraining: function() { 117 | 118 | if( argv.validationRound ) { 119 | module.exports.startClassifiers(); 120 | 121 | } else if( argv.alreadyFormatted ) { 122 | // if we have already formatted the data, skip over repeating that step. This allows us to train more classifiers rapidly without repeating the oftentimes lengthy data formatting process. 123 | utils.splitData(function() { 124 | module.exports.startClassifiers(); 125 | }); 126 | } else { 127 | // here is where we invoke data-formatter to handle all our data formatting needs 128 | // for more information, please check out that repo! 129 | // https://github.com/ClimbsRocks/data-formatter 130 | utils.formatData( function() { 131 | utils.splitData(function() { 132 | module.exports.startClassifiers(); 133 | }); 134 | }); 135 | } 136 | 137 | }, 138 | 139 | 140 | makePredictions: function(classifierName) { 141 | 142 | utils.makePredictions( function() { 143 | process.emit('algoFinishedPredicting'); 144 | }, classifierName); 145 | } 146 | 147 | }; 148 | -------------------------------------------------------------------------------- /pySetup/splitDatasets.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import random 4 | from os import path 5 | import ntpath 6 | import cPickle as pickle 7 | 8 | import numpy as np 9 | from scipy.sparse import csr_matrix, csc_matrix 10 | 11 | from sendMessages import printParent 12 | from sendMessages import messageParent 13 | from sendMessages import obviousPrint 14 | 15 | args = json.loads(sys.argv[2]) 16 | fileNames = json.loads(sys.argv[3]) 17 | XFileName = fileNames['X_train'] 18 | XnnFileName = fileNames['X_train_nn'] 19 | ynnFileName = fileNames['y_train_nn'] 20 | idFileName = fileNames['id_train'] 21 | yTrainFileName = fileNames['y_train'] 22 | validationSplitColumnFileName = fileNames['validation_split_column'] 23 | hasCustomValidationSplit = fileNames['hasCustomValidationSplit'] 24 | 25 | outputDirectory = path.dirname(XFileName) 26 | 27 | # what percent of our dataset to not train on, but to set aside for validation and stacking/blending? 28 | validationPercent = args['validationPercent'] 29 | 30 | 31 | # we are not supporting dense matrices at the moment. 32 | def load_sparse_csr(filename): 33 | loader = np.load(filename) 34 | return csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) 35 | 36 | X = load_sparse_csr(XFileName) 37 | 38 | 39 | numRows = X.shape[0] 40 | 41 | includeOrNot = [random.random() for x in range(0,numRows)] 42 | 43 | # we want to save the validation indices with the test data. that way we can have multiple different versions of the sme training data set scattered throughout a computer, but still use these same validationIndices for all of them 44 | validationIndexFolder = path.dirname(args['predict']) 45 | validationIndexFileName = 'dfValidationIndices' + args['testOutputFileName'] + '.pkl' 46 | validationIndicesFile = path.join( validationIndexFolder, validationIndexFileName ) 47 | 48 | 49 | writeToFile = True 50 | createNewSplit = False 51 | 52 | if hasCustomValidationSplit: 53 | # load the validation split column 54 | validationSplitColumn = load_sparse_csr(validationSplitColumnFileName) 55 | # create both training and validation indices 56 | # validationIndices are rows we will hold out as the validation data set 57 | # trainingIndices are rows we will include in the training data set 58 | validationIndices = [] 59 | trainingIndices = [] 60 | for idx, item in enumerate(validationSplitColumn.todense().tolist()[0]): 61 | if item == 1: 62 | validationIndices.append(idx) 63 | else: 64 | trainingIndices.append(idx) 65 | 66 | else: 67 | # try to load in existing validationIndices 68 | try: 69 | with open(validationIndicesFile, 'rb') as openFile: 70 | validationIndices = pickle.load(openFile) 71 | 72 | # check to make sure that the validation length is less than the length of our X dataset 73 | if len(validationIndices) > numRows * ( validationPercent + .02): 74 | printParent('validationIndices too long') 75 | # if it isn't, create a new validationIndices for this dataset, but do not write it to file 76 | # this lets us keep our larger validationIndices split (for the full training data set), while still having something to work with for this smaller dataset we're currently testing on. 77 | writeToFile = False 78 | raise IndexError("this dataset is shorter than the one we built the validation split on previously") 79 | 80 | # check to make sure that the validation length is within a few percentage points of our validationPercent number (in other words, if X is 10,000 rows long, and the length of the validationIndices is only 1,200, then we know validationIndices was built on a smaller test dataset earlier.) 81 | elif len(validationIndices) < numRows * validationPercent * .98: 82 | printParent('validationIndices too short') 83 | # If it is not, create a new validationIndices and write that to file 84 | raise IndexError("this dataset is longer than the one we built the validation split on previously") 85 | 86 | # In both cases, fall into the except state below 87 | # but create a variable that lays out whether to write that new validationIndices to file or not in the try block, and then use that in the except block below 88 | 89 | # if we found existing validationIndices that meet the criteria above, we still want to split our incoming dataset on those indices 90 | # this allows us to change our feature engineering on a training dataset, and pass those features through to machineJS 91 | trainingIndices = [] 92 | validationIndicesCopy = validationIndices[:] 93 | # it should already be sorted, but we're being safe here in case of future changes 94 | validationIndicesCopy.sort() 95 | validationIndicesCounter = 0 96 | 97 | # linear comparison of two lists to only put indices into trainingIndices if they are not in validationIndices 98 | for x in range(0,numRows): 99 | if x == validationIndicesCopy[validationIndicesCounter]: 100 | validationIndicesCounter += 1 101 | else: 102 | trainingIndices.append(x) 103 | del validationIndicesCopy 104 | 105 | 106 | # in the case that we were not able to load in validationIndices successfully, we want to write our validationIndices to file for all future runs to use 107 | except: 108 | createNewSplit = True 109 | validationIndices = [] 110 | trainingIndices = [] 111 | for idx, randomNum in enumerate(includeOrNot): 112 | if randomNum < validationPercent: 113 | validationIndices.append(idx) 114 | else: 115 | trainingIndices.append(idx) 116 | 117 | if writeToFile: 118 | with open(validationIndicesFile, 'wb') as writeFile: 119 | # now save that file as a .pkl next to where our test data sits. 120 | pickle.dump(validationIndices, writeFile) 121 | 122 | 123 | # continued callout to the person originally responsible for this function: 124 | # http://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format 125 | def save_sparse_csr(filename,array): 126 | np.savez(filename,data=array.data ,indices=array.indices, indptr=array.indptr, shape=array.shape ) 127 | 128 | 129 | # we want to write the splits of the training data every time 130 | # but only create a new validationIndices in certain circumstances 131 | def splitDataset(data, name, fileCategory): 132 | 133 | # uses slicing, one of the most useful and least-well-known features of scipy sparse matrices 134 | # you pass in a list of row indices you want to keep, and it will create a sliced copy that includes only those rows 135 | # slicing also works on column indices 136 | # callout to the person who first opened my eyes to them: 137 | # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best 138 | 139 | # if this "sparse" matrix only has a single value for each row, we have to treat it as a column matrix, and slice it accordingly 140 | # this is the case for our idColumn, and frequently our y values as well. 141 | if data.shape[0] == 1: 142 | validation = data[:,validationIndices] 143 | trainingData = data[:,trainingIndices] 144 | 145 | else: 146 | validation = data[validationIndices,:] 147 | trainingData = data[trainingIndices,:] 148 | 149 | # ntpath theoretically works really well across systems 150 | name = ntpath.basename(name) 151 | # remove the file extension 152 | name = name[0:-4] 153 | 154 | validationFile = path.join(outputDirectory, name + 'validationData.npz') 155 | trainingDataFile = path.join(outputDirectory, name + 'trainingData.npz') 156 | 157 | save_sparse_csr(trainingDataFile, trainingData) 158 | save_sparse_csr(validationFile, validation) 159 | 160 | # send the file names back to the parent process, where we aggregate and save them 161 | fileNameDict = { 162 | fileCategory + 'trainingData': trainingDataFile, 163 | fileCategory + 'validationData': validationFile 164 | } 165 | messageParent(fileNameDict, 'splitFileNames') 166 | 167 | 168 | # we are going to repeat this process several times: 169 | # idColumn 170 | # X_train 171 | # y_train 172 | # X_train_nn 173 | # they are just slightly different enough that i don't want to loop through them. The code below is super readable 174 | 175 | splitDataset(X, XFileName, 'X_train') 176 | del X 177 | 178 | idColumn = load_sparse_csr(idFileName) 179 | splitDataset(idColumn, idFileName, 'id_train') 180 | del idColumn 181 | 182 | yColumn = load_sparse_csr(yTrainFileName) 183 | splitDataset(yColumn, yTrainFileName, 'y_train') 184 | del yColumn 185 | 186 | Xnn = load_sparse_csr(XnnFileName) 187 | splitDataset(Xnn, XnnFileName, 'X_train_nn') 188 | del Xnn 189 | 190 | ynn = load_sparse_csr(ynnFileName) 191 | splitDataset(ynn, ynnFileName, 'y_train_nn') 192 | del ynn 193 | 194 | -------------------------------------------------------------------------------- /processArgs.js: -------------------------------------------------------------------------------- 1 | var path = require('path'); 2 | var mkdirp = require('mkdirp'); 3 | // we will soon save path.dirname(__filename) into argv.machineJSLocation, but to get all this started by loading our require statements, we'll type it in directly here 4 | var utils = require(path.join(path.dirname(__filename), 'pySetup','utils.js')); 5 | var classifierListOptions = require(path.join(path.dirname(__filename), 'pySetup', 'classifierList.js')); 6 | 7 | module.exports = function() { 8 | if(argv.dev || argv.devKaggle || argv.devEnsemble) { 9 | argv.dev = true; 10 | } else { 11 | argv.dev = false; 12 | } 13 | 14 | 15 | var dataFile = global.argv.dataFile || process.argv[2]; 16 | argv.computerTotalCPUs = require('os').cpus().length; 17 | argv.machineJSLocation = path.dirname(__filename); 18 | 19 | // setting defaults if using the --dev or --devKaggle flags (speeds up development time when doing engineering work on the machineJS library itself) 20 | if( argv.dev ) { 21 | require('longjohn'); 22 | if (dataFile === undefined) { 23 | dataFile = 'rossShortTrainDev.csv'; 24 | } 25 | if ( (argv.devKaggle && !argv.predict) || argv.devEnsemble) { 26 | argv.predict = argv.predict || 'rossmantest.csv'; 27 | } 28 | } 29 | 30 | argv.dataFile = dataFile; 31 | argv.dataFileName = path.basename( argv.dataFile ); 32 | argv.dataFilePretty = argv.dataFileName.slice(0,-4); 33 | argv.binaryOutput = argv.binaryOutput || false; //python doesn't like undefined, so explicitly set this to false if it does not exist 34 | argv.outputFileName = argv.dataFilePretty; 35 | if( argv.outputFileName === 'train' ) { 36 | dataFileFolder = path.parse(argv.dataFile).dir.split(path.sep).pop(); 37 | argv.outputFileName = dataFileFolder + argv.dataFilePretty; 38 | } 39 | 40 | // python throws a keyError if you try to look up a key that doesn't exist, so we are explicitly giving it a blank value to ensure the key will exist when we need it later 41 | argv.join = argv.join || ''; 42 | argv.on = argv.on || ''; 43 | argv.allFeatureCombinations = argv.allFeatureCombinations || ''; 44 | argv.keepAllFeatures = argv.keepAllFeatures || ''; 45 | argv.dfOutputFolder = argv.dfOutputFolder || path.join(argv.machineJSLocation,'pySetup','data-formatterResults'); 46 | argv.matrixOutput = argv.matrixOutput || ''; 47 | 48 | 49 | argv.testFileName = path.basename( argv.predict ); 50 | argv.testFilePretty = argv.testFileName.slice(0,-4); 51 | argv.testOutputFileName = argv.testFilePretty; 52 | 53 | if( argv.testOutputFileName === 'test' ) { 54 | dataFileFolder = path.parse(argv.dataFile).dir.split(path.sep).pop(); 55 | argv.testOutputFileName = dataFileFolder + argv.testFilePretty; 56 | } 57 | 58 | /* 59 | in splitDatasets.py, we are going to break our data out into three groups: 60 | 1. The group we run the hyperparameter search over (GridSearchCV or RandomizedSearchCV). 61 | Since the best hyperparameters for a random subset of the data are going to be the same as the entire dataset, 62 | we run the search on only a subset of the data to drastically speed up search time 63 | 2. The training data we will train our (now-optimized) algorithm on. 64 | Now that we have our best hyperparameters, create an algorithm with those parameters, and train it on a larger portion of our overall dataset. 65 | 3. The validation set. This is a holdout set we do not include in the training set. 66 | We use this to test how well our algorithm generalizes to data it hasn't seen yet. 67 | We also use this, later down the road, for ensembler to create stacked/blended ensembles with. 68 | For a given test.csv dataset, we will determine the validation dataset once, and then use that each time. 69 | This means that we can include all the algorithms you've trained on this dataset in our ensembling. 70 | This lets you change how you format the data (normalization, scaling, new feature engineering, etc.), and still use all these algorithms in the final ensemble. 71 | You can easily start over with a new validation set by simply deleting the validation.pkl file saved next to your test.csv file. 72 | */ 73 | if( argv.dev ) { 74 | argv.searchPercent = argv.searchPercent || .1; 75 | argv.validationPercent = argv.validationPercent || .85; 76 | } else { 77 | argv.searchPercent = argv.searchPercent || .3; 78 | argv.validationPercent = argv.validationPercent || .3; 79 | } 80 | 81 | /* 82 | set out how many combinations of parameters we want to try. 83 | numRounds is how many different times we will run RandomizedSearchCV for that algorithm. 84 | so if we have numRounds = 20, we will search for optimal hyperparameters for each algorithm 20 times 85 | numIterationsPerRound is how many different combinations of hyperparameters we will attempt for each of those rounds 86 | so numIterationsPerRound = 10 means we will try 10 different combinations of hyperparameters each round. 87 | for competitions, more numRounds and lower numIterationsPerRound is ideal. In that case, we have more material to feed into ensembler, since we will have more algos trained at the end. For production environments, fewer numRounds and much higher numIterationsPerRound means that each of the algos we train will be higher quality. We will probably miss out on accuracy to a tiny degree, but we will need far fewer algos to accomplish this, which will be much more efficient in a production environment. 88 | bumping up these values will increase accuracy at the cost of compute time 89 | */ 90 | 91 | if( argv.dev ) { 92 | argv.numRounds = argv.numRounds || 2; 93 | argv.numIterationsPerRound = argv.numIterationsPerRound || 5; 94 | 95 | } else { 96 | argv.numRounds = argv.numRounds || 3; 97 | argv.numIterationsPerRound = argv.numIterationsPerRound || 8; 98 | 99 | } 100 | 101 | 102 | // keep track of where we will be saving data during all of the intermediate stages 103 | argv.predictionsFolder = argv.predictionsFolder || path.join(argv.machineJSLocation, 'predictions', argv.testOutputFileName); 104 | argv.validationFolder = path.join(argv.predictionsFolder, 'validation'); 105 | argv.bestClassifiersFolder = argv.bestClassifiersFolder || path.join(argv.machineJSLocation, 'pySetup','bestClassifiers',argv.outputFileName); 106 | // create these folders if they do not already exist 107 | mkdirp(argv.predictionsFolder); 108 | mkdirp(argv.validationFolder); 109 | mkdirp(argv.bestClassifiersFolder); 110 | 111 | // allow the user to specify a different location for the output 112 | argv.ensemblerOutputFolder = argv.ensemblerOutputFolder || argv.machineJSLocation; 113 | 114 | /* 115 | the first time we run machineJS, it will just make predictions for a ton of different algos 116 | then ensembler will add all the predictions of these algo to the validation data. 117 | in other words, for each row of data, we will now have the original input data (height = 5'2", gender = female, etc.), as well as the predictions from all the stage 0 predictors (randomForest says .99 probability, MLP says .997 probability, perceptron says .97, etc.). 118 | then ensembler asks machineJS to try to train a new algo that takes these stage 0 predictions into account 119 | */ 120 | // keep track of whether this is the validation round or the original stage 0 round 121 | if( argv.validationRound !== true ) { 122 | argv.validationRound = false; 123 | } 124 | var nextValidationRound = !argv.validationRound; 125 | 126 | // these are the arguments we will pass to ensembler 127 | argv.ensemblerArgs = { 128 | inputFolder: argv.predictionsFolder, 129 | outputFolder: argv.ensemblerOutputFolder, 130 | validationFolder: argv.validationFolder, 131 | fileNameIdentifier: argv.outputFileName, 132 | validationRound: nextValidationRound 133 | }; 134 | 135 | // sometimes we want the probability (.97), sometimes we just want a binary yes or no (1) 136 | if( argv.binaryOutput ) { 137 | argv.kaggleBinaryOutputFolder = path.join(argv.predictionsFolder, 'kaggleBinaryOutput'); 138 | mkdirp(argv.kaggleBinaryOutputFolder); 139 | } 140 | 141 | // sometimes we want matrix output. This is useful when we are trying to, say, categorize a shopper into one of 12 different categories. With matrix output, each of those 12 categories will come out as their own column, with a 0 or 1, as opposed to a single column with values from 1 - 12. 142 | // this hasn't been tested in a while. 143 | if( argv.matrixOutput ) { 144 | argv.matrixOutputFolder = path.join(argv.predictionsFolder, 'matrixOutput'); 145 | mkdirp(argv.matrixOutputFolder); 146 | } 147 | 148 | // store information on all the algos we've trained so far 149 | global.allTrainingResults = []; 150 | global.trainingResultsSummary = {}; 151 | global.trainedAlgoCounts = {}; 152 | global.bestSearchScore = 0; 153 | global.finishedAlgos = 0; 154 | global.copyValidationData = true; 155 | 156 | // each classifier is only allowed to take up half the CPUs on the machine. 157 | // we will be training two in parallel 158 | // this way, if a single classifier takes so long to train that it effectively fails, we can still train classifiers on the other cores 159 | argv.numCPUs = argv.numCPUs || Math.round( argv.computerTotalCPUs / 2 ) + 1; 160 | 161 | // we have several different objects in our classifierListOptions, depending on the length of dataset we're training against. 162 | // rather than trying to build in the logic of figuring out which ones we want before we have formatted and understood our data, just add in all the possible options as keys. 163 | classifierListOptions = classifierListOptions('all'); 164 | for( var algo in classifierListOptions ) { 165 | global.trainedAlgoCounts[algo] = 0; 166 | } 167 | 168 | 169 | // we are setting the minimum threshold an algorithm must hit in order to justify us training that algorithm for an extended period of time. 170 | // this comes into play for algorithms that have a considerably longer longTraining time than testing time, such as our random forests with 1200 trees. 171 | // it takes only ~3 minutes to do the hyperparameter search, but ~40 to do the long training. we obviously don't want to undertake that long training unless that algo is "good enough". 172 | // in this case, good enough is defined as being within 3% of our best algo at that stage in the process. 173 | argv.longTrainThreshold = argv.longTrainThreshold || .97; 174 | argv.continueToTrainThreshold = argv.continueToTrainThreshold || argv.longTrainThreshold; 175 | 176 | // formatting our data can take a long time. Unless you're performing additional feature engineering, the results are basically the same every time we run data-formatter. So, we can save ourselves a lot of time by just using the previously calculated results from data-formatter. 177 | // the entire process the user follows when using previously formatted data is exactly the same as formatting the data anew. You must pass in the exact same arguments to machineJS as you would to run data-formatter from scratch- we depend on that information you're passing in. 178 | // take in a flag to tell machineJS that we want to use previously formatted data. This is always the case when the dev flags have been passed in. 179 | if( argv.alreadyFormatted === undefined ) { 180 | if( argv.dev || argv.ensemble ) { 181 | argv.alreadyFormatted = true; 182 | } else { 183 | argv.alreadyFormatted = false; 184 | } 185 | } 186 | 187 | // if we are using previously formatted data, load in the names of the right files from machineJS/pySetup/testingFileNames.js. Follow instructions in that file for more information on the exact format expected. 188 | if( argv.alreadyFormatted ) { 189 | 190 | if( argv.fileNames !== undefined ) { 191 | utils.fileNames = argv.fileNames; 192 | } else { 193 | var fileNamesOptions = require(path.join(argv.machineJSLocation,'pySetup','testingFileNames.js')); 194 | utils.fileNames = fileNamesOptions[argv.outputFileName]; 195 | argv.fileNames = utils.fileNames; 196 | } 197 | 198 | try{ 199 | utils.fileNames = JSON.parse(utils.fileNames); 200 | } catch(err) { 201 | 202 | } 203 | } 204 | 205 | }; 206 | -------------------------------------------------------------------------------- /pySetup/training.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import os 4 | import os.path as path 5 | import json 6 | import joblib 7 | import logging 8 | import time 9 | 10 | import numpy as np 11 | from sklearn.cross_validation import train_test_split 12 | from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 13 | from scipy.sparse import csr_matrix, vstack 14 | 15 | from sendMessages import printParent 16 | from sendMessages import messageParent 17 | from sendMessages import obviousPrint 18 | 19 | logging.basicConfig() 20 | 21 | import warnings 22 | startTime = time.time() 23 | 24 | from randomizedSearchList import rsList 25 | randomizedSearchCVList = rsList() 26 | 27 | # these lines will give us an object with keys for each classifier name, and values that will return classifiers to us. 28 | from makeClassifiers import makeClassifiers 29 | globalArgs = json.loads(sys.argv[2]) 30 | fileNames = json.loads(sys.argv[3]) 31 | 32 | classifierName = sys.argv[4] 33 | problemType = sys.argv[5] 34 | bestSearchScore = float(sys.argv[6]) 35 | 36 | sys.path.append(globalArgs['machineJSLocation'] + '/pySetup/parameterMakers') 37 | import paramMakers 38 | 39 | import makeBigClassifiers 40 | import extendedTrainingList 41 | 42 | dev = False 43 | if( globalArgs['dev'] ): 44 | dev = True 45 | 46 | def load_sparse_csr(filename): 47 | loader = np.load(filename) 48 | return csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) 49 | 50 | 51 | classifierCreater = makeClassifiers(globalArgs, dev, problemType) 52 | 53 | X = [] 54 | y = [] 55 | headerRow = [] 56 | 57 | 58 | # for the validationRound, we have saved the data into the dataFile property of globalArgs 59 | if globalArgs['validationRound']: 60 | X_file_name = globalArgs['dataFile'] 61 | # for neural networks, we need to train on data normalized to the range of {0,1} or {-1,1} 62 | # data-formatter did that for us already, so we just have to load in the correct feature data 63 | elif( classifierName[0:4] == 'clnn' ): 64 | X_file_name = fileNames['X_train_nntrainingData'] 65 | else: 66 | X_file_name = fileNames['X_traintrainingData'] 67 | 68 | if globalArgs['validationRound']: 69 | y_file_name = globalArgs['validationYs'] 70 | else: 71 | # for neural networks, the y values do not need to be normalized 72 | y_file_name = fileNames['y_traintrainingData'] 73 | 74 | 75 | try: 76 | X = load_sparse_csr(X_file_name) 77 | 78 | # the following block works for dense arrays 79 | except: 80 | # our X_train file has a header row, so the user can see the results of data-formatter in a pretty way if they'd like. 81 | # we need to remove this row form our actual dataset 82 | # none of our other files from data-formatter have header rows 83 | with open(X_file_name, 'rU') as openInputFile: 84 | inputRows = csv.reader(openInputFile) 85 | firstRow=False 86 | for row in inputRows: 87 | if(firstRow): 88 | rowAsFloats = [] 89 | # make sure that floats that were saved as scientific notation are actually read in as floats 90 | # this should be non-controversial, as by this point we should have turned all categorical data into binary representation (0 or 1). 91 | for idx, val in enumerate(row): 92 | try: 93 | val = float(val) 94 | except: 95 | printParent(headerRow[idx]) 96 | printParent(val) 97 | rowAsFloats.append( val ) 98 | X.append(row) 99 | else: 100 | headerRow = row 101 | firstRow=True 102 | 103 | 104 | X = np.array(X) 105 | 106 | try: 107 | y = load_sparse_csr(y_file_name) 108 | 109 | except: 110 | # supports dense input, which is used in validationRound 111 | with open(y_file_name, 'rU') as openOutputFile: 112 | outputRows = csv.reader(openOutputFile) 113 | # this might be unnecessary now that we have run our data through data-formatter 114 | # we might be able to load in the y_train data directly 115 | firstRow = False 116 | for row in outputRows: 117 | if firstRow: 118 | try: 119 | row[0] = float(row[0]) 120 | except: 121 | row[0] = row[0] 122 | y.append(row[0]) 123 | else: 124 | # ignore the first row as it holds our header 125 | firstRow = True 126 | y = np.array(y) 127 | 128 | try: 129 | if y.shape[0] == 1: 130 | y = y.todense().tolist()[0] 131 | except: 132 | pass 133 | 134 | if fileNames['testingDataLength'] < 100000: 135 | # train on all the available (non-validation) data 136 | testSize = 0 137 | # a small data set should have many rounds of cross-validation. this will take longer to train, but means we will be training on more data 138 | cvRounds = 3 139 | elif fileNames['testingDataLength'] < 200000: 140 | testSize = .25 141 | cvRounds = 2 142 | else: 143 | # if this is the stage 0 round 144 | # we have already separated out our validation data (currently 30% of the entire training data set by default) 145 | # the data that we have loaded in here is the 70% that is not our validation data 146 | # we want to have 30% of our entire training data set used as our "search" data set, meaning it is ~43% of this 70% data set 147 | # the number we must give though is how much we want saved for testing, which is 1-.43 = .57 148 | testSize = .57 149 | cvRounds = 2 150 | 151 | if globalArgs['validationRound']: 152 | # if this is the validation round, we do not want to split our data out any further. 153 | # take only the validation portion of these datasets 154 | # right now they are the combined validation + test datasets 155 | # we want them to only be the validation portions 156 | combinedLength = X.shape[0] 157 | validationLength = combinedLength - fileNames['testingDataLength'] 158 | validationIndices = range( validationLength ) 159 | 160 | # slicing the X array to only contain the training data 161 | X_train = X[validationIndices , : ] 162 | 163 | # unless we are doing multi-category or multi-label predictions, we have converted y to be a list, meaning we have to slice it differently 164 | try: 165 | # slicing sparse matrices, if y is for multi-label or multi-category predictions 166 | y_train = y[validationIndices , : ] 167 | except: 168 | # slicing standard python lists 169 | y_train = y[ 0 : validationLength ] 170 | 171 | # set X and y equal to the versions of themselves that only have the validation data 172 | # this makes our lives easier later on when we go to train the big classifier on the "full" dataset 173 | X = X_train 174 | y = y_train 175 | 176 | else: 177 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=0) 178 | 179 | # if we're developing, train on only a small percentage of the dataset, and do not train the final large classifier (where we significantly bump up the number of estimators). 180 | if dev: 181 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=0) 182 | 183 | # instantiate a new classifier, given the type passed in to us 184 | classifier = classifierCreater[classifierName] 185 | 186 | # if possible, have the algorithm warm_start, taking advantage of the training it's done previously and then simply building on top of that 187 | try: 188 | classifier.set_params(warm_start=True) 189 | except: 190 | pass 191 | 192 | # XGBoost requires data to be in it's own particular format. 193 | if classifierName == 'clXGBoost': 194 | try: 195 | X_train = classifier.DMatrix( X_train ) 196 | X = classifier.DMatrix( X ) 197 | except: 198 | pass 199 | 200 | # create features that are custom to the size of the input data. 201 | # Each individual paramaterMaker file sits in the paramaterMakers folder. If you want to modify what the parameters are, or submit a PR with a better combination of parameters to try, that is the place to start. 202 | allParams = paramMakers.makeAll(X,y,globalArgs, dev, problemType) 203 | parameters_to_try = allParams[classifierName] 204 | 205 | printParent('we are about to run a cross-validated search for the best hyperparameters for ' + classifierName) 206 | 207 | try: 208 | if randomizedSearchCVList[classifierName]: 209 | # error_score=0 means that if some combinations of parameters fail to train properly, the rest of the search process will work. 210 | # numIterationsPerRound defaults to 8, unless the user has passed in a more specific value. 211 | n_iter = globalArgs['numIterationsPerRound'] 212 | if classifierName in ['clSGDClassifier']: 213 | # these algorithms train very quickly, and have many parameters to try, so they get more attempts than other algorithms 214 | n_iter = n_iter * 2 215 | searchCV = RandomizedSearchCV(classifier, parameters_to_try, n_jobs=globalArgs['numCPUs'], error_score=0, n_iter=n_iter, refit=True, cv=cvRounds) 216 | else: 217 | searchCV = GridSearchCV(classifier, parameters_to_try, n_jobs=globalArgs['numCPUs'], error_score=0, refit=True, cv=cvRounds) 218 | except: 219 | searchCV = GridSearchCV(classifier, parameters_to_try, n_jobs=globalArgs['numCPUs'], error_score=0, refit=True, cv=cvRounds) 220 | 221 | searchCV.fit(X_train, y_train ) 222 | printParent('\n') 223 | printParent('*********************************************************************************************************') 224 | printParent(classifierName + "'s best score from the hyperparameter search attempts is:") 225 | printParent(searchCV.best_score_) 226 | printParent('*********************************************************************************************************') 227 | printParent(classifierName + "'s best parameters this time are:") 228 | printParent(searchCV.best_params_) 229 | printParent('\n') 230 | 231 | printParent(classifierName + "'s total hyperparameter searching time is:") 232 | # this will give time in minutes, to one decimal point 233 | finishTrainTime = time.time() 234 | printParent( round((finishTrainTime - startTime)/60, 1) ) 235 | 236 | 237 | longTrainThreshold = bestSearchScore * globalArgs['longTrainThreshold'] 238 | messageObj = { 239 | "searchScore": searchCV.best_score_, 240 | "algoName": classifierName 241 | } 242 | 243 | # only put in the (oftentimes considerable) effort of longTraining this algorithm if it meets the threshold defined by longTrainThreshold 244 | # and do not train up a long version of the first two. that is a time-consuming process for an algorithm that is probably not very well optimized 245 | # Get info on whether this algo supports creating a larger version of that classifier. 246 | # for example, a random forest you can train with more trees, a neural network you can train for more epochs, etc. 247 | extendedTraining = extendedTrainingList.getAll()[classifierName] 248 | if ((searchCV.best_score_ > longTrainThreshold and longTrainThreshold > 0) or globalArgs['validationRound']) and extendedTraining: 249 | 250 | allBigClassifiers = makeBigClassifiers.makeAll(globalArgs, dev, problemType) 251 | longTrainClassifier = allBigClassifiers[classifierName] 252 | 253 | longTrainClassifier.set_params(**searchCV.best_params_) 254 | 255 | # grab the best esimator from our searchCV 256 | else: 257 | longTrainClassifier = searchCV.best_estimator_ 258 | 259 | startLongTrainTime = time.time() 260 | 261 | # when doing the cross-validated search, we potentially been holding out a significant portion of the dataset 262 | # once we have found the best hyperparameters, train on the entire dataset 263 | # we have already verified that this is the best set of hyperparameters using cross-validation 264 | if X.shape[0] != X_train.shape[0] or extendedTraining: 265 | longTrainClassifier.fit(X, y) 266 | 267 | finishLongTrainTime = time.time() 268 | printParent(classifierName + "'s training on the longer data set took:") 269 | printParent( round((finishLongTrainTime - startLongTrainTime)/60, 1) ) 270 | 271 | 272 | longTrainClassifierScore = longTrainClassifier.score(X, y) 273 | printParent(classifierName + "'s score against the larger training data set is:") 274 | printParent(longTrainClassifierScore) 275 | messageObj['longTrainScore'] = longTrainClassifierScore 276 | 277 | 278 | # save our classifiers from the validationRound to a separate folder 279 | if globalArgs['validationRound']: 280 | classifierFolder = path.join(globalArgs['bestClassifiersFolder'], 'ensemblingAlgos', 'best' + classifierName) 281 | else: 282 | classifierFolder = path.join(globalArgs['bestClassifiersFolder'], 'best' + classifierName) 283 | 284 | if not os.path.exists(classifierFolder): 285 | os.makedirs(classifierFolder) 286 | 287 | joblib.dump(longTrainClassifier, path.join(classifierFolder, 'best' + classifierName + '.pkl') ) 288 | 289 | messageParent(messageObj, 'trainingResults') 290 | -------------------------------------------------------------------------------- /pySetup/makePredictions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import os.path as path 4 | import sys 5 | import csv 6 | import time 7 | import joblib 8 | import numpy as np 9 | import pandas as pd 10 | import logging 11 | import xgboost 12 | 13 | from scipy.sparse import csr_matrix, vstack 14 | 15 | from sendMessages import printParent 16 | from sendMessages import messageParent 17 | from sendMessages import obviousPrint 18 | 19 | logging.basicConfig() 20 | 21 | fileNames = json.loads(sys.argv[4]) 22 | classifierName = sys.argv[5] 23 | argv = json.loads(sys.argv[3]) 24 | problemType = sys.argv[6] 25 | trainingScore = sys.argv[7] 26 | copyValidationData = sys.argv[8] 27 | searchScore = sys.argv[9] 28 | 29 | if argv['validationRound']: 30 | X_file_name = argv['dataFile'] 31 | 32 | else: 33 | if( classifierName[0:4] == 'clnn' ): 34 | nn = True 35 | X_file_name = fileNames['X_test_nn'] 36 | else: 37 | nn = False 38 | X_file_name = fileNames['X_test'] 39 | 40 | id_file_name = fileNames['id_test'] 41 | 42 | 43 | XTest = [] 44 | testIDColumn = [] 45 | 46 | # load up the prediction data set, without the header row 47 | try: 48 | def load_sparse_csr(filename): 49 | loader = np.load(filename) 50 | return csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) 51 | 52 | XTest = load_sparse_csr(X_file_name) 53 | except: 54 | with open(X_file_name, 'rU') as x_file: 55 | inputRows = csv.reader(x_file) 56 | headerRow = False 57 | for row in inputRows: 58 | if(headerRow): 59 | XTest.append(row) 60 | else: 61 | headerRow = True 62 | 63 | if argv['validationRound']: 64 | # in the validation file, we have combined the validationData and the test data 65 | # split out to only have the test data 66 | testLength = fileNames['testingDataLength'] 67 | combinedValidationLength = XTest.shape[0] 68 | testIndices = range( combinedValidationLength - testLength, combinedValidationLength) 69 | 70 | XTest = XTest[ testIndices , : ] 71 | 72 | # should be pretty safe to convert the testIDColumn to a list, since it is always going to be a single value per row 73 | # to get a single vector (in this case, our ID column) to be saved as a sparse matrix, we have to do some vaguely hacky stuff 74 | # the following line converts it to a normal python list 75 | testIDColumn = load_sparse_csr( id_file_name ).todense().tolist()[0] 76 | 77 | 78 | try: 79 | idHeader = fileNames['idHeader'] 80 | outputHeader = fileNames['outputHeader'] 81 | except: 82 | # read in the y_file simply to get the pretty header name for the output column 83 | with open(y_file_name, 'rU') as y_file: 84 | inputRows = csv.reader(y_file) 85 | outputHeader = False 86 | for row in inputRows: 87 | if outputHeader == False: 88 | outputHeader = row[0] 89 | else: 90 | pass 91 | 92 | if argv['validationRound']: 93 | classifierFile = path.join( argv['bestClassifiersFolder'], 'ensemblingAlgos', 'best' + classifierName, 'best' + classifierName + '.pkl') 94 | else: 95 | classifierFile = path.join( argv['bestClassifiersFolder'], 'best' + classifierName, 'best' + classifierName + '.pkl') 96 | 97 | # load up the previously trained (and tuned!) classifier 98 | classifier = joblib.load( classifierFile ) 99 | 100 | try: 101 | classifier.set_params(n_jobs=-1) 102 | except: 103 | pass 104 | 105 | 106 | # get predictions for each item in the prediction data set 107 | if problemType == 'category': 108 | try: 109 | testDataPredictions = classifier.predict_proba(XTest) 110 | except: 111 | # perceptron does not support predict_proba 112 | # and MultinomialNB does not do probability predictions all that well 113 | testDataPredictions = classifier.predict(XTest) 114 | 115 | # else will handle both regression and multi-category predictions at the moment. 116 | else: 117 | testDataPredictions = classifier.predict(XTest) 118 | 119 | 120 | if not argv['validationRound']: 121 | validationFile = fileNames['X_trainvalidationData'] 122 | validationData = load_sparse_csr(validationFile) 123 | validationIdFile = fileNames['id_trainvalidationData'] 124 | validationIDs = load_sparse_csr( validationIdFile ).todense().tolist()[0] 125 | 126 | if nn: 127 | validationYFile = fileNames['y_train_nnvalidationData'] 128 | else: 129 | validationYFile = fileNames['y_trainvalidationData'] 130 | validationY = load_sparse_csr(validationYFile).todense().tolist()[0] 131 | 132 | 133 | if problemType == 'category': 134 | try: 135 | validationPredictions = classifier.predict_proba(validationData) 136 | except: 137 | validationPredictions = classifier.predict(validationData) 138 | 139 | else: 140 | # else will handle both regression and multi-category predictions for now 141 | validationPredictions = classifier.predict(validationData) 142 | 143 | validationScore = classifier.score(validationData,validationY) 144 | 145 | printParent('\n') 146 | printParent('***************') 147 | printParent(classifierName + "'s score on the validation set is:") 148 | printParent(validationScore) 149 | printParent('***************') 150 | else: 151 | # we still need something to write to the file. we will write the score from the hyperparameter search, which is the cross-validation score on the holdout data from that search. in that way, it's actualy a pretty accurate score to be using. 152 | validationScore = searchScore 153 | 154 | # write our predictions on the test data to a file 155 | if argv['validationRound']: 156 | predictionsPath = path.join( argv['predictionsFolder'], 'ensembledPredictions' ) 157 | 158 | else: 159 | predictionsPath = argv['predictionsFolder'] 160 | 161 | 162 | # using the outputFileName here so that if people have different input files (different feature engineering), that will show up in our file names. 163 | predictionsFileName = argv['outputFileName'] + classifierName + str(time.time()) + '.csv' 164 | 165 | # create the directory if it doesn't exist already 166 | if not os.path.exists(predictionsPath): 167 | os.makedirs(predictionsPath) 168 | 169 | with open( path.join(predictionsPath, predictionsFileName) , 'w+') as predictionsFile: 170 | csvwriter = csv.writer(predictionsFile) 171 | 172 | # we are going to have to modify this when we allow it to make categorical predictions too. 173 | # write the scores to the top row 174 | csvwriter.writerow([validationScore, trainingScore]) 175 | csvwriter.writerow([idHeader,outputHeader]) 176 | for idx, prediction in enumerate(testDataPredictions): 177 | rowID = testIDColumn[idx] 178 | 179 | try: 180 | len(prediction) 181 | csvwriter.writerow([int(rowID),prediction[1]]) 182 | except: 183 | csvwriter.writerow([int(rowID),prediction]) 184 | 185 | if not argv['validationRound']: 186 | 187 | # write our validation predictions to a file too 188 | validationPath = path.join( predictionsPath, 'validation') 189 | validationFileName = argv['outputFileName'] + classifierName + str(time.time()) +'.csv' 190 | 191 | # to keep things super consistent, we will combine our test and validation data, so there's no risk of order getting mixed up in ensembler 192 | totalPredictions = np.concatenate( (validationPredictions, testDataPredictions), axis=0 ) 193 | validationAndTestIDs = np.concatenate( (validationIDs, testIDColumn), axis=0 ) 194 | 195 | with open( path.join(validationPath, validationFileName) , 'w+') as validationFile: 196 | csvwriter = csv.writer(validationFile) 197 | 198 | # at the top of each validation file, write the score for that classifier on the validation set 199 | csvwriter.writerow([validationScore, trainingScore]) 200 | 201 | # we are going to have to modify this when we allow it to make categorical predictions too. 202 | csvwriter.writerow([idHeader,outputHeader]) 203 | for idx, prediction in enumerate(totalPredictions): 204 | rowID = validationAndTestIDs[idx] 205 | try: 206 | len(prediction) 207 | csvwriter.writerow([int(rowID),prediction[1]]) 208 | except: 209 | csvwriter.writerow([int(rowID),prediction]) 210 | 211 | # continued callout to the person originally responsible for this function: 212 | # http://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format 213 | def save_sparse_csr(filename,array): 214 | np.savez(filename,data=array.data ,indices=array.indices, indptr=array.indptr, shape=array.shape ) 215 | 216 | if copyValidationData and nn == False: 217 | allValidationDataFile = path.join( validationPath, 'validationData.npz') 218 | allValidationIDsFile = path.join( validationPath, 'validationIDs.npz') 219 | allValidationYsFile = path.join( validationPath, 'validationYs.npz') 220 | 221 | # to make sure we keep everything consistent, we write the combined validation data and test data to a file 222 | allValidationData = vstack( [validationData, XTest] ) 223 | save_sparse_csr(allValidationDataFile, allValidationData) 224 | 225 | # we already loaded in this data, but then immediately converted it to a dense list. 226 | # so we are going to load it in again, this time as a sparse csr matrix, and then immediately save it as a sparse csr matrix elsewhere 227 | # we could just as easily copy the original file to a new location, but since we're not coyping anywhere else, this is slightly more consistent stylistically 228 | validationSparseIDs = load_sparse_csr( validationIdFile ) 229 | save_sparse_csr( allValidationIDsFile, validationSparseIDs ) 230 | 231 | validationSparseYs = load_sparse_csr(validationYFile) 232 | save_sparse_csr( allValidationYsFile, validationSparseYs ) 233 | 234 | # with open( path.join(validationPath, 'validationIDsAndY.csv') , 'w+') as validationFile: 235 | # csvwriter = csv.writer(validationFile) 236 | 237 | # # we are going to have to modify this when we allow it to make categorical predictions too. 238 | # csvwriter.writerow([idHeader,outputHeader]) 239 | # for idx, rowID in enumerate(validationAndTestIDs): 240 | # # our test data will not have y values attached, so we will try to find a y value for this ID, but if we can't, we assume it is a test value, and we set the y value to None 241 | # try: 242 | # yValue = validationY[idx] 243 | # except: 244 | # yValue = None 245 | # try: 246 | # len(yValue) 247 | # csvwriter.writerow([int(rowID),yValue[1]]) 248 | # except: 249 | # csvwriter.writerow([int(rowID),yValue]) 250 | 251 | # The following sections write our output in a format that the user requested. This output is not used for anything else later down the line in machineJS or ensembler, it is solely for the user. 252 | 253 | 254 | # if the final output is binary, create a separate file at this stage that can be easily uploaded to kaggle by rounding the predicted value to the nearest int 255 | # We will use the actual probability in ensembler, but it's nice at this stage to be able to upload results to kaggle and get some feedback 256 | if argv[ 'binaryOutput'] == 'true': 257 | 258 | # add kaggle to the front of the name to make it obvious that this is for kaggle 259 | # this also keeps the rest of our files consistent for ensembler 260 | kagglePath = argv['kaggleBinaryOutputFolder'] 261 | kaggleFileName = argv['outputFileName'] + classifierName + str(time.time()) + '.csv' 262 | with open( path.join(kagglePath, kaggleFileName) , 'w+') as predictionsFile: 263 | csvwriter = csv.writer(predictionsFile) 264 | 265 | csvwriter.writerow([idHeader,outputHeader]) 266 | for idx, prediction in enumerate(testDataPredictions): 267 | 268 | rowID = testIDColumn[idx] 269 | # I'm not sure why we're checking if prediction is already a list 270 | # or why we're taking the second item in that list 271 | try: 272 | len(prediction) 273 | prediction = int( round( prediction[1] ) ) 274 | except: 275 | prediction = int( round( prediction ) ) 276 | pass 277 | csvwriter.writerow( [rowID,prediction] ) 278 | 279 | # for multi-category data, we can choose to output a single column with all the categories contained in that column, or we can translate that into a set of binary columns, where each column represents a single categorical value. 280 | # if the final output is matrixOutput, create a separate file at this stage that can be easily referenced by the user 281 | # We will use the single categorical column in ensembler, but it's nice at this stage to be able to view results in the expected format and get some feedback 282 | if argv[ 'matrixOutput'] == 'true': 283 | 284 | # convert our predictions on the test set to a pandas series 285 | pdPredictions = pd.Series(testDataPredictions) 286 | 287 | # take our single column of category predictions, and turn it into a matrix, where each column represents a yes or no for a single category 288 | # prefix puts our outputHeader in front of each of the values for our header row 289 | matrixPredictions = pd.get_dummies(pdPredictions, prefix=outputHeader) 290 | # get the header row from the data frame: 291 | matrixHeaderRow = matrixPredictions.columns.values.tolist() 292 | # convert from pandas data frame to a python list 293 | matrixPredictions = matrixPredictions.values.tolist() 294 | 295 | # add the id to the header row 296 | outputFileHeaderRow = [idHeader] + matrixHeaderRow 297 | 298 | # add matrix to the front of the name to make it obvious 299 | # this also keeps the rest of our files consistent for ensembler 300 | matrixPath = argv['matrixOutputFolder'] 301 | matrixFileName = argv['outputFileName'] + classifierName + str(time.time()) + '.csv' 302 | with open( path.join(matrixPath, matrixFileName) , 'w+') as predictionsFile: 303 | csvwriter = csv.writer(predictionsFile) 304 | 305 | csvwriter.writerow(outputFileHeaderRow) 306 | for idx, listOfMatrixPredictions in enumerate(matrixPredictions): 307 | 308 | rowID = testIDColumn[idx] 309 | csvwriter.writerow( [rowID] + listOfMatrixPredictions ) 310 | -------------------------------------------------------------------------------- /pySetup/testingFileNames.js: -------------------------------------------------------------------------------- 1 | // right now, machineJS is designed to console.log the list of fileNames 2 | // if you copy/paste them in here, you can add in the --alreadyFormatted flag 3 | // this will allow you to skip over repeating the data-formatter part of the process if you've already run it 4 | // make sure you name your new object after the training data set you passed in 5 | // right now, that name can be found as the last word after the "_" before the ".npz" file extension 6 | // if you look at the examples below, you can look at the property name of each object, find where in each file name that property name resides, and then grab the same part of the file name for your files as your own property name. 7 | 8 | module.exports = { 9 | 10 | rossmantrain: { 11 | idHeader: 'Id', 12 | outputHeader: 'sales', 13 | id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_rossmantrain.npz', 14 | y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_rossmantrain.npz', 15 | id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_rossmantestrossmantrain.npz', 16 | X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_rossmantestrossmantrain.npz', 17 | X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_rossmantrain.npz', 18 | X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_rossmantrain.npz', 19 | y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_rossmantrain.npz', 20 | X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_rossmantestrossmantrain.npz', 21 | testingDataLength: 41088, 22 | trainingDataLength: 1017209, 23 | problemType: 'regression' 24 | }, 25 | 26 | numerai_training_data_tournament: { 27 | idHeader: "t_id", 28 | outputHeader: "target", 29 | id_train: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_numerai_training_data_tournament.npz", 30 | y_train: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_numerai_training_data_tournament.npz", 31 | id_test: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_numerai_tournament_datanumerai_training_data_tournament.npz", 32 | X_test: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_numerai_tournament_datanumerai_training_data_tournament.npz", 33 | X_train: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_numerai_training_data_tournament.npz", 34 | X_train_nn: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_numerai_training_data_tournament.npz", 35 | y_train_nn: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_numerai_training_data_tournament.npz", 36 | X_test_nn: "/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_numerai_tournament_datanumerai_training_data_tournament.npz", 37 | testingDataLength: 19461, 38 | trainingDataLength: 55038, 39 | problemType: "category" 40 | }, 41 | 42 | homesitetrain: { 43 | idHeader: 'quotenumber', 44 | outputHeader: 'quoteconversion_flag', 45 | id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_homesitetrain.npz', 46 | y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_homesitetrain.npz', 47 | id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_homesitetesthomesitetrain.npz', 48 | X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_homesitetesthomesitetrain.npz', 49 | X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_homesitetrain.npz', 50 | X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_homesitetrain.npz', 51 | y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_homesitetrain.npz', 52 | X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_homesitetesthomesitetrain.npz', 53 | testingDataLength: 173836, 54 | trainingDataLength: 260753, 55 | problemType: 'category' 56 | }, 57 | 58 | walmarttrain: { 59 | idHeader: 'visitnumber', 60 | outputHeader: 'triptype', 61 | id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_walmarttrain.npz', 62 | y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_walmarttrain.npz', 63 | id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_walmarttestwalmarttrain.npz', 64 | X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_walmarttestwalmarttrain.npz', 65 | X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_walmarttrain.npz', 66 | X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_walmarttrain.npz', 67 | y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_walmarttrain.npz', 68 | X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_walmarttestwalmarttrain.npz', 69 | testingDataLength: 9935, 70 | trainingDataLength: 95674, 71 | problemType: 'multi-category' 72 | }, 73 | 74 | shortTrain: { 75 | idHeader: 'visitnumber', 76 | outputHeader: 'triptype', 77 | id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_shortTrain.npz', 78 | y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_shortTrain.npz', 79 | id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_shortTestshortTrain.npz', 80 | X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_shortTestshortTrain.npz', 81 | X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_shortTrain.npz', 82 | X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_shortTrain.npz', 83 | y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_shortTrain.npz', 84 | X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_shortTestshortTrain.npz', 85 | testingDataLength: 9935, 86 | trainingDataLength: 10115, 87 | problemType: 'multi-category' 88 | }, 89 | 90 | telstratrain: { 91 | idHeader: 'id', 92 | outputHeader: 'fault_severity', 93 | id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_telstratrain.npz', 94 | y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_telstratrain.npz', 95 | id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_telstratesttelstratrain.npz', 96 | X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_telstratesttelstratrain.npz', 97 | X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_telstratrain.npz', 98 | X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_telstratrain.npz', 99 | y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_telstratrain.npz', 100 | X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_telstratesttelstratrain.npz', 101 | testingDataLength: 11171, 102 | trainingDataLength: 7381, 103 | problemType: 'multi-category' 104 | }, 105 | 106 | numerai_training_data: { idHeader: 't_id', 107 | outputHeader: 'target', 108 | id_train: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/id_train_numerai_training_data.npz', 109 | y_train: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/y_train_numerai_training_data.npz', 110 | id_test: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/id_test_numerai_tournament_datanumerai_training_data.npz', 111 | X_test: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/X_test_numerai_tournament_datanumerai_training_data.npz', 112 | X_train: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/X_train_numerai_training_data.npz', 113 | X_train_nn: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/X_train_nn_numerai_training_data.npz', 114 | y_train_nn: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/y_train_nn_numerai_training_data.npz', 115 | X_test_nn: '/Users/preston/Desktop/machineJS/pySetup/data-formatterResults/X_test_nn_numerai_tournament_datanumerai_training_data.npz', 116 | testingDataLength: 19461, 117 | trainingDataLength: 55038, 118 | problemType: 'category' }, 119 | 120 | // here's an example of what the fileNames will look like if you engaged in labelEncoding 121 | evenTrain: { labelMapping: 122 | { 'Arbitration': 0, 123 | 'Billing disputes': 1, 124 | 'Adding money': 8, 125 | 'Overdraft, savings or rewards features': 3, 126 | 'Money was not available when promised': 14, 127 | 'Unexpected/Other fees': 5, 128 | 'Credit monitoring or identity protection': 6, 129 | 'Making/receiving payments, sending money': 7, 130 | 'Loan servicing, payments, escrow account': 2, 131 | 'Unsolicited issuance of credit card': 9, 132 | 'Improper use of my credit report': 10, 133 | 'Lender sold the property': 11, 134 | 'Sale of account': 12, 135 | 'Problems when you are unable to pay': 13, 136 | 'Improper contact or sharing of info': 4, 137 | 'Shopping for a loan or lease': 15, 138 | 'Getting a loan': 16, 139 | 'Excessive fees': 17, 140 | 'Identity theft / Fraud / Embezzlement': 18, 141 | 'Advertising and marketing': 19, 142 | 'Bankruptcy': 20, 143 | 'Communication tactics': 21, 144 | 'Cont\'d attempts collect debt not owed': 22, 145 | 'Charged fees or interest I didn\'t expect': 23, 146 | 'Lost or stolen check': 24, 147 | 'Lender damaged or destroyed vehicle': 25, 148 | 'Forbearance / Workout plans': 26, 149 | 'Taking/threatening an illegal action': 27, 150 | 'Received a loan I didn\'t apply for': 28, 151 | 'Fraud or scam': 29, 152 | 'Account terms and changes': 30, 153 | 'Application, originator, mortgage broker': 31, 154 | 'Other fee': 32, 155 | 'Convenience checks': 33, 156 | 'Incorrect/missing disclosures or info': 34, 157 | 'Incorrect information on credit report': 35, 158 | 'Can\'t contact lender': 36, 159 | 'Taking out the loan or lease': 37, 160 | 'Application processing delay': 38, 161 | 'Using a debit or ATM card': 39, 162 | 'Advertising, marketing or disclosures': 40, 163 | 'Credit card protection / Debt protection': 41, 164 | 'Late fee': 42, 165 | 'Credit reporting company\'s investigation': 43, 166 | 'Managing the loan or lease': 44, 167 | 'Can\'t repay my loan': 45, 168 | 'Other transaction issues': 46, 169 | 'Privacy': 47, 170 | 'Payment to acct not credited': 48, 171 | 'Balance transfer': 71, 172 | 'Transaction issue': 50, 173 | 'Disclosure verification of debt': 51, 174 | 'Rewards': 52, 175 | 'Incorrect exchange rate': 53, 176 | 'Credit decision / Underwriting': 54, 177 | 'Lost or stolen money order': 55, 178 | 'Unauthorized transactions/trans. issues': 56, 179 | 'Lender repossessed or sold the vehicle': 57, 180 | 'Shopping for a line of credit': 58, 181 | 'Deposits and withdrawals': 59, 182 | 'Account opening, closing, or management': 60, 183 | 'Can\'t stop charges to bank account': 61, 184 | 'Balance transfer fee': 62, 185 | 'Wrong amount charged or received': 63, 186 | 'Customer service / Customer relations': 64, 187 | 'Applied for loan/did not receive money': 65, 188 | 'Credit determination': 66, 189 | 'Fees': 67, 190 | 'Disclosures': 68, 191 | 'Managing, opening, or closing account': 69, 192 | 'APR or interest rate': 70, 193 | 'Closing/Cancelling account': 49, 194 | 'Loan modification,collection,foreclosure': 72, 195 | 'Dealing with my lender or servicer': 73, 196 | 'Other': 74, 197 | 'Managing the line of credit': 75, 198 | 'Charged bank acct wrong day or amt': 76, 199 | 'Overlimit fee': 77, 200 | 'Unable to get credit report/credit score': 78, 201 | 'Delinquent account': 79, 202 | 'Cash advance fee': 80, 203 | 'Problems caused by my funds being low': 81, 204 | 'Other service issues': 82, 205 | 'False statements or representation': 83, 206 | 'Cash advance': 84, 207 | 'Credit line increase/decrease': 85, 208 | 'Customer service/Customer relations': 86, 209 | 'Settlement process and costs': 87, 210 | 'Payoff process': 88, 211 | 'Billing statement': 89 }, 212 | labelEncoded: true, 213 | idHeader: 'id', 214 | outputHeader: 'issue', 215 | id_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_train_evenTrain.npz', 216 | id_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/id_test_evenTestevenTrain.npz', 217 | y_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_evenTrain.npz', 218 | validation_split_column: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/validation_split_column_evenTrain.npz', 219 | hasCustomValidationSplit: false, 220 | X_test: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_evenTestevenTrain.npz', 221 | X_train: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_evenTrain.npz', 222 | X_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_train_nn_evenTrain.npz', 223 | y_train_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/y_train_nn_evenTrain.npz', 224 | X_test_nn: '/Users/preston/ghLocal/machineLearningWork/machineJS/pySetup/data-formatterResults/X_test_nn_evenTestevenTrain.npz', 225 | testingDataLength: 5710, 226 | trainingDataLength: 51399, 227 | problemType: 'multi-category' } 228 | }; 229 | 230 | --------------------------------------------------------------------------------