├── .gitignore ├── LICENSE ├── README.md └── otto ├── __init__.py ├── model ├── __init__.py ├── model_01_bagging_linear │ ├── __init__.py │ └── bagging_linear.py ├── model_02_random_forest │ ├── __init__.py │ └── random_forest.py ├── model_03_svm │ ├── __init__.py │ └── svm.py ├── model_04_rgf │ ├── __init__.py │ └── rgf.py ├── model_05_bagging_nn_rmsprop │ ├── __init__.py │ └── bagging_nn_rmsprop.py ├── model_06_xgboost │ ├── __init__.py │ └── xgboost.py ├── model_07_bagging_nn_nesterov │ ├── __init__.py │ └── bagging_nn_nesterov.py ├── model_08_random_forest_calibrated │ ├── __init__.py │ └── random_forest_calibrated.py ├── model_09_nn_adagrad │ ├── __init__.py │ └── nn_adagrad.py ├── model_10_nn_adagrad_pca │ ├── __init__.py │ └── nn_adagrad_pca.py ├── model_11_xgboost_poly │ ├── __init__.py │ └── xgboost_poly.py ├── model_12_nn_rmsprop_pca │ ├── __init__.py │ └── nn_rmsprop_pca.py ├── model_13_nn_rmsprop_features │ ├── __init__.py │ └── nn_rmsprop_features.py ├── model_14_bagging_xgboost │ ├── __init__.py │ └── bagging_xgboost.py ├── model_15_nn_adagrad_pca │ ├── __init__.py │ └── nn_adagrad_pca.py ├── model_16_random_forest_calibrated_feature_selection │ ├── __init__.py │ └── random_forest_calibrated_feature_selection.py └── model_17_nn_adagrad_log │ ├── __init__.py │ └── nn_adagrad_log.py └── otto_utils ├── __init__.py ├── blender.py ├── consts.py ├── ensembler.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python compiled and temporary files 2 | *.py[c~] 3 | 4 | # PyCharm project files 5 | otto/.idea/* 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Adam 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of kaggle_otto nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kaggle Otto Group Product Classification Challenge 2 | 3 | Solution for achieving place 66th/3514 on private leaderboard. 4 | 5 | It contains: 6 | * Neural Networks 7 | * XGBoost 8 | * Random Forest 9 | * SVM 10 | * Regularized Greedy Forest 11 | * Linear model 12 | 13 | However only top four kind of algorithms were used to build final ensemble. 14 | 15 | You can find more information on my [blog](http://blog.aicry.com/kaggle-otto-group-product-classification-challenge/). 16 | -------------------------------------------------------------------------------- /otto/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_01_bagging_linear/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_01_bagging_linear/bagging_linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | It achieves around 0.52914588084 log loss on holdout set 3 | """ 4 | 5 | import numpy as np 6 | import os 7 | 8 | from sklearn import ensemble, feature_extraction, linear_model, preprocessing 9 | from sklearn.svm import LinearSVC 10 | 11 | from otto_utils import consts, utils 12 | 13 | 14 | MODEL_NAME = 'model_01_bagging_linear' 15 | MODE = 'holdout' # cv|submission|holdout 16 | 17 | # import data 18 | train, labels, test, _, _ = utils.load_data() 19 | 20 | # polynomial features 21 | poly_feat = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True) 22 | train = poly_feat.fit_transform(train, labels) 23 | test = poly_feat.transform(test) 24 | 25 | print train.shape 26 | 27 | # transform counts to TFIDF features 28 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 29 | train = tfidf.fit_transform(train).toarray() 30 | test = tfidf.transform(test).toarray() 31 | 32 | # feature selection 33 | feat_selector = LinearSVC(C=0.3, penalty='l1', dual=False) 34 | train = feat_selector.fit_transform(train, labels) 35 | test = feat_selector.transform(test) 36 | 37 | print train.shape 38 | 39 | # encode labels 40 | lbl_enc = preprocessing.LabelEncoder() 41 | labels = lbl_enc.fit_transform(labels) 42 | 43 | # train classifier 44 | linear_clf = linear_model.LogisticRegression(C=1, penalty='l1', 45 | fit_intercept=True, random_state=23) 46 | 47 | clf = ensemble.BaggingClassifier(base_estimator=linear_clf, n_estimators=40, 48 | max_samples=1., max_features=1., bootstrap=True, 49 | n_jobs=5, verbose=True, random_state=23) 50 | 51 | if MODE == 'cv': 52 | scores, predictions = utils.make_blender_cv(clf, train, labels) 53 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 54 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 55 | elif MODE == 'submission': 56 | clf.fit(train, labels) 57 | predictions = clf.predict_proba(test) 58 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 59 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 60 | predictions) 61 | elif MODE == 'holdout': 62 | score = utils.hold_out_evaluation(clf, train, labels) 63 | print 'Log loss:', score 64 | else: 65 | print 'Unknown mode' 66 | 67 | 68 | -------------------------------------------------------------------------------- /otto/model/model_02_random_forest/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_02_random_forest/random_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | from sklearn import ensemble, feature_extraction, preprocessing 5 | 6 | from otto_utils import consts, utils 7 | 8 | 9 | MODEL_NAME = 'model_02_random_forest' 10 | MODE = 'holdout' # cv|submission|holdout 11 | 12 | # import data 13 | train, labels, test, _, _ = utils.load_data() 14 | 15 | # transform counts to TFIDF features 16 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 17 | train = tfidf.fit_transform(train).toarray() 18 | test = tfidf.transform(test).toarray() 19 | 20 | # encode labels 21 | lbl_enc = preprocessing.LabelEncoder() 22 | labels = lbl_enc.fit_transform(labels) 23 | 24 | # train classifier 25 | clf = ensemble.ExtraTreesClassifier(n_jobs=4, n_estimators=2000, max_features=20, min_samples_split=3, 26 | bootstrap=False, verbose=3, random_state=23) 27 | 28 | if MODE == 'cv': 29 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 30 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 31 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 32 | elif MODE == 'submission': 33 | clf.fit(train, labels) 34 | predictions = clf.predict_proba(test) 35 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 36 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 37 | predictions) 38 | elif MODE == 'holdout': 39 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 40 | print 'Log loss:', score 41 | else: 42 | print 'Unknown mode' 43 | -------------------------------------------------------------------------------- /otto/model/model_03_svm/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_03_svm/svm.py: -------------------------------------------------------------------------------- 1 | """ 2 | 5-fold CV - log loss 0.513778609795 3 | """ 4 | import numpy as np 5 | import os 6 | 7 | from hyperopt import fmin, hp, tpe 8 | 9 | from sklearn import feature_extraction, preprocessing, svm 10 | from sklearn.calibration import CalibratedClassifierCV 11 | from sklearn.multiclass import OneVsRestClassifier 12 | 13 | from otto_utils import consts, utils 14 | 15 | 16 | MODEL_NAME = 'model_03_svm' 17 | MODE = 'cv' # cv|submission|holdout|tune 18 | 19 | # import data 20 | train, labels, test, _, _ = utils.load_data() 21 | 22 | # transform counts to TFIDF features 23 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 24 | train = tfidf.fit_transform(train).toarray() 25 | test = tfidf.transform(test).toarray() 26 | 27 | # encode labels 28 | lbl_enc = preprocessing.LabelEncoder() 29 | labels = lbl_enc.fit_transform(labels) 30 | 31 | # train classifier 32 | clf = OneVsRestClassifier(svm.SVC(C=4.919646+2., kernel='rbf', tol=.001, 33 | verbose=True, probability=True, gamma=0.646508+.3, random_state=23)) 34 | 35 | if MODE == 'cv': 36 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True) 37 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 38 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 39 | elif MODE == 'submission': 40 | calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels)) 41 | fitted_classifier = calibrated_classifier.fit(train, labels) 42 | predictions = fitted_classifier.predict_proba(test) 43 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 44 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 45 | predictions) 46 | elif MODE == 'holdout': 47 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False, test_size=0.9) 48 | print 'Log loss:', score 49 | elif MODE == 'tune': 50 | train, labels, valid, valid_labels = utils.stratified_split(train, labels, test_size=.8) 51 | from sklearn.metrics import log_loss 52 | # Objective function 53 | def objective(args): 54 | c, gamma = args 55 | clf = OneVsRestClassifier(svm.SVC(C=c, kernel='rbf', tol=.001, gamma=gamma, 56 | probability=True, random_state=23)) 57 | score1 = 0 58 | score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 59 | score = log_loss(valid_labels, clf.predict_proba(valid)) 60 | print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % (c, gamma, score1, score2, score) 61 | return score 62 | # Searching space 63 | space = ( 64 | hp.uniform('c', 4, 10), 65 | hp.uniform('gamma', 0.3, 3) 66 | ) 67 | 68 | best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=200) 69 | print 'Best solution:', best_sln 70 | else: 71 | print 'Unknown mode' 72 | -------------------------------------------------------------------------------- /otto/model/model_04_rgf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahara/kaggle_otto/2b7861d052529d7a3f78c053088450f15278ac42/otto/model/model_04_rgf/__init__.py -------------------------------------------------------------------------------- /otto/model/model_04_rgf/rgf.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import numpy as np 3 | import os 4 | import subprocess 5 | import shutil 6 | 7 | from sklearn import feature_extraction 8 | from sklearn.base import BaseEstimator, ClassifierMixin 9 | from sklearn.metrics import log_loss 10 | 11 | from otto_utils import consts, utils 12 | 13 | 14 | MODEL_NAME = 'model_04_rgf' 15 | MODE = 'cv' # cv|submission|holdout|tune 16 | 17 | 18 | class RGF(BaseEstimator, ClassifierMixin): 19 | def __init__(self, verbose=True, random_state=None): 20 | self.n_classes_ = None 21 | self.rgf_path = None 22 | self.files_location_ = None 23 | self.files_location_data_ = None 24 | self.files_location_output_ = None 25 | self.models = None 26 | self.verbose = verbose 27 | self.random_state = random_state 28 | 29 | def fit(self, X, y): 30 | self.n_classes_ = 9 31 | self.rgf_path = '/home/adam/Tools/rgf1.2/bin/rgf' 32 | self.files_location_ = '/home/adam/Projects/otto/model/model_04_rgf' 33 | self.files_location_data_ = os.path.join(self.files_location_, 'data') 34 | self.files_location_output_ = os.path.join(self.files_location_, 'output') 35 | self.models = dict() 36 | 37 | shutil.rmtree(self.files_location_output_) 38 | 39 | train_index = np.array(range(X.shape[0])) 40 | np.random.shuffle(train_index) 41 | x_train, y_train = X[train_index], y[train_index] 42 | 43 | self._train(x_train, y_train) 44 | 45 | return self 46 | 47 | def predict(self, X): 48 | preds = self.predict_proba(X) 49 | return np.argmax(preds, 1) 50 | 51 | def predict_proba(self, X): 52 | return self._predict(X) 53 | 54 | def score(self, X, y, sample_weight=None): 55 | return log_loss(y, self.predict_proba(X)) 56 | 57 | def get_params(self, deep=True): 58 | return {'verbose': self.verbose, 'random_state': self.random_state} 59 | 60 | def set_params(self, **parameters): 61 | for parameter, value in parameters.items(): 62 | self.setattr(parameter, value) 63 | return self 64 | 65 | def fit_transform(self, X, y): 66 | self.fit(X, y) 67 | return self.predict_proba(X) 68 | 69 | def transform(self, X): 70 | return self.predict_proba(X) 71 | 72 | # Private methods 73 | def write_into_files(self, prefix, x, y=None): 74 | if not os.path.exists(self.files_location_data_): 75 | os.makedirs(self.files_location_data_) 76 | # Write file with X 77 | data_location = os.path.join(self.files_location_data_, '%s.data.x' % prefix) 78 | np.savetxt(data_location, x, delimiter='\t', fmt='%.5f') 79 | 80 | paths = dict(x=data_location, y=[]) 81 | 82 | if y is not None: 83 | for i in range(self.n_classes_): 84 | labels = map(lambda l: ['+1'] if i == l else ['-1'], y) 85 | labels_location = os.path.join(self.files_location_data_, '%s.data.y.%d' % (prefix, i)) 86 | np.savetxt(labels_location, labels, delimiter='\t', fmt='%s') 87 | paths['y'].append(labels_location) 88 | 89 | return paths 90 | 91 | def get_params_string(self, train_x_fn=None, train_y_fn=None, test_x_fn=None, test_y_fn=None, 92 | model_fn=None, model_fn_prefix=None, evaluation_fn=None, prediction_fn=None, 93 | reg_L2=None, reg_sL2=None, algorithm=None, loss=None, 94 | test_interval=None, max_tree=None, max_leaf_forest=None): 95 | frame = inspect.currentframe() 96 | args, _, _, values = inspect.getargvalues(frame) 97 | params_string = '' 98 | 99 | for arg in args: 100 | if values[arg] is not None and arg != 'self': 101 | params_string += '%s=%s,' % (arg, values[arg]) 102 | 103 | return params_string 104 | 105 | def _train(self, x_train, y_train): 106 | prefix_train, prefix_model = 'train', 'model' 107 | cmd = self.rgf_path + ' train %s' 108 | 109 | if not os.path.exists(self.files_location_output_): 110 | os.makedirs(self.files_location_output_) 111 | 112 | # Write files in RGF format 113 | paths = dict() 114 | paths[prefix_train] = self.write_into_files(prefix_train, x_train, y_train) 115 | 116 | for i in range(self.n_classes_): 117 | # Train and test model 118 | params_string = self.get_params_string(train_x_fn=paths[prefix_train]['x'], 119 | train_y_fn=paths[prefix_train]['y'][i], 120 | model_fn_prefix=os.path.join(self.files_location_output_, 121 | '%s_class_%s' % (prefix_model, i)), 122 | reg_L2=.01, # Should be 0.001 123 | loss='Log', # Maybe LS will work better 124 | test_interval=200, # Should be 2000 125 | max_tree=1200, # Should be 1000 126 | max_leaf_forest=6000 # Should be 10000 127 | ) 128 | print 'Running', cmd % params_string 129 | process = subprocess.Popen((cmd % params_string).split(), stdout=subprocess.PIPE) 130 | output = process.communicate()[0] 131 | if self.verbose: 132 | print output 133 | 134 | # Read list of generated models 135 | models = [m for m in os.listdir(self.files_location_output_) if ('%s_class_%s' % (prefix_model, i)) in m] 136 | models.sort() 137 | self.models[i] = models[-1] 138 | 139 | def _predict(self, x_test): 140 | prefix_test, prefix_preds = 'test', 'preds' 141 | cmd = self.rgf_path + ' predict %s' 142 | 143 | if not os.path.exists(self.files_location_output_): 144 | os.makedirs(self.files_location_output_) 145 | 146 | # Write files in RGF format 147 | paths = dict() 148 | paths[prefix_test] = self.write_into_files(prefix_test, x_test) 149 | 150 | all_predictions = [] 151 | 152 | for i in range(self.n_classes_): 153 | # Make predictions and collect it 154 | preds_file = os.path.join(self.files_location_output_, '%s_class_%s' % (prefix_preds, i)) 155 | 156 | params_string = self.get_params_string(test_x_fn=paths[prefix_test]['x'], 157 | model_fn=os.path.join(self.files_location_output_, 158 | self.models[i]), 159 | prediction_fn=preds_file 160 | ) 161 | print 'Running', cmd % params_string 162 | process = subprocess.Popen((cmd % params_string).split(), stdout=subprocess.PIPE) 163 | output = process.communicate()[0] 164 | if self.verbose: 165 | print output 166 | 167 | # Read generated predictions 168 | preds = np.loadtxt(preds_file) 169 | preds = 1. / (1. + np.exp(-preds)) # For Log, Expo 170 | 171 | all_predictions.append(preds) 172 | 173 | # Join all predictions 174 | all_predictions = np.array(all_predictions).T 175 | all_predictions /= np.sum(all_predictions, axis=1)[:, None] 176 | 177 | return all_predictions 178 | 179 | 180 | if __name__ == '__main__': 181 | train, labels, test, _, _ = utils.load_data() 182 | 183 | # Preprocess data - transform counts to TFIDF features 184 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 185 | train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1) 186 | test = np.append(test, tfidf.transform(test).toarray(), axis=1) 187 | 188 | clf = RGF(verbose=False, random_state=23) 189 | 190 | if MODE == 'cv': 191 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 192 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 193 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 194 | elif MODE == 'submission': 195 | clf.fit(train, labels) 196 | predictions = clf.predict_proba(test) 197 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 198 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 199 | predictions) 200 | elif MODE == 'holdout': 201 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 202 | print 'Log loss:', score 203 | else: 204 | print 'Unknown mode' -------------------------------------------------------------------------------- /otto/model/model_05_bagging_nn_rmsprop/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_05_bagging_nn_rmsprop/bagging_nn_rmsprop.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mean log loss from 5-fold CV: 0.461596760113 3 | """ 4 | import copy 5 | import itertools 6 | import numpy as np 7 | import lasagne 8 | import math 9 | import os 10 | import theano 11 | import theano.tensor as T 12 | import time 13 | 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params 15 | from lasagne.nonlinearities import rectify, softmax 16 | from lasagne.objectives import categorical_crossentropy, Objective 17 | from lasagne.updates import rmsprop 18 | 19 | from sklearn import feature_extraction, ensemble 20 | from sklearn.base import BaseEstimator 21 | from sklearn.cross_validation import StratifiedShuffleSplit 22 | from sklearn.utils import check_random_state 23 | 24 | from otto_utils import consts, utils 25 | 26 | 27 | MODEL_NAME = 'model_05_bagging_nn_rmsprop' 28 | MODE = 'cv' # cv|submission|holdout|tune 29 | 30 | 31 | class NeuralNetwork(BaseEstimator): 32 | def __init__(self, n_hidden=20, max_epochs=150, batch_size=200, 33 | lr=0.01, rho=0.9, dropout=0.5, valid_ratio=0.0, 34 | use_valid=False, verbose=0, random_state=None): 35 | self.n_hidden = n_hidden 36 | self.max_epochs = max_epochs 37 | self.batch_size = batch_size 38 | self.lr = lr 39 | self.rho = rho 40 | self.dropout = dropout 41 | self.valid_ratio = valid_ratio 42 | self.use_valid = use_valid 43 | self.verbose = verbose 44 | self.random_state = random_state 45 | # State 46 | self.score_ = None 47 | self.classes_ = None 48 | self.n_classes_ = None 49 | self.model = None 50 | 51 | def fit(self, data, targets, sample_weight=None): 52 | self.classes_, indices = np.unique(targets, return_inverse=True) 53 | self.n_classes_ = self.classes_.shape[0] 54 | 55 | random_state = check_random_state(self.random_state) 56 | 57 | # Shuffle data and eventually split on train and validation sets 58 | if self.valid_ratio > 0: 59 | strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio, 60 | n_iter=1, random_state=self.random_state) 61 | train_index, valid_index = [s for s in strat_shuffled_split][0] 62 | X_train, y_train = data[train_index], targets[train_index] 63 | X_valid, y_valid = data[valid_index], targets[valid_index] 64 | else: 65 | X_train, y_train = data, targets 66 | X_valid, y_valid = np.array([]), np.array([]) 67 | 68 | if self.verbose > 5: 69 | print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape) 70 | if self.use_valid: 71 | print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape) 72 | 73 | # Prepare theano variables 74 | dataset = dict( 75 | X_train=theano.shared(lasagne.utils.floatX(X_train)), 76 | y_train=T.cast(theano.shared(y_train), 'int32'), 77 | X_valid=theano.shared(lasagne.utils.floatX(X_valid)), 78 | y_valid=T.cast(theano.shared(y_valid), 'int32'), 79 | num_examples_train=X_train.shape[0], 80 | num_examples_valid=X_valid.shape[0], 81 | input_dim=X_train.shape[1], 82 | output_dim=self.n_classes_, 83 | ) 84 | 85 | if self.verbose > 0: 86 | print "Building model and compiling functions..." 87 | output_layer = self.build_model(dataset['input_dim']) 88 | iter_funcs = self.create_iter_functions(dataset, output_layer) 89 | 90 | if self.verbose > 0: 91 | print "Starting training..." 92 | now = time.time() 93 | results = [] 94 | try: 95 | for epoch in self.train(iter_funcs, dataset, output_layer): 96 | if self.verbose > 1: 97 | print "Epoch {} of {} took {:.3f}s".format( 98 | epoch['number'], self.max_epochs, time.time() - now) 99 | now = time.time() 100 | results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']]) 101 | if self.verbose > 1: 102 | print " training loss:\t\t{:.6f}".format(epoch['train_loss']) 103 | print " validation loss:\t\t{:.6f}".format(epoch['valid_loss']) 104 | print " validation accuracy:\t\t{:.2f} %%".format( 105 | epoch['valid_accuracy'] * 100) 106 | 107 | if epoch['number'] >= self.max_epochs: 108 | break 109 | 110 | if self.verbose > 0: 111 | print 'Minimum validation error: %f (epoch %d)' % \ 112 | (epoch['best_val_error'], epoch['best_val_iter']) 113 | 114 | except KeyboardInterrupt: 115 | pass 116 | 117 | return self 118 | 119 | def predict(self, data): 120 | preds, _ = self.make_predictions(data) 121 | 122 | return preds 123 | 124 | def predict_proba(self, data): 125 | _, proba = self.make_predictions(data) 126 | 127 | return proba 128 | 129 | def score(self): 130 | return self.score_ 131 | 132 | # Private methods 133 | def build_model(self, input_dim): 134 | l_in = InputLayer(shape=(self.batch_size, input_dim)) 135 | 136 | l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify) 137 | l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout) 138 | 139 | l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify) 140 | l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout) 141 | 142 | l_out = DenseLayer(l_hidden2_dropout, num_units=self.n_classes_, nonlinearity=softmax) 143 | 144 | return l_out 145 | 146 | def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): 147 | batch_index = T.iscalar('batch_index') 148 | X_batch = X_tensor_type('x') 149 | y_batch = T.ivector('y') 150 | 151 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 152 | 153 | objective = Objective(output_layer, loss_function=categorical_crossentropy) 154 | 155 | loss_train = objective.get_loss(X_batch, target=y_batch) 156 | loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) 157 | 158 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 159 | proba = output_layer.get_output(X_batch, deterministic=True) 160 | accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) 161 | 162 | all_params = get_all_params(output_layer) 163 | updates = rmsprop(loss_train, all_params, self.lr, self.rho) 164 | 165 | iter_train = theano.function( 166 | [batch_index], loss_train, 167 | updates=updates, 168 | givens={ 169 | X_batch: dataset['X_train'][batch_slice], 170 | y_batch: dataset['y_train'][batch_slice], 171 | }, 172 | on_unused_input='ignore', 173 | ) 174 | 175 | iter_valid = None 176 | if self.use_valid: 177 | iter_valid = theano.function( 178 | [batch_index], [loss_eval, accuracy, proba], 179 | givens={ 180 | X_batch: dataset['X_valid'][batch_slice], 181 | y_batch: dataset['y_valid'][batch_slice], 182 | }, 183 | ) 184 | 185 | return dict(train=iter_train, valid=iter_valid) 186 | 187 | def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix): 188 | batch_index = T.iscalar('batch_index') 189 | X_batch = X_tensor_type('x') 190 | 191 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 192 | 193 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 194 | proba = output_layer.get_output(X_batch, deterministic=True) 195 | 196 | iter_test = theano.function( 197 | [batch_index], [pred, proba], 198 | givens={ 199 | X_batch: dataset['X_test'][batch_slice], 200 | }, 201 | ) 202 | 203 | return dict(test=iter_test) 204 | 205 | def train(self, iter_funcs, dataset, output_layer): 206 | num_batches_train = dataset['num_examples_train'] // self.batch_size 207 | num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size))) 208 | 209 | best_val_err = 100 210 | best_val_iter = -1 211 | 212 | for epoch in itertools.count(1): 213 | batch_train_losses = [] 214 | for b in range(num_batches_train): 215 | batch_train_loss = iter_funcs['train'](b) 216 | batch_train_losses.append(batch_train_loss) 217 | 218 | avg_train_loss = np.mean(batch_train_losses) 219 | 220 | batch_valid_losses = [] 221 | batch_valid_accuracies = [] 222 | batch_valid_probas = [] 223 | 224 | if self.use_valid: 225 | for b in range(num_batches_valid): 226 | batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b) 227 | batch_valid_losses.append(batch_valid_loss) 228 | batch_valid_accuracies.append(batch_valid_accuracy) 229 | batch_valid_probas.append(batch_valid_proba) 230 | 231 | avg_valid_loss = np.mean(batch_valid_losses) 232 | avg_valid_accuracy = np.mean(batch_valid_accuracies) 233 | 234 | if (best_val_err > avg_valid_loss and self.use_valid) or\ 235 | (epoch == self.max_epochs and not self.use_valid): 236 | best_val_err = avg_valid_loss 237 | best_val_iter = epoch 238 | # Save model 239 | self.score_ = best_val_err 240 | self.model = copy.deepcopy(output_layer) 241 | 242 | 243 | yield { 244 | 'number': epoch, 245 | 'train_loss': avg_train_loss, 246 | 'valid_loss': avg_valid_loss, 247 | 'valid_accuracy': avg_valid_accuracy, 248 | 'best_val_error': best_val_err, 249 | 'best_val_iter': best_val_iter, 250 | } 251 | 252 | def make_predictions(self, data): 253 | dataset = dict( 254 | X_test=theano.shared(lasagne.utils.floatX(data)), 255 | num_examples_test=data.shape[0], 256 | input_dim=data.shape[1], 257 | output_dim=self.n_classes_, 258 | ) 259 | 260 | iter_funcs = self.create_test_function(dataset, self.model) 261 | num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size))) 262 | 263 | test_preds, test_probas = np.array([]), None 264 | 265 | for b in range(num_batches_test): 266 | batch_test_pred, batch_test_proba = iter_funcs['test'](b) 267 | test_preds = np.append(test_preds, batch_test_pred) 268 | test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba 269 | 270 | return test_preds, test_probas 271 | 272 | 273 | if __name__ == '__main__': 274 | train, labels, test, _, _ = utils.load_data() 275 | 276 | # Preprocess data - transform counts to TFIDF features 277 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 278 | train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1) 279 | test = np.append(test, tfidf.transform(test).toarray(), axis=1) 280 | 281 | clf_nn = NeuralNetwork(600, 110, 200, 0.00012503331803251808, 0.9864830676545417, 0.3245683842495481, 282 | .05, True, 10, random_state=23) 283 | 284 | clf = ensemble.BaggingClassifier(base_estimator=clf_nn, n_estimators=10, 285 | max_samples=1., max_features=1., 286 | random_state=23) 287 | 288 | if MODE == 'cv': 289 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 290 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 291 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 292 | elif MODE == 'submission': 293 | clf.fit(train, labels) 294 | predictions = clf.predict_proba(test) 295 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 296 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 297 | predictions) 298 | elif MODE == 'holdout': 299 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 300 | print 'Log loss:', score 301 | else: 302 | print 'Unknown mode' -------------------------------------------------------------------------------- /otto/model/model_06_xgboost/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_06_xgboost/xgboost.py: -------------------------------------------------------------------------------- 1 | import graphlab as gl 2 | import numpy as np 3 | import logging 4 | import os 5 | 6 | from hyperopt import fmin, hp, tpe 7 | 8 | from sklearn.base import BaseEstimator 9 | from sklearn import preprocessing 10 | 11 | from otto_utils import consts, utils 12 | 13 | 14 | MODEL_NAME = 'model_06_xgboost' 15 | MODE = 'holdout' # cv|submission|holdout|tune 16 | 17 | logging.disable(logging.INFO) 18 | 19 | 20 | class XGBoost(BaseEstimator): 21 | def __init__(self, max_iterations=50, max_depth=9, min_child_weight=4, row_subsample=.75, 22 | min_loss_reduction=1., column_subsample=.8, step_size=.3, verbose=True): 23 | self.n_classes_ = 9 24 | self.max_iterations = max_iterations 25 | self.max_depth = max_depth 26 | self.min_child_weight = min_child_weight 27 | self.row_subsample = row_subsample 28 | self.min_loss_reduction = min_loss_reduction 29 | self.column_subsample = column_subsample 30 | self.step_size = step_size 31 | self.verbose = verbose 32 | self.model = None 33 | 34 | def fit(self, X, y, sample_weight=None): 35 | sf = self._array_to_sframe(X, y) 36 | self.model = gl.boosted_trees_classifier.create(sf, target='target', 37 | max_iterations=self.max_iterations, 38 | max_depth=self.max_depth, 39 | min_child_weight=self.min_child_weight, 40 | row_subsample=self.row_subsample, 41 | min_loss_reduction=self.min_loss_reduction, 42 | column_subsample=self.column_subsample, 43 | step_size=self.step_size, 44 | verbose=self.verbose) 45 | 46 | return self 47 | 48 | def predict(self, X): 49 | preds = self.predict_proba(X) 50 | return np.argmax(preds, axis=1) 51 | 52 | def predict_proba(self, X): 53 | sf = self._array_to_sframe(X) 54 | preds = self.model.predict_topk(sf, output_type='probability', k=self.n_classes_) 55 | 56 | return self._preds_to_array(preds) 57 | 58 | # Private methods 59 | def _array_to_sframe(self, data, targets=None): 60 | d = dict() 61 | for i in xrange(data.shape[1]): 62 | d['feat_%d' % (i + 1)] = gl.SArray(data[:, i]) 63 | if targets is not None: 64 | d['target'] = gl.SArray(targets) 65 | 66 | return gl.SFrame(d) 67 | 68 | def _preds_to_array(self, preds): 69 | p = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '') 70 | p['id'] = p['id'].astype(int) + 1 71 | p = p.sort('id') 72 | del p['id'] 73 | preds_array = np.array(p.to_dataframe(), dtype=float) 74 | 75 | return preds_array 76 | 77 | 78 | if __name__ == '__main__': 79 | train, labels, test, _, _ = utils.load_data() 80 | 81 | clf = XGBoost(max_iterations=4800, max_depth=12, min_child_weight=4.9208250938262745, 82 | row_subsample=.9134478530382129, min_loss_reduction=.5132278416508804, 83 | column_subsample=.730128689911957, step_size=.009) 84 | 85 | if MODE == 'cv': 86 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 87 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 88 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 89 | elif MODE == 'submission': 90 | clf.fit(train, labels) 91 | predictions = clf.predict_proba(test) 92 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 93 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 94 | predictions) 95 | elif MODE == 'holdout': 96 | train, labels, _, _ = utils.stratified_split(train, labels, test_size=.7) 97 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 98 | print 'Log loss:', score 99 | elif MODE == 'tune': 100 | # Objective function 101 | def objective(args): 102 | max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args 103 | clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight, 104 | row_subsample=row_subsample, min_loss_reduction=min_loss_reduction, 105 | column_subsample=column_subsample, verbose=False) 106 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 107 | print 'max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss' 108 | print args, score 109 | return score 110 | # Searching space 111 | space = ( 112 | hp.quniform('max_depth', 2, 14, 1), 113 | hp.uniform('min_child_weight', .5, 10.), 114 | hp.uniform('row_subsample', .3, 1.), 115 | hp.uniform('min_loss_reduction', .1, 3.), 116 | hp.uniform('column_subsample', .1, 1.), 117 | ) 118 | 119 | best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=500) 120 | print 'Best solution:', best_sln 121 | else: 122 | print 'Unknown mode' 123 | -------------------------------------------------------------------------------- /otto/model/model_07_bagging_nn_nesterov/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_07_bagging_nn_nesterov/bagging_nn_nesterov.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mean log loss from 5-fold CV: 0.461952014711 3 | """ 4 | import copy 5 | import itertools 6 | import numpy as np 7 | import lasagne 8 | import math 9 | import os 10 | import theano 11 | import theano.tensor as T 12 | import time 13 | 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params 15 | from lasagne.nonlinearities import rectify, softmax 16 | from lasagne.objectives import categorical_crossentropy, Objective 17 | 18 | from sklearn import decomposition, ensemble 19 | from sklearn.base import BaseEstimator 20 | from sklearn.cross_validation import StratifiedShuffleSplit 21 | from sklearn.utils import check_random_state 22 | 23 | from otto_utils import consts, utils 24 | 25 | 26 | MODEL_NAME = 'model_07_bagging_nn_nesterov' 27 | MODE = 'submission' # cv|submission|holdout|tune 28 | 29 | 30 | class NeuralNetwork(BaseEstimator): 31 | def __init__(self, n_hidden=20, max_epochs=150, batch_size=200, 32 | lr=0.01, momentum=0.9, dropout=0.5, valid_ratio=0.0, 33 | use_valid=False, verbose=0, random_state=None): 34 | self.n_hidden = n_hidden 35 | self.max_epochs = max_epochs 36 | self.batch_size = batch_size 37 | self.lr = lr 38 | self.momentum = momentum 39 | self.dropout = dropout 40 | self.valid_ratio = valid_ratio 41 | self.use_valid = use_valid 42 | self.verbose = verbose 43 | self.random_state = random_state 44 | # State 45 | self.score_ = None 46 | self.classes_ = None 47 | self.n_classes_ = None 48 | self.model = None 49 | 50 | def fit(self, data, targets, sample_weight=None): 51 | self.classes_, indices = np.unique(targets, return_inverse=True) 52 | self.n_classes_ = self.classes_.shape[0] 53 | 54 | random_state = check_random_state(self.random_state) 55 | 56 | # Shuffle data and eventually split on train and validation sets 57 | if self.valid_ratio > 0: 58 | strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio, 59 | n_iter=1, random_state=self.random_state) 60 | train_index, valid_index = [s for s in strat_shuffled_split][0] 61 | X_train, y_train = data[train_index], targets[train_index] 62 | X_valid, y_valid = data[valid_index], targets[valid_index] 63 | else: 64 | X_train, y_train = data, targets 65 | X_valid, y_valid = np.array([]), np.array([]) 66 | 67 | if self.verbose > 5: 68 | print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape) 69 | if self.use_valid: 70 | print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape) 71 | 72 | # Prepare theano variables 73 | dataset = dict( 74 | X_train=theano.shared(lasagne.utils.floatX(X_train)), 75 | y_train=T.cast(theano.shared(y_train), 'int32'), 76 | X_valid=theano.shared(lasagne.utils.floatX(X_valid)), 77 | y_valid=T.cast(theano.shared(y_valid), 'int32'), 78 | num_examples_train=X_train.shape[0], 79 | num_examples_valid=X_valid.shape[0], 80 | input_dim=X_train.shape[1], 81 | output_dim=self.n_classes_, 82 | ) 83 | 84 | if self.verbose > 0: 85 | print "Building model and compiling functions..." 86 | output_layer = self.build_model(dataset['input_dim']) 87 | iter_funcs = self.create_iter_functions(dataset, output_layer) 88 | 89 | if self.verbose > 0: 90 | print "Starting training..." 91 | now = time.time() 92 | results = [] 93 | try: 94 | for epoch in self.train(iter_funcs, dataset, output_layer): 95 | if self.verbose > 1: 96 | print "Epoch {} of {} took {:.3f}s".format( 97 | epoch['number'], self.max_epochs, time.time() - now) 98 | now = time.time() 99 | results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']]) 100 | if self.verbose > 1: 101 | print " training loss:\t\t{:.6f}".format(epoch['train_loss']) 102 | print " validation loss:\t\t{:.6f}".format(epoch['valid_loss']) 103 | print " validation accuracy:\t\t{:.2f} %%".format( 104 | epoch['valid_accuracy'] * 100) 105 | 106 | if epoch['number'] >= self.max_epochs: 107 | break 108 | 109 | if self.verbose > 0: 110 | print 'Minimum validation error: %f (epoch %d)' % \ 111 | (epoch['best_val_error'], epoch['best_val_iter']) 112 | 113 | except KeyboardInterrupt: 114 | pass 115 | 116 | return self 117 | 118 | def predict(self, data): 119 | preds, _ = self.make_predictions(data) 120 | 121 | return preds 122 | 123 | def predict_proba(self, data): 124 | _, proba = self.make_predictions(data) 125 | 126 | return proba 127 | 128 | def score(self): 129 | return self.score_ 130 | 131 | # Private methods 132 | def build_model(self, input_dim): 133 | l_in = InputLayer(shape=(self.batch_size, input_dim)) 134 | 135 | l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify) 136 | l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout) 137 | 138 | l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify) 139 | l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout) 140 | 141 | l_out = DenseLayer(l_hidden2_dropout, num_units=self.n_classes_, nonlinearity=softmax) 142 | 143 | return l_out 144 | 145 | def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): 146 | batch_index = T.iscalar('batch_index') 147 | X_batch = X_tensor_type('x') 148 | y_batch = T.ivector('y') 149 | 150 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 151 | 152 | objective = Objective(output_layer, loss_function=categorical_crossentropy) 153 | 154 | loss_train = objective.get_loss(X_batch, target=y_batch) 155 | loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) 156 | 157 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 158 | proba = output_layer.get_output(X_batch, deterministic=True) 159 | accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) 160 | 161 | all_params = get_all_params(output_layer) 162 | updates = lasagne.updates.nesterov_momentum(loss_train, all_params, self.lr, self.momentum) 163 | 164 | iter_train = theano.function( 165 | [batch_index], loss_train, 166 | updates=updates, 167 | givens={ 168 | X_batch: dataset['X_train'][batch_slice], 169 | y_batch: dataset['y_train'][batch_slice], 170 | }, 171 | on_unused_input='ignore', 172 | ) 173 | 174 | iter_valid = None 175 | if self.use_valid: 176 | iter_valid = theano.function( 177 | [batch_index], [loss_eval, accuracy, proba], 178 | givens={ 179 | X_batch: dataset['X_valid'][batch_slice], 180 | y_batch: dataset['y_valid'][batch_slice], 181 | }, 182 | ) 183 | 184 | return dict(train=iter_train, valid=iter_valid) 185 | 186 | def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix): 187 | batch_index = T.iscalar('batch_index') 188 | X_batch = X_tensor_type('x') 189 | 190 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 191 | 192 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 193 | proba = output_layer.get_output(X_batch, deterministic=True) 194 | 195 | iter_test = theano.function( 196 | [batch_index], [pred, proba], 197 | givens={ 198 | X_batch: dataset['X_test'][batch_slice], 199 | }, 200 | ) 201 | 202 | return dict(test=iter_test) 203 | 204 | def train(self, iter_funcs, dataset, output_layer): 205 | num_batches_train = dataset['num_examples_train'] // self.batch_size 206 | num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size))) 207 | 208 | best_val_err = 100 209 | best_val_iter = -1 210 | 211 | for epoch in itertools.count(1): 212 | batch_train_losses = [] 213 | for b in range(num_batches_train): 214 | batch_train_loss = iter_funcs['train'](b) 215 | batch_train_losses.append(batch_train_loss) 216 | 217 | avg_train_loss = np.mean(batch_train_losses) 218 | 219 | batch_valid_losses = [] 220 | batch_valid_accuracies = [] 221 | batch_valid_probas = [] 222 | 223 | if self.use_valid: 224 | for b in range(num_batches_valid): 225 | batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b) 226 | batch_valid_losses.append(batch_valid_loss) 227 | batch_valid_accuracies.append(batch_valid_accuracy) 228 | batch_valid_probas.append(batch_valid_proba) 229 | 230 | avg_valid_loss = np.mean(batch_valid_losses) 231 | avg_valid_accuracy = np.mean(batch_valid_accuracies) 232 | 233 | if (best_val_err > avg_valid_loss and self.use_valid) or\ 234 | (epoch == self.max_epochs and not self.use_valid): 235 | best_val_err = avg_valid_loss 236 | best_val_iter = epoch 237 | # Save model 238 | self.score_ = best_val_err 239 | self.model = copy.deepcopy(output_layer) 240 | 241 | 242 | yield { 243 | 'number': epoch, 244 | 'train_loss': avg_train_loss, 245 | 'valid_loss': avg_valid_loss, 246 | 'valid_accuracy': avg_valid_accuracy, 247 | 'best_val_error': best_val_err, 248 | 'best_val_iter': best_val_iter, 249 | } 250 | 251 | def make_predictions(self, data): 252 | dataset = dict( 253 | X_test=theano.shared(lasagne.utils.floatX(data)), 254 | num_examples_test=data.shape[0], 255 | input_dim=data.shape[1], 256 | output_dim=self.n_classes_, 257 | ) 258 | 259 | iter_funcs = self.create_test_function(dataset, self.model) 260 | num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size))) 261 | 262 | test_preds, test_probas = np.array([]), None 263 | 264 | for b in range(num_batches_test): 265 | batch_test_pred, batch_test_proba = iter_funcs['test'](b) 266 | test_preds = np.append(test_preds, batch_test_pred) 267 | test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba 268 | 269 | return test_preds, test_probas 270 | 271 | 272 | if __name__ == '__main__': 273 | train, labels, test, _, _ = utils.load_data() 274 | 275 | # PCA 276 | pp = decomposition.PCA() 277 | train = pp.fit_transform(train) 278 | test = pp.transform(test) 279 | 280 | clf_nn = NeuralNetwork(750, 110, 116, 0.0012503331803251808, 0.9544425038759606, 0.3992570325984604, 281 | .05, True, 10, random_state=23) 282 | 283 | clf = ensemble.BaggingClassifier(base_estimator=clf_nn, n_estimators=10, 284 | max_samples=1., max_features=1., 285 | random_state=29) 286 | 287 | if MODE == 'cv': 288 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 289 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 290 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 291 | elif MODE == 'submission': 292 | clf.fit(train, labels) 293 | predictions = clf.predict_proba(test) 294 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 295 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 296 | predictions) 297 | elif MODE == 'holdout': 298 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 299 | print 'Log loss:', score 300 | else: 301 | print 'Unknown mode' -------------------------------------------------------------------------------- /otto/model/model_08_random_forest_calibrated/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_08_random_forest_calibrated/random_forest_calibrated.py: -------------------------------------------------------------------------------- 1 | """ 2 | 5-fold CV - log loss 0.4687 3 | """ 4 | import numpy as np 5 | import os 6 | 7 | from sklearn import ensemble, feature_extraction, preprocessing 8 | from sklearn.calibration import CalibratedClassifierCV 9 | 10 | from otto_utils import consts, utils 11 | 12 | 13 | MODEL_NAME = 'model_08_random_forest_calibrated' 14 | MODE = 'holdout' # cv|submission|holdout 15 | 16 | # import data 17 | train, labels, test, _, _ = utils.load_data() 18 | 19 | # transform counts to TFIDF features 20 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 21 | train = tfidf.fit_transform(train).toarray() 22 | test = tfidf.transform(test).toarray() 23 | 24 | # encode labels 25 | lbl_enc = preprocessing.LabelEncoder() 26 | labels = lbl_enc.fit_transform(labels) 27 | 28 | # train classifier 29 | clf = ensemble.ExtraTreesClassifier(n_jobs=5, n_estimators=600, max_features=20, min_samples_split=3, 30 | bootstrap=False, verbose=3, random_state=23) 31 | 32 | if MODE == 'cv': 33 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True) 34 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 35 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 36 | elif MODE == 'submission': 37 | calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels)) 38 | fitted_classifier = calibrated_classifier.fit(train, labels) 39 | predictions = fitted_classifier.predict_proba(test) 40 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 41 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 42 | predictions) 43 | elif MODE == 'holdout': 44 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=True) 45 | print 'Log loss:', score 46 | else: 47 | print 'Unknown mode' 48 | -------------------------------------------------------------------------------- /otto/model/model_09_nn_adagrad/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_09_nn_adagrad/nn_adagrad.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mean log loss from 5-fold CV: 0.480065955962 3 | """ 4 | import copy 5 | import itertools 6 | import numpy as np 7 | import lasagne 8 | import math 9 | import os 10 | import theano 11 | import theano.tensor as T 12 | import time 13 | 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params 15 | from lasagne.nonlinearities import rectify, softmax 16 | from lasagne.objectives import categorical_crossentropy, Objective 17 | from lasagne.updates import adagrad 18 | 19 | from sklearn import feature_extraction 20 | from sklearn.base import BaseEstimator 21 | from sklearn.cross_validation import StratifiedShuffleSplit 22 | from sklearn.utils import check_random_state 23 | 24 | from otto_utils import consts, utils 25 | 26 | 27 | MODEL_NAME = 'model_09_nn_adagrad' 28 | MODE = 'cv' # cv|submission|holdout|tune 29 | 30 | 31 | class NeuralNetwork(BaseEstimator): 32 | def __init__(self, n_hidden=20, max_epochs=150, batch_size=200, 33 | lr=0.01, epsilon=0.9, dropout=0.5, valid_ratio=0.0, 34 | use_valid=False, verbose=0, random_state=None): 35 | self.n_hidden = n_hidden 36 | self.max_epochs = max_epochs 37 | self.batch_size = batch_size 38 | self.lr = lr 39 | self.epsilon = epsilon 40 | self.dropout = dropout 41 | self.valid_ratio = valid_ratio 42 | self.use_valid = use_valid 43 | self.verbose = verbose 44 | self.random_state = random_state 45 | # State 46 | self.score_ = None 47 | self.classes_ = None 48 | self.n_classes_ = None 49 | self.model = None 50 | 51 | def fit(self, data, targets, sample_weight=None): 52 | self.classes_, indices = np.unique(targets, return_inverse=True) 53 | self.n_classes_ = self.classes_.shape[0] 54 | 55 | random_state = check_random_state(self.random_state) 56 | 57 | # Shuffle data and eventually split on train and validation sets 58 | if self.valid_ratio > 0: 59 | strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio, 60 | n_iter=1, random_state=self.random_state) 61 | train_index, valid_index = [s for s in strat_shuffled_split][0] 62 | X_train, y_train = data[train_index], targets[train_index] 63 | X_valid, y_valid = data[valid_index], targets[valid_index] 64 | else: 65 | X_train, y_train = data, targets 66 | X_valid, y_valid = np.array([]), np.array([]) 67 | 68 | if self.verbose > 5: 69 | print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape) 70 | if self.use_valid: 71 | print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape) 72 | 73 | # Prepare theano variables 74 | dataset = dict( 75 | X_train=theano.shared(lasagne.utils.floatX(X_train)), 76 | y_train=T.cast(theano.shared(y_train), 'int32'), 77 | X_valid=theano.shared(lasagne.utils.floatX(X_valid)), 78 | y_valid=T.cast(theano.shared(y_valid), 'int32'), 79 | num_examples_train=X_train.shape[0], 80 | num_examples_valid=X_valid.shape[0], 81 | input_dim=X_train.shape[1], 82 | output_dim=self.n_classes_, 83 | ) 84 | 85 | if self.verbose > 0: 86 | print "Building model and compiling functions..." 87 | output_layer = self.build_model(dataset['input_dim']) 88 | iter_funcs = self.create_iter_functions(dataset, output_layer) 89 | 90 | if self.verbose > 0: 91 | print "Starting training..." 92 | now = time.time() 93 | results = [] 94 | try: 95 | for epoch in self.train(iter_funcs, dataset, output_layer): 96 | if self.verbose > 1: 97 | print "Epoch {} of {} took {:.3f}s".format( 98 | epoch['number'], self.max_epochs, time.time() - now) 99 | now = time.time() 100 | results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']]) 101 | if self.verbose > 1: 102 | print " training loss:\t\t{:.6f}".format(epoch['train_loss']) 103 | print " validation loss:\t\t{:.6f}".format(epoch['valid_loss']) 104 | print " validation accuracy:\t\t{:.2f} %%".format( 105 | epoch['valid_accuracy'] * 100) 106 | 107 | if epoch['number'] >= self.max_epochs: 108 | break 109 | 110 | if self.verbose > 0: 111 | print 'Minimum validation error: %f (epoch %d)' % \ 112 | (epoch['best_val_error'], epoch['best_val_iter']) 113 | 114 | except KeyboardInterrupt: 115 | pass 116 | 117 | return self 118 | 119 | def predict(self, data): 120 | preds, _ = self.make_predictions(data) 121 | 122 | return preds 123 | 124 | def predict_proba(self, data): 125 | _, proba = self.make_predictions(data) 126 | 127 | return proba 128 | 129 | def score(self): 130 | return self.score_ 131 | 132 | # Private methods 133 | def build_model(self, input_dim): 134 | l_in = InputLayer(shape=(self.batch_size, input_dim)) 135 | 136 | l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify) 137 | l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout) 138 | 139 | l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify) 140 | l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout) 141 | 142 | l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden, nonlinearity=rectify) 143 | l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout) 144 | 145 | l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax) 146 | 147 | return l_out 148 | 149 | def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): 150 | batch_index = T.iscalar('batch_index') 151 | X_batch = X_tensor_type('x') 152 | y_batch = T.ivector('y') 153 | 154 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 155 | 156 | objective = Objective(output_layer, loss_function=categorical_crossentropy) 157 | 158 | loss_train = objective.get_loss(X_batch, target=y_batch) 159 | loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) 160 | 161 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 162 | proba = output_layer.get_output(X_batch, deterministic=True) 163 | accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) 164 | 165 | all_params = get_all_params(output_layer) 166 | updates = adagrad(loss_train, all_params, self.lr, self.epsilon) 167 | 168 | iter_train = theano.function( 169 | [batch_index], loss_train, 170 | updates=updates, 171 | givens={ 172 | X_batch: dataset['X_train'][batch_slice], 173 | y_batch: dataset['y_train'][batch_slice], 174 | }, 175 | on_unused_input='ignore', 176 | ) 177 | 178 | iter_valid = None 179 | if self.use_valid: 180 | iter_valid = theano.function( 181 | [batch_index], [loss_eval, accuracy, proba], 182 | givens={ 183 | X_batch: dataset['X_valid'][batch_slice], 184 | y_batch: dataset['y_valid'][batch_slice], 185 | }, 186 | ) 187 | 188 | return dict(train=iter_train, valid=iter_valid) 189 | 190 | def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix): 191 | batch_index = T.iscalar('batch_index') 192 | X_batch = X_tensor_type('x') 193 | 194 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 195 | 196 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 197 | proba = output_layer.get_output(X_batch, deterministic=True) 198 | 199 | iter_test = theano.function( 200 | [batch_index], [pred, proba], 201 | givens={ 202 | X_batch: dataset['X_test'][batch_slice], 203 | }, 204 | ) 205 | 206 | return dict(test=iter_test) 207 | 208 | def train(self, iter_funcs, dataset, output_layer): 209 | num_batches_train = dataset['num_examples_train'] // self.batch_size 210 | num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size))) 211 | 212 | best_val_err = 100 213 | best_val_iter = -1 214 | 215 | for epoch in itertools.count(1): 216 | batch_train_losses = [] 217 | for b in range(num_batches_train): 218 | batch_train_loss = iter_funcs['train'](b) 219 | batch_train_losses.append(batch_train_loss) 220 | 221 | avg_train_loss = np.mean(batch_train_losses) 222 | 223 | batch_valid_losses = [] 224 | batch_valid_accuracies = [] 225 | batch_valid_probas = [] 226 | 227 | if self.use_valid: 228 | for b in range(num_batches_valid): 229 | batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b) 230 | batch_valid_losses.append(batch_valid_loss) 231 | batch_valid_accuracies.append(batch_valid_accuracy) 232 | batch_valid_probas.append(batch_valid_proba) 233 | 234 | avg_valid_loss = np.mean(batch_valid_losses) 235 | avg_valid_accuracy = np.mean(batch_valid_accuracies) 236 | 237 | if (best_val_err > avg_valid_loss and self.use_valid) or\ 238 | (epoch == self.max_epochs and not self.use_valid): 239 | best_val_err = avg_valid_loss 240 | best_val_iter = epoch 241 | # Save model 242 | self.score_ = best_val_err 243 | self.model = copy.deepcopy(output_layer) 244 | 245 | 246 | yield { 247 | 'number': epoch, 248 | 'train_loss': avg_train_loss, 249 | 'valid_loss': avg_valid_loss, 250 | 'valid_accuracy': avg_valid_accuracy, 251 | 'best_val_error': best_val_err, 252 | 'best_val_iter': best_val_iter, 253 | } 254 | 255 | def make_predictions(self, data): 256 | dataset = dict( 257 | X_test=theano.shared(lasagne.utils.floatX(data)), 258 | num_examples_test=data.shape[0], 259 | input_dim=data.shape[1], 260 | output_dim=self.n_classes_, 261 | ) 262 | 263 | iter_funcs = self.create_test_function(dataset, self.model) 264 | num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size))) 265 | 266 | test_preds, test_probas = np.array([]), None 267 | 268 | for b in range(num_batches_test): 269 | batch_test_pred, batch_test_proba = iter_funcs['test'](b) 270 | test_preds = np.append(test_preds, batch_test_pred) 271 | test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba 272 | 273 | return test_preds, test_probas 274 | 275 | 276 | if __name__ == '__main__': 277 | train, labels, test, _, _ = utils.load_data() 278 | 279 | # Preprocess data - transform counts to TFIDF features 280 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 281 | train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1) 282 | test = np.append(test, tfidf.transform(test).toarray(), axis=1) 283 | 284 | clf = NeuralNetwork(512, 110, 128, 0.004438538808932511, 1.6674644616533133e-14, 0.2137591043893735, 285 | .02, True, 10, random_state=23) 286 | 287 | if MODE == 'cv': 288 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 289 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 290 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 291 | elif MODE == 'submission': 292 | clf.fit(train, labels) 293 | predictions = clf.predict_proba(test) 294 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 295 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 296 | predictions) 297 | elif MODE == 'holdout': 298 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 299 | print 'Log loss:', score 300 | else: 301 | print 'Unknown mode' -------------------------------------------------------------------------------- /otto/model/model_10_nn_adagrad_pca/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_10_nn_adagrad_pca/nn_adagrad_pca.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mean log loss from 5-fold CV: 0.476382621152 3 | """ 4 | import copy 5 | import itertools 6 | import numpy as np 7 | import lasagne 8 | import math 9 | import os 10 | import theano 11 | import theano.tensor as T 12 | import time 13 | 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params 15 | from lasagne.nonlinearities import rectify, softmax 16 | from lasagne.objectives import categorical_crossentropy, Objective 17 | from lasagne.updates import adagrad 18 | 19 | from sklearn import decomposition, feature_extraction 20 | from sklearn.base import BaseEstimator 21 | from sklearn.cross_validation import StratifiedShuffleSplit 22 | from sklearn.utils import check_random_state 23 | 24 | from otto_utils import consts, utils 25 | 26 | 27 | MODEL_NAME = 'model_10_nn_adagrad_pca' 28 | MODE = 'holdout' # cv|submission|holdout|tune 29 | 30 | 31 | class NeuralNetwork(BaseEstimator): 32 | def __init__(self, n_hidden=20, max_epochs=150, batch_size=200, 33 | lr=0.01, epsilon=0.9, dropout=0.5, valid_ratio=0.0, 34 | use_valid=False, verbose=0, random_state=None): 35 | self.n_hidden = n_hidden 36 | self.max_epochs = max_epochs 37 | self.batch_size = batch_size 38 | self.lr = lr 39 | self.epsilon = epsilon 40 | self.dropout = dropout 41 | self.valid_ratio = valid_ratio 42 | self.use_valid = use_valid 43 | self.verbose = verbose 44 | self.random_state = random_state 45 | # State 46 | self.score_ = None 47 | self.classes_ = None 48 | self.n_classes_ = None 49 | self.model = None 50 | 51 | def fit(self, data, targets, sample_weight=None): 52 | self.classes_, indices = np.unique(targets, return_inverse=True) 53 | self.n_classes_ = self.classes_.shape[0] 54 | 55 | random_state = check_random_state(self.random_state) 56 | 57 | # Shuffle data and eventually split on train and validation sets 58 | if self.valid_ratio > 0: 59 | strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio, 60 | n_iter=1, random_state=self.random_state) 61 | train_index, valid_index = [s for s in strat_shuffled_split][0] 62 | X_train, y_train = data[train_index], targets[train_index] 63 | X_valid, y_valid = data[valid_index], targets[valid_index] 64 | else: 65 | X_train, y_train = data, targets 66 | X_valid, y_valid = np.array([]), np.array([]) 67 | 68 | if self.verbose > 5: 69 | print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape) 70 | if self.use_valid: 71 | print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape) 72 | 73 | # Prepare theano variables 74 | dataset = dict( 75 | X_train=theano.shared(lasagne.utils.floatX(X_train)), 76 | y_train=T.cast(theano.shared(y_train), 'int32'), 77 | X_valid=theano.shared(lasagne.utils.floatX(X_valid)), 78 | y_valid=T.cast(theano.shared(y_valid), 'int32'), 79 | num_examples_train=X_train.shape[0], 80 | num_examples_valid=X_valid.shape[0], 81 | input_dim=X_train.shape[1], 82 | output_dim=self.n_classes_, 83 | ) 84 | 85 | if self.verbose > 0: 86 | print "Building model and compiling functions..." 87 | output_layer = self.build_model(dataset['input_dim']) 88 | iter_funcs = self.create_iter_functions(dataset, output_layer) 89 | 90 | if self.verbose > 0: 91 | print "Starting training..." 92 | now = time.time() 93 | results = [] 94 | try: 95 | for epoch in self.train(iter_funcs, dataset, output_layer): 96 | if self.verbose > 1: 97 | print "Epoch {} of {} took {:.3f}s".format( 98 | epoch['number'], self.max_epochs, time.time() - now) 99 | now = time.time() 100 | results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']]) 101 | if self.verbose > 1: 102 | print " training loss:\t\t{:.6f}".format(epoch['train_loss']) 103 | print " validation loss:\t\t{:.6f}".format(epoch['valid_loss']) 104 | print " validation accuracy:\t\t{:.2f} %%".format( 105 | epoch['valid_accuracy'] * 100) 106 | 107 | if epoch['number'] >= self.max_epochs: 108 | break 109 | 110 | if self.verbose > 0: 111 | print 'Minimum validation error: %f (epoch %d)' % \ 112 | (epoch['best_val_error'], epoch['best_val_iter']) 113 | 114 | except KeyboardInterrupt: 115 | pass 116 | 117 | return self 118 | 119 | def predict(self, data): 120 | preds, _ = self.make_predictions(data) 121 | 122 | return preds 123 | 124 | def predict_proba(self, data): 125 | _, proba = self.make_predictions(data) 126 | 127 | return proba 128 | 129 | def score(self): 130 | return self.score_ 131 | 132 | # Private methods 133 | def build_model(self, input_dim): 134 | l_in = InputLayer(shape=(self.batch_size, input_dim)) 135 | 136 | l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify) 137 | l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout) 138 | 139 | l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify) 140 | l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout) 141 | 142 | l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden, nonlinearity=rectify) 143 | l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout) 144 | 145 | l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax) 146 | 147 | return l_out 148 | 149 | def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): 150 | batch_index = T.iscalar('batch_index') 151 | X_batch = X_tensor_type('x') 152 | y_batch = T.ivector('y') 153 | 154 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 155 | 156 | objective = Objective(output_layer, loss_function=categorical_crossentropy) 157 | 158 | loss_train = objective.get_loss(X_batch, target=y_batch) 159 | loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) 160 | 161 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 162 | proba = output_layer.get_output(X_batch, deterministic=True) 163 | accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) 164 | 165 | all_params = get_all_params(output_layer) 166 | updates = adagrad(loss_train, all_params, self.lr, self.epsilon) 167 | 168 | iter_train = theano.function( 169 | [batch_index], loss_train, 170 | updates=updates, 171 | givens={ 172 | X_batch: dataset['X_train'][batch_slice], 173 | y_batch: dataset['y_train'][batch_slice], 174 | }, 175 | on_unused_input='ignore', 176 | ) 177 | 178 | iter_valid = None 179 | if self.use_valid: 180 | iter_valid = theano.function( 181 | [batch_index], [loss_eval, accuracy, proba], 182 | givens={ 183 | X_batch: dataset['X_valid'][batch_slice], 184 | y_batch: dataset['y_valid'][batch_slice], 185 | }, 186 | ) 187 | 188 | return dict(train=iter_train, valid=iter_valid) 189 | 190 | def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix): 191 | batch_index = T.iscalar('batch_index') 192 | X_batch = X_tensor_type('x') 193 | 194 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 195 | 196 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 197 | proba = output_layer.get_output(X_batch, deterministic=True) 198 | 199 | iter_test = theano.function( 200 | [batch_index], [pred, proba], 201 | givens={ 202 | X_batch: dataset['X_test'][batch_slice], 203 | }, 204 | ) 205 | 206 | return dict(test=iter_test) 207 | 208 | def train(self, iter_funcs, dataset, output_layer): 209 | num_batches_train = dataset['num_examples_train'] // self.batch_size 210 | num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size))) 211 | 212 | best_val_err = 100 213 | best_val_iter = -1 214 | 215 | for epoch in itertools.count(1): 216 | batch_train_losses = [] 217 | for b in range(num_batches_train): 218 | batch_train_loss = iter_funcs['train'](b) 219 | batch_train_losses.append(batch_train_loss) 220 | 221 | avg_train_loss = np.mean(batch_train_losses) 222 | 223 | batch_valid_losses = [] 224 | batch_valid_accuracies = [] 225 | batch_valid_probas = [] 226 | 227 | if self.use_valid: 228 | for b in range(num_batches_valid): 229 | batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b) 230 | batch_valid_losses.append(batch_valid_loss) 231 | batch_valid_accuracies.append(batch_valid_accuracy) 232 | batch_valid_probas.append(batch_valid_proba) 233 | 234 | avg_valid_loss = np.mean(batch_valid_losses) 235 | avg_valid_accuracy = np.mean(batch_valid_accuracies) 236 | 237 | if (best_val_err > avg_valid_loss and self.use_valid) or\ 238 | (epoch == self.max_epochs and not self.use_valid): 239 | best_val_err = avg_valid_loss 240 | best_val_iter = epoch 241 | # Save model 242 | self.score_ = best_val_err 243 | self.model = copy.deepcopy(output_layer) 244 | 245 | 246 | yield { 247 | 'number': epoch, 248 | 'train_loss': avg_train_loss, 249 | 'valid_loss': avg_valid_loss, 250 | 'valid_accuracy': avg_valid_accuracy, 251 | 'best_val_error': best_val_err, 252 | 'best_val_iter': best_val_iter, 253 | } 254 | 255 | def make_predictions(self, data): 256 | dataset = dict( 257 | X_test=theano.shared(lasagne.utils.floatX(data)), 258 | num_examples_test=data.shape[0], 259 | input_dim=data.shape[1], 260 | output_dim=self.n_classes_, 261 | ) 262 | 263 | iter_funcs = self.create_test_function(dataset, self.model) 264 | num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size))) 265 | 266 | test_preds, test_probas = np.array([]), None 267 | 268 | for b in range(num_batches_test): 269 | batch_test_pred, batch_test_proba = iter_funcs['test'](b) 270 | test_preds = np.append(test_preds, batch_test_pred) 271 | test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba 272 | 273 | return test_preds, test_probas 274 | 275 | 276 | if __name__ == '__main__': 277 | train, labels, test, _, _ = utils.load_data() 278 | 279 | # Preprocess data - transform counts to TFIDF features 280 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 281 | train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1) 282 | test = np.append(test, tfidf.transform(test).toarray(), axis=1) 283 | 284 | # PCA 285 | pp = decomposition.PCA() 286 | train = pp.fit_transform(train) 287 | test = pp.transform(test) 288 | 289 | clf = NeuralNetwork(512, 110, 128, 0.004438538808932511, 1.6674644616533133e-14, 0.2137591043893735, 290 | .02, True, 10, random_state=23) 291 | 292 | if MODE == 'cv': 293 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 294 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 295 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 296 | elif MODE == 'submission': 297 | clf.fit(train, labels) 298 | predictions = clf.predict_proba(test) 299 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 300 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 301 | predictions) 302 | elif MODE == 'holdout': 303 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 304 | print 'Log loss:', score 305 | else: 306 | print 'Unknown mode' -------------------------------------------------------------------------------- /otto/model/model_11_xgboost_poly/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_11_xgboost_poly/xgboost_poly.py: -------------------------------------------------------------------------------- 1 | """ 2 | 5-fold cv - log loss 0.468809065953 3 | """ 4 | import graphlab as gl 5 | import numpy as np 6 | import logging 7 | import os 8 | 9 | from hyperopt import fmin, hp, tpe 10 | 11 | from sklearn.base import BaseEstimator 12 | from sklearn.svm import LinearSVC 13 | from sklearn import preprocessing 14 | 15 | from otto_utils import consts, utils 16 | 17 | 18 | MODEL_NAME = 'model_11_xgboost_poly' 19 | MODE = 'cv' # cv|submission|holdout|tune 20 | 21 | logging.disable(logging.INFO) 22 | 23 | 24 | class XGBoost(BaseEstimator): 25 | def __init__(self, max_iterations=50, max_depth=9, min_child_weight=4, row_subsample=.75, 26 | min_loss_reduction=1., column_subsample=.8, step_size=.3, verbose=True): 27 | self.n_classes_ = 9 28 | self.max_iterations = max_iterations 29 | self.max_depth = max_depth 30 | self.min_child_weight = min_child_weight 31 | self.row_subsample = row_subsample 32 | self.min_loss_reduction = min_loss_reduction 33 | self.column_subsample = column_subsample 34 | self.step_size = step_size 35 | self.verbose = verbose 36 | self.model = None 37 | 38 | def fit(self, X, y, sample_weight=None): 39 | sf = self._array_to_sframe(X, y) 40 | self.model = gl.boosted_trees_classifier.create(sf, target='target', 41 | max_iterations=self.max_iterations, 42 | max_depth=self.max_depth, 43 | min_child_weight=self.min_child_weight, 44 | row_subsample=self.row_subsample, 45 | min_loss_reduction=self.min_loss_reduction, 46 | column_subsample=self.column_subsample, 47 | step_size=self.step_size, 48 | validation_set=None, 49 | verbose=self.verbose) 50 | 51 | return self 52 | 53 | def predict(self, X): 54 | preds = self.predict_proba(X) 55 | return np.argmax(preds, axis=1) 56 | 57 | def predict_proba(self, X): 58 | sf = self._array_to_sframe(X) 59 | preds = self.model.predict_topk(sf, output_type='probability', k=self.n_classes_) 60 | 61 | return self._preds_to_array(preds) 62 | 63 | # Private methods 64 | def _array_to_sframe(self, data, targets=None): 65 | d = dict() 66 | for i in xrange(data.shape[1]): 67 | d['feat_%d' % (i + 1)] = gl.SArray(data[:, i]) 68 | if targets is not None: 69 | d['target'] = gl.SArray(targets) 70 | 71 | return gl.SFrame(d) 72 | 73 | def _preds_to_array(self, preds): 74 | p = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '') 75 | p['id'] = p['id'].astype(int) + 1 76 | p = p.sort('id') 77 | del p['id'] 78 | preds_array = np.array(p.to_dataframe(), dtype=float) 79 | 80 | return preds_array 81 | 82 | 83 | if __name__ == '__main__': 84 | train, labels, test, _, _ = utils.load_data() 85 | 86 | # polynomial features 87 | poly_feat = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True) 88 | train = poly_feat.fit_transform(train, labels) 89 | test = poly_feat.transform(test) 90 | 91 | print train.shape 92 | 93 | # feature selection 94 | feat_selector = LinearSVC(C=0.0001, penalty='l1', dual=False) 95 | train = feat_selector.fit_transform(train, labels) 96 | test = feat_selector.transform(test) 97 | 98 | print train.shape 99 | 100 | clf = XGBoost(max_iterations=4800, max_depth=12, min_child_weight=4.9208250938262745, row_subsample=.9134478530382129, 101 | min_loss_reduction=.5132278416508804, column_subsample=.730128689911957, step_size=.009) 102 | 103 | 104 | if MODE == 'cv': 105 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 106 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 107 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 108 | elif MODE == 'submission': 109 | clf.fit(train, labels) 110 | predictions = clf.predict_proba(test) 111 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 112 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 113 | predictions) 114 | elif MODE == 'holdout': 115 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 116 | print 'Log loss:', score 117 | elif MODE == 'tune': 118 | # Objective function 119 | def objective(args): 120 | max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args 121 | clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight, 122 | row_subsample=row_subsample, min_loss_reduction=min_loss_reduction, 123 | column_subsample=column_subsample, verbose=False) 124 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 125 | print 'max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss' 126 | print args, score 127 | return score 128 | # Searching space 129 | space = ( 130 | hp.quniform('max_depth', 2, 14, 1), 131 | hp.uniform('min_child_weight', .5, 10.), 132 | hp.uniform('row_subsample', .3, 1.), 133 | hp.uniform('min_loss_reduction', .1, 3.), 134 | hp.uniform('column_subsample', .1, 1.), 135 | ) 136 | 137 | best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=500) 138 | print 'Best solution:', best_sln 139 | else: 140 | print 'Unknown mode' 141 | -------------------------------------------------------------------------------- /otto/model/model_12_nn_rmsprop_pca/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_12_nn_rmsprop_pca/nn_rmsprop_pca.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mean log loss from 5-fold CV: 0.476282022208 3 | """ 4 | import copy 5 | import itertools 6 | import numpy as np 7 | import lasagne 8 | import math 9 | import os 10 | import theano 11 | import theano.tensor as T 12 | import time 13 | 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params 15 | from lasagne.nonlinearities import rectify, softmax 16 | from lasagne.objectives import categorical_crossentropy, Objective 17 | from lasagne.updates import rmsprop 18 | 19 | from sklearn import decomposition 20 | from sklearn.base import BaseEstimator 21 | from sklearn.cross_validation import StratifiedShuffleSplit 22 | from sklearn.utils import check_random_state 23 | 24 | from otto_utils import consts, utils 25 | 26 | 27 | MODEL_NAME = 'model_12_nn_rmsprop_pca' 28 | MODE = 'submission' # cv|submission|holdout|tune 29 | 30 | 31 | class NeuralNetwork(BaseEstimator): 32 | def __init__(self, n_hidden=20, max_epochs=150, batch_size=200, 33 | lr=0.01, rho=0.9, dropout=0.5, valid_ratio=0.0, 34 | use_valid=False, verbose=0, random_state=None): 35 | self.n_hidden = n_hidden 36 | self.max_epochs = max_epochs 37 | self.batch_size = batch_size 38 | self.lr = lr 39 | self.rho = rho 40 | self.dropout = dropout 41 | self.valid_ratio = valid_ratio 42 | self.use_valid = use_valid 43 | self.verbose = verbose 44 | self.random_state = random_state 45 | # State 46 | self.score_ = None 47 | self.classes_ = None 48 | self.n_classes_ = None 49 | self.model = None 50 | 51 | def fit(self, data, targets, sample_weight=None): 52 | self.classes_, indices = np.unique(targets, return_inverse=True) 53 | self.n_classes_ = self.classes_.shape[0] 54 | 55 | random_state = check_random_state(self.random_state) 56 | 57 | # Shuffle data and eventually split on train and validation sets 58 | if self.valid_ratio > 0: 59 | strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio, 60 | n_iter=1, random_state=self.random_state) 61 | train_index, valid_index = [s for s in strat_shuffled_split][0] 62 | X_train, y_train = data[train_index], targets[train_index] 63 | X_valid, y_valid = data[valid_index], targets[valid_index] 64 | else: 65 | X_train, y_train = data, targets 66 | X_valid, y_valid = np.array([]), np.array([]) 67 | 68 | if self.verbose > 5: 69 | print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape) 70 | if self.use_valid: 71 | print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape) 72 | 73 | # Prepare theano variables 74 | dataset = dict( 75 | X_train=theano.shared(lasagne.utils.floatX(X_train)), 76 | y_train=T.cast(theano.shared(y_train), 'int32'), 77 | X_valid=theano.shared(lasagne.utils.floatX(X_valid)), 78 | y_valid=T.cast(theano.shared(y_valid), 'int32'), 79 | num_examples_train=X_train.shape[0], 80 | num_examples_valid=X_valid.shape[0], 81 | input_dim=X_train.shape[1], 82 | output_dim=self.n_classes_, 83 | ) 84 | 85 | if self.verbose > 0: 86 | print "Building model and compiling functions..." 87 | output_layer = self.build_model(dataset['input_dim']) 88 | iter_funcs = self.create_iter_functions(dataset, output_layer) 89 | 90 | if self.verbose > 0: 91 | print "Starting training..." 92 | now = time.time() 93 | results = [] 94 | try: 95 | for epoch in self.train(iter_funcs, dataset, output_layer): 96 | if self.verbose > 1: 97 | print "Epoch {} of {} took {:.3f}s".format( 98 | epoch['number'], self.max_epochs, time.time() - now) 99 | now = time.time() 100 | results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']]) 101 | if self.verbose > 1: 102 | print " training loss:\t\t{:.6f}".format(epoch['train_loss']) 103 | print " validation loss:\t\t{:.6f}".format(epoch['valid_loss']) 104 | print " validation accuracy:\t\t{:.2f} %%".format( 105 | epoch['valid_accuracy'] * 100) 106 | 107 | if epoch['number'] >= self.max_epochs: 108 | break 109 | 110 | if self.verbose > 0: 111 | print 'Minimum validation error: %f (epoch %d)' % \ 112 | (epoch['best_val_error'], epoch['best_val_iter']) 113 | 114 | except KeyboardInterrupt: 115 | pass 116 | 117 | return self 118 | 119 | def predict(self, data): 120 | preds, _ = self.make_predictions(data) 121 | 122 | return preds 123 | 124 | def predict_proba(self, data): 125 | _, proba = self.make_predictions(data) 126 | 127 | return proba 128 | 129 | def score(self): 130 | return self.score_ 131 | 132 | # Private methods 133 | def build_model(self, input_dim): 134 | l_in = InputLayer(shape=(self.batch_size, input_dim)) 135 | 136 | l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify) 137 | l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout) 138 | 139 | l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden / 2, nonlinearity=rectify) 140 | l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout) 141 | 142 | l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden / 4, nonlinearity=rectify) 143 | l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout) 144 | 145 | l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax) 146 | 147 | return l_out 148 | 149 | def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): 150 | batch_index = T.iscalar('batch_index') 151 | X_batch = X_tensor_type('x') 152 | y_batch = T.ivector('y') 153 | 154 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 155 | 156 | objective = Objective(output_layer, loss_function=categorical_crossentropy) 157 | 158 | loss_train = objective.get_loss(X_batch, target=y_batch) 159 | loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) 160 | 161 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 162 | proba = output_layer.get_output(X_batch, deterministic=True) 163 | accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) 164 | 165 | all_params = get_all_params(output_layer) 166 | updates = rmsprop(loss_train, all_params, self.lr, self.rho) 167 | 168 | iter_train = theano.function( 169 | [batch_index], loss_train, 170 | updates=updates, 171 | givens={ 172 | X_batch: dataset['X_train'][batch_slice], 173 | y_batch: dataset['y_train'][batch_slice], 174 | }, 175 | on_unused_input='ignore', 176 | ) 177 | 178 | iter_valid = None 179 | if self.use_valid: 180 | iter_valid = theano.function( 181 | [batch_index], [loss_eval, accuracy, proba], 182 | givens={ 183 | X_batch: dataset['X_valid'][batch_slice], 184 | y_batch: dataset['y_valid'][batch_slice], 185 | }, 186 | ) 187 | 188 | return dict(train=iter_train, valid=iter_valid) 189 | 190 | def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix): 191 | batch_index = T.iscalar('batch_index') 192 | X_batch = X_tensor_type('x') 193 | 194 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 195 | 196 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 197 | proba = output_layer.get_output(X_batch, deterministic=True) 198 | 199 | iter_test = theano.function( 200 | [batch_index], [pred, proba], 201 | givens={ 202 | X_batch: dataset['X_test'][batch_slice], 203 | }, 204 | ) 205 | 206 | return dict(test=iter_test) 207 | 208 | def train(self, iter_funcs, dataset, output_layer): 209 | num_batches_train = dataset['num_examples_train'] // self.batch_size 210 | num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size))) 211 | 212 | best_val_err = 100 213 | best_val_iter = -1 214 | 215 | for epoch in itertools.count(1): 216 | batch_train_losses = [] 217 | for b in range(num_batches_train): 218 | batch_train_loss = iter_funcs['train'](b) 219 | batch_train_losses.append(batch_train_loss) 220 | 221 | avg_train_loss = np.mean(batch_train_losses) 222 | 223 | batch_valid_losses = [] 224 | batch_valid_accuracies = [] 225 | batch_valid_probas = [] 226 | 227 | if self.use_valid: 228 | for b in range(num_batches_valid): 229 | batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b) 230 | batch_valid_losses.append(batch_valid_loss) 231 | batch_valid_accuracies.append(batch_valid_accuracy) 232 | batch_valid_probas.append(batch_valid_proba) 233 | 234 | avg_valid_loss = np.mean(batch_valid_losses) 235 | avg_valid_accuracy = np.mean(batch_valid_accuracies) 236 | 237 | if (best_val_err > avg_valid_loss and self.use_valid) or\ 238 | (epoch == self.max_epochs and not self.use_valid): 239 | best_val_err = avg_valid_loss 240 | best_val_iter = epoch 241 | # Save model 242 | self.score_ = best_val_err 243 | self.model = copy.deepcopy(output_layer) 244 | 245 | 246 | yield { 247 | 'number': epoch, 248 | 'train_loss': avg_train_loss, 249 | 'valid_loss': avg_valid_loss, 250 | 'valid_accuracy': avg_valid_accuracy, 251 | 'best_val_error': best_val_err, 252 | 'best_val_iter': best_val_iter, 253 | } 254 | 255 | def make_predictions(self, data): 256 | dataset = dict( 257 | X_test=theano.shared(lasagne.utils.floatX(data)), 258 | num_examples_test=data.shape[0], 259 | input_dim=data.shape[1], 260 | output_dim=self.n_classes_, 261 | ) 262 | 263 | iter_funcs = self.create_test_function(dataset, self.model) 264 | num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size))) 265 | 266 | test_preds, test_probas = np.array([]), None 267 | 268 | for b in range(num_batches_test): 269 | batch_test_pred, batch_test_proba = iter_funcs['test'](b) 270 | test_preds = np.append(test_preds, batch_test_pred) 271 | test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba 272 | 273 | return test_preds, test_probas 274 | 275 | 276 | if __name__ == '__main__': 277 | train, labels, test, _, _ = utils.load_data() 278 | 279 | # PCA 280 | pp = decomposition.PCA() 281 | train = pp.fit_transform(train) 282 | test = pp.transform(test) 283 | 284 | clf = NeuralNetwork(1024, 110, 128, 7.218018732952578e-05, 0.9385973679339745, 0.3848935494155976, 285 | .02, True, 10, random_state=23) 286 | 287 | if MODE == 'cv': 288 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 289 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 290 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 291 | elif MODE == 'submission': 292 | clf.fit(train, labels) 293 | predictions = clf.predict_proba(test) 294 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 295 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 296 | predictions) 297 | elif MODE == 'holdout': 298 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 299 | print 'Log loss:', score 300 | else: 301 | print 'Unknown mode' -------------------------------------------------------------------------------- /otto/model/model_13_nn_rmsprop_features/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_13_nn_rmsprop_features/nn_rmsprop_features.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mean log loss from 5-fold CV: 0.485363808092 3 | """ 4 | import copy 5 | import itertools 6 | import numpy as np 7 | import lasagne 8 | import math 9 | import os 10 | import theano 11 | import theano.tensor as T 12 | import time 13 | 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params 15 | from lasagne.nonlinearities import rectify, softmax 16 | from lasagne.objectives import categorical_crossentropy, Objective 17 | from lasagne.updates import rmsprop 18 | 19 | from sklearn import feature_extraction 20 | from sklearn.base import BaseEstimator 21 | from sklearn.cross_validation import StratifiedShuffleSplit 22 | from sklearn.utils import check_random_state 23 | 24 | from otto_utils import consts, utils 25 | 26 | 27 | MODEL_NAME = 'model_13_nn_rmsprop_features' 28 | MODE = 'submission' # cv|submission|holdout|tune 29 | 30 | 31 | class NeuralNetwork(BaseEstimator): 32 | def __init__(self, n_hidden=20, max_epochs=150, batch_size=200, 33 | lr=0.01, rho=0.96, epsilon=0.9, dropout=0.5, valid_ratio=0.0, 34 | use_valid=False, verbose=0, random_state=None): 35 | self.n_hidden = n_hidden 36 | self.max_epochs = max_epochs 37 | self.batch_size = batch_size 38 | self.lr = lr 39 | self.rho = rho 40 | self.epsilon = epsilon 41 | self.dropout = dropout 42 | self.valid_ratio = valid_ratio 43 | self.use_valid = use_valid 44 | self.verbose = verbose 45 | self.random_state = random_state 46 | # State 47 | self.score_ = None 48 | self.classes_ = None 49 | self.n_classes_ = None 50 | self.model = None 51 | 52 | def fit(self, data, targets, sample_weight=None): 53 | self.classes_, indices = np.unique(targets, return_inverse=True) 54 | self.n_classes_ = self.classes_.shape[0] 55 | 56 | random_state = check_random_state(self.random_state) 57 | 58 | # Shuffle data and eventually split on train and validation sets 59 | if self.valid_ratio > 0: 60 | strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio, 61 | n_iter=1, random_state=self.random_state) 62 | train_index, valid_index = [s for s in strat_shuffled_split][0] 63 | X_train, y_train = data[train_index], targets[train_index] 64 | X_valid, y_valid = data[valid_index], targets[valid_index] 65 | else: 66 | X_train, y_train = data, targets 67 | X_valid, y_valid = np.array([]), np.array([]) 68 | 69 | if self.verbose > 5: 70 | print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape) 71 | if self.use_valid: 72 | print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape) 73 | 74 | # Prepare theano variables 75 | dataset = dict( 76 | X_train=theano.shared(lasagne.utils.floatX(X_train)), 77 | y_train=T.cast(theano.shared(y_train), 'int32'), 78 | X_valid=theano.shared(lasagne.utils.floatX(X_valid)), 79 | y_valid=T.cast(theano.shared(y_valid), 'int32'), 80 | num_examples_train=X_train.shape[0], 81 | num_examples_valid=X_valid.shape[0], 82 | input_dim=X_train.shape[1], 83 | output_dim=self.n_classes_, 84 | ) 85 | 86 | if self.verbose > 0: 87 | print "Building model and compiling functions..." 88 | output_layer = self.build_model(dataset['input_dim']) 89 | iter_funcs = self.create_iter_functions(dataset, output_layer) 90 | 91 | if self.verbose > 0: 92 | print "Starting training..." 93 | now = time.time() 94 | results = [] 95 | try: 96 | for epoch in self.train(iter_funcs, dataset, output_layer): 97 | if self.verbose > 1: 98 | print "Epoch {} of {} took {:.3f}s".format( 99 | epoch['number'], self.max_epochs, time.time() - now) 100 | now = time.time() 101 | results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']]) 102 | if self.verbose > 1: 103 | print " training loss:\t\t{:.6f}".format(epoch['train_loss']) 104 | print " validation loss:\t\t{:.6f}".format(epoch['valid_loss']) 105 | print " validation accuracy:\t\t{:.2f} %%".format( 106 | epoch['valid_accuracy'] * 100) 107 | 108 | if epoch['number'] >= self.max_epochs: 109 | break 110 | 111 | if self.verbose > 0: 112 | print 'Minimum validation error: %f (epoch %d)' % \ 113 | (epoch['best_val_error'], epoch['best_val_iter']) 114 | 115 | except KeyboardInterrupt: 116 | pass 117 | 118 | return self 119 | 120 | def predict(self, data): 121 | preds, _ = self.make_predictions(data) 122 | 123 | return preds 124 | 125 | def predict_proba(self, data): 126 | _, proba = self.make_predictions(data) 127 | 128 | return proba 129 | 130 | def score(self): 131 | return self.score_ 132 | 133 | # Private methods 134 | def build_model(self, input_dim): 135 | l_in = InputLayer(shape=(self.batch_size, input_dim)) 136 | 137 | l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden / 2, nonlinearity=rectify) 138 | l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout) 139 | 140 | l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify) 141 | l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout) 142 | 143 | l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden / 2, nonlinearity=rectify) 144 | l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout) 145 | 146 | l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax) 147 | 148 | return l_out 149 | 150 | def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): 151 | batch_index = T.iscalar('batch_index') 152 | X_batch = X_tensor_type('x') 153 | y_batch = T.ivector('y') 154 | 155 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 156 | 157 | objective = Objective(output_layer, loss_function=categorical_crossentropy) 158 | 159 | loss_train = objective.get_loss(X_batch, target=y_batch) 160 | loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) 161 | 162 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 163 | proba = output_layer.get_output(X_batch, deterministic=True) 164 | accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) 165 | 166 | all_params = get_all_params(output_layer) 167 | updates = rmsprop(loss_train, all_params, self.lr, self.rho) 168 | 169 | iter_train = theano.function( 170 | [batch_index], loss_train, 171 | updates=updates, 172 | givens={ 173 | X_batch: dataset['X_train'][batch_slice], 174 | y_batch: dataset['y_train'][batch_slice], 175 | }, 176 | on_unused_input='ignore', 177 | ) 178 | 179 | iter_valid = None 180 | if self.use_valid: 181 | iter_valid = theano.function( 182 | [batch_index], [loss_eval, accuracy, proba], 183 | givens={ 184 | X_batch: dataset['X_valid'][batch_slice], 185 | y_batch: dataset['y_valid'][batch_slice], 186 | }, 187 | ) 188 | 189 | return dict(train=iter_train, valid=iter_valid) 190 | 191 | def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix): 192 | batch_index = T.iscalar('batch_index') 193 | X_batch = X_tensor_type('x') 194 | 195 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 196 | 197 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 198 | proba = output_layer.get_output(X_batch, deterministic=True) 199 | 200 | iter_test = theano.function( 201 | [batch_index], [pred, proba], 202 | givens={ 203 | X_batch: dataset['X_test'][batch_slice], 204 | }, 205 | ) 206 | 207 | return dict(test=iter_test) 208 | 209 | def train(self, iter_funcs, dataset, output_layer): 210 | num_batches_train = dataset['num_examples_train'] // self.batch_size 211 | num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size))) 212 | 213 | best_val_err = 100 214 | best_val_iter = -1 215 | 216 | for epoch in itertools.count(1): 217 | batch_train_losses = [] 218 | for b in range(num_batches_train): 219 | batch_train_loss = iter_funcs['train'](b) 220 | batch_train_losses.append(batch_train_loss) 221 | 222 | avg_train_loss = np.mean(batch_train_losses) 223 | 224 | batch_valid_losses = [] 225 | batch_valid_accuracies = [] 226 | batch_valid_probas = [] 227 | 228 | if self.use_valid: 229 | for b in range(num_batches_valid): 230 | batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b) 231 | batch_valid_losses.append(batch_valid_loss) 232 | batch_valid_accuracies.append(batch_valid_accuracy) 233 | batch_valid_probas.append(batch_valid_proba) 234 | 235 | avg_valid_loss = np.mean(batch_valid_losses) 236 | avg_valid_accuracy = np.mean(batch_valid_accuracies) 237 | 238 | if (best_val_err > avg_valid_loss and self.use_valid) or\ 239 | (epoch == self.max_epochs and not self.use_valid): 240 | best_val_err = avg_valid_loss 241 | best_val_iter = epoch 242 | # Save model 243 | self.score_ = best_val_err 244 | self.model = copy.deepcopy(output_layer) 245 | 246 | 247 | yield { 248 | 'number': epoch, 249 | 'train_loss': avg_train_loss, 250 | 'valid_loss': avg_valid_loss, 251 | 'valid_accuracy': avg_valid_accuracy, 252 | 'best_val_error': best_val_err, 253 | 'best_val_iter': best_val_iter, 254 | } 255 | 256 | def make_predictions(self, data): 257 | dataset = dict( 258 | X_test=theano.shared(lasagne.utils.floatX(data)), 259 | num_examples_test=data.shape[0], 260 | input_dim=data.shape[1], 261 | output_dim=self.n_classes_, 262 | ) 263 | 264 | iter_funcs = self.create_test_function(dataset, self.model) 265 | num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size))) 266 | 267 | test_preds, test_probas = np.array([]), None 268 | 269 | for b in range(num_batches_test): 270 | batch_test_pred, batch_test_proba = iter_funcs['test'](b) 271 | test_preds = np.append(test_preds, batch_test_pred) 272 | test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba 273 | 274 | return test_preds, test_probas 275 | 276 | 277 | if __name__ == '__main__': 278 | train, labels, test, _, _ = utils.load_data(os.path.join(consts.DATA_PATH, 'fe_train.csv'), 279 | os.path.join(consts.DATA_PATH, 'fe_test.csv')) 280 | 281 | from sklearn import decomposition 282 | # PCA 283 | pp = decomposition.PCA() 284 | train = pp.fit_transform(train) 285 | test = pp.transform(test) 286 | 287 | clf = NeuralNetwork(1024, 110, 128, 0.00013934891814068934, 0.9724490021642429, 288 | 6.238206486137665e-05, 0.3081052487919688, 289 | .02, True, 10, random_state=21) 290 | 291 | if MODE == 'cv': 292 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 293 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 294 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 295 | elif MODE == 'submission': 296 | clf.fit(train, labels) 297 | predictions = clf.predict_proba(test) 298 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 299 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 300 | predictions) 301 | elif MODE == 'holdout': 302 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 303 | print 'Log loss:', score 304 | else: 305 | print 'Unknown mode' -------------------------------------------------------------------------------- /otto/model/model_14_bagging_xgboost/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_14_bagging_xgboost/bagging_xgboost.py: -------------------------------------------------------------------------------- 1 | """ 2 | 5-fold cv - log loss 0.447489661199 3 | """ 4 | import graphlab as gl 5 | import numpy as np 6 | import logging 7 | import os 8 | 9 | from hyperopt import fmin, hp, tpe 10 | 11 | from sklearn.base import BaseEstimator 12 | from sklearn import ensemble 13 | 14 | from otto_utils import consts, utils 15 | 16 | 17 | MODEL_NAME = 'model_14_bagging_xgboost' 18 | MODE = 'cv' # cv|submission|holdout|tune 19 | 20 | logging.disable(logging.INFO) 21 | 22 | 23 | class XGBoost(BaseEstimator): 24 | def __init__(self, max_iterations=50, max_depth=9, min_child_weight=4, row_subsample=.75, 25 | min_loss_reduction=1., column_subsample=.8, step_size=.3, verbose=True): 26 | self.n_classes_ = None 27 | self.classes_ = None 28 | self.max_iterations = max_iterations 29 | self.max_depth = max_depth 30 | self.min_child_weight = min_child_weight 31 | self.row_subsample = row_subsample 32 | self.min_loss_reduction = min_loss_reduction 33 | self.column_subsample = column_subsample 34 | self.step_size = step_size 35 | self.verbose = verbose 36 | self.model = None 37 | 38 | def fit(self, X, y, sample_weight=None): 39 | self.classes_, indices = np.unique(y, return_inverse=True) 40 | self.n_classes_ = self.classes_.shape[0] 41 | 42 | sf = self._array_to_sframe(X, y) 43 | self.model = gl.boosted_trees_classifier.create(sf, target='target', 44 | max_iterations=self.max_iterations, 45 | max_depth=self.max_depth, 46 | min_child_weight=self.min_child_weight, 47 | row_subsample=self.row_subsample, 48 | min_loss_reduction=self.min_loss_reduction, 49 | column_subsample=self.column_subsample, 50 | step_size=self.step_size, 51 | validation_set=None, 52 | verbose=self.verbose) 53 | 54 | return self 55 | 56 | def predict(self, X): 57 | preds = self.predict_proba(X) 58 | return np.argmax(preds, axis=1) 59 | 60 | def predict_proba(self, X): 61 | sf = self._array_to_sframe(X) 62 | preds = self.model.predict_topk(sf, output_type='probability', k=self.n_classes_) 63 | 64 | return self._preds_to_array(preds) 65 | 66 | # Private methods 67 | def _array_to_sframe(self, data, targets=None): 68 | d = dict() 69 | for i in xrange(data.shape[1]): 70 | d['feat_%d' % (i + 1)] = gl.SArray(data[:, i]) 71 | if targets is not None: 72 | d['target'] = gl.SArray(targets) 73 | 74 | return gl.SFrame(d) 75 | 76 | def _preds_to_array(self, preds): 77 | p = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '') 78 | p['id'] = p['id'].astype(int) + 1 79 | p = p.sort('id') 80 | del p['id'] 81 | preds_array = np.array(p.to_dataframe(), dtype=float) 82 | 83 | return preds_array 84 | 85 | 86 | if __name__ == '__main__': 87 | train, labels, test, _, _ = utils.load_data() 88 | 89 | clf_xgb = XGBoost(max_iterations=300, max_depth=12, min_child_weight=4.9208250938262745, row_subsample=.9134478530382129, 90 | min_loss_reduction=.5132278416508804, column_subsample=.730128689911957, step_size=.1) 91 | clf = ensemble.BaggingClassifier(base_estimator=clf_xgb, n_estimators=20, 92 | max_samples=1., max_features=1., bootstrap=True, 93 | n_jobs=1, verbose=True, random_state=23) 94 | 95 | if MODE == 'cv': 96 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 97 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 98 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 99 | elif MODE == 'submission': 100 | clf.fit(train, labels) 101 | predictions = clf.predict_proba(test) 102 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 103 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 104 | predictions) 105 | elif MODE == 'holdout': 106 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 107 | print 'Log loss:', score 108 | elif MODE == 'tune': 109 | # Objective function 110 | def objective(args): 111 | max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args 112 | clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight, 113 | row_subsample=row_subsample, min_loss_reduction=min_loss_reduction, 114 | column_subsample=column_subsample, verbose=False) 115 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 116 | print 'max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss' 117 | print args, score 118 | return score 119 | # Searching space 120 | space = ( 121 | hp.quniform('max_depth', 12, 12, 1), 122 | hp.uniform('min_child_weight', .5, 10.), 123 | hp.uniform('row_subsample', .3, 1.), 124 | hp.uniform('min_loss_reduction', .1, 3.), 125 | hp.uniform('column_subsample', .1, 1.), 126 | ) 127 | 128 | best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=500) 129 | print 'Best solution:', best_sln 130 | else: 131 | print 'Unknown mode' 132 | -------------------------------------------------------------------------------- /otto/model/model_15_nn_adagrad_pca/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_15_nn_adagrad_pca/nn_adagrad_pca.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mean log loss from 5-fold CV: 0.478792791749 3 | """ 4 | import copy 5 | import itertools 6 | import numpy as np 7 | import lasagne 8 | import math 9 | import os 10 | import theano 11 | import theano.tensor as T 12 | import time 13 | 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params 15 | from lasagne.nonlinearities import rectify, softmax 16 | from lasagne.objectives import categorical_crossentropy, Objective 17 | from lasagne.updates import adagrad 18 | 19 | from sklearn import decomposition, feature_extraction 20 | from sklearn.base import BaseEstimator 21 | from sklearn.cross_validation import StratifiedShuffleSplit 22 | from sklearn.utils import check_random_state 23 | 24 | from otto_utils import consts, utils 25 | 26 | 27 | MODEL_NAME = 'model_15_nn_adagrad_pca' 28 | MODE = 'cv' # cv|submission|holdout|tune 29 | 30 | 31 | class NeuralNetwork(BaseEstimator): 32 | def __init__(self, n_hidden=20, max_epochs=150, batch_size=200, 33 | lr=0.01, epsilon=0.9, dropout=0.5, valid_ratio=0.0, 34 | use_valid=False, verbose=0, random_state=None): 35 | self.n_hidden = n_hidden 36 | self.max_epochs = max_epochs 37 | self.batch_size = batch_size 38 | self.lr = lr 39 | self.epsilon = epsilon 40 | self.dropout = dropout 41 | self.valid_ratio = valid_ratio 42 | self.use_valid = use_valid 43 | self.verbose = verbose 44 | self.random_state = random_state 45 | # State 46 | self.score_ = None 47 | self.classes_ = None 48 | self.n_classes_ = None 49 | self.model = None 50 | 51 | def fit(self, data, targets, sample_weight=None): 52 | self.classes_, indices = np.unique(targets, return_inverse=True) 53 | self.n_classes_ = self.classes_.shape[0] 54 | 55 | random_state = check_random_state(self.random_state) 56 | 57 | # Shuffle data and eventually split on train and validation sets 58 | if self.valid_ratio > 0: 59 | strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio, 60 | n_iter=1, random_state=self.random_state) 61 | train_index, valid_index = [s for s in strat_shuffled_split][0] 62 | X_train, y_train = data[train_index], targets[train_index] 63 | X_valid, y_valid = data[valid_index], targets[valid_index] 64 | else: 65 | X_train, y_train = data, targets 66 | X_valid, y_valid = np.array([]), np.array([]) 67 | 68 | if self.verbose > 5: 69 | print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape) 70 | if self.use_valid: 71 | print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape) 72 | 73 | # Prepare theano variables 74 | dataset = dict( 75 | X_train=theano.shared(lasagne.utils.floatX(X_train)), 76 | y_train=T.cast(theano.shared(y_train), 'int32'), 77 | X_valid=theano.shared(lasagne.utils.floatX(X_valid)), 78 | y_valid=T.cast(theano.shared(y_valid), 'int32'), 79 | num_examples_train=X_train.shape[0], 80 | num_examples_valid=X_valid.shape[0], 81 | input_dim=X_train.shape[1], 82 | output_dim=self.n_classes_, 83 | ) 84 | 85 | if self.verbose > 0: 86 | print "Building model and compiling functions..." 87 | output_layer = self.build_model(dataset['input_dim']) 88 | iter_funcs = self.create_iter_functions(dataset, output_layer) 89 | 90 | if self.verbose > 0: 91 | print "Starting training..." 92 | now = time.time() 93 | results = [] 94 | try: 95 | for epoch in self.train(iter_funcs, dataset, output_layer): 96 | if self.verbose > 1: 97 | print "Epoch {} of {} took {:.3f}s".format( 98 | epoch['number'], self.max_epochs, time.time() - now) 99 | now = time.time() 100 | results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']]) 101 | if self.verbose > 1: 102 | print " training loss:\t\t{:.6f}".format(epoch['train_loss']) 103 | print " validation loss:\t\t{:.6f}".format(epoch['valid_loss']) 104 | print " validation accuracy:\t\t{:.2f} %%".format( 105 | epoch['valid_accuracy'] * 100) 106 | 107 | if epoch['number'] >= self.max_epochs: 108 | break 109 | 110 | if self.verbose > 0: 111 | print 'Minimum validation error: %f (epoch %d)' % \ 112 | (epoch['best_val_error'], epoch['best_val_iter']) 113 | 114 | except KeyboardInterrupt: 115 | pass 116 | 117 | return self 118 | 119 | def predict(self, data): 120 | preds, _ = self.make_predictions(data) 121 | 122 | return preds 123 | 124 | def predict_proba(self, data): 125 | _, proba = self.make_predictions(data) 126 | 127 | return proba 128 | 129 | def score(self): 130 | return self.score_ 131 | 132 | # Private methods 133 | def build_model(self, input_dim): 134 | l_in = InputLayer(shape=(self.batch_size, input_dim)) 135 | 136 | l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify) 137 | l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout) 138 | 139 | l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden / 2, nonlinearity=rectify) 140 | l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout) 141 | 142 | l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden, nonlinearity=rectify) 143 | l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout) 144 | 145 | l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax) 146 | 147 | return l_out 148 | 149 | def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): 150 | batch_index = T.iscalar('batch_index') 151 | X_batch = X_tensor_type('x') 152 | y_batch = T.ivector('y') 153 | 154 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 155 | 156 | objective = Objective(output_layer, loss_function=categorical_crossentropy) 157 | 158 | loss_train = objective.get_loss(X_batch, target=y_batch) 159 | loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) 160 | 161 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 162 | proba = output_layer.get_output(X_batch, deterministic=True) 163 | accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) 164 | 165 | all_params = get_all_params(output_layer) 166 | updates = adagrad(loss_train, all_params, self.lr, self.epsilon) 167 | 168 | iter_train = theano.function( 169 | [batch_index], loss_train, 170 | updates=updates, 171 | givens={ 172 | X_batch: dataset['X_train'][batch_slice], 173 | y_batch: dataset['y_train'][batch_slice], 174 | }, 175 | on_unused_input='ignore', 176 | ) 177 | 178 | iter_valid = None 179 | if self.use_valid: 180 | iter_valid = theano.function( 181 | [batch_index], [loss_eval, accuracy, proba], 182 | givens={ 183 | X_batch: dataset['X_valid'][batch_slice], 184 | y_batch: dataset['y_valid'][batch_slice], 185 | }, 186 | ) 187 | 188 | return dict(train=iter_train, valid=iter_valid) 189 | 190 | def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix): 191 | batch_index = T.iscalar('batch_index') 192 | X_batch = X_tensor_type('x') 193 | 194 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 195 | 196 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 197 | proba = output_layer.get_output(X_batch, deterministic=True) 198 | 199 | iter_test = theano.function( 200 | [batch_index], [pred, proba], 201 | givens={ 202 | X_batch: dataset['X_test'][batch_slice], 203 | }, 204 | ) 205 | 206 | return dict(test=iter_test) 207 | 208 | def train(self, iter_funcs, dataset, output_layer): 209 | num_batches_train = dataset['num_examples_train'] // self.batch_size 210 | num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size))) 211 | 212 | best_val_err = 100 213 | best_val_iter = -1 214 | 215 | for epoch in itertools.count(1): 216 | batch_train_losses = [] 217 | for b in range(num_batches_train): 218 | batch_train_loss = iter_funcs['train'](b) 219 | batch_train_losses.append(batch_train_loss) 220 | 221 | avg_train_loss = np.mean(batch_train_losses) 222 | 223 | batch_valid_losses = [] 224 | batch_valid_accuracies = [] 225 | batch_valid_probas = [] 226 | 227 | if self.use_valid: 228 | for b in range(num_batches_valid): 229 | batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b) 230 | batch_valid_losses.append(batch_valid_loss) 231 | batch_valid_accuracies.append(batch_valid_accuracy) 232 | batch_valid_probas.append(batch_valid_proba) 233 | 234 | avg_valid_loss = np.mean(batch_valid_losses) 235 | avg_valid_accuracy = np.mean(batch_valid_accuracies) 236 | 237 | if (best_val_err > avg_valid_loss and self.use_valid) or\ 238 | (epoch == self.max_epochs and not self.use_valid): 239 | best_val_err = avg_valid_loss 240 | best_val_iter = epoch 241 | # Save model 242 | self.score_ = best_val_err 243 | self.model = copy.deepcopy(output_layer) 244 | 245 | 246 | yield { 247 | 'number': epoch, 248 | 'train_loss': avg_train_loss, 249 | 'valid_loss': avg_valid_loss, 250 | 'valid_accuracy': avg_valid_accuracy, 251 | 'best_val_error': best_val_err, 252 | 'best_val_iter': best_val_iter, 253 | } 254 | 255 | def make_predictions(self, data): 256 | dataset = dict( 257 | X_test=theano.shared(lasagne.utils.floatX(data)), 258 | num_examples_test=data.shape[0], 259 | input_dim=data.shape[1], 260 | output_dim=self.n_classes_, 261 | ) 262 | 263 | iter_funcs = self.create_test_function(dataset, self.model) 264 | num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size))) 265 | 266 | test_preds, test_probas = np.array([]), None 267 | 268 | for b in range(num_batches_test): 269 | batch_test_pred, batch_test_proba = iter_funcs['test'](b) 270 | test_preds = np.append(test_preds, batch_test_pred) 271 | test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba 272 | 273 | return test_preds, test_probas 274 | 275 | 276 | if __name__ == '__main__': 277 | train, labels, test, _, _ = utils.load_data() 278 | 279 | # Preprocess data - transform counts to TFIDF features 280 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 281 | train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1) 282 | test = np.append(test, tfidf.transform(test).toarray(), axis=1) 283 | 284 | # PCA 285 | pp = decomposition.PCA() 286 | train = pp.fit_transform(train) 287 | test = pp.transform(test) 288 | 289 | clf = NeuralNetwork(1024, 110, 150, 0.0010954104605473447, 5.003481345255732e-15, 0.1, 290 | .02, True, 10, random_state=18) 291 | 292 | if MODE == 'cv': 293 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 294 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 295 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 296 | elif MODE == 'submission': 297 | clf.fit(train, labels) 298 | predictions = clf.predict_proba(test) 299 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 300 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 301 | predictions) 302 | elif MODE == 'holdout': 303 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 304 | print 'Log loss:', score 305 | else: 306 | print 'Unknown mode' -------------------------------------------------------------------------------- /otto/model/model_16_random_forest_calibrated_feature_selection/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_16_random_forest_calibrated_feature_selection/random_forest_calibrated_feature_selection.py: -------------------------------------------------------------------------------- 1 | """ 2 | 5-fold CV - log loss 0.463244260386 3 | """ 4 | import numpy as np 5 | import os 6 | 7 | from sklearn import ensemble, feature_extraction, preprocessing 8 | from sklearn.calibration import CalibratedClassifierCV 9 | from sklearn.svm import LinearSVC 10 | 11 | from otto_utils import consts, utils 12 | 13 | 14 | MODEL_NAME = 'model_16_random_forest_calibrated_feature_selection' 15 | MODE = 'cv' # cv|submission|holdout 16 | 17 | # import data 18 | train, labels, test, _, _ = utils.load_data() 19 | 20 | # transform counts to TFIDF features 21 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) 22 | train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1) 23 | test = np.append(test, tfidf.transform(test).toarray(), axis=1) 24 | 25 | # feature selection 26 | feat_selector = LinearSVC(C=0.095, penalty='l1', dual=False) 27 | train = feat_selector.fit_transform(train, labels) 28 | test = feat_selector.transform(test) 29 | 30 | print train.shape 31 | 32 | # encode labels 33 | lbl_enc = preprocessing.LabelEncoder() 34 | labels = lbl_enc.fit_transform(labels) 35 | 36 | 37 | 38 | # train classifier 39 | clf = ensemble.ExtraTreesClassifier(n_jobs=3, n_estimators=600, max_features=20, min_samples_split=3, 40 | bootstrap=False, verbose=3, random_state=23) 41 | 42 | if MODE == 'cv': 43 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True) 44 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 45 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 46 | elif MODE == 'submission': 47 | calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels)) 48 | fitted_classifier = calibrated_classifier.fit(train, labels) 49 | predictions = fitted_classifier.predict_proba(test) 50 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 51 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 52 | predictions) 53 | elif MODE == 'holdout': 54 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=True) 55 | print 'Log loss:', score 56 | else: 57 | print 'Unknown mode' 58 | -------------------------------------------------------------------------------- /otto/model/model_17_nn_adagrad_log/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/model/model_17_nn_adagrad_log/nn_adagrad_log.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mean log loss from 5-fold CV: 0.488150595136 3 | """ 4 | import copy 5 | import itertools 6 | import numpy as np 7 | import lasagne 8 | import math 9 | import os 10 | import theano 11 | import theano.tensor as T 12 | import time 13 | 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params 15 | from lasagne.nonlinearities import rectify, softmax 16 | from lasagne.objectives import categorical_crossentropy, Objective 17 | from lasagne.updates import adagrad 18 | 19 | from sklearn.base import BaseEstimator 20 | from sklearn.cross_validation import StratifiedShuffleSplit 21 | from sklearn.utils import check_random_state 22 | 23 | from otto_utils import consts, utils 24 | 25 | 26 | MODEL_NAME = 'model_17_nn_adagrad_log' 27 | MODE = 'submission' # cv|submission|holdout|tune 28 | 29 | 30 | class NeuralNetwork(BaseEstimator): 31 | def __init__(self, n_hidden=20, max_epochs=150, batch_size=200, 32 | lr=0.01, rho=0.9, dropout=0.5, valid_ratio=0.0, 33 | use_valid=False, verbose=0, random_state=None): 34 | self.n_hidden = n_hidden 35 | self.max_epochs = max_epochs 36 | self.batch_size = batch_size 37 | self.lr = lr 38 | self.rho = rho 39 | self.dropout = dropout 40 | self.valid_ratio = valid_ratio 41 | self.use_valid = use_valid 42 | self.verbose = verbose 43 | self.random_state = random_state 44 | # State 45 | self.score_ = None 46 | self.classes_ = None 47 | self.n_classes_ = None 48 | self.model = None 49 | 50 | def fit(self, data, targets, sample_weight=None): 51 | self.classes_, indices = np.unique(targets, return_inverse=True) 52 | self.n_classes_ = self.classes_.shape[0] 53 | 54 | random_state = check_random_state(self.random_state) 55 | 56 | # Shuffle data and eventually split on train and validation sets 57 | if self.valid_ratio > 0: 58 | strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio, 59 | n_iter=1, random_state=self.random_state) 60 | train_index, valid_index = [s for s in strat_shuffled_split][0] 61 | X_train, y_train = data[train_index], targets[train_index] 62 | X_valid, y_valid = data[valid_index], targets[valid_index] 63 | else: 64 | X_train, y_train = data, targets 65 | X_valid, y_valid = np.array([]), np.array([]) 66 | 67 | if self.verbose > 5: 68 | print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape) 69 | if self.use_valid: 70 | print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape) 71 | 72 | # Prepare theano variables 73 | dataset = dict( 74 | X_train=theano.shared(lasagne.utils.floatX(X_train)), 75 | y_train=T.cast(theano.shared(y_train), 'int32'), 76 | X_valid=theano.shared(lasagne.utils.floatX(X_valid)), 77 | y_valid=T.cast(theano.shared(y_valid), 'int32'), 78 | num_examples_train=X_train.shape[0], 79 | num_examples_valid=X_valid.shape[0], 80 | input_dim=X_train.shape[1], 81 | output_dim=self.n_classes_, 82 | ) 83 | 84 | if self.verbose > 0: 85 | print "Building model and compiling functions..." 86 | output_layer = self.build_model(dataset['input_dim']) 87 | iter_funcs = self.create_iter_functions(dataset, output_layer) 88 | 89 | if self.verbose > 0: 90 | print "Starting training..." 91 | now = time.time() 92 | results = [] 93 | try: 94 | for epoch in self.train(iter_funcs, dataset, output_layer): 95 | if self.verbose > 1: 96 | print "Epoch {} of {} took {:.3f}s".format( 97 | epoch['number'], self.max_epochs, time.time() - now) 98 | now = time.time() 99 | results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']]) 100 | if self.verbose > 1: 101 | print " training loss:\t\t{:.6f}".format(epoch['train_loss']) 102 | print " validation loss:\t\t{:.6f}".format(epoch['valid_loss']) 103 | print " validation accuracy:\t\t{:.2f} %%".format( 104 | epoch['valid_accuracy'] * 100) 105 | 106 | if epoch['number'] >= self.max_epochs: 107 | break 108 | 109 | if self.verbose > 0: 110 | print 'Minimum validation error: %f (epoch %d)' % \ 111 | (epoch['best_val_error'], epoch['best_val_iter']) 112 | 113 | except KeyboardInterrupt: 114 | pass 115 | 116 | return self 117 | 118 | def predict(self, data): 119 | preds, _ = self.make_predictions(data) 120 | 121 | return preds 122 | 123 | def predict_proba(self, data): 124 | _, proba = self.make_predictions(data) 125 | 126 | return proba 127 | 128 | def score(self): 129 | return self.score_ 130 | 131 | # Private methods 132 | def build_model(self, input_dim): 133 | l_in = InputLayer(shape=(self.batch_size, input_dim)) 134 | 135 | l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify) 136 | l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout) 137 | 138 | l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden / 2, nonlinearity=rectify) 139 | l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout) 140 | 141 | l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden / 4, nonlinearity=rectify) 142 | l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout) 143 | 144 | l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax) 145 | 146 | return l_out 147 | 148 | def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): 149 | batch_index = T.iscalar('batch_index') 150 | X_batch = X_tensor_type('x') 151 | y_batch = T.ivector('y') 152 | 153 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 154 | 155 | objective = Objective(output_layer, loss_function=categorical_crossentropy) 156 | 157 | loss_train = objective.get_loss(X_batch, target=y_batch) 158 | loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) 159 | 160 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 161 | proba = output_layer.get_output(X_batch, deterministic=True) 162 | accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) 163 | 164 | all_params = get_all_params(output_layer) 165 | updates = adagrad(loss_train, all_params, self.lr, self.rho) 166 | 167 | iter_train = theano.function( 168 | [batch_index], loss_train, 169 | updates=updates, 170 | givens={ 171 | X_batch: dataset['X_train'][batch_slice], 172 | y_batch: dataset['y_train'][batch_slice], 173 | }, 174 | on_unused_input='ignore', 175 | ) 176 | 177 | iter_valid = None 178 | if self.use_valid: 179 | iter_valid = theano.function( 180 | [batch_index], [loss_eval, accuracy, proba], 181 | givens={ 182 | X_batch: dataset['X_valid'][batch_slice], 183 | y_batch: dataset['y_valid'][batch_slice], 184 | }, 185 | ) 186 | 187 | return dict(train=iter_train, valid=iter_valid) 188 | 189 | def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix): 190 | batch_index = T.iscalar('batch_index') 191 | X_batch = X_tensor_type('x') 192 | 193 | batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) 194 | 195 | pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) 196 | proba = output_layer.get_output(X_batch, deterministic=True) 197 | 198 | iter_test = theano.function( 199 | [batch_index], [pred, proba], 200 | givens={ 201 | X_batch: dataset['X_test'][batch_slice], 202 | }, 203 | ) 204 | 205 | return dict(test=iter_test) 206 | 207 | def train(self, iter_funcs, dataset, output_layer): 208 | num_batches_train = dataset['num_examples_train'] // self.batch_size 209 | num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size))) 210 | 211 | best_val_err = 100 212 | best_val_iter = -1 213 | 214 | for epoch in itertools.count(1): 215 | batch_train_losses = [] 216 | for b in range(num_batches_train): 217 | batch_train_loss = iter_funcs['train'](b) 218 | batch_train_losses.append(batch_train_loss) 219 | 220 | avg_train_loss = np.mean(batch_train_losses) 221 | 222 | batch_valid_losses = [] 223 | batch_valid_accuracies = [] 224 | batch_valid_probas = [] 225 | 226 | if self.use_valid: 227 | for b in range(num_batches_valid): 228 | batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b) 229 | batch_valid_losses.append(batch_valid_loss) 230 | batch_valid_accuracies.append(batch_valid_accuracy) 231 | batch_valid_probas.append(batch_valid_proba) 232 | 233 | avg_valid_loss = np.mean(batch_valid_losses) 234 | avg_valid_accuracy = np.mean(batch_valid_accuracies) 235 | 236 | if (best_val_err > avg_valid_loss and self.use_valid) or\ 237 | (epoch == self.max_epochs and not self.use_valid): 238 | best_val_err = avg_valid_loss 239 | best_val_iter = epoch 240 | # Save model 241 | self.score_ = best_val_err 242 | self.model = copy.deepcopy(output_layer) 243 | 244 | 245 | yield { 246 | 'number': epoch, 247 | 'train_loss': avg_train_loss, 248 | 'valid_loss': avg_valid_loss, 249 | 'valid_accuracy': avg_valid_accuracy, 250 | 'best_val_error': best_val_err, 251 | 'best_val_iter': best_val_iter, 252 | } 253 | 254 | def make_predictions(self, data): 255 | dataset = dict( 256 | X_test=theano.shared(lasagne.utils.floatX(data)), 257 | num_examples_test=data.shape[0], 258 | input_dim=data.shape[1], 259 | output_dim=self.n_classes_, 260 | ) 261 | 262 | iter_funcs = self.create_test_function(dataset, self.model) 263 | num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size))) 264 | 265 | test_preds, test_probas = np.array([]), None 266 | 267 | for b in range(num_batches_test): 268 | batch_test_pred, batch_test_proba = iter_funcs['test'](b) 269 | test_preds = np.append(test_preds, batch_test_pred) 270 | test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba 271 | 272 | return test_preds, test_probas 273 | 274 | 275 | if __name__ == '__main__': 276 | train, labels, test, _, _ = utils.load_data() 277 | 278 | train = np.log(train + 1.) 279 | test = np.log(test + 1.) 280 | 281 | clf = NeuralNetwork(1024, 110, 220, 0.0026294067059507813, 1.1141900388281156e-15, 0.26355646219340834, 282 | .02, True, 10, random_state=23) 283 | 284 | if MODE == 'cv': 285 | scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) 286 | print 'CV:', scores, 'Mean log loss:', np.mean(scores) 287 | utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) 288 | elif MODE == 'submission': 289 | clf.fit(train, labels) 290 | predictions = clf.predict_proba(test) 291 | utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, 292 | os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), 293 | predictions) 294 | elif MODE == 'holdout': 295 | score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) 296 | print 'Log loss:', score 297 | else: 298 | print 'Unknown mode' -------------------------------------------------------------------------------- /otto/otto_utils/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'adam' 2 | -------------------------------------------------------------------------------- /otto/otto_utils/blender.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | from scipy.optimize import fmin_cobyla 5 | from sklearn.cross_validation import StratifiedKFold 6 | from sklearn.metrics import log_loss 7 | 8 | import consts 9 | import utils 10 | 11 | 12 | def blended(c, x): 13 | result = None 14 | for i in range(len(c)): 15 | result = result + c[i] * x[i] if result is not None else c[i] * x[i] 16 | result /= sum(c) 17 | return result 18 | 19 | 20 | def error(p, x, y): 21 | preds = blended(p, x) 22 | err = log_loss(y, preds) 23 | return err 24 | 25 | 26 | def constraint(p, *args): 27 | return min(p) - .0 28 | 29 | 30 | def get_weights(): 31 | # Read validation labels 32 | _, labels, _, _, _ = utils.load_data() 33 | skf = StratifiedKFold(labels, n_folds=5, random_state=23) 34 | test_index = None 35 | for _, test_idx in skf: 36 | test_index = np.append(test_index, test_idx) if test_index is not None else test_idx 37 | val_labels = labels[test_index] 38 | # Read predictions on validation set 39 | val_predictions = [] 40 | prediction_files = utils.get_prediction_files() 41 | for preds_file in prediction_files: 42 | vp = np.genfromtxt(os.path.join(consts.BLEND_PATH, preds_file), delimiter=',') 43 | val_predictions.append(vp) 44 | # Minimize blending function 45 | p0 = [1.] * len(prediction_files) 46 | p = fmin_cobyla(error, p0, args=(val_predictions, val_labels), cons=[constraint], rhoend=1e-5) 47 | 48 | return p 49 | 50 | 51 | if __name__ == '__main__': 52 | weights = get_weights() 53 | print weights 54 | print weights / np.sum(weights) -------------------------------------------------------------------------------- /otto/otto_utils/consts.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | PROJECT_PATH = '/home/adam/Projects/otto' 5 | 6 | # Data 7 | DATA_PATH = os.path.join(PROJECT_PATH, 'data') 8 | DATA_TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv') 9 | DATA_TEST_PATH = os.path.join(DATA_PATH, 'test.csv') 10 | DATA_SAMPLE_SUBMISSION_PATH = os.path.join(DATA_PATH, 'sampleSubmission.csv') 11 | 12 | # Models 13 | MODEL_PATH = os.path.join(PROJECT_PATH, 'model') 14 | 15 | # Results 16 | OUTPUT_PATH = os.path.join(PROJECT_PATH, 'output') 17 | 18 | # Blending 19 | BLEND_PATH = os.path.join(PROJECT_PATH, 'blend') 20 | 21 | # Ensembling 22 | ENSEMBLE_PATH = os.path.join(PROJECT_PATH, 'ensemble') 23 | 24 | # Names of prediction files 25 | PREDICTION_FILES = ['03_svm', '05_bagging_nn_rmsprop', 26 | '06_xgboost', '09_nn_adagrad', '11_xgboost_poly', 27 | '12_nn_rmsprop_pca', '13_nn_rmsprop_features', 28 | '16_random_forest_calibrated_feature_selection'] 29 | -------------------------------------------------------------------------------- /otto/otto_utils/ensembler.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | import os 4 | 5 | import blender 6 | import consts 7 | import utils 8 | 9 | 10 | if __name__ == '__main__': 11 | weights = blender.get_weights() 12 | prediction_files = utils.get_prediction_files() 13 | 14 | with open(os.path.join(consts.OUTPUT_PATH, 'ensembler_weighted_models.csv'), 'wb') as f_out: 15 | writer = csv.writer(f_out) 16 | readers = [] 17 | f_ins = [] 18 | for fpred in prediction_files: 19 | f_in = open(os.path.join(consts.ENSEMBLE_PATH, fpred), 'rb') 20 | f_ins.append(f_in) 21 | readers.append(csv.reader(f_in)) 22 | # Copy header 23 | writer.writerow(readers[0].next()) 24 | for r in readers[1:]: 25 | r.next() 26 | # Merge content 27 | for line in readers[0]: 28 | file_name = line[0] 29 | preds = weights[0] * np.array(map(float, line[1:])) 30 | for i, r in enumerate(readers[1:]): 31 | preds += weights[i+1] * np.array(map(float, r.next()[1:])) 32 | preds /= np.sum(weights) 33 | writer.writerow([file_name] + list(preds)) 34 | # Close files 35 | for f_in in f_ins: 36 | f_in.close() 37 | -------------------------------------------------------------------------------- /otto/otto_utils/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code shared between all models for Otto competition. 3 | """ 4 | 5 | import gc 6 | import numpy as np 7 | import os 8 | import pandas as pd 9 | 10 | from sklearn.calibration import CalibratedClassifierCV 11 | from sklearn.cross_validation import StratifiedKFold, StratifiedShuffleSplit 12 | from sklearn.metrics import log_loss 13 | 14 | import consts 15 | 16 | 17 | def load_data(path_train=consts.DATA_TRAIN_PATH, path_test=consts.DATA_TEST_PATH): 18 | train = pd.read_csv(path_train) 19 | train_labels = [int(v[-1])-1 for v in train.target.values] 20 | train_ids = train.id.values 21 | train = train.drop('id', axis=1) 22 | train = train.drop('target', axis=1) 23 | 24 | test = pd.read_csv(path_test) 25 | test_ids = test.id.values 26 | test = test.drop('id', axis=1) 27 | 28 | return np.array(train, dtype=float), np.array(train_labels), np.array(test, dtype=float),\ 29 | np.array(train_ids), np.array(test_ids) 30 | 31 | 32 | def make_blender_cv(classifier, x, y, calibrate=False): 33 | skf = StratifiedKFold(y, n_folds=5, random_state=23) 34 | scores, predictions = [], None 35 | for train_index, test_index in skf: 36 | if calibrate: 37 | # Make training and calibration 38 | calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y[train_index])) 39 | fitted_classifier = calibrated_classifier.fit(x[train_index, :], y[train_index]) 40 | else: 41 | fitted_classifier = classifier.fit(x[train_index, :], y[train_index]) 42 | preds = fitted_classifier.predict_proba(x[test_index, :]) 43 | 44 | # Free memory 45 | calibrated_classifier, fitted_classifier = None, None 46 | gc.collect() 47 | 48 | scores.append(log_loss(y[test_index], preds)) 49 | predictions = np.append(predictions, preds, axis=0) if predictions is not None else preds 50 | return scores, predictions 51 | 52 | 53 | def write_blender_data(path, file_name, predictions): 54 | file_path = os.path.join(path, file_name) 55 | np.savetxt(file_path, predictions, delimiter=',', fmt='%.5f') 56 | 57 | 58 | def save_submission(path_sample_submission, output_file_path, predictions): 59 | sample = pd.read_csv(path_sample_submission) 60 | submission = pd.DataFrame(predictions, index=sample.id.values, columns=sample.columns[1:]) 61 | submission.to_csv(output_file_path, index_label='id') 62 | 63 | 64 | def stratified_split(x, y, test_size=0.2): 65 | strat_shuffled_split = StratifiedShuffleSplit(y, n_iter=1, test_size=test_size, random_state=23) 66 | train_index, valid_index = [s for s in strat_shuffled_split][0] 67 | 68 | x_train, y_train, x_valid, y_valid = x[train_index, :], y[train_index], x[valid_index, :], y[valid_index] 69 | 70 | return x_train, y_train, x_valid, y_valid 71 | 72 | 73 | def hold_out_evaluation(classifier, x, y, test_size=0.2, calibrate=False): 74 | x_train, y_train, x_valid, y_valid = stratified_split(x, y, test_size) 75 | 76 | # Train 77 | if calibrate: 78 | # Make training and calibration 79 | calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y_train)) 80 | fitted_classifier = calibrated_classifier.fit(x_train, y_train) 81 | else: 82 | fitted_classifier = classifier.fit(x_train, y_train) 83 | # Evaluate 84 | score = log_loss(y_valid, fitted_classifier.predict_proba(x_valid)) 85 | 86 | return score 87 | 88 | 89 | def get_prediction_files(): 90 | return ['model_%s.csv' % f for f in consts.PREDICTION_FILES] 91 | 92 | 93 | def get_cv(y, n_folds=5): 94 | return StratifiedKFold(y, n_folds=n_folds, random_state=23) 95 | --------------------------------------------------------------------------------