├── .gitignore
├── LICENSE
├── README.md
└── otto
    ├── __init__.py
    ├── model
        ├── __init__.py
        ├── model_01_bagging_linear
        │   ├── __init__.py
        │   └── bagging_linear.py
        ├── model_02_random_forest
        │   ├── __init__.py
        │   └── random_forest.py
        ├── model_03_svm
        │   ├── __init__.py
        │   └── svm.py
        ├── model_04_rgf
        │   ├── __init__.py
        │   └── rgf.py
        ├── model_05_bagging_nn_rmsprop
        │   ├── __init__.py
        │   └── bagging_nn_rmsprop.py
        ├── model_06_xgboost
        │   ├── __init__.py
        │   └── xgboost.py
        ├── model_07_bagging_nn_nesterov
        │   ├── __init__.py
        │   └── bagging_nn_nesterov.py
        ├── model_08_random_forest_calibrated
        │   ├── __init__.py
        │   └── random_forest_calibrated.py
        ├── model_09_nn_adagrad
        │   ├── __init__.py
        │   └── nn_adagrad.py
        ├── model_10_nn_adagrad_pca
        │   ├── __init__.py
        │   └── nn_adagrad_pca.py
        ├── model_11_xgboost_poly
        │   ├── __init__.py
        │   └── xgboost_poly.py
        ├── model_12_nn_rmsprop_pca
        │   ├── __init__.py
        │   └── nn_rmsprop_pca.py
        ├── model_13_nn_rmsprop_features
        │   ├── __init__.py
        │   └── nn_rmsprop_features.py
        ├── model_14_bagging_xgboost
        │   ├── __init__.py
        │   └── bagging_xgboost.py
        ├── model_15_nn_adagrad_pca
        │   ├── __init__.py
        │   └── nn_adagrad_pca.py
        ├── model_16_random_forest_calibrated_feature_selection
        │   ├── __init__.py
        │   └── random_forest_calibrated_feature_selection.py
        └── model_17_nn_adagrad_log
        │   ├── __init__.py
        │   └── nn_adagrad_log.py
    └── otto_utils
        ├── __init__.py
        ├── blender.py
        ├── consts.py
        ├── ensembler.py
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # Python compiled  and temporary files
2 | *.py[c~]
3 | 
4 | # PyCharm project files
5 | otto/.idea/*
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Adam
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of kaggle_otto nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Kaggle Otto Group Product Classification Challenge
 2 | 
 3 | Solution for achieving place 66th/3514 on private leaderboard.
 4 | 
 5 | It contains:
 6 | * Neural Networks
 7 | * XGBoost
 8 | * Random Forest
 9 | * SVM
10 | * Regularized Greedy Forest
11 | * Linear model
12 | 
13 | However only top four kind of algorithms were used to build final ensemble.
14 | 
15 | You can find more information on my [blog](http://blog.aicry.com/kaggle-otto-group-product-classification-challenge/).
16 | 


--------------------------------------------------------------------------------
/otto/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_01_bagging_linear/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_01_bagging_linear/bagging_linear.py:
--------------------------------------------------------------------------------
 1 | """
 2 | It achieves around 0.52914588084 log loss on holdout set
 3 | """
 4 | 
 5 | import numpy as np
 6 | import os
 7 | 
 8 | from sklearn import ensemble, feature_extraction, linear_model, preprocessing
 9 | from sklearn.svm import LinearSVC
10 | 
11 | from otto_utils import consts, utils
12 | 
13 | 
14 | MODEL_NAME = 'model_01_bagging_linear'
15 | MODE = 'holdout'  # cv|submission|holdout
16 | 
17 | # import data
18 | train, labels, test, _, _ = utils.load_data()
19 | 
20 | # polynomial features
21 | poly_feat = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
22 | train = poly_feat.fit_transform(train, labels)
23 | test = poly_feat.transform(test)
24 | 
25 | print train.shape
26 | 
27 | # transform counts to TFIDF features
28 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
29 | train = tfidf.fit_transform(train).toarray()
30 | test = tfidf.transform(test).toarray()
31 | 
32 | # feature selection
33 | feat_selector = LinearSVC(C=0.3, penalty='l1', dual=False)
34 | train = feat_selector.fit_transform(train, labels)
35 | test = feat_selector.transform(test)
36 | 
37 | print train.shape
38 | 
39 | # encode labels
40 | lbl_enc = preprocessing.LabelEncoder()
41 | labels = lbl_enc.fit_transform(labels)
42 | 
43 | # train classifier
44 | linear_clf = linear_model.LogisticRegression(C=1, penalty='l1',
45 |                                              fit_intercept=True, random_state=23)
46 | 
47 | clf = ensemble.BaggingClassifier(base_estimator=linear_clf, n_estimators=40,
48 |                                  max_samples=1., max_features=1., bootstrap=True,
49 |                                  n_jobs=5, verbose=True, random_state=23)
50 | 
51 | if MODE == 'cv':
52 |     scores, predictions = utils.make_blender_cv(clf, train, labels)
53 |     print 'CV:', scores, 'Mean log loss:', np.mean(scores)
54 |     utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
55 | elif MODE == 'submission':
56 |     clf.fit(train, labels)
57 |     predictions = clf.predict_proba(test)
58 |     utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
59 |                           os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
60 |                           predictions)
61 | elif MODE == 'holdout':
62 |     score = utils.hold_out_evaluation(clf, train, labels)
63 |     print 'Log loss:', score
64 | else:
65 |     print 'Unknown mode'
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/otto/model/model_02_random_forest/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_02_random_forest/random_forest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | 
 4 | from sklearn import ensemble, feature_extraction, preprocessing
 5 | 
 6 | from otto_utils import consts, utils
 7 | 
 8 | 
 9 | MODEL_NAME = 'model_02_random_forest'
10 | MODE = 'holdout'  # cv|submission|holdout
11 | 
12 | # import data
13 | train, labels, test, _, _ = utils.load_data()
14 | 
15 | # transform counts to TFIDF features
16 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
17 | train = tfidf.fit_transform(train).toarray()
18 | test = tfidf.transform(test).toarray()
19 | 
20 | # encode labels
21 | lbl_enc = preprocessing.LabelEncoder()
22 | labels = lbl_enc.fit_transform(labels)
23 | 
24 | # train classifier
25 | clf = ensemble.ExtraTreesClassifier(n_jobs=4, n_estimators=2000, max_features=20, min_samples_split=3,
26 |                                     bootstrap=False, verbose=3, random_state=23)
27 | 
28 | if MODE == 'cv':
29 |     scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
30 |     print 'CV:', scores, 'Mean log loss:', np.mean(scores)
31 |     utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
32 | elif MODE == 'submission':
33 |     clf.fit(train, labels)
34 |     predictions = clf.predict_proba(test)
35 |     utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
36 |                           os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
37 |                           predictions)
38 | elif MODE == 'holdout':
39 |     score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
40 |     print 'Log loss:', score
41 | else:
42 |     print 'Unknown mode'
43 | 


--------------------------------------------------------------------------------
/otto/model/model_03_svm/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_03_svm/svm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 5-fold CV - log loss 0.513778609795
 3 | """
 4 | import numpy as np
 5 | import os
 6 | 
 7 | from hyperopt import fmin, hp, tpe
 8 | 
 9 | from sklearn import feature_extraction, preprocessing, svm
10 | from sklearn.calibration import CalibratedClassifierCV
11 | from sklearn.multiclass import OneVsRestClassifier
12 | 
13 | from otto_utils import consts, utils
14 | 
15 | 
16 | MODEL_NAME = 'model_03_svm'
17 | MODE = 'cv'  # cv|submission|holdout|tune
18 | 
19 | # import data
20 | train, labels, test, _, _ = utils.load_data()
21 | 
22 | # transform counts to TFIDF features
23 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
24 | train = tfidf.fit_transform(train).toarray()
25 | test = tfidf.transform(test).toarray()
26 | 
27 | # encode labels
28 | lbl_enc = preprocessing.LabelEncoder()
29 | labels = lbl_enc.fit_transform(labels)
30 | 
31 | # train classifier
32 | clf = OneVsRestClassifier(svm.SVC(C=4.919646+2., kernel='rbf', tol=.001,
33 |                                   verbose=True, probability=True, gamma=0.646508+.3, random_state=23))
34 | 
35 | if MODE == 'cv':
36 |     scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True)
37 |     print 'CV:', scores, 'Mean log loss:', np.mean(scores)
38 |     utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
39 | elif MODE == 'submission':
40 |     calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels))
41 |     fitted_classifier = calibrated_classifier.fit(train, labels)
42 |     predictions = fitted_classifier.predict_proba(test)
43 |     utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
44 |                           os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
45 |                           predictions)
46 | elif MODE == 'holdout':
47 |     score = utils.hold_out_evaluation(clf, train, labels, calibrate=False, test_size=0.9)
48 |     print 'Log loss:', score
49 | elif MODE == 'tune':
50 |     train, labels, valid, valid_labels = utils.stratified_split(train, labels, test_size=.8)
51 |     from sklearn.metrics import log_loss
52 |     # Objective function
53 |     def objective(args):
54 |         c, gamma = args
55 |         clf = OneVsRestClassifier(svm.SVC(C=c, kernel='rbf', tol=.001, gamma=gamma,
56 |                                   probability=True, random_state=23))
57 |         score1 = 0
58 |         score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
59 |         score = log_loss(valid_labels, clf.predict_proba(valid))
60 |         print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % (c, gamma, score1, score2, score)
61 |         return score
62 |     # Searching space
63 |     space = (
64 |         hp.uniform('c', 4, 10),
65 |         hp.uniform('gamma', 0.3, 3)
66 |     )
67 | 
68 |     best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=200)
69 |     print 'Best solution:', best_sln
70 | else:
71 |     print 'Unknown mode'
72 | 


--------------------------------------------------------------------------------
/otto/model/model_04_rgf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahara/kaggle_otto/2b7861d052529d7a3f78c053088450f15278ac42/otto/model/model_04_rgf/__init__.py


--------------------------------------------------------------------------------
/otto/model/model_04_rgf/rgf.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import numpy as np
  3 | import os
  4 | import subprocess
  5 | import shutil
  6 | 
  7 | from sklearn import feature_extraction
  8 | from sklearn.base import BaseEstimator, ClassifierMixin
  9 | from sklearn.metrics import log_loss
 10 | 
 11 | from otto_utils import consts, utils
 12 | 
 13 | 
 14 | MODEL_NAME = 'model_04_rgf'
 15 | MODE = 'cv'  # cv|submission|holdout|tune
 16 | 
 17 | 
 18 | class RGF(BaseEstimator, ClassifierMixin):
 19 |     def __init__(self, verbose=True, random_state=None):
 20 |         self.n_classes_ = None
 21 |         self.rgf_path = None
 22 |         self.files_location_ = None
 23 |         self.files_location_data_ = None
 24 |         self.files_location_output_ = None
 25 |         self.models = None
 26 |         self.verbose = verbose
 27 |         self.random_state = random_state
 28 | 
 29 |     def fit(self, X, y):
 30 |         self.n_classes_ = 9
 31 |         self.rgf_path = '/home/adam/Tools/rgf1.2/bin/rgf'
 32 |         self.files_location_ = '/home/adam/Projects/otto/model/model_04_rgf'
 33 |         self.files_location_data_ = os.path.join(self.files_location_, 'data')
 34 |         self.files_location_output_ = os.path.join(self.files_location_, 'output')
 35 |         self.models = dict()
 36 | 
 37 |         shutil.rmtree(self.files_location_output_)
 38 | 
 39 |         train_index = np.array(range(X.shape[0]))
 40 |         np.random.shuffle(train_index)
 41 |         x_train, y_train = X[train_index], y[train_index]
 42 | 
 43 |         self._train(x_train, y_train)
 44 | 
 45 |         return self
 46 | 
 47 |     def predict(self, X):
 48 |         preds = self.predict_proba(X)
 49 |         return np.argmax(preds, 1)
 50 | 
 51 |     def predict_proba(self, X):
 52 |         return self._predict(X)
 53 | 
 54 |     def score(self, X, y, sample_weight=None):
 55 |         return log_loss(y, self.predict_proba(X))
 56 | 
 57 |     def get_params(self, deep=True):
 58 |         return {'verbose': self.verbose, 'random_state': self.random_state}
 59 | 
 60 |     def set_params(self, **parameters):
 61 |         for parameter, value in parameters.items():
 62 |             self.setattr(parameter, value)
 63 |         return self
 64 | 
 65 |     def fit_transform(self, X, y):
 66 |         self.fit(X, y)
 67 |         return self.predict_proba(X)
 68 | 
 69 |     def transform(self, X):
 70 |         return self.predict_proba(X)
 71 | 
 72 |     # Private methods
 73 |     def write_into_files(self, prefix, x, y=None):
 74 |         if not os.path.exists(self.files_location_data_):
 75 |             os.makedirs(self.files_location_data_)
 76 |         # Write file with X
 77 |         data_location = os.path.join(self.files_location_data_, '%s.data.x' % prefix)
 78 |         np.savetxt(data_location, x, delimiter='\t', fmt='%.5f')
 79 | 
 80 |         paths = dict(x=data_location, y=[])
 81 | 
 82 |         if y is not None:
 83 |             for i in range(self.n_classes_):
 84 |                 labels = map(lambda l: ['+1'] if i == l else ['-1'], y)
 85 |                 labels_location = os.path.join(self.files_location_data_, '%s.data.y.%d' % (prefix, i))
 86 |                 np.savetxt(labels_location, labels, delimiter='\t', fmt='%s')
 87 |                 paths['y'].append(labels_location)
 88 | 
 89 |         return paths
 90 | 
 91 |     def get_params_string(self, train_x_fn=None, train_y_fn=None, test_x_fn=None, test_y_fn=None,
 92 |                           model_fn=None, model_fn_prefix=None, evaluation_fn=None, prediction_fn=None,
 93 |                           reg_L2=None, reg_sL2=None, algorithm=None, loss=None,
 94 |                           test_interval=None, max_tree=None, max_leaf_forest=None):
 95 |         frame = inspect.currentframe()
 96 |         args, _, _, values = inspect.getargvalues(frame)
 97 |         params_string = ''
 98 | 
 99 |         for arg in args:
100 |             if values[arg] is not None and arg != 'self':
101 |                 params_string += '%s=%s,' % (arg, values[arg])
102 | 
103 |         return params_string
104 | 
105 |     def _train(self, x_train, y_train):
106 |         prefix_train, prefix_model = 'train', 'model'
107 |         cmd = self.rgf_path + ' train %s'
108 | 
109 |         if not os.path.exists(self.files_location_output_):
110 |             os.makedirs(self.files_location_output_)
111 | 
112 |         # Write files in RGF format
113 |         paths = dict()
114 |         paths[prefix_train] = self.write_into_files(prefix_train, x_train, y_train)
115 | 
116 |         for i in range(self.n_classes_):
117 |             # Train and test model
118 |             params_string = self.get_params_string(train_x_fn=paths[prefix_train]['x'],
119 |                                                    train_y_fn=paths[prefix_train]['y'][i],
120 |                                                    model_fn_prefix=os.path.join(self.files_location_output_,
121 |                                                                                 '%s_class_%s' % (prefix_model, i)),
122 |                                                    reg_L2=.01,  # Should be 0.001
123 |                                                    loss='Log',  # Maybe LS will work better
124 |                                                    test_interval=200,  # Should be 2000
125 |                                                    max_tree=1200,  # Should be 1000
126 |                                                    max_leaf_forest=6000  # Should be 10000
127 |                                                    )
128 |             print 'Running', cmd % params_string
129 |             process = subprocess.Popen((cmd % params_string).split(), stdout=subprocess.PIPE)
130 |             output = process.communicate()[0]
131 |             if self.verbose:
132 |                 print output
133 | 
134 |             # Read list of generated models
135 |             models = [m for m in os.listdir(self.files_location_output_) if ('%s_class_%s' % (prefix_model, i)) in m]
136 |             models.sort()
137 |             self.models[i] = models[-1]
138 | 
139 |     def _predict(self, x_test):
140 |         prefix_test, prefix_preds = 'test', 'preds'
141 |         cmd = self.rgf_path + ' predict %s'
142 | 
143 |         if not os.path.exists(self.files_location_output_):
144 |             os.makedirs(self.files_location_output_)
145 | 
146 |         # Write files in RGF format
147 |         paths = dict()
148 |         paths[prefix_test] = self.write_into_files(prefix_test, x_test)
149 | 
150 |         all_predictions = []
151 | 
152 |         for i in range(self.n_classes_):
153 |             # Make predictions and collect it
154 |             preds_file = os.path.join(self.files_location_output_, '%s_class_%s' % (prefix_preds, i))
155 | 
156 |             params_string = self.get_params_string(test_x_fn=paths[prefix_test]['x'],
157 |                                                    model_fn=os.path.join(self.files_location_output_,
158 |                                                                          self.models[i]),
159 |                                                    prediction_fn=preds_file
160 |                                                    )
161 |             print 'Running', cmd % params_string
162 |             process = subprocess.Popen((cmd % params_string).split(), stdout=subprocess.PIPE)
163 |             output = process.communicate()[0]
164 |             if self.verbose:
165 |                 print output
166 | 
167 |             # Read generated predictions
168 |             preds = np.loadtxt(preds_file)
169 |             preds = 1. / (1. + np.exp(-preds))  # For Log, Expo
170 | 
171 |             all_predictions.append(preds)
172 | 
173 |         # Join all predictions
174 |         all_predictions = np.array(all_predictions).T
175 |         all_predictions /= np.sum(all_predictions, axis=1)[:, None]
176 | 
177 |         return all_predictions
178 | 
179 | 
180 | if __name__ == '__main__':
181 |     train, labels, test, _, _ = utils.load_data()
182 | 
183 |     # Preprocess data - transform counts to TFIDF features
184 |     tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
185 |     train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1)
186 |     test = np.append(test, tfidf.transform(test).toarray(), axis=1)
187 | 
188 |     clf = RGF(verbose=False, random_state=23)
189 | 
190 |     if MODE == 'cv':
191 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
192 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
193 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
194 |     elif MODE == 'submission':
195 |         clf.fit(train, labels)
196 |         predictions = clf.predict_proba(test)
197 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
198 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
199 |                               predictions)
200 |     elif MODE == 'holdout':
201 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
202 |         print 'Log loss:', score
203 |     else:
204 |         print 'Unknown mode'


--------------------------------------------------------------------------------
/otto/model/model_05_bagging_nn_rmsprop/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_05_bagging_nn_rmsprop/bagging_nn_rmsprop.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mean log loss from 5-fold CV: 0.461596760113
  3 | """
  4 | import copy
  5 | import itertools
  6 | import numpy as np
  7 | import lasagne
  8 | import math
  9 | import os
 10 | import theano
 11 | import theano.tensor as T
 12 | import time
 13 | 
 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params
 15 | from lasagne.nonlinearities import rectify, softmax
 16 | from lasagne.objectives import categorical_crossentropy, Objective
 17 | from lasagne.updates import rmsprop
 18 | 
 19 | from sklearn import feature_extraction, ensemble
 20 | from sklearn.base import BaseEstimator
 21 | from sklearn.cross_validation import StratifiedShuffleSplit
 22 | from sklearn.utils import check_random_state
 23 | 
 24 | from otto_utils import consts, utils
 25 | 
 26 | 
 27 | MODEL_NAME = 'model_05_bagging_nn_rmsprop'
 28 | MODE = 'cv'  # cv|submission|holdout|tune
 29 | 
 30 | 
 31 | class NeuralNetwork(BaseEstimator):
 32 |     def __init__(self, n_hidden=20, max_epochs=150, batch_size=200,
 33 |                  lr=0.01, rho=0.9, dropout=0.5, valid_ratio=0.0,
 34 |                  use_valid=False, verbose=0, random_state=None):
 35 |         self.n_hidden = n_hidden
 36 |         self.max_epochs = max_epochs
 37 |         self.batch_size = batch_size
 38 |         self.lr = lr
 39 |         self.rho = rho
 40 |         self.dropout = dropout
 41 |         self.valid_ratio = valid_ratio
 42 |         self.use_valid = use_valid
 43 |         self.verbose = verbose
 44 |         self.random_state = random_state
 45 |         # State
 46 |         self.score_ = None
 47 |         self.classes_ = None
 48 |         self.n_classes_ = None
 49 |         self.model = None
 50 | 
 51 |     def fit(self, data, targets, sample_weight=None):
 52 |         self.classes_, indices = np.unique(targets, return_inverse=True)
 53 |         self.n_classes_ = self.classes_.shape[0]
 54 | 
 55 |         random_state = check_random_state(self.random_state)
 56 | 
 57 |         # Shuffle data and eventually split on train and validation sets
 58 |         if self.valid_ratio > 0:
 59 |             strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio,
 60 |                                                           n_iter=1, random_state=self.random_state)
 61 |             train_index, valid_index = [s for s in strat_shuffled_split][0]
 62 |             X_train, y_train = data[train_index], targets[train_index]
 63 |             X_valid, y_valid = data[valid_index], targets[valid_index]
 64 |         else:
 65 |             X_train, y_train = data, targets
 66 |             X_valid, y_valid = np.array([]), np.array([])
 67 | 
 68 |         if self.verbose > 5:
 69 |             print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape)
 70 |             if self.use_valid:
 71 |                 print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape)
 72 | 
 73 |         # Prepare theano variables
 74 |         dataset = dict(
 75 |             X_train=theano.shared(lasagne.utils.floatX(X_train)),
 76 |             y_train=T.cast(theano.shared(y_train), 'int32'),
 77 |             X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
 78 |             y_valid=T.cast(theano.shared(y_valid), 'int32'),
 79 |             num_examples_train=X_train.shape[0],
 80 |             num_examples_valid=X_valid.shape[0],
 81 |             input_dim=X_train.shape[1],
 82 |             output_dim=self.n_classes_,
 83 |         )
 84 | 
 85 |         if self.verbose > 0:
 86 |             print "Building model and compiling functions..."
 87 |         output_layer = self.build_model(dataset['input_dim'])
 88 |         iter_funcs = self.create_iter_functions(dataset, output_layer)
 89 | 
 90 |         if self.verbose > 0:
 91 |             print "Starting training..."
 92 |         now = time.time()
 93 |         results = []
 94 |         try:
 95 |             for epoch in self.train(iter_funcs, dataset, output_layer):
 96 |                 if self.verbose > 1:
 97 |                     print "Epoch {} of {} took {:.3f}s".format(
 98 |                         epoch['number'], self.max_epochs, time.time() - now)
 99 |                 now = time.time()
100 |                 results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']])
101 |                 if self.verbose > 1:
102 |                     print "  training loss:\t\t{:.6f}".format(epoch['train_loss'])
103 |                     print "  validation loss:\t\t{:.6f}".format(epoch['valid_loss'])
104 |                     print "  validation accuracy:\t\t{:.2f} %%".format(
105 |                         epoch['valid_accuracy'] * 100)
106 | 
107 |                 if epoch['number'] >= self.max_epochs:
108 |                     break
109 | 
110 |             if self.verbose > 0:
111 |                 print 'Minimum validation error: %f (epoch %d)' % \
112 |                       (epoch['best_val_error'], epoch['best_val_iter'])
113 | 
114 |         except KeyboardInterrupt:
115 |             pass
116 | 
117 |         return self
118 | 
119 |     def predict(self, data):
120 |         preds, _ = self.make_predictions(data)
121 | 
122 |         return preds
123 | 
124 |     def predict_proba(self, data):
125 |         _, proba = self.make_predictions(data)
126 | 
127 |         return proba
128 | 
129 |     def score(self):
130 |         return self.score_
131 | 
132 |     # Private methods
133 |     def build_model(self, input_dim):
134 |         l_in = InputLayer(shape=(self.batch_size, input_dim))
135 | 
136 |         l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify)
137 |         l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout)
138 | 
139 |         l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify)
140 |         l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout)
141 | 
142 |         l_out = DenseLayer(l_hidden2_dropout, num_units=self.n_classes_, nonlinearity=softmax)
143 | 
144 |         return l_out
145 | 
146 |     def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix):
147 |         batch_index = T.iscalar('batch_index')
148 |         X_batch = X_tensor_type('x')
149 |         y_batch = T.ivector('y')
150 | 
151 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
152 | 
153 |         objective = Objective(output_layer, loss_function=categorical_crossentropy)
154 | 
155 |         loss_train = objective.get_loss(X_batch, target=y_batch)
156 |         loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)
157 | 
158 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
159 |         proba = output_layer.get_output(X_batch, deterministic=True)
160 |         accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)
161 | 
162 |         all_params = get_all_params(output_layer)
163 |         updates = rmsprop(loss_train, all_params, self.lr, self.rho)
164 | 
165 |         iter_train = theano.function(
166 |             [batch_index], loss_train,
167 |             updates=updates,
168 |             givens={
169 |                 X_batch: dataset['X_train'][batch_slice],
170 |                 y_batch: dataset['y_train'][batch_slice],
171 |             },
172 |             on_unused_input='ignore',
173 |         )
174 | 
175 |         iter_valid = None
176 |         if self.use_valid:
177 |             iter_valid = theano.function(
178 |                 [batch_index], [loss_eval, accuracy, proba],
179 |                 givens={
180 |                     X_batch: dataset['X_valid'][batch_slice],
181 |                     y_batch: dataset['y_valid'][batch_slice],
182 |                 },
183 |             )
184 | 
185 |         return dict(train=iter_train, valid=iter_valid)
186 | 
187 |     def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix):
188 |         batch_index = T.iscalar('batch_index')
189 |         X_batch = X_tensor_type('x')
190 | 
191 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
192 | 
193 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
194 |         proba = output_layer.get_output(X_batch, deterministic=True)
195 | 
196 |         iter_test = theano.function(
197 |             [batch_index], [pred, proba],
198 |             givens={
199 |                 X_batch: dataset['X_test'][batch_slice],
200 |             },
201 |         )
202 | 
203 |         return dict(test=iter_test)
204 | 
205 |     def train(self, iter_funcs, dataset, output_layer):
206 |         num_batches_train = dataset['num_examples_train'] // self.batch_size
207 |         num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size)))
208 | 
209 |         best_val_err = 100
210 |         best_val_iter = -1
211 | 
212 |         for epoch in itertools.count(1):
213 |             batch_train_losses = []
214 |             for b in range(num_batches_train):
215 |                 batch_train_loss = iter_funcs['train'](b)
216 |                 batch_train_losses.append(batch_train_loss)
217 | 
218 |             avg_train_loss = np.mean(batch_train_losses)
219 | 
220 |             batch_valid_losses = []
221 |             batch_valid_accuracies = []
222 |             batch_valid_probas = []
223 | 
224 |             if self.use_valid:
225 |                 for b in range(num_batches_valid):
226 |                     batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b)
227 |                     batch_valid_losses.append(batch_valid_loss)
228 |                     batch_valid_accuracies.append(batch_valid_accuracy)
229 |                     batch_valid_probas.append(batch_valid_proba)
230 | 
231 |             avg_valid_loss = np.mean(batch_valid_losses)
232 |             avg_valid_accuracy = np.mean(batch_valid_accuracies)
233 | 
234 |             if (best_val_err > avg_valid_loss and self.use_valid) or\
235 |                     (epoch == self.max_epochs and not self.use_valid):
236 |                 best_val_err = avg_valid_loss
237 |                 best_val_iter = epoch
238 |                 # Save model
239 |                 self.score_ = best_val_err
240 |                 self.model = copy.deepcopy(output_layer)
241 | 
242 | 
243 |             yield {
244 |                 'number': epoch,
245 |                 'train_loss': avg_train_loss,
246 |                 'valid_loss': avg_valid_loss,
247 |                 'valid_accuracy': avg_valid_accuracy,
248 |                 'best_val_error': best_val_err,
249 |                 'best_val_iter': best_val_iter,
250 |             }
251 | 
252 |     def make_predictions(self, data):
253 |         dataset = dict(
254 |             X_test=theano.shared(lasagne.utils.floatX(data)),
255 |             num_examples_test=data.shape[0],
256 |             input_dim=data.shape[1],
257 |             output_dim=self.n_classes_,
258 |         )
259 | 
260 |         iter_funcs = self.create_test_function(dataset, self.model)
261 |         num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size)))
262 | 
263 |         test_preds, test_probas = np.array([]), None
264 | 
265 |         for b in range(num_batches_test):
266 |             batch_test_pred, batch_test_proba = iter_funcs['test'](b)
267 |             test_preds = np.append(test_preds, batch_test_pred)
268 |             test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba
269 | 
270 |         return test_preds, test_probas
271 | 
272 | 
273 | if __name__ == '__main__':
274 |     train, labels, test, _, _ = utils.load_data()
275 | 
276 |     # Preprocess data - transform counts to TFIDF features
277 |     tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
278 |     train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1)
279 |     test = np.append(test, tfidf.transform(test).toarray(), axis=1)
280 | 
281 |     clf_nn = NeuralNetwork(600, 110, 200, 0.00012503331803251808, 0.9864830676545417, 0.3245683842495481,
282 |                            .05, True, 10, random_state=23)
283 | 
284 |     clf = ensemble.BaggingClassifier(base_estimator=clf_nn, n_estimators=10,
285 |                                      max_samples=1., max_features=1.,
286 |                                      random_state=23)
287 | 
288 |     if MODE == 'cv':
289 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
290 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
291 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
292 |     elif MODE == 'submission':
293 |         clf.fit(train, labels)
294 |         predictions = clf.predict_proba(test)
295 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
296 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
297 |                               predictions)
298 |     elif MODE == 'holdout':
299 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
300 |         print 'Log loss:', score
301 |     else:
302 |         print 'Unknown mode'


--------------------------------------------------------------------------------
/otto/model/model_06_xgboost/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_06_xgboost/xgboost.py:
--------------------------------------------------------------------------------
  1 | import graphlab as gl
  2 | import numpy as np
  3 | import logging
  4 | import os
  5 | 
  6 | from hyperopt import fmin, hp, tpe
  7 | 
  8 | from sklearn.base import BaseEstimator
  9 | from sklearn import preprocessing
 10 | 
 11 | from otto_utils import consts, utils
 12 | 
 13 | 
 14 | MODEL_NAME = 'model_06_xgboost'
 15 | MODE = 'holdout'  # cv|submission|holdout|tune
 16 | 
 17 | logging.disable(logging.INFO)
 18 | 
 19 | 
 20 | class XGBoost(BaseEstimator):
 21 |     def __init__(self, max_iterations=50, max_depth=9, min_child_weight=4, row_subsample=.75,
 22 |                  min_loss_reduction=1., column_subsample=.8, step_size=.3, verbose=True):
 23 |         self.n_classes_ = 9
 24 |         self.max_iterations = max_iterations
 25 |         self.max_depth = max_depth
 26 |         self.min_child_weight = min_child_weight
 27 |         self.row_subsample = row_subsample
 28 |         self.min_loss_reduction = min_loss_reduction
 29 |         self.column_subsample = column_subsample
 30 |         self.step_size = step_size
 31 |         self.verbose = verbose
 32 |         self.model = None
 33 | 
 34 |     def fit(self, X, y, sample_weight=None):
 35 |         sf = self._array_to_sframe(X, y)
 36 |         self.model = gl.boosted_trees_classifier.create(sf, target='target',
 37 |                                                         max_iterations=self.max_iterations,
 38 |                                                         max_depth=self.max_depth,
 39 |                                                         min_child_weight=self.min_child_weight,
 40 |                                                         row_subsample=self.row_subsample,
 41 |                                                         min_loss_reduction=self.min_loss_reduction,
 42 |                                                         column_subsample=self.column_subsample,
 43 |                                                         step_size=self.step_size,
 44 |                                                         verbose=self.verbose)
 45 | 
 46 |         return self
 47 | 
 48 |     def predict(self, X):
 49 |         preds = self.predict_proba(X)
 50 |         return np.argmax(preds, axis=1)
 51 | 
 52 |     def predict_proba(self, X):
 53 |         sf = self._array_to_sframe(X)
 54 |         preds = self.model.predict_topk(sf, output_type='probability', k=self.n_classes_)
 55 | 
 56 |         return self._preds_to_array(preds)
 57 | 
 58 |     # Private methods
 59 |     def _array_to_sframe(self, data, targets=None):
 60 |         d = dict()
 61 |         for i in xrange(data.shape[1]):
 62 |             d['feat_%d' % (i + 1)] = gl.SArray(data[:, i])
 63 |         if targets is not None:
 64 |             d['target'] = gl.SArray(targets)
 65 | 
 66 |         return gl.SFrame(d)
 67 | 
 68 |     def _preds_to_array(self, preds):
 69 |         p = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '')
 70 |         p['id'] = p['id'].astype(int) + 1
 71 |         p = p.sort('id')
 72 |         del p['id']
 73 |         preds_array = np.array(p.to_dataframe(), dtype=float)
 74 | 
 75 |         return preds_array
 76 | 
 77 | 
 78 | if __name__ == '__main__':
 79 |     train, labels, test, _, _ = utils.load_data()
 80 | 
 81 |     clf = XGBoost(max_iterations=4800, max_depth=12, min_child_weight=4.9208250938262745,
 82 |                   row_subsample=.9134478530382129, min_loss_reduction=.5132278416508804,
 83 |                   column_subsample=.730128689911957, step_size=.009)
 84 | 
 85 |     if MODE == 'cv':
 86 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
 87 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
 88 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
 89 |     elif MODE == 'submission':
 90 |         clf.fit(train, labels)
 91 |         predictions = clf.predict_proba(test)
 92 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
 93 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
 94 |                               predictions)
 95 |     elif MODE == 'holdout':
 96 |         train, labels, _, _ = utils.stratified_split(train, labels, test_size=.7)
 97 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
 98 |         print 'Log loss:', score
 99 |     elif MODE == 'tune':
100 |         # Objective function
101 |         def objective(args):
102 |             max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args
103 |             clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight,
104 |                           row_subsample=row_subsample, min_loss_reduction=min_loss_reduction,
105 |                           column_subsample=column_subsample, verbose=False)
106 |             score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
107 |             print 'max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss'
108 |             print args, score
109 |             return score
110 |         # Searching space
111 |         space = (
112 |             hp.quniform('max_depth', 2, 14, 1),
113 |             hp.uniform('min_child_weight', .5, 10.),
114 |             hp.uniform('row_subsample', .3, 1.),
115 |             hp.uniform('min_loss_reduction', .1, 3.),
116 |             hp.uniform('column_subsample', .1, 1.),
117 |         )
118 | 
119 |         best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=500)
120 |         print 'Best solution:', best_sln
121 |     else:
122 |         print 'Unknown mode'
123 | 


--------------------------------------------------------------------------------
/otto/model/model_07_bagging_nn_nesterov/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_07_bagging_nn_nesterov/bagging_nn_nesterov.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mean log loss from 5-fold CV: 0.461952014711
  3 | """
  4 | import copy
  5 | import itertools
  6 | import numpy as np
  7 | import lasagne
  8 | import math
  9 | import os
 10 | import theano
 11 | import theano.tensor as T
 12 | import time
 13 | 
 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params
 15 | from lasagne.nonlinearities import rectify, softmax
 16 | from lasagne.objectives import categorical_crossentropy, Objective
 17 | 
 18 | from sklearn import decomposition, ensemble
 19 | from sklearn.base import BaseEstimator
 20 | from sklearn.cross_validation import StratifiedShuffleSplit
 21 | from sklearn.utils import check_random_state
 22 | 
 23 | from otto_utils import consts, utils
 24 | 
 25 | 
 26 | MODEL_NAME = 'model_07_bagging_nn_nesterov'
 27 | MODE = 'submission'  # cv|submission|holdout|tune
 28 | 
 29 | 
 30 | class NeuralNetwork(BaseEstimator):
 31 |     def __init__(self, n_hidden=20, max_epochs=150, batch_size=200,
 32 |                  lr=0.01, momentum=0.9, dropout=0.5, valid_ratio=0.0,
 33 |                  use_valid=False, verbose=0, random_state=None):
 34 |         self.n_hidden = n_hidden
 35 |         self.max_epochs = max_epochs
 36 |         self.batch_size = batch_size
 37 |         self.lr = lr
 38 |         self.momentum = momentum
 39 |         self.dropout = dropout
 40 |         self.valid_ratio = valid_ratio
 41 |         self.use_valid = use_valid
 42 |         self.verbose = verbose
 43 |         self.random_state = random_state
 44 |         # State
 45 |         self.score_ = None
 46 |         self.classes_ = None
 47 |         self.n_classes_ = None
 48 |         self.model = None
 49 | 
 50 |     def fit(self, data, targets, sample_weight=None):
 51 |         self.classes_, indices = np.unique(targets, return_inverse=True)
 52 |         self.n_classes_ = self.classes_.shape[0]
 53 | 
 54 |         random_state = check_random_state(self.random_state)
 55 | 
 56 |         # Shuffle data and eventually split on train and validation sets
 57 |         if self.valid_ratio > 0:
 58 |             strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio,
 59 |                                                           n_iter=1, random_state=self.random_state)
 60 |             train_index, valid_index = [s for s in strat_shuffled_split][0]
 61 |             X_train, y_train = data[train_index], targets[train_index]
 62 |             X_valid, y_valid = data[valid_index], targets[valid_index]
 63 |         else:
 64 |             X_train, y_train = data, targets
 65 |             X_valid, y_valid = np.array([]), np.array([])
 66 | 
 67 |         if self.verbose > 5:
 68 |             print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape)
 69 |             if self.use_valid:
 70 |                 print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape)
 71 | 
 72 |         # Prepare theano variables
 73 |         dataset = dict(
 74 |             X_train=theano.shared(lasagne.utils.floatX(X_train)),
 75 |             y_train=T.cast(theano.shared(y_train), 'int32'),
 76 |             X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
 77 |             y_valid=T.cast(theano.shared(y_valid), 'int32'),
 78 |             num_examples_train=X_train.shape[0],
 79 |             num_examples_valid=X_valid.shape[0],
 80 |             input_dim=X_train.shape[1],
 81 |             output_dim=self.n_classes_,
 82 |         )
 83 | 
 84 |         if self.verbose > 0:
 85 |             print "Building model and compiling functions..."
 86 |         output_layer = self.build_model(dataset['input_dim'])
 87 |         iter_funcs = self.create_iter_functions(dataset, output_layer)
 88 | 
 89 |         if self.verbose > 0:
 90 |             print "Starting training..."
 91 |         now = time.time()
 92 |         results = []
 93 |         try:
 94 |             for epoch in self.train(iter_funcs, dataset, output_layer):
 95 |                 if self.verbose > 1:
 96 |                     print "Epoch {} of {} took {:.3f}s".format(
 97 |                         epoch['number'], self.max_epochs, time.time() - now)
 98 |                 now = time.time()
 99 |                 results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']])
100 |                 if self.verbose > 1:
101 |                     print "  training loss:\t\t{:.6f}".format(epoch['train_loss'])
102 |                     print "  validation loss:\t\t{:.6f}".format(epoch['valid_loss'])
103 |                     print "  validation accuracy:\t\t{:.2f} %%".format(
104 |                         epoch['valid_accuracy'] * 100)
105 | 
106 |                 if epoch['number'] >= self.max_epochs:
107 |                     break
108 | 
109 |             if self.verbose > 0:
110 |                 print 'Minimum validation error: %f (epoch %d)' % \
111 |                       (epoch['best_val_error'], epoch['best_val_iter'])
112 | 
113 |         except KeyboardInterrupt:
114 |             pass
115 | 
116 |         return self
117 | 
118 |     def predict(self, data):
119 |         preds, _ = self.make_predictions(data)
120 | 
121 |         return preds
122 | 
123 |     def predict_proba(self, data):
124 |         _, proba = self.make_predictions(data)
125 | 
126 |         return proba
127 | 
128 |     def score(self):
129 |         return self.score_
130 | 
131 |     # Private methods
132 |     def build_model(self, input_dim):
133 |         l_in = InputLayer(shape=(self.batch_size, input_dim))
134 | 
135 |         l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify)
136 |         l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout)
137 | 
138 |         l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify)
139 |         l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout)
140 | 
141 |         l_out = DenseLayer(l_hidden2_dropout, num_units=self.n_classes_, nonlinearity=softmax)
142 | 
143 |         return l_out
144 | 
145 |     def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix):
146 |         batch_index = T.iscalar('batch_index')
147 |         X_batch = X_tensor_type('x')
148 |         y_batch = T.ivector('y')
149 | 
150 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
151 | 
152 |         objective = Objective(output_layer, loss_function=categorical_crossentropy)
153 | 
154 |         loss_train = objective.get_loss(X_batch, target=y_batch)
155 |         loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)
156 | 
157 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
158 |         proba = output_layer.get_output(X_batch, deterministic=True)
159 |         accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)
160 | 
161 |         all_params = get_all_params(output_layer)
162 |         updates = lasagne.updates.nesterov_momentum(loss_train, all_params, self.lr, self.momentum)
163 | 
164 |         iter_train = theano.function(
165 |             [batch_index], loss_train,
166 |             updates=updates,
167 |             givens={
168 |                 X_batch: dataset['X_train'][batch_slice],
169 |                 y_batch: dataset['y_train'][batch_slice],
170 |             },
171 |             on_unused_input='ignore',
172 |         )
173 | 
174 |         iter_valid = None
175 |         if self.use_valid:
176 |             iter_valid = theano.function(
177 |                 [batch_index], [loss_eval, accuracy, proba],
178 |                 givens={
179 |                     X_batch: dataset['X_valid'][batch_slice],
180 |                     y_batch: dataset['y_valid'][batch_slice],
181 |                 },
182 |             )
183 | 
184 |         return dict(train=iter_train, valid=iter_valid)
185 | 
186 |     def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix):
187 |         batch_index = T.iscalar('batch_index')
188 |         X_batch = X_tensor_type('x')
189 | 
190 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
191 | 
192 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
193 |         proba = output_layer.get_output(X_batch, deterministic=True)
194 | 
195 |         iter_test = theano.function(
196 |             [batch_index], [pred, proba],
197 |             givens={
198 |                 X_batch: dataset['X_test'][batch_slice],
199 |             },
200 |         )
201 | 
202 |         return dict(test=iter_test)
203 | 
204 |     def train(self, iter_funcs, dataset, output_layer):
205 |         num_batches_train = dataset['num_examples_train'] // self.batch_size
206 |         num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size)))
207 | 
208 |         best_val_err = 100
209 |         best_val_iter = -1
210 | 
211 |         for epoch in itertools.count(1):
212 |             batch_train_losses = []
213 |             for b in range(num_batches_train):
214 |                 batch_train_loss = iter_funcs['train'](b)
215 |                 batch_train_losses.append(batch_train_loss)
216 | 
217 |             avg_train_loss = np.mean(batch_train_losses)
218 | 
219 |             batch_valid_losses = []
220 |             batch_valid_accuracies = []
221 |             batch_valid_probas = []
222 | 
223 |             if self.use_valid:
224 |                 for b in range(num_batches_valid):
225 |                     batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b)
226 |                     batch_valid_losses.append(batch_valid_loss)
227 |                     batch_valid_accuracies.append(batch_valid_accuracy)
228 |                     batch_valid_probas.append(batch_valid_proba)
229 | 
230 |             avg_valid_loss = np.mean(batch_valid_losses)
231 |             avg_valid_accuracy = np.mean(batch_valid_accuracies)
232 | 
233 |             if (best_val_err > avg_valid_loss and self.use_valid) or\
234 |                     (epoch == self.max_epochs and not self.use_valid):
235 |                 best_val_err = avg_valid_loss
236 |                 best_val_iter = epoch
237 |                 # Save model
238 |                 self.score_ = best_val_err
239 |                 self.model = copy.deepcopy(output_layer)
240 | 
241 | 
242 |             yield {
243 |                 'number': epoch,
244 |                 'train_loss': avg_train_loss,
245 |                 'valid_loss': avg_valid_loss,
246 |                 'valid_accuracy': avg_valid_accuracy,
247 |                 'best_val_error': best_val_err,
248 |                 'best_val_iter': best_val_iter,
249 |             }
250 | 
251 |     def make_predictions(self, data):
252 |         dataset = dict(
253 |             X_test=theano.shared(lasagne.utils.floatX(data)),
254 |             num_examples_test=data.shape[0],
255 |             input_dim=data.shape[1],
256 |             output_dim=self.n_classes_,
257 |         )
258 | 
259 |         iter_funcs = self.create_test_function(dataset, self.model)
260 |         num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size)))
261 | 
262 |         test_preds, test_probas = np.array([]), None
263 | 
264 |         for b in range(num_batches_test):
265 |             batch_test_pred, batch_test_proba = iter_funcs['test'](b)
266 |             test_preds = np.append(test_preds, batch_test_pred)
267 |             test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba
268 | 
269 |         return test_preds, test_probas
270 | 
271 | 
272 | if __name__ == '__main__':
273 |     train, labels, test, _, _ = utils.load_data()
274 | 
275 |     # PCA
276 |     pp = decomposition.PCA()
277 |     train = pp.fit_transform(train)
278 |     test = pp.transform(test)
279 | 
280 |     clf_nn = NeuralNetwork(750, 110, 116, 0.0012503331803251808, 0.9544425038759606, 0.3992570325984604,
281 |                            .05, True, 10, random_state=23)
282 | 
283 |     clf = ensemble.BaggingClassifier(base_estimator=clf_nn, n_estimators=10,
284 |                                      max_samples=1., max_features=1.,
285 |                                      random_state=29)
286 | 
287 |     if MODE == 'cv':
288 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
289 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
290 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
291 |     elif MODE == 'submission':
292 |         clf.fit(train, labels)
293 |         predictions = clf.predict_proba(test)
294 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
295 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
296 |                               predictions)
297 |     elif MODE == 'holdout':
298 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
299 |         print 'Log loss:', score
300 |     else:
301 |         print 'Unknown mode'


--------------------------------------------------------------------------------
/otto/model/model_08_random_forest_calibrated/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_08_random_forest_calibrated/random_forest_calibrated.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 5-fold CV - log loss 0.4687
 3 | """
 4 | import numpy as np
 5 | import os
 6 | 
 7 | from sklearn import ensemble, feature_extraction, preprocessing
 8 | from sklearn.calibration import CalibratedClassifierCV
 9 | 
10 | from otto_utils import consts, utils
11 | 
12 | 
13 | MODEL_NAME = 'model_08_random_forest_calibrated'
14 | MODE = 'holdout'  # cv|submission|holdout
15 | 
16 | # import data
17 | train, labels, test, _, _ = utils.load_data()
18 | 
19 | # transform counts to TFIDF features
20 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
21 | train = tfidf.fit_transform(train).toarray()
22 | test = tfidf.transform(test).toarray()
23 | 
24 | # encode labels
25 | lbl_enc = preprocessing.LabelEncoder()
26 | labels = lbl_enc.fit_transform(labels)
27 | 
28 | # train classifier
29 | clf = ensemble.ExtraTreesClassifier(n_jobs=5, n_estimators=600, max_features=20, min_samples_split=3,
30 |                                     bootstrap=False, verbose=3, random_state=23)
31 | 
32 | if MODE == 'cv':
33 |     scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True)
34 |     print 'CV:', scores, 'Mean log loss:', np.mean(scores)
35 |     utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
36 | elif MODE == 'submission':
37 |     calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels))
38 |     fitted_classifier = calibrated_classifier.fit(train, labels)
39 |     predictions = fitted_classifier.predict_proba(test)
40 |     utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
41 |                           os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
42 |                           predictions)
43 | elif MODE == 'holdout':
44 |     score = utils.hold_out_evaluation(clf, train, labels, calibrate=True)
45 |     print 'Log loss:', score
46 | else:
47 |     print 'Unknown mode'
48 | 


--------------------------------------------------------------------------------
/otto/model/model_09_nn_adagrad/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_09_nn_adagrad/nn_adagrad.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mean log loss from 5-fold CV: 0.480065955962
  3 | """
  4 | import copy
  5 | import itertools
  6 | import numpy as np
  7 | import lasagne
  8 | import math
  9 | import os
 10 | import theano
 11 | import theano.tensor as T
 12 | import time
 13 | 
 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params
 15 | from lasagne.nonlinearities import rectify, softmax
 16 | from lasagne.objectives import categorical_crossentropy, Objective
 17 | from lasagne.updates import adagrad
 18 | 
 19 | from sklearn import feature_extraction
 20 | from sklearn.base import BaseEstimator
 21 | from sklearn.cross_validation import StratifiedShuffleSplit
 22 | from sklearn.utils import check_random_state
 23 | 
 24 | from otto_utils import consts, utils
 25 | 
 26 | 
 27 | MODEL_NAME = 'model_09_nn_adagrad'
 28 | MODE = 'cv'  # cv|submission|holdout|tune
 29 | 
 30 | 
 31 | class NeuralNetwork(BaseEstimator):
 32 |     def __init__(self, n_hidden=20, max_epochs=150, batch_size=200,
 33 |                  lr=0.01, epsilon=0.9, dropout=0.5, valid_ratio=0.0,
 34 |                  use_valid=False, verbose=0, random_state=None):
 35 |         self.n_hidden = n_hidden
 36 |         self.max_epochs = max_epochs
 37 |         self.batch_size = batch_size
 38 |         self.lr = lr
 39 |         self.epsilon = epsilon
 40 |         self.dropout = dropout
 41 |         self.valid_ratio = valid_ratio
 42 |         self.use_valid = use_valid
 43 |         self.verbose = verbose
 44 |         self.random_state = random_state
 45 |         # State
 46 |         self.score_ = None
 47 |         self.classes_ = None
 48 |         self.n_classes_ = None
 49 |         self.model = None
 50 | 
 51 |     def fit(self, data, targets, sample_weight=None):
 52 |         self.classes_, indices = np.unique(targets, return_inverse=True)
 53 |         self.n_classes_ = self.classes_.shape[0]
 54 | 
 55 |         random_state = check_random_state(self.random_state)
 56 | 
 57 |         # Shuffle data and eventually split on train and validation sets
 58 |         if self.valid_ratio > 0:
 59 |             strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio,
 60 |                                                           n_iter=1, random_state=self.random_state)
 61 |             train_index, valid_index = [s for s in strat_shuffled_split][0]
 62 |             X_train, y_train = data[train_index], targets[train_index]
 63 |             X_valid, y_valid = data[valid_index], targets[valid_index]
 64 |         else:
 65 |             X_train, y_train = data, targets
 66 |             X_valid, y_valid = np.array([]), np.array([])
 67 | 
 68 |         if self.verbose > 5:
 69 |             print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape)
 70 |             if self.use_valid:
 71 |                 print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape)
 72 | 
 73 |         # Prepare theano variables
 74 |         dataset = dict(
 75 |             X_train=theano.shared(lasagne.utils.floatX(X_train)),
 76 |             y_train=T.cast(theano.shared(y_train), 'int32'),
 77 |             X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
 78 |             y_valid=T.cast(theano.shared(y_valid), 'int32'),
 79 |             num_examples_train=X_train.shape[0],
 80 |             num_examples_valid=X_valid.shape[0],
 81 |             input_dim=X_train.shape[1],
 82 |             output_dim=self.n_classes_,
 83 |         )
 84 | 
 85 |         if self.verbose > 0:
 86 |             print "Building model and compiling functions..."
 87 |         output_layer = self.build_model(dataset['input_dim'])
 88 |         iter_funcs = self.create_iter_functions(dataset, output_layer)
 89 | 
 90 |         if self.verbose > 0:
 91 |             print "Starting training..."
 92 |         now = time.time()
 93 |         results = []
 94 |         try:
 95 |             for epoch in self.train(iter_funcs, dataset, output_layer):
 96 |                 if self.verbose > 1:
 97 |                     print "Epoch {} of {} took {:.3f}s".format(
 98 |                         epoch['number'], self.max_epochs, time.time() - now)
 99 |                 now = time.time()
100 |                 results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']])
101 |                 if self.verbose > 1:
102 |                     print "  training loss:\t\t{:.6f}".format(epoch['train_loss'])
103 |                     print "  validation loss:\t\t{:.6f}".format(epoch['valid_loss'])
104 |                     print "  validation accuracy:\t\t{:.2f} %%".format(
105 |                         epoch['valid_accuracy'] * 100)
106 | 
107 |                 if epoch['number'] >= self.max_epochs:
108 |                     break
109 | 
110 |             if self.verbose > 0:
111 |                 print 'Minimum validation error: %f (epoch %d)' % \
112 |                       (epoch['best_val_error'], epoch['best_val_iter'])
113 | 
114 |         except KeyboardInterrupt:
115 |             pass
116 | 
117 |         return self
118 | 
119 |     def predict(self, data):
120 |         preds, _ = self.make_predictions(data)
121 | 
122 |         return preds
123 | 
124 |     def predict_proba(self, data):
125 |         _, proba = self.make_predictions(data)
126 | 
127 |         return proba
128 | 
129 |     def score(self):
130 |         return self.score_
131 | 
132 |     # Private methods
133 |     def build_model(self, input_dim):
134 |         l_in = InputLayer(shape=(self.batch_size, input_dim))
135 | 
136 |         l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify)
137 |         l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout)
138 | 
139 |         l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify)
140 |         l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout)
141 | 
142 |         l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden, nonlinearity=rectify)
143 |         l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout)
144 | 
145 |         l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax)
146 | 
147 |         return l_out
148 | 
149 |     def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix):
150 |         batch_index = T.iscalar('batch_index')
151 |         X_batch = X_tensor_type('x')
152 |         y_batch = T.ivector('y')
153 | 
154 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
155 | 
156 |         objective = Objective(output_layer, loss_function=categorical_crossentropy)
157 | 
158 |         loss_train = objective.get_loss(X_batch, target=y_batch)
159 |         loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)
160 | 
161 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
162 |         proba = output_layer.get_output(X_batch, deterministic=True)
163 |         accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)
164 | 
165 |         all_params = get_all_params(output_layer)
166 |         updates = adagrad(loss_train, all_params, self.lr, self.epsilon)
167 | 
168 |         iter_train = theano.function(
169 |             [batch_index], loss_train,
170 |             updates=updates,
171 |             givens={
172 |                 X_batch: dataset['X_train'][batch_slice],
173 |                 y_batch: dataset['y_train'][batch_slice],
174 |             },
175 |             on_unused_input='ignore',
176 |         )
177 | 
178 |         iter_valid = None
179 |         if self.use_valid:
180 |             iter_valid = theano.function(
181 |                 [batch_index], [loss_eval, accuracy, proba],
182 |                 givens={
183 |                     X_batch: dataset['X_valid'][batch_slice],
184 |                     y_batch: dataset['y_valid'][batch_slice],
185 |                 },
186 |             )
187 | 
188 |         return dict(train=iter_train, valid=iter_valid)
189 | 
190 |     def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix):
191 |         batch_index = T.iscalar('batch_index')
192 |         X_batch = X_tensor_type('x')
193 | 
194 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
195 | 
196 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
197 |         proba = output_layer.get_output(X_batch, deterministic=True)
198 | 
199 |         iter_test = theano.function(
200 |             [batch_index], [pred, proba],
201 |             givens={
202 |                 X_batch: dataset['X_test'][batch_slice],
203 |             },
204 |         )
205 | 
206 |         return dict(test=iter_test)
207 | 
208 |     def train(self, iter_funcs, dataset, output_layer):
209 |         num_batches_train = dataset['num_examples_train'] // self.batch_size
210 |         num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size)))
211 | 
212 |         best_val_err = 100
213 |         best_val_iter = -1
214 | 
215 |         for epoch in itertools.count(1):
216 |             batch_train_losses = []
217 |             for b in range(num_batches_train):
218 |                 batch_train_loss = iter_funcs['train'](b)
219 |                 batch_train_losses.append(batch_train_loss)
220 | 
221 |             avg_train_loss = np.mean(batch_train_losses)
222 | 
223 |             batch_valid_losses = []
224 |             batch_valid_accuracies = []
225 |             batch_valid_probas = []
226 | 
227 |             if self.use_valid:
228 |                 for b in range(num_batches_valid):
229 |                     batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b)
230 |                     batch_valid_losses.append(batch_valid_loss)
231 |                     batch_valid_accuracies.append(batch_valid_accuracy)
232 |                     batch_valid_probas.append(batch_valid_proba)
233 | 
234 |             avg_valid_loss = np.mean(batch_valid_losses)
235 |             avg_valid_accuracy = np.mean(batch_valid_accuracies)
236 | 
237 |             if (best_val_err > avg_valid_loss and self.use_valid) or\
238 |                     (epoch == self.max_epochs and not self.use_valid):
239 |                 best_val_err = avg_valid_loss
240 |                 best_val_iter = epoch
241 |                 # Save model
242 |                 self.score_ = best_val_err
243 |                 self.model = copy.deepcopy(output_layer)
244 | 
245 | 
246 |             yield {
247 |                 'number': epoch,
248 |                 'train_loss': avg_train_loss,
249 |                 'valid_loss': avg_valid_loss,
250 |                 'valid_accuracy': avg_valid_accuracy,
251 |                 'best_val_error': best_val_err,
252 |                 'best_val_iter': best_val_iter,
253 |             }
254 | 
255 |     def make_predictions(self, data):
256 |         dataset = dict(
257 |             X_test=theano.shared(lasagne.utils.floatX(data)),
258 |             num_examples_test=data.shape[0],
259 |             input_dim=data.shape[1],
260 |             output_dim=self.n_classes_,
261 |         )
262 | 
263 |         iter_funcs = self.create_test_function(dataset, self.model)
264 |         num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size)))
265 | 
266 |         test_preds, test_probas = np.array([]), None
267 | 
268 |         for b in range(num_batches_test):
269 |             batch_test_pred, batch_test_proba = iter_funcs['test'](b)
270 |             test_preds = np.append(test_preds, batch_test_pred)
271 |             test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba
272 | 
273 |         return test_preds, test_probas
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     train, labels, test, _, _ = utils.load_data()
278 | 
279 |     # Preprocess data - transform counts to TFIDF features
280 |     tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
281 |     train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1)
282 |     test = np.append(test, tfidf.transform(test).toarray(), axis=1)
283 | 
284 |     clf = NeuralNetwork(512, 110, 128, 0.004438538808932511, 1.6674644616533133e-14, 0.2137591043893735,
285 |                         .02, True, 10, random_state=23)
286 | 
287 |     if MODE == 'cv':
288 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
289 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
290 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
291 |     elif MODE == 'submission':
292 |         clf.fit(train, labels)
293 |         predictions = clf.predict_proba(test)
294 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
295 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
296 |                               predictions)
297 |     elif MODE == 'holdout':
298 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
299 |         print 'Log loss:', score
300 |     else:
301 |         print 'Unknown mode'


--------------------------------------------------------------------------------
/otto/model/model_10_nn_adagrad_pca/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_10_nn_adagrad_pca/nn_adagrad_pca.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mean log loss from 5-fold CV: 0.476382621152
  3 | """
  4 | import copy
  5 | import itertools
  6 | import numpy as np
  7 | import lasagne
  8 | import math
  9 | import os
 10 | import theano
 11 | import theano.tensor as T
 12 | import time
 13 | 
 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params
 15 | from lasagne.nonlinearities import rectify, softmax
 16 | from lasagne.objectives import categorical_crossentropy, Objective
 17 | from lasagne.updates import adagrad
 18 | 
 19 | from sklearn import decomposition, feature_extraction
 20 | from sklearn.base import BaseEstimator
 21 | from sklearn.cross_validation import StratifiedShuffleSplit
 22 | from sklearn.utils import check_random_state
 23 | 
 24 | from otto_utils import consts, utils
 25 | 
 26 | 
 27 | MODEL_NAME = 'model_10_nn_adagrad_pca'
 28 | MODE = 'holdout'  # cv|submission|holdout|tune
 29 | 
 30 | 
 31 | class NeuralNetwork(BaseEstimator):
 32 |     def __init__(self, n_hidden=20, max_epochs=150, batch_size=200,
 33 |                  lr=0.01, epsilon=0.9, dropout=0.5, valid_ratio=0.0,
 34 |                  use_valid=False, verbose=0, random_state=None):
 35 |         self.n_hidden = n_hidden
 36 |         self.max_epochs = max_epochs
 37 |         self.batch_size = batch_size
 38 |         self.lr = lr
 39 |         self.epsilon = epsilon
 40 |         self.dropout = dropout
 41 |         self.valid_ratio = valid_ratio
 42 |         self.use_valid = use_valid
 43 |         self.verbose = verbose
 44 |         self.random_state = random_state
 45 |         # State
 46 |         self.score_ = None
 47 |         self.classes_ = None
 48 |         self.n_classes_ = None
 49 |         self.model = None
 50 | 
 51 |     def fit(self, data, targets, sample_weight=None):
 52 |         self.classes_, indices = np.unique(targets, return_inverse=True)
 53 |         self.n_classes_ = self.classes_.shape[0]
 54 | 
 55 |         random_state = check_random_state(self.random_state)
 56 | 
 57 |         # Shuffle data and eventually split on train and validation sets
 58 |         if self.valid_ratio > 0:
 59 |             strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio,
 60 |                                                           n_iter=1, random_state=self.random_state)
 61 |             train_index, valid_index = [s for s in strat_shuffled_split][0]
 62 |             X_train, y_train = data[train_index], targets[train_index]
 63 |             X_valid, y_valid = data[valid_index], targets[valid_index]
 64 |         else:
 65 |             X_train, y_train = data, targets
 66 |             X_valid, y_valid = np.array([]), np.array([])
 67 | 
 68 |         if self.verbose > 5:
 69 |             print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape)
 70 |             if self.use_valid:
 71 |                 print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape)
 72 | 
 73 |         # Prepare theano variables
 74 |         dataset = dict(
 75 |             X_train=theano.shared(lasagne.utils.floatX(X_train)),
 76 |             y_train=T.cast(theano.shared(y_train), 'int32'),
 77 |             X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
 78 |             y_valid=T.cast(theano.shared(y_valid), 'int32'),
 79 |             num_examples_train=X_train.shape[0],
 80 |             num_examples_valid=X_valid.shape[0],
 81 |             input_dim=X_train.shape[1],
 82 |             output_dim=self.n_classes_,
 83 |         )
 84 | 
 85 |         if self.verbose > 0:
 86 |             print "Building model and compiling functions..."
 87 |         output_layer = self.build_model(dataset['input_dim'])
 88 |         iter_funcs = self.create_iter_functions(dataset, output_layer)
 89 | 
 90 |         if self.verbose > 0:
 91 |             print "Starting training..."
 92 |         now = time.time()
 93 |         results = []
 94 |         try:
 95 |             for epoch in self.train(iter_funcs, dataset, output_layer):
 96 |                 if self.verbose > 1:
 97 |                     print "Epoch {} of {} took {:.3f}s".format(
 98 |                         epoch['number'], self.max_epochs, time.time() - now)
 99 |                 now = time.time()
100 |                 results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']])
101 |                 if self.verbose > 1:
102 |                     print "  training loss:\t\t{:.6f}".format(epoch['train_loss'])
103 |                     print "  validation loss:\t\t{:.6f}".format(epoch['valid_loss'])
104 |                     print "  validation accuracy:\t\t{:.2f} %%".format(
105 |                         epoch['valid_accuracy'] * 100)
106 | 
107 |                 if epoch['number'] >= self.max_epochs:
108 |                     break
109 | 
110 |             if self.verbose > 0:
111 |                 print 'Minimum validation error: %f (epoch %d)' % \
112 |                       (epoch['best_val_error'], epoch['best_val_iter'])
113 | 
114 |         except KeyboardInterrupt:
115 |             pass
116 | 
117 |         return self
118 | 
119 |     def predict(self, data):
120 |         preds, _ = self.make_predictions(data)
121 | 
122 |         return preds
123 | 
124 |     def predict_proba(self, data):
125 |         _, proba = self.make_predictions(data)
126 | 
127 |         return proba
128 | 
129 |     def score(self):
130 |         return self.score_
131 | 
132 |     # Private methods
133 |     def build_model(self, input_dim):
134 |         l_in = InputLayer(shape=(self.batch_size, input_dim))
135 | 
136 |         l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify)
137 |         l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout)
138 | 
139 |         l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify)
140 |         l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout)
141 | 
142 |         l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden, nonlinearity=rectify)
143 |         l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout)
144 | 
145 |         l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax)
146 | 
147 |         return l_out
148 | 
149 |     def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix):
150 |         batch_index = T.iscalar('batch_index')
151 |         X_batch = X_tensor_type('x')
152 |         y_batch = T.ivector('y')
153 | 
154 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
155 | 
156 |         objective = Objective(output_layer, loss_function=categorical_crossentropy)
157 | 
158 |         loss_train = objective.get_loss(X_batch, target=y_batch)
159 |         loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)
160 | 
161 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
162 |         proba = output_layer.get_output(X_batch, deterministic=True)
163 |         accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)
164 | 
165 |         all_params = get_all_params(output_layer)
166 |         updates = adagrad(loss_train, all_params, self.lr, self.epsilon)
167 | 
168 |         iter_train = theano.function(
169 |             [batch_index], loss_train,
170 |             updates=updates,
171 |             givens={
172 |                 X_batch: dataset['X_train'][batch_slice],
173 |                 y_batch: dataset['y_train'][batch_slice],
174 |             },
175 |             on_unused_input='ignore',
176 |         )
177 | 
178 |         iter_valid = None
179 |         if self.use_valid:
180 |             iter_valid = theano.function(
181 |                 [batch_index], [loss_eval, accuracy, proba],
182 |                 givens={
183 |                     X_batch: dataset['X_valid'][batch_slice],
184 |                     y_batch: dataset['y_valid'][batch_slice],
185 |                 },
186 |             )
187 | 
188 |         return dict(train=iter_train, valid=iter_valid)
189 | 
190 |     def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix):
191 |         batch_index = T.iscalar('batch_index')
192 |         X_batch = X_tensor_type('x')
193 | 
194 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
195 | 
196 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
197 |         proba = output_layer.get_output(X_batch, deterministic=True)
198 | 
199 |         iter_test = theano.function(
200 |             [batch_index], [pred, proba],
201 |             givens={
202 |                 X_batch: dataset['X_test'][batch_slice],
203 |             },
204 |         )
205 | 
206 |         return dict(test=iter_test)
207 | 
208 |     def train(self, iter_funcs, dataset, output_layer):
209 |         num_batches_train = dataset['num_examples_train'] // self.batch_size
210 |         num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size)))
211 | 
212 |         best_val_err = 100
213 |         best_val_iter = -1
214 | 
215 |         for epoch in itertools.count(1):
216 |             batch_train_losses = []
217 |             for b in range(num_batches_train):
218 |                 batch_train_loss = iter_funcs['train'](b)
219 |                 batch_train_losses.append(batch_train_loss)
220 | 
221 |             avg_train_loss = np.mean(batch_train_losses)
222 | 
223 |             batch_valid_losses = []
224 |             batch_valid_accuracies = []
225 |             batch_valid_probas = []
226 | 
227 |             if self.use_valid:
228 |                 for b in range(num_batches_valid):
229 |                     batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b)
230 |                     batch_valid_losses.append(batch_valid_loss)
231 |                     batch_valid_accuracies.append(batch_valid_accuracy)
232 |                     batch_valid_probas.append(batch_valid_proba)
233 | 
234 |             avg_valid_loss = np.mean(batch_valid_losses)
235 |             avg_valid_accuracy = np.mean(batch_valid_accuracies)
236 | 
237 |             if (best_val_err > avg_valid_loss and self.use_valid) or\
238 |                     (epoch == self.max_epochs and not self.use_valid):
239 |                 best_val_err = avg_valid_loss
240 |                 best_val_iter = epoch
241 |                 # Save model
242 |                 self.score_ = best_val_err
243 |                 self.model = copy.deepcopy(output_layer)
244 | 
245 | 
246 |             yield {
247 |                 'number': epoch,
248 |                 'train_loss': avg_train_loss,
249 |                 'valid_loss': avg_valid_loss,
250 |                 'valid_accuracy': avg_valid_accuracy,
251 |                 'best_val_error': best_val_err,
252 |                 'best_val_iter': best_val_iter,
253 |             }
254 | 
255 |     def make_predictions(self, data):
256 |         dataset = dict(
257 |             X_test=theano.shared(lasagne.utils.floatX(data)),
258 |             num_examples_test=data.shape[0],
259 |             input_dim=data.shape[1],
260 |             output_dim=self.n_classes_,
261 |         )
262 | 
263 |         iter_funcs = self.create_test_function(dataset, self.model)
264 |         num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size)))
265 | 
266 |         test_preds, test_probas = np.array([]), None
267 | 
268 |         for b in range(num_batches_test):
269 |             batch_test_pred, batch_test_proba = iter_funcs['test'](b)
270 |             test_preds = np.append(test_preds, batch_test_pred)
271 |             test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba
272 | 
273 |         return test_preds, test_probas
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     train, labels, test, _, _ = utils.load_data()
278 | 
279 |     # Preprocess data - transform counts to TFIDF features
280 |     tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
281 |     train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1)
282 |     test = np.append(test, tfidf.transform(test).toarray(), axis=1)
283 | 
284 |     # PCA
285 |     pp = decomposition.PCA()
286 |     train = pp.fit_transform(train)
287 |     test = pp.transform(test)
288 | 
289 |     clf = NeuralNetwork(512, 110, 128, 0.004438538808932511, 1.6674644616533133e-14, 0.2137591043893735,
290 |                         .02, True, 10, random_state=23)
291 | 
292 |     if MODE == 'cv':
293 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
294 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
295 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
296 |     elif MODE == 'submission':
297 |         clf.fit(train, labels)
298 |         predictions = clf.predict_proba(test)
299 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
300 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
301 |                               predictions)
302 |     elif MODE == 'holdout':
303 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
304 |         print 'Log loss:', score
305 |     else:
306 |         print 'Unknown mode'


--------------------------------------------------------------------------------
/otto/model/model_11_xgboost_poly/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_11_xgboost_poly/xgboost_poly.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 5-fold cv - log loss 0.468809065953
  3 | """
  4 | import graphlab as gl
  5 | import numpy as np
  6 | import logging
  7 | import os
  8 | 
  9 | from hyperopt import fmin, hp, tpe
 10 | 
 11 | from sklearn.base import BaseEstimator
 12 | from sklearn.svm import LinearSVC
 13 | from sklearn import preprocessing
 14 | 
 15 | from otto_utils import consts, utils
 16 | 
 17 | 
 18 | MODEL_NAME = 'model_11_xgboost_poly'
 19 | MODE = 'cv'  # cv|submission|holdout|tune
 20 | 
 21 | logging.disable(logging.INFO)
 22 | 
 23 | 
 24 | class XGBoost(BaseEstimator):
 25 |     def __init__(self, max_iterations=50, max_depth=9, min_child_weight=4, row_subsample=.75,
 26 |                  min_loss_reduction=1., column_subsample=.8, step_size=.3, verbose=True):
 27 |         self.n_classes_ = 9
 28 |         self.max_iterations = max_iterations
 29 |         self.max_depth = max_depth
 30 |         self.min_child_weight = min_child_weight
 31 |         self.row_subsample = row_subsample
 32 |         self.min_loss_reduction = min_loss_reduction
 33 |         self.column_subsample = column_subsample
 34 |         self.step_size = step_size
 35 |         self.verbose = verbose
 36 |         self.model = None
 37 | 
 38 |     def fit(self, X, y, sample_weight=None):
 39 |         sf = self._array_to_sframe(X, y)
 40 |         self.model = gl.boosted_trees_classifier.create(sf, target='target',
 41 |                                                         max_iterations=self.max_iterations,
 42 |                                                         max_depth=self.max_depth,
 43 |                                                         min_child_weight=self.min_child_weight,
 44 |                                                         row_subsample=self.row_subsample,
 45 |                                                         min_loss_reduction=self.min_loss_reduction,
 46 |                                                         column_subsample=self.column_subsample,
 47 |                                                         step_size=self.step_size,
 48 |                                                         validation_set=None,
 49 |                                                         verbose=self.verbose)
 50 | 
 51 |         return self
 52 | 
 53 |     def predict(self, X):
 54 |         preds = self.predict_proba(X)
 55 |         return np.argmax(preds, axis=1)
 56 | 
 57 |     def predict_proba(self, X):
 58 |         sf = self._array_to_sframe(X)
 59 |         preds = self.model.predict_topk(sf, output_type='probability', k=self.n_classes_)
 60 | 
 61 |         return self._preds_to_array(preds)
 62 | 
 63 |     # Private methods
 64 |     def _array_to_sframe(self, data, targets=None):
 65 |         d = dict()
 66 |         for i in xrange(data.shape[1]):
 67 |             d['feat_%d' % (i + 1)] = gl.SArray(data[:, i])
 68 |         if targets is not None:
 69 |             d['target'] = gl.SArray(targets)
 70 | 
 71 |         return gl.SFrame(d)
 72 | 
 73 |     def _preds_to_array(self, preds):
 74 |         p = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '')
 75 |         p['id'] = p['id'].astype(int) + 1
 76 |         p = p.sort('id')
 77 |         del p['id']
 78 |         preds_array = np.array(p.to_dataframe(), dtype=float)
 79 | 
 80 |         return preds_array
 81 | 
 82 | 
 83 | if __name__ == '__main__':
 84 |     train, labels, test, _, _ = utils.load_data()
 85 | 
 86 |     # polynomial features
 87 |     poly_feat = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
 88 |     train = poly_feat.fit_transform(train, labels)
 89 |     test = poly_feat.transform(test)
 90 | 
 91 |     print train.shape
 92 | 
 93 |     # feature selection
 94 |     feat_selector = LinearSVC(C=0.0001, penalty='l1', dual=False)
 95 |     train = feat_selector.fit_transform(train, labels)
 96 |     test = feat_selector.transform(test)
 97 | 
 98 |     print train.shape
 99 | 
100 |     clf = XGBoost(max_iterations=4800, max_depth=12, min_child_weight=4.9208250938262745, row_subsample=.9134478530382129,
101 |                   min_loss_reduction=.5132278416508804, column_subsample=.730128689911957, step_size=.009)
102 | 
103 | 
104 |     if MODE == 'cv':
105 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
106 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
107 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
108 |     elif MODE == 'submission':
109 |         clf.fit(train, labels)
110 |         predictions = clf.predict_proba(test)
111 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
112 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
113 |                               predictions)
114 |     elif MODE == 'holdout':
115 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
116 |         print 'Log loss:', score
117 |     elif MODE == 'tune':
118 |         # Objective function
119 |         def objective(args):
120 |             max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args
121 |             clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight,
122 |                           row_subsample=row_subsample, min_loss_reduction=min_loss_reduction,
123 |                           column_subsample=column_subsample, verbose=False)
124 |             score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
125 |             print 'max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss'
126 |             print args, score
127 |             return score
128 |         # Searching space
129 |         space = (
130 |             hp.quniform('max_depth', 2, 14, 1),
131 |             hp.uniform('min_child_weight', .5, 10.),
132 |             hp.uniform('row_subsample', .3, 1.),
133 |             hp.uniform('min_loss_reduction', .1, 3.),
134 |             hp.uniform('column_subsample', .1, 1.),
135 |         )
136 | 
137 |         best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=500)
138 |         print 'Best solution:', best_sln
139 |     else:
140 |         print 'Unknown mode'
141 | 


--------------------------------------------------------------------------------
/otto/model/model_12_nn_rmsprop_pca/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_12_nn_rmsprop_pca/nn_rmsprop_pca.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mean log loss from 5-fold CV: 0.476282022208
  3 | """
  4 | import copy
  5 | import itertools
  6 | import numpy as np
  7 | import lasagne
  8 | import math
  9 | import os
 10 | import theano
 11 | import theano.tensor as T
 12 | import time
 13 | 
 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params
 15 | from lasagne.nonlinearities import rectify, softmax
 16 | from lasagne.objectives import categorical_crossentropy, Objective
 17 | from lasagne.updates import rmsprop
 18 | 
 19 | from sklearn import decomposition
 20 | from sklearn.base import BaseEstimator
 21 | from sklearn.cross_validation import StratifiedShuffleSplit
 22 | from sklearn.utils import check_random_state
 23 | 
 24 | from otto_utils import consts, utils
 25 | 
 26 | 
 27 | MODEL_NAME = 'model_12_nn_rmsprop_pca'
 28 | MODE = 'submission'  # cv|submission|holdout|tune
 29 | 
 30 | 
 31 | class NeuralNetwork(BaseEstimator):
 32 |     def __init__(self, n_hidden=20, max_epochs=150, batch_size=200,
 33 |                  lr=0.01, rho=0.9, dropout=0.5, valid_ratio=0.0,
 34 |                  use_valid=False, verbose=0, random_state=None):
 35 |         self.n_hidden = n_hidden
 36 |         self.max_epochs = max_epochs
 37 |         self.batch_size = batch_size
 38 |         self.lr = lr
 39 |         self.rho = rho
 40 |         self.dropout = dropout
 41 |         self.valid_ratio = valid_ratio
 42 |         self.use_valid = use_valid
 43 |         self.verbose = verbose
 44 |         self.random_state = random_state
 45 |         # State
 46 |         self.score_ = None
 47 |         self.classes_ = None
 48 |         self.n_classes_ = None
 49 |         self.model = None
 50 | 
 51 |     def fit(self, data, targets, sample_weight=None):
 52 |         self.classes_, indices = np.unique(targets, return_inverse=True)
 53 |         self.n_classes_ = self.classes_.shape[0]
 54 | 
 55 |         random_state = check_random_state(self.random_state)
 56 | 
 57 |         # Shuffle data and eventually split on train and validation sets
 58 |         if self.valid_ratio > 0:
 59 |             strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio,
 60 |                                                           n_iter=1, random_state=self.random_state)
 61 |             train_index, valid_index = [s for s in strat_shuffled_split][0]
 62 |             X_train, y_train = data[train_index], targets[train_index]
 63 |             X_valid, y_valid = data[valid_index], targets[valid_index]
 64 |         else:
 65 |             X_train, y_train = data, targets
 66 |             X_valid, y_valid = np.array([]), np.array([])
 67 | 
 68 |         if self.verbose > 5:
 69 |             print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape)
 70 |             if self.use_valid:
 71 |                 print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape)
 72 | 
 73 |         # Prepare theano variables
 74 |         dataset = dict(
 75 |             X_train=theano.shared(lasagne.utils.floatX(X_train)),
 76 |             y_train=T.cast(theano.shared(y_train), 'int32'),
 77 |             X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
 78 |             y_valid=T.cast(theano.shared(y_valid), 'int32'),
 79 |             num_examples_train=X_train.shape[0],
 80 |             num_examples_valid=X_valid.shape[0],
 81 |             input_dim=X_train.shape[1],
 82 |             output_dim=self.n_classes_,
 83 |         )
 84 | 
 85 |         if self.verbose > 0:
 86 |             print "Building model and compiling functions..."
 87 |         output_layer = self.build_model(dataset['input_dim'])
 88 |         iter_funcs = self.create_iter_functions(dataset, output_layer)
 89 | 
 90 |         if self.verbose > 0:
 91 |             print "Starting training..."
 92 |         now = time.time()
 93 |         results = []
 94 |         try:
 95 |             for epoch in self.train(iter_funcs, dataset, output_layer):
 96 |                 if self.verbose > 1:
 97 |                     print "Epoch {} of {} took {:.3f}s".format(
 98 |                         epoch['number'], self.max_epochs, time.time() - now)
 99 |                 now = time.time()
100 |                 results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']])
101 |                 if self.verbose > 1:
102 |                     print "  training loss:\t\t{:.6f}".format(epoch['train_loss'])
103 |                     print "  validation loss:\t\t{:.6f}".format(epoch['valid_loss'])
104 |                     print "  validation accuracy:\t\t{:.2f} %%".format(
105 |                         epoch['valid_accuracy'] * 100)
106 | 
107 |                 if epoch['number'] >= self.max_epochs:
108 |                     break
109 | 
110 |             if self.verbose > 0:
111 |                 print 'Minimum validation error: %f (epoch %d)' % \
112 |                       (epoch['best_val_error'], epoch['best_val_iter'])
113 | 
114 |         except KeyboardInterrupt:
115 |             pass
116 | 
117 |         return self
118 | 
119 |     def predict(self, data):
120 |         preds, _ = self.make_predictions(data)
121 | 
122 |         return preds
123 | 
124 |     def predict_proba(self, data):
125 |         _, proba = self.make_predictions(data)
126 | 
127 |         return proba
128 | 
129 |     def score(self):
130 |         return self.score_
131 | 
132 |     # Private methods
133 |     def build_model(self, input_dim):
134 |         l_in = InputLayer(shape=(self.batch_size, input_dim))
135 | 
136 |         l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify)
137 |         l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout)
138 | 
139 |         l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden / 2, nonlinearity=rectify)
140 |         l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout)
141 | 
142 |         l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden / 4, nonlinearity=rectify)
143 |         l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout)
144 | 
145 |         l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax)
146 | 
147 |         return l_out
148 | 
149 |     def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix):
150 |         batch_index = T.iscalar('batch_index')
151 |         X_batch = X_tensor_type('x')
152 |         y_batch = T.ivector('y')
153 | 
154 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
155 | 
156 |         objective = Objective(output_layer, loss_function=categorical_crossentropy)
157 | 
158 |         loss_train = objective.get_loss(X_batch, target=y_batch)
159 |         loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)
160 | 
161 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
162 |         proba = output_layer.get_output(X_batch, deterministic=True)
163 |         accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)
164 | 
165 |         all_params = get_all_params(output_layer)
166 |         updates = rmsprop(loss_train, all_params, self.lr, self.rho)
167 | 
168 |         iter_train = theano.function(
169 |             [batch_index], loss_train,
170 |             updates=updates,
171 |             givens={
172 |                 X_batch: dataset['X_train'][batch_slice],
173 |                 y_batch: dataset['y_train'][batch_slice],
174 |             },
175 |             on_unused_input='ignore',
176 |         )
177 | 
178 |         iter_valid = None
179 |         if self.use_valid:
180 |             iter_valid = theano.function(
181 |                 [batch_index], [loss_eval, accuracy, proba],
182 |                 givens={
183 |                     X_batch: dataset['X_valid'][batch_slice],
184 |                     y_batch: dataset['y_valid'][batch_slice],
185 |                 },
186 |             )
187 | 
188 |         return dict(train=iter_train, valid=iter_valid)
189 | 
190 |     def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix):
191 |         batch_index = T.iscalar('batch_index')
192 |         X_batch = X_tensor_type('x')
193 | 
194 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
195 | 
196 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
197 |         proba = output_layer.get_output(X_batch, deterministic=True)
198 | 
199 |         iter_test = theano.function(
200 |             [batch_index], [pred, proba],
201 |             givens={
202 |                 X_batch: dataset['X_test'][batch_slice],
203 |             },
204 |         )
205 | 
206 |         return dict(test=iter_test)
207 | 
208 |     def train(self, iter_funcs, dataset, output_layer):
209 |         num_batches_train = dataset['num_examples_train'] // self.batch_size
210 |         num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size)))
211 | 
212 |         best_val_err = 100
213 |         best_val_iter = -1
214 | 
215 |         for epoch in itertools.count(1):
216 |             batch_train_losses = []
217 |             for b in range(num_batches_train):
218 |                 batch_train_loss = iter_funcs['train'](b)
219 |                 batch_train_losses.append(batch_train_loss)
220 | 
221 |             avg_train_loss = np.mean(batch_train_losses)
222 | 
223 |             batch_valid_losses = []
224 |             batch_valid_accuracies = []
225 |             batch_valid_probas = []
226 | 
227 |             if self.use_valid:
228 |                 for b in range(num_batches_valid):
229 |                     batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b)
230 |                     batch_valid_losses.append(batch_valid_loss)
231 |                     batch_valid_accuracies.append(batch_valid_accuracy)
232 |                     batch_valid_probas.append(batch_valid_proba)
233 | 
234 |             avg_valid_loss = np.mean(batch_valid_losses)
235 |             avg_valid_accuracy = np.mean(batch_valid_accuracies)
236 | 
237 |             if (best_val_err > avg_valid_loss and self.use_valid) or\
238 |                     (epoch == self.max_epochs and not self.use_valid):
239 |                 best_val_err = avg_valid_loss
240 |                 best_val_iter = epoch
241 |                 # Save model
242 |                 self.score_ = best_val_err
243 |                 self.model = copy.deepcopy(output_layer)
244 | 
245 | 
246 |             yield {
247 |                 'number': epoch,
248 |                 'train_loss': avg_train_loss,
249 |                 'valid_loss': avg_valid_loss,
250 |                 'valid_accuracy': avg_valid_accuracy,
251 |                 'best_val_error': best_val_err,
252 |                 'best_val_iter': best_val_iter,
253 |             }
254 | 
255 |     def make_predictions(self, data):
256 |         dataset = dict(
257 |             X_test=theano.shared(lasagne.utils.floatX(data)),
258 |             num_examples_test=data.shape[0],
259 |             input_dim=data.shape[1],
260 |             output_dim=self.n_classes_,
261 |         )
262 | 
263 |         iter_funcs = self.create_test_function(dataset, self.model)
264 |         num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size)))
265 | 
266 |         test_preds, test_probas = np.array([]), None
267 | 
268 |         for b in range(num_batches_test):
269 |             batch_test_pred, batch_test_proba = iter_funcs['test'](b)
270 |             test_preds = np.append(test_preds, batch_test_pred)
271 |             test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba
272 | 
273 |         return test_preds, test_probas
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     train, labels, test, _, _ = utils.load_data()
278 | 
279 |     # PCA
280 |     pp = decomposition.PCA()
281 |     train = pp.fit_transform(train)
282 |     test = pp.transform(test)
283 | 
284 |     clf = NeuralNetwork(1024, 110, 128, 7.218018732952578e-05, 0.9385973679339745, 0.3848935494155976,
285 |                         .02, True, 10, random_state=23)
286 | 
287 |     if MODE == 'cv':
288 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
289 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
290 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
291 |     elif MODE == 'submission':
292 |         clf.fit(train, labels)
293 |         predictions = clf.predict_proba(test)
294 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
295 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
296 |                               predictions)
297 |     elif MODE == 'holdout':
298 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
299 |         print 'Log loss:', score
300 |     else:
301 |         print 'Unknown mode'


--------------------------------------------------------------------------------
/otto/model/model_13_nn_rmsprop_features/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_13_nn_rmsprop_features/nn_rmsprop_features.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mean log loss from 5-fold CV: 0.485363808092
  3 | """
  4 | import copy
  5 | import itertools
  6 | import numpy as np
  7 | import lasagne
  8 | import math
  9 | import os
 10 | import theano
 11 | import theano.tensor as T
 12 | import time
 13 | 
 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params
 15 | from lasagne.nonlinearities import rectify, softmax
 16 | from lasagne.objectives import categorical_crossentropy, Objective
 17 | from lasagne.updates import rmsprop
 18 | 
 19 | from sklearn import feature_extraction
 20 | from sklearn.base import BaseEstimator
 21 | from sklearn.cross_validation import StratifiedShuffleSplit
 22 | from sklearn.utils import check_random_state
 23 | 
 24 | from otto_utils import consts, utils
 25 | 
 26 | 
 27 | MODEL_NAME = 'model_13_nn_rmsprop_features'
 28 | MODE = 'submission'  # cv|submission|holdout|tune
 29 | 
 30 | 
 31 | class NeuralNetwork(BaseEstimator):
 32 |     def __init__(self, n_hidden=20, max_epochs=150, batch_size=200,
 33 |                  lr=0.01, rho=0.96, epsilon=0.9, dropout=0.5, valid_ratio=0.0,
 34 |                  use_valid=False, verbose=0, random_state=None):
 35 |         self.n_hidden = n_hidden
 36 |         self.max_epochs = max_epochs
 37 |         self.batch_size = batch_size
 38 |         self.lr = lr
 39 |         self.rho = rho
 40 |         self.epsilon = epsilon
 41 |         self.dropout = dropout
 42 |         self.valid_ratio = valid_ratio
 43 |         self.use_valid = use_valid
 44 |         self.verbose = verbose
 45 |         self.random_state = random_state
 46 |         # State
 47 |         self.score_ = None
 48 |         self.classes_ = None
 49 |         self.n_classes_ = None
 50 |         self.model = None
 51 | 
 52 |     def fit(self, data, targets, sample_weight=None):
 53 |         self.classes_, indices = np.unique(targets, return_inverse=True)
 54 |         self.n_classes_ = self.classes_.shape[0]
 55 | 
 56 |         random_state = check_random_state(self.random_state)
 57 | 
 58 |         # Shuffle data and eventually split on train and validation sets
 59 |         if self.valid_ratio > 0:
 60 |             strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio,
 61 |                                                           n_iter=1, random_state=self.random_state)
 62 |             train_index, valid_index = [s for s in strat_shuffled_split][0]
 63 |             X_train, y_train = data[train_index], targets[train_index]
 64 |             X_valid, y_valid = data[valid_index], targets[valid_index]
 65 |         else:
 66 |             X_train, y_train = data, targets
 67 |             X_valid, y_valid = np.array([]), np.array([])
 68 | 
 69 |         if self.verbose > 5:
 70 |             print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape)
 71 |             if self.use_valid:
 72 |                 print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape)
 73 | 
 74 |         # Prepare theano variables
 75 |         dataset = dict(
 76 |             X_train=theano.shared(lasagne.utils.floatX(X_train)),
 77 |             y_train=T.cast(theano.shared(y_train), 'int32'),
 78 |             X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
 79 |             y_valid=T.cast(theano.shared(y_valid), 'int32'),
 80 |             num_examples_train=X_train.shape[0],
 81 |             num_examples_valid=X_valid.shape[0],
 82 |             input_dim=X_train.shape[1],
 83 |             output_dim=self.n_classes_,
 84 |         )
 85 | 
 86 |         if self.verbose > 0:
 87 |             print "Building model and compiling functions..."
 88 |         output_layer = self.build_model(dataset['input_dim'])
 89 |         iter_funcs = self.create_iter_functions(dataset, output_layer)
 90 | 
 91 |         if self.verbose > 0:
 92 |             print "Starting training..."
 93 |         now = time.time()
 94 |         results = []
 95 |         try:
 96 |             for epoch in self.train(iter_funcs, dataset, output_layer):
 97 |                 if self.verbose > 1:
 98 |                     print "Epoch {} of {} took {:.3f}s".format(
 99 |                         epoch['number'], self.max_epochs, time.time() - now)
100 |                 now = time.time()
101 |                 results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']])
102 |                 if self.verbose > 1:
103 |                     print "  training loss:\t\t{:.6f}".format(epoch['train_loss'])
104 |                     print "  validation loss:\t\t{:.6f}".format(epoch['valid_loss'])
105 |                     print "  validation accuracy:\t\t{:.2f} %%".format(
106 |                         epoch['valid_accuracy'] * 100)
107 | 
108 |                 if epoch['number'] >= self.max_epochs:
109 |                     break
110 | 
111 |             if self.verbose > 0:
112 |                 print 'Minimum validation error: %f (epoch %d)' % \
113 |                       (epoch['best_val_error'], epoch['best_val_iter'])
114 | 
115 |         except KeyboardInterrupt:
116 |             pass
117 | 
118 |         return self
119 | 
120 |     def predict(self, data):
121 |         preds, _ = self.make_predictions(data)
122 | 
123 |         return preds
124 | 
125 |     def predict_proba(self, data):
126 |         _, proba = self.make_predictions(data)
127 | 
128 |         return proba
129 | 
130 |     def score(self):
131 |         return self.score_
132 | 
133 |     # Private methods
134 |     def build_model(self, input_dim):
135 |         l_in = InputLayer(shape=(self.batch_size, input_dim))
136 | 
137 |         l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden / 2, nonlinearity=rectify)
138 |         l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout)
139 | 
140 |         l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden, nonlinearity=rectify)
141 |         l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout)
142 | 
143 |         l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden / 2, nonlinearity=rectify)
144 |         l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout)
145 | 
146 |         l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax)
147 | 
148 |         return l_out
149 | 
150 |     def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix):
151 |         batch_index = T.iscalar('batch_index')
152 |         X_batch = X_tensor_type('x')
153 |         y_batch = T.ivector('y')
154 | 
155 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
156 | 
157 |         objective = Objective(output_layer, loss_function=categorical_crossentropy)
158 | 
159 |         loss_train = objective.get_loss(X_batch, target=y_batch)
160 |         loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)
161 | 
162 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
163 |         proba = output_layer.get_output(X_batch, deterministic=True)
164 |         accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)
165 | 
166 |         all_params = get_all_params(output_layer)
167 |         updates = rmsprop(loss_train, all_params, self.lr, self.rho)
168 | 
169 |         iter_train = theano.function(
170 |             [batch_index], loss_train,
171 |             updates=updates,
172 |             givens={
173 |                 X_batch: dataset['X_train'][batch_slice],
174 |                 y_batch: dataset['y_train'][batch_slice],
175 |             },
176 |             on_unused_input='ignore',
177 |         )
178 | 
179 |         iter_valid = None
180 |         if self.use_valid:
181 |             iter_valid = theano.function(
182 |                 [batch_index], [loss_eval, accuracy, proba],
183 |                 givens={
184 |                     X_batch: dataset['X_valid'][batch_slice],
185 |                     y_batch: dataset['y_valid'][batch_slice],
186 |                 },
187 |             )
188 | 
189 |         return dict(train=iter_train, valid=iter_valid)
190 | 
191 |     def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix):
192 |         batch_index = T.iscalar('batch_index')
193 |         X_batch = X_tensor_type('x')
194 | 
195 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
196 | 
197 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
198 |         proba = output_layer.get_output(X_batch, deterministic=True)
199 | 
200 |         iter_test = theano.function(
201 |             [batch_index], [pred, proba],
202 |             givens={
203 |                 X_batch: dataset['X_test'][batch_slice],
204 |             },
205 |         )
206 | 
207 |         return dict(test=iter_test)
208 | 
209 |     def train(self, iter_funcs, dataset, output_layer):
210 |         num_batches_train = dataset['num_examples_train'] // self.batch_size
211 |         num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size)))
212 | 
213 |         best_val_err = 100
214 |         best_val_iter = -1
215 | 
216 |         for epoch in itertools.count(1):
217 |             batch_train_losses = []
218 |             for b in range(num_batches_train):
219 |                 batch_train_loss = iter_funcs['train'](b)
220 |                 batch_train_losses.append(batch_train_loss)
221 | 
222 |             avg_train_loss = np.mean(batch_train_losses)
223 | 
224 |             batch_valid_losses = []
225 |             batch_valid_accuracies = []
226 |             batch_valid_probas = []
227 | 
228 |             if self.use_valid:
229 |                 for b in range(num_batches_valid):
230 |                     batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b)
231 |                     batch_valid_losses.append(batch_valid_loss)
232 |                     batch_valid_accuracies.append(batch_valid_accuracy)
233 |                     batch_valid_probas.append(batch_valid_proba)
234 | 
235 |             avg_valid_loss = np.mean(batch_valid_losses)
236 |             avg_valid_accuracy = np.mean(batch_valid_accuracies)
237 | 
238 |             if (best_val_err > avg_valid_loss and self.use_valid) or\
239 |                     (epoch == self.max_epochs and not self.use_valid):
240 |                 best_val_err = avg_valid_loss
241 |                 best_val_iter = epoch
242 |                 # Save model
243 |                 self.score_ = best_val_err
244 |                 self.model = copy.deepcopy(output_layer)
245 | 
246 | 
247 |             yield {
248 |                 'number': epoch,
249 |                 'train_loss': avg_train_loss,
250 |                 'valid_loss': avg_valid_loss,
251 |                 'valid_accuracy': avg_valid_accuracy,
252 |                 'best_val_error': best_val_err,
253 |                 'best_val_iter': best_val_iter,
254 |             }
255 | 
256 |     def make_predictions(self, data):
257 |         dataset = dict(
258 |             X_test=theano.shared(lasagne.utils.floatX(data)),
259 |             num_examples_test=data.shape[0],
260 |             input_dim=data.shape[1],
261 |             output_dim=self.n_classes_,
262 |         )
263 | 
264 |         iter_funcs = self.create_test_function(dataset, self.model)
265 |         num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size)))
266 | 
267 |         test_preds, test_probas = np.array([]), None
268 | 
269 |         for b in range(num_batches_test):
270 |             batch_test_pred, batch_test_proba = iter_funcs['test'](b)
271 |             test_preds = np.append(test_preds, batch_test_pred)
272 |             test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba
273 | 
274 |         return test_preds, test_probas
275 | 
276 | 
277 | if __name__ == '__main__':
278 |     train, labels, test, _, _ = utils.load_data(os.path.join(consts.DATA_PATH, 'fe_train.csv'),
279 |                                                 os.path.join(consts.DATA_PATH, 'fe_test.csv'))
280 | 
281 |     from sklearn import decomposition
282 |     # PCA
283 |     pp = decomposition.PCA()
284 |     train = pp.fit_transform(train)
285 |     test = pp.transform(test)
286 | 
287 |     clf = NeuralNetwork(1024, 110, 128, 0.00013934891814068934, 0.9724490021642429,
288 |                         6.238206486137665e-05, 0.3081052487919688,
289 |                         .02, True, 10, random_state=21)
290 | 
291 |     if MODE == 'cv':
292 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
293 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
294 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
295 |     elif MODE == 'submission':
296 |         clf.fit(train, labels)
297 |         predictions = clf.predict_proba(test)
298 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
299 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
300 |                               predictions)
301 |     elif MODE == 'holdout':
302 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
303 |         print 'Log loss:', score
304 |     else:
305 |         print 'Unknown mode'


--------------------------------------------------------------------------------
/otto/model/model_14_bagging_xgboost/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_14_bagging_xgboost/bagging_xgboost.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 5-fold cv - log loss 0.447489661199
  3 | """
  4 | import graphlab as gl
  5 | import numpy as np
  6 | import logging
  7 | import os
  8 | 
  9 | from hyperopt import fmin, hp, tpe
 10 | 
 11 | from sklearn.base import BaseEstimator
 12 | from sklearn import ensemble
 13 | 
 14 | from otto_utils import consts, utils
 15 | 
 16 | 
 17 | MODEL_NAME = 'model_14_bagging_xgboost'
 18 | MODE = 'cv'  # cv|submission|holdout|tune
 19 | 
 20 | logging.disable(logging.INFO)
 21 | 
 22 | 
 23 | class XGBoost(BaseEstimator):
 24 |     def __init__(self, max_iterations=50, max_depth=9, min_child_weight=4, row_subsample=.75,
 25 |                  min_loss_reduction=1., column_subsample=.8, step_size=.3, verbose=True):
 26 |         self.n_classes_ = None
 27 |         self.classes_ = None
 28 |         self.max_iterations = max_iterations
 29 |         self.max_depth = max_depth
 30 |         self.min_child_weight = min_child_weight
 31 |         self.row_subsample = row_subsample
 32 |         self.min_loss_reduction = min_loss_reduction
 33 |         self.column_subsample = column_subsample
 34 |         self.step_size = step_size
 35 |         self.verbose = verbose
 36 |         self.model = None
 37 | 
 38 |     def fit(self, X, y, sample_weight=None):
 39 |         self.classes_, indices = np.unique(y, return_inverse=True)
 40 |         self.n_classes_ = self.classes_.shape[0]
 41 | 
 42 |         sf = self._array_to_sframe(X, y)
 43 |         self.model = gl.boosted_trees_classifier.create(sf, target='target',
 44 |                                                         max_iterations=self.max_iterations,
 45 |                                                         max_depth=self.max_depth,
 46 |                                                         min_child_weight=self.min_child_weight,
 47 |                                                         row_subsample=self.row_subsample,
 48 |                                                         min_loss_reduction=self.min_loss_reduction,
 49 |                                                         column_subsample=self.column_subsample,
 50 |                                                         step_size=self.step_size,
 51 |                                                         validation_set=None,
 52 |                                                         verbose=self.verbose)
 53 | 
 54 |         return self
 55 | 
 56 |     def predict(self, X):
 57 |         preds = self.predict_proba(X)
 58 |         return np.argmax(preds, axis=1)
 59 | 
 60 |     def predict_proba(self, X):
 61 |         sf = self._array_to_sframe(X)
 62 |         preds = self.model.predict_topk(sf, output_type='probability', k=self.n_classes_)
 63 | 
 64 |         return self._preds_to_array(preds)
 65 | 
 66 |     # Private methods
 67 |     def _array_to_sframe(self, data, targets=None):
 68 |         d = dict()
 69 |         for i in xrange(data.shape[1]):
 70 |             d['feat_%d' % (i + 1)] = gl.SArray(data[:, i])
 71 |         if targets is not None:
 72 |             d['target'] = gl.SArray(targets)
 73 | 
 74 |         return gl.SFrame(d)
 75 | 
 76 |     def _preds_to_array(self, preds):
 77 |         p = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '')
 78 |         p['id'] = p['id'].astype(int) + 1
 79 |         p = p.sort('id')
 80 |         del p['id']
 81 |         preds_array = np.array(p.to_dataframe(), dtype=float)
 82 | 
 83 |         return preds_array
 84 | 
 85 | 
 86 | if __name__ == '__main__':
 87 |     train, labels, test, _, _ = utils.load_data()
 88 | 
 89 |     clf_xgb = XGBoost(max_iterations=300, max_depth=12, min_child_weight=4.9208250938262745, row_subsample=.9134478530382129,
 90 |                       min_loss_reduction=.5132278416508804, column_subsample=.730128689911957, step_size=.1)
 91 |     clf = ensemble.BaggingClassifier(base_estimator=clf_xgb, n_estimators=20,
 92 |                                      max_samples=1., max_features=1., bootstrap=True,
 93 |                                      n_jobs=1, verbose=True, random_state=23)
 94 | 
 95 |     if MODE == 'cv':
 96 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
 97 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
 98 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
 99 |     elif MODE == 'submission':
100 |         clf.fit(train, labels)
101 |         predictions = clf.predict_proba(test)
102 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
103 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
104 |                               predictions)
105 |     elif MODE == 'holdout':
106 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
107 |         print 'Log loss:', score
108 |     elif MODE == 'tune':
109 |         # Objective function
110 |         def objective(args):
111 |             max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args
112 |             clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight,
113 |                           row_subsample=row_subsample, min_loss_reduction=min_loss_reduction,
114 |                           column_subsample=column_subsample, verbose=False)
115 |             score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
116 |             print 'max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss'
117 |             print args, score
118 |             return score
119 |         # Searching space
120 |         space = (
121 |             hp.quniform('max_depth', 12, 12, 1),
122 |             hp.uniform('min_child_weight', .5, 10.),
123 |             hp.uniform('row_subsample', .3, 1.),
124 |             hp.uniform('min_loss_reduction', .1, 3.),
125 |             hp.uniform('column_subsample', .1, 1.),
126 |         )
127 | 
128 |         best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=500)
129 |         print 'Best solution:', best_sln
130 |     else:
131 |         print 'Unknown mode'
132 | 


--------------------------------------------------------------------------------
/otto/model/model_15_nn_adagrad_pca/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_15_nn_adagrad_pca/nn_adagrad_pca.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mean log loss from 5-fold CV: 0.478792791749
  3 | """
  4 | import copy
  5 | import itertools
  6 | import numpy as np
  7 | import lasagne
  8 | import math
  9 | import os
 10 | import theano
 11 | import theano.tensor as T
 12 | import time
 13 | 
 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params
 15 | from lasagne.nonlinearities import rectify, softmax
 16 | from lasagne.objectives import categorical_crossentropy, Objective
 17 | from lasagne.updates import adagrad
 18 | 
 19 | from sklearn import decomposition, feature_extraction
 20 | from sklearn.base import BaseEstimator
 21 | from sklearn.cross_validation import StratifiedShuffleSplit
 22 | from sklearn.utils import check_random_state
 23 | 
 24 | from otto_utils import consts, utils
 25 | 
 26 | 
 27 | MODEL_NAME = 'model_15_nn_adagrad_pca'
 28 | MODE = 'cv'  # cv|submission|holdout|tune
 29 | 
 30 | 
 31 | class NeuralNetwork(BaseEstimator):
 32 |     def __init__(self, n_hidden=20, max_epochs=150, batch_size=200,
 33 |                  lr=0.01, epsilon=0.9, dropout=0.5, valid_ratio=0.0,
 34 |                  use_valid=False, verbose=0, random_state=None):
 35 |         self.n_hidden = n_hidden
 36 |         self.max_epochs = max_epochs
 37 |         self.batch_size = batch_size
 38 |         self.lr = lr
 39 |         self.epsilon = epsilon
 40 |         self.dropout = dropout
 41 |         self.valid_ratio = valid_ratio
 42 |         self.use_valid = use_valid
 43 |         self.verbose = verbose
 44 |         self.random_state = random_state
 45 |         # State
 46 |         self.score_ = None
 47 |         self.classes_ = None
 48 |         self.n_classes_ = None
 49 |         self.model = None
 50 | 
 51 |     def fit(self, data, targets, sample_weight=None):
 52 |         self.classes_, indices = np.unique(targets, return_inverse=True)
 53 |         self.n_classes_ = self.classes_.shape[0]
 54 | 
 55 |         random_state = check_random_state(self.random_state)
 56 | 
 57 |         # Shuffle data and eventually split on train and validation sets
 58 |         if self.valid_ratio > 0:
 59 |             strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio,
 60 |                                                           n_iter=1, random_state=self.random_state)
 61 |             train_index, valid_index = [s for s in strat_shuffled_split][0]
 62 |             X_train, y_train = data[train_index], targets[train_index]
 63 |             X_valid, y_valid = data[valid_index], targets[valid_index]
 64 |         else:
 65 |             X_train, y_train = data, targets
 66 |             X_valid, y_valid = np.array([]), np.array([])
 67 | 
 68 |         if self.verbose > 5:
 69 |             print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape)
 70 |             if self.use_valid:
 71 |                 print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape)
 72 | 
 73 |         # Prepare theano variables
 74 |         dataset = dict(
 75 |             X_train=theano.shared(lasagne.utils.floatX(X_train)),
 76 |             y_train=T.cast(theano.shared(y_train), 'int32'),
 77 |             X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
 78 |             y_valid=T.cast(theano.shared(y_valid), 'int32'),
 79 |             num_examples_train=X_train.shape[0],
 80 |             num_examples_valid=X_valid.shape[0],
 81 |             input_dim=X_train.shape[1],
 82 |             output_dim=self.n_classes_,
 83 |         )
 84 | 
 85 |         if self.verbose > 0:
 86 |             print "Building model and compiling functions..."
 87 |         output_layer = self.build_model(dataset['input_dim'])
 88 |         iter_funcs = self.create_iter_functions(dataset, output_layer)
 89 | 
 90 |         if self.verbose > 0:
 91 |             print "Starting training..."
 92 |         now = time.time()
 93 |         results = []
 94 |         try:
 95 |             for epoch in self.train(iter_funcs, dataset, output_layer):
 96 |                 if self.verbose > 1:
 97 |                     print "Epoch {} of {} took {:.3f}s".format(
 98 |                         epoch['number'], self.max_epochs, time.time() - now)
 99 |                 now = time.time()
100 |                 results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']])
101 |                 if self.verbose > 1:
102 |                     print "  training loss:\t\t{:.6f}".format(epoch['train_loss'])
103 |                     print "  validation loss:\t\t{:.6f}".format(epoch['valid_loss'])
104 |                     print "  validation accuracy:\t\t{:.2f} %%".format(
105 |                         epoch['valid_accuracy'] * 100)
106 | 
107 |                 if epoch['number'] >= self.max_epochs:
108 |                     break
109 | 
110 |             if self.verbose > 0:
111 |                 print 'Minimum validation error: %f (epoch %d)' % \
112 |                       (epoch['best_val_error'], epoch['best_val_iter'])
113 | 
114 |         except KeyboardInterrupt:
115 |             pass
116 | 
117 |         return self
118 | 
119 |     def predict(self, data):
120 |         preds, _ = self.make_predictions(data)
121 | 
122 |         return preds
123 | 
124 |     def predict_proba(self, data):
125 |         _, proba = self.make_predictions(data)
126 | 
127 |         return proba
128 | 
129 |     def score(self):
130 |         return self.score_
131 | 
132 |     # Private methods
133 |     def build_model(self, input_dim):
134 |         l_in = InputLayer(shape=(self.batch_size, input_dim))
135 | 
136 |         l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify)
137 |         l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout)
138 | 
139 |         l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden / 2, nonlinearity=rectify)
140 |         l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout)
141 | 
142 |         l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden, nonlinearity=rectify)
143 |         l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout)
144 | 
145 |         l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax)
146 | 
147 |         return l_out
148 | 
149 |     def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix):
150 |         batch_index = T.iscalar('batch_index')
151 |         X_batch = X_tensor_type('x')
152 |         y_batch = T.ivector('y')
153 | 
154 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
155 | 
156 |         objective = Objective(output_layer, loss_function=categorical_crossentropy)
157 | 
158 |         loss_train = objective.get_loss(X_batch, target=y_batch)
159 |         loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)
160 | 
161 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
162 |         proba = output_layer.get_output(X_batch, deterministic=True)
163 |         accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)
164 | 
165 |         all_params = get_all_params(output_layer)
166 |         updates = adagrad(loss_train, all_params, self.lr, self.epsilon)
167 | 
168 |         iter_train = theano.function(
169 |             [batch_index], loss_train,
170 |             updates=updates,
171 |             givens={
172 |                 X_batch: dataset['X_train'][batch_slice],
173 |                 y_batch: dataset['y_train'][batch_slice],
174 |             },
175 |             on_unused_input='ignore',
176 |         )
177 | 
178 |         iter_valid = None
179 |         if self.use_valid:
180 |             iter_valid = theano.function(
181 |                 [batch_index], [loss_eval, accuracy, proba],
182 |                 givens={
183 |                     X_batch: dataset['X_valid'][batch_slice],
184 |                     y_batch: dataset['y_valid'][batch_slice],
185 |                 },
186 |             )
187 | 
188 |         return dict(train=iter_train, valid=iter_valid)
189 | 
190 |     def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix):
191 |         batch_index = T.iscalar('batch_index')
192 |         X_batch = X_tensor_type('x')
193 | 
194 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
195 | 
196 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
197 |         proba = output_layer.get_output(X_batch, deterministic=True)
198 | 
199 |         iter_test = theano.function(
200 |             [batch_index], [pred, proba],
201 |             givens={
202 |                 X_batch: dataset['X_test'][batch_slice],
203 |             },
204 |         )
205 | 
206 |         return dict(test=iter_test)
207 | 
208 |     def train(self, iter_funcs, dataset, output_layer):
209 |         num_batches_train = dataset['num_examples_train'] // self.batch_size
210 |         num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size)))
211 | 
212 |         best_val_err = 100
213 |         best_val_iter = -1
214 | 
215 |         for epoch in itertools.count(1):
216 |             batch_train_losses = []
217 |             for b in range(num_batches_train):
218 |                 batch_train_loss = iter_funcs['train'](b)
219 |                 batch_train_losses.append(batch_train_loss)
220 | 
221 |             avg_train_loss = np.mean(batch_train_losses)
222 | 
223 |             batch_valid_losses = []
224 |             batch_valid_accuracies = []
225 |             batch_valid_probas = []
226 | 
227 |             if self.use_valid:
228 |                 for b in range(num_batches_valid):
229 |                     batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b)
230 |                     batch_valid_losses.append(batch_valid_loss)
231 |                     batch_valid_accuracies.append(batch_valid_accuracy)
232 |                     batch_valid_probas.append(batch_valid_proba)
233 | 
234 |             avg_valid_loss = np.mean(batch_valid_losses)
235 |             avg_valid_accuracy = np.mean(batch_valid_accuracies)
236 | 
237 |             if (best_val_err > avg_valid_loss and self.use_valid) or\
238 |                     (epoch == self.max_epochs and not self.use_valid):
239 |                 best_val_err = avg_valid_loss
240 |                 best_val_iter = epoch
241 |                 # Save model
242 |                 self.score_ = best_val_err
243 |                 self.model = copy.deepcopy(output_layer)
244 | 
245 | 
246 |             yield {
247 |                 'number': epoch,
248 |                 'train_loss': avg_train_loss,
249 |                 'valid_loss': avg_valid_loss,
250 |                 'valid_accuracy': avg_valid_accuracy,
251 |                 'best_val_error': best_val_err,
252 |                 'best_val_iter': best_val_iter,
253 |             }
254 | 
255 |     def make_predictions(self, data):
256 |         dataset = dict(
257 |             X_test=theano.shared(lasagne.utils.floatX(data)),
258 |             num_examples_test=data.shape[0],
259 |             input_dim=data.shape[1],
260 |             output_dim=self.n_classes_,
261 |         )
262 | 
263 |         iter_funcs = self.create_test_function(dataset, self.model)
264 |         num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size)))
265 | 
266 |         test_preds, test_probas = np.array([]), None
267 | 
268 |         for b in range(num_batches_test):
269 |             batch_test_pred, batch_test_proba = iter_funcs['test'](b)
270 |             test_preds = np.append(test_preds, batch_test_pred)
271 |             test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba
272 | 
273 |         return test_preds, test_probas
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     train, labels, test, _, _ = utils.load_data()
278 | 
279 |     # Preprocess data - transform counts to TFIDF features
280 |     tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
281 |     train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1)
282 |     test = np.append(test, tfidf.transform(test).toarray(), axis=1)
283 | 
284 |     # PCA
285 |     pp = decomposition.PCA()
286 |     train = pp.fit_transform(train)
287 |     test = pp.transform(test)
288 | 
289 |     clf = NeuralNetwork(1024, 110, 150, 0.0010954104605473447, 5.003481345255732e-15, 0.1,
290 |                         .02, True, 10, random_state=18)
291 | 
292 |     if MODE == 'cv':
293 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
294 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
295 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
296 |     elif MODE == 'submission':
297 |         clf.fit(train, labels)
298 |         predictions = clf.predict_proba(test)
299 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
300 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
301 |                               predictions)
302 |     elif MODE == 'holdout':
303 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
304 |         print 'Log loss:', score
305 |     else:
306 |         print 'Unknown mode'


--------------------------------------------------------------------------------
/otto/model/model_16_random_forest_calibrated_feature_selection/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_16_random_forest_calibrated_feature_selection/random_forest_calibrated_feature_selection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 5-fold CV - log loss 0.463244260386
 3 | """
 4 | import numpy as np
 5 | import os
 6 | 
 7 | from sklearn import ensemble, feature_extraction, preprocessing
 8 | from sklearn.calibration import CalibratedClassifierCV
 9 | from sklearn.svm import LinearSVC
10 | 
11 | from otto_utils import consts, utils
12 | 
13 | 
14 | MODEL_NAME = 'model_16_random_forest_calibrated_feature_selection'
15 | MODE = 'cv'  # cv|submission|holdout
16 | 
17 | # import data
18 | train, labels, test, _, _ = utils.load_data()
19 | 
20 | # transform counts to TFIDF features
21 | tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
22 | train = np.append(train, tfidf.fit_transform(train).toarray(), axis=1)
23 | test = np.append(test, tfidf.transform(test).toarray(), axis=1)
24 | 
25 | # feature selection
26 | feat_selector = LinearSVC(C=0.095, penalty='l1', dual=False)
27 | train = feat_selector.fit_transform(train, labels)
28 | test = feat_selector.transform(test)
29 | 
30 | print train.shape
31 | 
32 | # encode labels
33 | lbl_enc = preprocessing.LabelEncoder()
34 | labels = lbl_enc.fit_transform(labels)
35 | 
36 | 
37 | 
38 | # train classifier
39 | clf = ensemble.ExtraTreesClassifier(n_jobs=3, n_estimators=600, max_features=20, min_samples_split=3,
40 |                                     bootstrap=False, verbose=3, random_state=23)
41 | 
42 | if MODE == 'cv':
43 |     scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True)
44 |     print 'CV:', scores, 'Mean log loss:', np.mean(scores)
45 |     utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
46 | elif MODE == 'submission':
47 |     calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels))
48 |     fitted_classifier = calibrated_classifier.fit(train, labels)
49 |     predictions = fitted_classifier.predict_proba(test)
50 |     utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
51 |                           os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
52 |                           predictions)
53 | elif MODE == 'holdout':
54 |     score = utils.hold_out_evaluation(clf, train, labels, calibrate=True)
55 |     print 'Log loss:', score
56 | else:
57 |     print 'Unknown mode'
58 | 


--------------------------------------------------------------------------------
/otto/model/model_17_nn_adagrad_log/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/model/model_17_nn_adagrad_log/nn_adagrad_log.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mean log loss from 5-fold CV: 0.488150595136
  3 | """
  4 | import copy
  5 | import itertools
  6 | import numpy as np
  7 | import lasagne
  8 | import math
  9 | import os
 10 | import theano
 11 | import theano.tensor as T
 12 | import time
 13 | 
 14 | from lasagne.layers import DenseLayer, DropoutLayer, InputLayer, get_all_params
 15 | from lasagne.nonlinearities import rectify, softmax
 16 | from lasagne.objectives import categorical_crossentropy, Objective
 17 | from lasagne.updates import adagrad
 18 | 
 19 | from sklearn.base import BaseEstimator
 20 | from sklearn.cross_validation import StratifiedShuffleSplit
 21 | from sklearn.utils import check_random_state
 22 | 
 23 | from otto_utils import consts, utils
 24 | 
 25 | 
 26 | MODEL_NAME = 'model_17_nn_adagrad_log'
 27 | MODE = 'submission'  # cv|submission|holdout|tune
 28 | 
 29 | 
 30 | class NeuralNetwork(BaseEstimator):
 31 |     def __init__(self, n_hidden=20, max_epochs=150, batch_size=200,
 32 |                  lr=0.01, rho=0.9, dropout=0.5, valid_ratio=0.0,
 33 |                  use_valid=False, verbose=0, random_state=None):
 34 |         self.n_hidden = n_hidden
 35 |         self.max_epochs = max_epochs
 36 |         self.batch_size = batch_size
 37 |         self.lr = lr
 38 |         self.rho = rho
 39 |         self.dropout = dropout
 40 |         self.valid_ratio = valid_ratio
 41 |         self.use_valid = use_valid
 42 |         self.verbose = verbose
 43 |         self.random_state = random_state
 44 |         # State
 45 |         self.score_ = None
 46 |         self.classes_ = None
 47 |         self.n_classes_ = None
 48 |         self.model = None
 49 | 
 50 |     def fit(self, data, targets, sample_weight=None):
 51 |         self.classes_, indices = np.unique(targets, return_inverse=True)
 52 |         self.n_classes_ = self.classes_.shape[0]
 53 | 
 54 |         random_state = check_random_state(self.random_state)
 55 | 
 56 |         # Shuffle data and eventually split on train and validation sets
 57 |         if self.valid_ratio > 0:
 58 |             strat_shuffled_split = StratifiedShuffleSplit(targets, test_size=self.valid_ratio,
 59 |                                                           n_iter=1, random_state=self.random_state)
 60 |             train_index, valid_index = [s for s in strat_shuffled_split][0]
 61 |             X_train, y_train = data[train_index], targets[train_index]
 62 |             X_valid, y_valid = data[valid_index], targets[valid_index]
 63 |         else:
 64 |             X_train, y_train = data, targets
 65 |             X_valid, y_valid = np.array([]), np.array([])
 66 | 
 67 |         if self.verbose > 5:
 68 |             print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape)
 69 |             if self.use_valid:
 70 |                 print 'X_valid: %s, y_valid: %s' % (X_valid.shape, y_valid.shape)
 71 | 
 72 |         # Prepare theano variables
 73 |         dataset = dict(
 74 |             X_train=theano.shared(lasagne.utils.floatX(X_train)),
 75 |             y_train=T.cast(theano.shared(y_train), 'int32'),
 76 |             X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
 77 |             y_valid=T.cast(theano.shared(y_valid), 'int32'),
 78 |             num_examples_train=X_train.shape[0],
 79 |             num_examples_valid=X_valid.shape[0],
 80 |             input_dim=X_train.shape[1],
 81 |             output_dim=self.n_classes_,
 82 |         )
 83 | 
 84 |         if self.verbose > 0:
 85 |             print "Building model and compiling functions..."
 86 |         output_layer = self.build_model(dataset['input_dim'])
 87 |         iter_funcs = self.create_iter_functions(dataset, output_layer)
 88 | 
 89 |         if self.verbose > 0:
 90 |             print "Starting training..."
 91 |         now = time.time()
 92 |         results = []
 93 |         try:
 94 |             for epoch in self.train(iter_funcs, dataset, output_layer):
 95 |                 if self.verbose > 1:
 96 |                     print "Epoch {} of {} took {:.3f}s".format(
 97 |                         epoch['number'], self.max_epochs, time.time() - now)
 98 |                 now = time.time()
 99 |                 results.append([epoch['number'], epoch['train_loss'], epoch['valid_loss']])
100 |                 if self.verbose > 1:
101 |                     print "  training loss:\t\t{:.6f}".format(epoch['train_loss'])
102 |                     print "  validation loss:\t\t{:.6f}".format(epoch['valid_loss'])
103 |                     print "  validation accuracy:\t\t{:.2f} %%".format(
104 |                         epoch['valid_accuracy'] * 100)
105 | 
106 |                 if epoch['number'] >= self.max_epochs:
107 |                     break
108 | 
109 |             if self.verbose > 0:
110 |                 print 'Minimum validation error: %f (epoch %d)' % \
111 |                       (epoch['best_val_error'], epoch['best_val_iter'])
112 | 
113 |         except KeyboardInterrupt:
114 |             pass
115 | 
116 |         return self
117 | 
118 |     def predict(self, data):
119 |         preds, _ = self.make_predictions(data)
120 | 
121 |         return preds
122 | 
123 |     def predict_proba(self, data):
124 |         _, proba = self.make_predictions(data)
125 | 
126 |         return proba
127 | 
128 |     def score(self):
129 |         return self.score_
130 | 
131 |     # Private methods
132 |     def build_model(self, input_dim):
133 |         l_in = InputLayer(shape=(self.batch_size, input_dim))
134 | 
135 |         l_hidden1 = DenseLayer(l_in, num_units=self.n_hidden, nonlinearity=rectify)
136 |         l_hidden1_dropout = DropoutLayer(l_hidden1, p=self.dropout)
137 | 
138 |         l_hidden2 = DenseLayer(l_hidden1_dropout, num_units=self.n_hidden / 2, nonlinearity=rectify)
139 |         l_hidden2_dropout = DropoutLayer(l_hidden2, p=self.dropout)
140 | 
141 |         l_hidden3 = DenseLayer(l_hidden2_dropout, num_units=self.n_hidden / 4, nonlinearity=rectify)
142 |         l_hidden3_dropout = DropoutLayer(l_hidden3, p=self.dropout)
143 | 
144 |         l_out = DenseLayer(l_hidden3_dropout, num_units=self.n_classes_, nonlinearity=softmax)
145 | 
146 |         return l_out
147 | 
148 |     def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix):
149 |         batch_index = T.iscalar('batch_index')
150 |         X_batch = X_tensor_type('x')
151 |         y_batch = T.ivector('y')
152 | 
153 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
154 | 
155 |         objective = Objective(output_layer, loss_function=categorical_crossentropy)
156 | 
157 |         loss_train = objective.get_loss(X_batch, target=y_batch)
158 |         loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)
159 | 
160 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
161 |         proba = output_layer.get_output(X_batch, deterministic=True)
162 |         accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)
163 | 
164 |         all_params = get_all_params(output_layer)
165 |         updates = adagrad(loss_train, all_params, self.lr, self.rho)
166 | 
167 |         iter_train = theano.function(
168 |             [batch_index], loss_train,
169 |             updates=updates,
170 |             givens={
171 |                 X_batch: dataset['X_train'][batch_slice],
172 |                 y_batch: dataset['y_train'][batch_slice],
173 |             },
174 |             on_unused_input='ignore',
175 |         )
176 | 
177 |         iter_valid = None
178 |         if self.use_valid:
179 |             iter_valid = theano.function(
180 |                 [batch_index], [loss_eval, accuracy, proba],
181 |                 givens={
182 |                     X_batch: dataset['X_valid'][batch_slice],
183 |                     y_batch: dataset['y_valid'][batch_slice],
184 |                 },
185 |             )
186 | 
187 |         return dict(train=iter_train, valid=iter_valid)
188 | 
189 |     def create_test_function(self, dataset, output_layer, X_tensor_type=T.matrix):
190 |         batch_index = T.iscalar('batch_index')
191 |         X_batch = X_tensor_type('x')
192 | 
193 |         batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)
194 | 
195 |         pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
196 |         proba = output_layer.get_output(X_batch, deterministic=True)
197 | 
198 |         iter_test = theano.function(
199 |             [batch_index], [pred, proba],
200 |             givens={
201 |                 X_batch: dataset['X_test'][batch_slice],
202 |             },
203 |         )
204 | 
205 |         return dict(test=iter_test)
206 | 
207 |     def train(self, iter_funcs, dataset, output_layer):
208 |         num_batches_train = dataset['num_examples_train'] // self.batch_size
209 |         num_batches_valid = int(math.ceil(dataset['num_examples_valid'] / float(self.batch_size)))
210 | 
211 |         best_val_err = 100
212 |         best_val_iter = -1
213 | 
214 |         for epoch in itertools.count(1):
215 |             batch_train_losses = []
216 |             for b in range(num_batches_train):
217 |                 batch_train_loss = iter_funcs['train'](b)
218 |                 batch_train_losses.append(batch_train_loss)
219 | 
220 |             avg_train_loss = np.mean(batch_train_losses)
221 | 
222 |             batch_valid_losses = []
223 |             batch_valid_accuracies = []
224 |             batch_valid_probas = []
225 | 
226 |             if self.use_valid:
227 |                 for b in range(num_batches_valid):
228 |                     batch_valid_loss, batch_valid_accuracy, batch_valid_proba = iter_funcs['valid'](b)
229 |                     batch_valid_losses.append(batch_valid_loss)
230 |                     batch_valid_accuracies.append(batch_valid_accuracy)
231 |                     batch_valid_probas.append(batch_valid_proba)
232 | 
233 |             avg_valid_loss = np.mean(batch_valid_losses)
234 |             avg_valid_accuracy = np.mean(batch_valid_accuracies)
235 | 
236 |             if (best_val_err > avg_valid_loss and self.use_valid) or\
237 |                     (epoch == self.max_epochs and not self.use_valid):
238 |                 best_val_err = avg_valid_loss
239 |                 best_val_iter = epoch
240 |                 # Save model
241 |                 self.score_ = best_val_err
242 |                 self.model = copy.deepcopy(output_layer)
243 | 
244 | 
245 |             yield {
246 |                 'number': epoch,
247 |                 'train_loss': avg_train_loss,
248 |                 'valid_loss': avg_valid_loss,
249 |                 'valid_accuracy': avg_valid_accuracy,
250 |                 'best_val_error': best_val_err,
251 |                 'best_val_iter': best_val_iter,
252 |             }
253 | 
254 |     def make_predictions(self, data):
255 |         dataset = dict(
256 |             X_test=theano.shared(lasagne.utils.floatX(data)),
257 |             num_examples_test=data.shape[0],
258 |             input_dim=data.shape[1],
259 |             output_dim=self.n_classes_,
260 |         )
261 | 
262 |         iter_funcs = self.create_test_function(dataset, self.model)
263 |         num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size)))
264 | 
265 |         test_preds, test_probas = np.array([]), None
266 | 
267 |         for b in range(num_batches_test):
268 |             batch_test_pred, batch_test_proba = iter_funcs['test'](b)
269 |             test_preds = np.append(test_preds, batch_test_pred)
270 |             test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba
271 | 
272 |         return test_preds, test_probas
273 | 
274 | 
275 | if __name__ == '__main__':
276 |     train, labels, test, _, _ = utils.load_data()
277 | 
278 |     train = np.log(train + 1.)
279 |     test = np.log(test + 1.)
280 | 
281 |     clf = NeuralNetwork(1024, 110, 220, 0.0026294067059507813, 1.1141900388281156e-15, 0.26355646219340834,
282 |                         .02, True, 10, random_state=23)
283 | 
284 |     if MODE == 'cv':
285 |         scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
286 |         print 'CV:', scores, 'Mean log loss:', np.mean(scores)
287 |         utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
288 |     elif MODE == 'submission':
289 |         clf.fit(train, labels)
290 |         predictions = clf.predict_proba(test)
291 |         utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
292 |                               os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
293 |                               predictions)
294 |     elif MODE == 'holdout':
295 |         score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
296 |         print 'Log loss:', score
297 |     else:
298 |         print 'Unknown mode'


--------------------------------------------------------------------------------
/otto/otto_utils/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'adam'
2 | 


--------------------------------------------------------------------------------
/otto/otto_utils/blender.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | 
 4 | from scipy.optimize import fmin_cobyla
 5 | from sklearn.cross_validation import StratifiedKFold
 6 | from sklearn.metrics import log_loss
 7 | 
 8 | import consts
 9 | import utils
10 | 
11 | 
12 | def blended(c, x):
13 |     result = None
14 |     for i in range(len(c)):
15 |         result = result + c[i] * x[i] if result is not None else c[i] * x[i]
16 |     result /= sum(c)
17 |     return result
18 | 
19 | 
20 | def error(p, x, y):
21 |     preds = blended(p, x)
22 |     err = log_loss(y, preds)
23 |     return err
24 | 
25 | 
26 | def constraint(p, *args):
27 |     return min(p) - .0
28 | 
29 | 
30 | def get_weights():
31 |     # Read validation labels
32 |     _, labels, _, _, _ = utils.load_data()
33 |     skf = StratifiedKFold(labels, n_folds=5, random_state=23)
34 |     test_index = None
35 |     for _, test_idx in skf:
36 |         test_index = np.append(test_index, test_idx) if test_index is not None else test_idx
37 |     val_labels = labels[test_index]
38 |     # Read predictions on validation set
39 |     val_predictions = []
40 |     prediction_files = utils.get_prediction_files()
41 |     for preds_file in prediction_files:
42 |         vp = np.genfromtxt(os.path.join(consts.BLEND_PATH, preds_file), delimiter=',')
43 |         val_predictions.append(vp)
44 |     # Minimize blending function
45 |     p0 = [1.] * len(prediction_files)
46 |     p = fmin_cobyla(error, p0, args=(val_predictions, val_labels), cons=[constraint], rhoend=1e-5)
47 | 
48 |     return p
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     weights = get_weights()
53 |     print weights
54 |     print weights / np.sum(weights)


--------------------------------------------------------------------------------
/otto/otto_utils/consts.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | PROJECT_PATH = '/home/adam/Projects/otto'
 5 | 
 6 | # Data
 7 | DATA_PATH = os.path.join(PROJECT_PATH, 'data')
 8 | DATA_TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')
 9 | DATA_TEST_PATH = os.path.join(DATA_PATH, 'test.csv')
10 | DATA_SAMPLE_SUBMISSION_PATH = os.path.join(DATA_PATH, 'sampleSubmission.csv')
11 | 
12 | # Models
13 | MODEL_PATH = os.path.join(PROJECT_PATH, 'model')
14 | 
15 | # Results
16 | OUTPUT_PATH = os.path.join(PROJECT_PATH, 'output')
17 | 
18 | # Blending
19 | BLEND_PATH = os.path.join(PROJECT_PATH, 'blend')
20 | 
21 | # Ensembling
22 | ENSEMBLE_PATH = os.path.join(PROJECT_PATH, 'ensemble')
23 | 
24 | # Names of prediction files
25 | PREDICTION_FILES = ['03_svm', '05_bagging_nn_rmsprop',
26 |                     '06_xgboost', '09_nn_adagrad', '11_xgboost_poly',
27 |                     '12_nn_rmsprop_pca', '13_nn_rmsprop_features',
28 |                     '16_random_forest_calibrated_feature_selection']
29 | 


--------------------------------------------------------------------------------
/otto/otto_utils/ensembler.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import numpy as np
 3 | import os
 4 | 
 5 | import blender
 6 | import consts
 7 | import utils
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     weights = blender.get_weights()
12 |     prediction_files = utils.get_prediction_files()
13 | 
14 |     with open(os.path.join(consts.OUTPUT_PATH, 'ensembler_weighted_models.csv'), 'wb') as f_out:
15 |         writer = csv.writer(f_out)
16 |         readers = []
17 |         f_ins = []
18 |         for fpred in prediction_files:
19 |             f_in = open(os.path.join(consts.ENSEMBLE_PATH, fpred), 'rb')
20 |             f_ins.append(f_in)
21 |             readers.append(csv.reader(f_in))
22 |         # Copy header
23 |         writer.writerow(readers[0].next())
24 |         for r in readers[1:]:
25 |             r.next()
26 |         # Merge content
27 |         for line in readers[0]:
28 |             file_name = line[0]
29 |             preds = weights[0] * np.array(map(float, line[1:]))
30 |             for i, r in enumerate(readers[1:]):
31 |                 preds += weights[i+1] * np.array(map(float, r.next()[1:]))
32 |             preds /= np.sum(weights)
33 |             writer.writerow([file_name] + list(preds))
34 |         # Close files
35 |         for f_in in f_ins:
36 |             f_in.close()
37 | 


--------------------------------------------------------------------------------
/otto/otto_utils/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code shared between all models for Otto competition.
 3 | """
 4 | 
 5 | import gc
 6 | import numpy as np
 7 | import os
 8 | import pandas as pd
 9 | 
10 | from sklearn.calibration import CalibratedClassifierCV
11 | from sklearn.cross_validation import StratifiedKFold, StratifiedShuffleSplit
12 | from sklearn.metrics import log_loss
13 | 
14 | import consts
15 | 
16 | 
17 | def load_data(path_train=consts.DATA_TRAIN_PATH, path_test=consts.DATA_TEST_PATH):
18 |     train = pd.read_csv(path_train)
19 |     train_labels = [int(v[-1])-1 for v in train.target.values]
20 |     train_ids = train.id.values
21 |     train = train.drop('id', axis=1)
22 |     train = train.drop('target', axis=1)
23 | 
24 |     test = pd.read_csv(path_test)
25 |     test_ids = test.id.values
26 |     test = test.drop('id', axis=1)
27 | 
28 |     return np.array(train, dtype=float), np.array(train_labels), np.array(test, dtype=float),\
29 |         np.array(train_ids), np.array(test_ids)
30 | 
31 | 
32 | def make_blender_cv(classifier, x, y, calibrate=False):
33 |     skf = StratifiedKFold(y, n_folds=5, random_state=23)
34 |     scores, predictions = [], None
35 |     for train_index, test_index in skf:
36 |         if calibrate:
37 |             # Make training and calibration
38 |             calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y[train_index]))
39 |             fitted_classifier = calibrated_classifier.fit(x[train_index, :], y[train_index])
40 |         else:
41 |             fitted_classifier = classifier.fit(x[train_index, :], y[train_index])
42 |         preds = fitted_classifier.predict_proba(x[test_index, :])
43 | 
44 |         # Free memory
45 |         calibrated_classifier, fitted_classifier = None, None
46 |         gc.collect()
47 | 
48 |         scores.append(log_loss(y[test_index], preds))
49 |         predictions = np.append(predictions, preds, axis=0) if predictions is not None else preds
50 |     return scores, predictions
51 | 
52 | 
53 | def write_blender_data(path, file_name, predictions):
54 |     file_path = os.path.join(path, file_name)
55 |     np.savetxt(file_path, predictions, delimiter=',', fmt='%.5f')
56 | 
57 | 
58 | def save_submission(path_sample_submission, output_file_path, predictions):
59 |     sample = pd.read_csv(path_sample_submission)
60 |     submission = pd.DataFrame(predictions, index=sample.id.values, columns=sample.columns[1:])
61 |     submission.to_csv(output_file_path, index_label='id')
62 | 
63 | 
64 | def stratified_split(x, y, test_size=0.2):
65 |     strat_shuffled_split = StratifiedShuffleSplit(y, n_iter=1, test_size=test_size, random_state=23)
66 |     train_index, valid_index = [s for s in strat_shuffled_split][0]
67 | 
68 |     x_train, y_train, x_valid, y_valid = x[train_index, :], y[train_index], x[valid_index, :], y[valid_index]
69 | 
70 |     return x_train, y_train, x_valid, y_valid
71 | 
72 | 
73 | def hold_out_evaluation(classifier, x, y, test_size=0.2, calibrate=False):
74 |     x_train, y_train, x_valid, y_valid = stratified_split(x, y, test_size)
75 | 
76 |     # Train
77 |     if calibrate:
78 |         # Make training and calibration
79 |         calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y_train))
80 |         fitted_classifier = calibrated_classifier.fit(x_train, y_train)
81 |     else:
82 |         fitted_classifier = classifier.fit(x_train, y_train)
83 |     # Evaluate
84 |     score = log_loss(y_valid, fitted_classifier.predict_proba(x_valid))
85 | 
86 |     return score
87 | 
88 | 
89 | def get_prediction_files():
90 |     return ['model_%s.csv' % f for f in consts.PREDICTION_FILES]
91 | 
92 | 
93 | def get_cv(y, n_folds=5):
94 |     return StratifiedKFold(y, n_folds=n_folds, random_state=23)
95 | 


--------------------------------------------------------------------------------