├── .gitignore
├── BSMan
    ├── __init__.py
    ├── ensemble.py
    └── logistic.py
├── MIT-LICENSE
├── README.md
├── cache
    ├── .gitignore
    └── models
    │   ├── diagnostics
    │       └── cv_preds
    │       │   └── .gitignore
    │   └── main
    │       └── cv_preds
    │           └── .gitignore
├── classifier.py
├── combine
    └── combine.py
├── data
    ├── .DS_Store
    ├── test.csv
    └── train.csv
├── external
    ├── __init__.py
    ├── ben.py
    └── greedy.py
├── helpers
    ├── __init__.py
    ├── data.py
    ├── diagnostics.py
    ├── feature_extraction.py
    ├── ml.py
    └── utils.py
├── history.log
├── plots
    └── .gitignore
├── saved_params.json
└── submissions
    └── .gitignore


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | *.pkl
 3 | *.log
 4 | .DS_Store
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Packages
10 | *.egg
11 | *.egg-info
12 | dist
13 | build
14 | eggs
15 | parts
16 | bin
17 | var
18 | sdist
19 | develop-eggs
20 | .installed.cfg
21 | lib
22 | lib64
23 | 
24 | # Installer logs
25 | pip-log.txt
26 | 
27 | # Unit test / coverage reports
28 | .coverage
29 | .tox
30 | nosetests.xml
31 | 
32 | # Translations
33 | *.mo
34 | 
35 | # Mr Developer
36 | .mr.developer.cfg
37 | .project
38 | .pydevproject
39 | 


--------------------------------------------------------------------------------
/BSMan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyduan/amazonaccess/f8addfefcee80f0ca15e416954af3926f3007d16/BSMan/__init__.py


--------------------------------------------------------------------------------
/BSMan/ensemble.py:
--------------------------------------------------------------------------------
  1 | """ Amazon Access Challenge Starter Code
  2 | 
  3 | This was built using the code of Paul Duan <email@paulduan.com> as a starting
  4 | point (thanks to Paul).
  5 | 
  6 | It builds ensemble models using the original dataset and a handful of 
  7 | extracted features.
  8 | 
  9 | Author: Benjamin Solecki <bensolecki@gmail.com>
 10 | """
 11 | 
 12 | from __future__ import division
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier)
 17 | from sklearn import (metrics, cross_validation, linear_model, preprocessing)
 18 | 
 19 | SEED = 42  # always use a seed for randomized procedures
 20 | 
 21 | def save_results(predictions, filename):
 22 |     """Given a vector of predictions, save results in CSV format."""
 23 |     with open(filename, 'w') as f:
 24 |         f.write("id,ACTION\n")
 25 |         for i, pred in enumerate(predictions):
 26 |             f.write("%d,%f\n" % (i + 1, pred))
 27 | 
 28 | 
 29 | """
 30 | Fit models and make predictions.
 31 | We'll use one-hot encoding to transform our categorical features
 32 | into binary features.
 33 | y and X will be numpy array objects.
 34 | """
 35 | # === load data in memory === #
 36 | print "loading data"
 37 | X = pd.read_csv('data/train.csv')
 38 | X = X.drop(['ROLE_CODE'], axis=1)
 39 | y = X['ACTION']
 40 | X = X.drop(['ACTION'], axis=1)
 41 | X_test = pd.read_csv('data/test.csv', index_col=0)
 42 | X_test = X_test.drop(['ROLE_CODE'], axis=1)
 43 | X_test['ACTION'] = 0
 44 | y_test = X_test['ACTION']
 45 | X_test = X_test.drop(['ACTION'], axis=1)
 46 | 
 47 | modelRF =RandomForestClassifier(n_estimators=1999, max_features='sqrt', max_depth=None, min_samples_split=9, compute_importances=True, random_state=SEED)#8803
 48 | modelXT =ExtraTreesClassifier(n_estimators=1999, max_features='sqrt', max_depth=None, min_samples_split=8, compute_importances=True, random_state=SEED) #8903
 49 | modelGB =GradientBoostingClassifier(n_estimators=50, learning_rate=0.20, max_depth=20, min_samples_split=9, random_state=SEED)  #8749
 50 | # 599: 20/90/08
 51 | #1999: 24/95/06
 52 | 
 53 | X_all = pd.concat([X_test,X], ignore_index=True)
 54 | 
 55 | # I want to combine role_title as a subset of role_familia and see if same results
 56 | X_all['ROLE_TITLE'] = X_all['ROLE_TITLE'] + (1000 * X_all['ROLE_FAMILY'])
 57 | X_all['ROLE_ROLLUPS'] = X_all['ROLE_ROLLUP_1'] + (10000 * X_all['ROLE_ROLLUP_2'])
 58 | X_all = X_all.drop(['ROLE_ROLLUP_1','ROLE_ROLLUP_2','ROLE_FAMILY'], axis=1)
 59 | 
 60 | # Count/freq
 61 | print "Counts"
 62 | for col in X_all.columns:
 63 |     X_all['cnt'+col] = 0
 64 |     groups = X_all.groupby([col])
 65 |     for name, group in groups:
 66 |         count = group[col].count()
 67 |         X_all['cnt'+col].ix[group.index] = count 
 68 |     X_all['cnt'+col] = X_all['cnt'+col].apply(np.log) # could check if this is neccesary, I think probably not
 69 | 
 70 | # Percent of dept that is this resource
 71 | for col in X_all.columns[1:6]:
 72 |     X_all['Duse'+col] = 0.0
 73 |     groups = X_all.groupby([col])
 74 |     for name, group in groups:
 75 |         grps = group.groupby(['RESOURCE'])
 76 |         for rsrc, grp in grps:
 77 |             X_all['Duse'+col].ix[grp.index] = float(len(grp.index)) / float(len(group.index) )
 78 | 
 79 | # Number of resources that a manager manages
 80 | for col in X_all.columns[0:1]:
 81 |     if col == 'MGR_ID':
 82 |         continue
 83 |     print col
 84 |     X_all['Mdeps'+col] = 0
 85 |     groups = X_all.groupby(['MGR_ID'])
 86 |     for name, group in groups:
 87 |         X_all['Mdeps'+col].ix[group.index] = len(group[col].unique()) 
 88 | 
 89 | 
 90 | X = X_all[:][X_all.index>=len(X_test.index)]
 91 | X_test = X_all[:][X_all.index<len(X_test.index)]
 92 | 
 93 | # === Combine Models === #
 94 | # Do a linear combination using a cross_validated data split
 95 | X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=0.5, random_state=SEED)
 96 | 
 97 | modelRF.fit(X_cv, y_cv) 
 98 | modelXT.fit(X_cv, y_cv) 
 99 | modelGB.fit(X_cv, y_cv) 
100 | predsRF = modelRF.predict_proba(X_train)[:, 1]
101 | predsXT = modelXT.predict_proba(X_train)[:, 1]
102 | predsGB = modelGB.predict_proba(X_train)[:, 1]
103 | preds = np.hstack((predsRF, predsXT, predsGB)).reshape(3,len(predsGB)).transpose()
104 | preds[preds>0.9999999]=0.9999999
105 | preds[preds<0.0000001]=0.0000001
106 | preds = -np.log((1-preds)/preds)
107 | modelEN1 = linear_model.LogisticRegression()
108 | modelEN1.fit(preds, y_train)
109 | print modelEN1.coef_
110 | 
111 | modelRF.fit(X_train, y_train) 
112 | modelXT.fit(X_train, y_train) 
113 | modelGB.fit(X_train, y_train) 
114 | predsRF = modelRF.predict_proba(X_cv)[:, 1]
115 | predsXT = modelXT.predict_proba(X_cv)[:, 1]
116 | predsGB = modelGB.predict_proba(X_cv)[:, 1]
117 | preds = np.hstack((predsRF, predsXT, predsGB)).reshape(3,len(predsGB)).transpose()
118 | preds[preds>0.9999999]=0.9999999
119 | preds[preds<0.0000001]=0.0000001
120 | preds = -np.log((1-preds)/preds)
121 | modelEN2 = linear_model.LogisticRegression()
122 | modelEN2.fit(preds, y_cv)
123 | print modelEN2.coef_
124 | 
125 | coefRF = modelEN1.coef_[0][0] + modelEN2.coef_[0][0]
126 | coefXT = modelEN1.coef_[0][1] + modelEN2.coef_[0][1]
127 | coefGB = modelEN1.coef_[0][2] + modelEN2.coef_[0][2]
128 | 
129 | # === Predictions === #
130 | # When making predictions, retrain the model on the whole training set
131 | modelRF.fit(X, y)
132 | modelXT.fit(X, y)
133 | modelGB.fit(X, y)
134 | 
135 | ### Combine here
136 | predsRF = modelRF.predict_proba(X_test)[:, 1]
137 | predsXT = modelXT.predict_proba(X_test)[:, 1]
138 | predsGB = modelGB.predict_proba(X_test)[:, 1]
139 | predsRF[predsRF>0.9999999]=0.9999999
140 | predsXT[predsXT>0.9999999]=0.9999999
141 | predsGB[predsGB>0.9999999]=0.9999999
142 | predsRF[predsRF<0.0000001]=0.0000001
143 | predsXT[predsXT<0.0000001]=0.0000001
144 | predsGB[predsGB<0.0000001]=0.0000001
145 | predsRF = -np.log((1-predsRF)/predsRF)
146 | predsXT = -np.log((1-predsXT)/predsXT)
147 | predsGB = -np.log((1-predsGB)/predsGB)
148 | preds = coefRF * predsRF + coefXT * predsXT + coefGB * predsGB
149 | 
150 | filename = raw_input("Enter name for submission file: ")
151 | save_results(preds, "submissions/en" + filename + ".csv")
152 | 


--------------------------------------------------------------------------------
/BSMan/logistic.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This program is based on code submitted by Miroslaw Horbal to the Kaggle 
  3 | forums, which was itself based on an earlier submission from Paul Doan.
  4 | My thanks to both.
  5 | 
  6 | Author: Benjamin Solecki <bensolucky@gmail.com>
  7 | """
  8 | 
  9 | from numpy import array, hstack
 10 | from sklearn import metrics, cross_validation, linear_model
 11 | from sklearn import naive_bayes
 12 | from sklearn import preprocessing
 13 | from scipy import sparse
 14 | from itertools import combinations
 15 | 
 16 | from sets import Set
 17 | import numpy as np
 18 | import pandas as pd
 19 | import sys
 20 | 
 21 | #SEED = 55
 22 | SEED = int(sys.argv[2])
 23 | 
 24 | def group_data(data, degree=3, hash=hash):
 25 |     """ 
 26 |     numpy.array -> numpy.array
 27 |     
 28 |     Groups all columns of data into all combinations of triples
 29 |     """
 30 |     new_data = []
 31 |     m,n = data.shape
 32 |     for indicies in combinations(range(n), degree):
 33 | 	if 5 in indicies and 7 in indicies:
 34 | 	    print "feature Xd"
 35 | 	elif 2 in indicies and 3 in indicies:
 36 | 	    print "feature Xd"
 37 | 	else:
 38 |             new_data.append([hash(tuple(v)) for v in data[:,indicies]])
 39 |     return array(new_data).T
 40 | 
 41 | def OneHotEncoder(data, keymap=None):
 42 |      """
 43 |      OneHotEncoder takes data matrix with categorical columns and
 44 |      converts it to a sparse binary matrix.
 45 |      
 46 |      Returns sparse binary matrix and keymap mapping categories to indicies.
 47 |      If a keymap is supplied on input it will be used instead of creating one
 48 |      and any categories appearing in the data that are not in the keymap are
 49 |      ignored
 50 |      """
 51 |      if keymap is None:
 52 |           keymap = []
 53 |           for col in data.T:
 54 |                uniques = set(list(col))
 55 |                keymap.append(dict((key, i) for i, key in enumerate(uniques)))
 56 |      total_pts = data.shape[0]
 57 |      outdat = []
 58 |      for i, col in enumerate(data.T):
 59 |           km = keymap[i]
 60 |           num_labels = len(km)
 61 |           spmat = sparse.lil_matrix((total_pts, num_labels))
 62 |           for j, val in enumerate(col):
 63 |                if val in km:
 64 |                     spmat[j, km[val]] = 1
 65 |           outdat.append(spmat)
 66 |      outdat = sparse.hstack(outdat).tocsr()
 67 |      return outdat, keymap
 68 | 
 69 | def create_test_submission(filename, prediction):
 70 |     content = ['id,ACTION']
 71 |     for i, p in enumerate(prediction):
 72 |         content.append('%i,%f' %(i+1,p))
 73 |     f = open(filename, 'w')
 74 |     f.write('\n'.join(content))
 75 |     f.close()
 76 |     print 'Saved'
 77 | 
 78 | # This loop essentially from Paul's starter code
 79 | # I (Ben) increased the size of train at the expense of test, because
 80 | # when train is small many features will not be found in train.
 81 | def cv_loop(X, y, model, N):
 82 |     mean_auc = 0.
 83 |     for i in range(N):
 84 |         X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
 85 |                                        X, y, test_size=1.0/float(N), 
 86 |                                        random_state = i*SEED)
 87 |         model.fit(X_train, y_train)
 88 |         preds = model.predict_proba(X_cv)[:,1]
 89 |         auc = metrics.auc_score(y_cv, preds)
 90 |         #print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
 91 |         mean_auc += auc
 92 |     return mean_auc/N
 93 |     
 94 | learner = sys.argv[1]
 95 | print "Reading dataset..."
 96 | train_data = pd.read_csv('train.csv')
 97 | test_data = pd.read_csv('test.csv')
 98 | submit=learner + str(SEED) + '.csv'
 99 | all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))
100 | num_train = np.shape(train_data)[0]
101 | 
102 | # Transform data
103 | print "Transforming data..."
104 | # Relabel the variable values to smallest possible so that I can use bincount
105 | # on them later.
106 | relabler = preprocessing.LabelEncoder()
107 | for col in range(len(all_data[0,:])):
108 |     relabler.fit(all_data[:, col])
109 |     all_data[:, col] = relabler.transform(all_data[:, col])
110 | ########################## 2nd order features ################################
111 | dp = group_data(all_data, degree=2) 
112 | for col in range(len(dp[0,:])):
113 |     relabler.fit(dp[:, col])
114 |     dp[:, col] = relabler.transform(dp[:, col])
115 |     uniques = len(set(dp[:,col]))
116 |     maximum = max(dp[:,col])
117 |     print col
118 |     if maximum < 65534:
119 |         count_map = np.bincount((dp[:, col]).astype('uint16'))
120 |         for n,i in enumerate(dp[:, col]):
121 |             if count_map[i] <= 1:
122 |                 dp[n, col] = uniques
123 |             elif count_map[i] == 2:
124 |                 dp[n, col] = uniques+1
125 |     else:
126 |         for n,i in enumerate(dp[:, col]):
127 |             if (dp[:, col] == i).sum() <= 1:
128 |                 dp[n, col] = uniques
129 |             elif (dp[:, col] == i).sum() == 2:
130 |                 dp[n, col] = uniques+1
131 |     print uniques # unique values
132 |     uniques = len(set(dp[:,col]))
133 |     print uniques
134 |     relabler.fit(dp[:, col])
135 |     dp[:, col] = relabler.transform(dp[:, col])
136 | ########################## 3rd order features ################################
137 | dt = group_data(all_data, degree=3)
138 | for col in range(len(dt[0,:])):
139 |     relabler.fit(dt[:, col])
140 |     dt[:, col] = relabler.transform(dt[:, col])
141 |     uniques = len(set(dt[:,col]))
142 |     maximum = max(dt[:,col])
143 |     print col
144 |     if maximum < 65534:
145 |         count_map = np.bincount((dt[:, col]).astype('uint16'))
146 |         for n,i in enumerate(dt[:, col]):
147 |             if count_map[i] <= 1:
148 |                 dt[n, col] = uniques
149 |             elif count_map[i] == 2:
150 |                 dt[n, col] = uniques+1
151 |     else:
152 |         for n,i in enumerate(dt[:, col]):
153 |             if (dt[:, col] == i).sum() <= 1:
154 |                 dt[n, col] = uniques
155 |             elif (dt[:, col] == i).sum() == 2:
156 |                 dt[n, col] = uniques+1
157 |     print uniques
158 |     uniques = len(set(dt[:,col]))
159 |     print uniques
160 |     relabler.fit(dt[:, col])
161 |     dt[:, col] = relabler.transform(dt[:, col])
162 | ########################## 1st order features ################################
163 | for col in range(len(all_data[0,:])):
164 |     relabler.fit(all_data[:, col])
165 |     all_data[:, col] = relabler.transform(all_data[:, col])
166 |     uniques = len(set(all_data[:,col]))
167 |     maximum = max(all_data[:,col])
168 |     print col
169 |     if maximum < 65534:
170 |         count_map = np.bincount((all_data[:, col]).astype('uint16'))
171 |         for n,i in enumerate(all_data[:, col]):
172 |             if count_map[i] <= 1:
173 |                 all_data[n, col] = uniques
174 |             elif count_map[i] == 2:
175 |                 all_data[n, col] = uniques+1
176 |     else:
177 |         for n,i in enumerate(all_data[:, col]):
178 |             if (all_data[:, col] == i).sum() <= 1:
179 |                 all_data[n, col] = uniques
180 |             elif (all_data[:, col] == i).sum() == 2:
181 |                 all_data[n, col] = uniques+1
182 |     print uniques
183 |     uniques = len(set(all_data[:,col]))
184 |     print uniques
185 |     relabler.fit(all_data[:, col])
186 |     all_data[:, col] = relabler.transform(all_data[:, col])
187 | 
188 | # Collect the training features together
189 | y = array(train_data.ACTION)
190 | X = all_data[:num_train]
191 | X_2 = dp[:num_train]
192 | X_3 = dt[:num_train]
193 | 
194 | # Collect the testing features together
195 | X_test = all_data[num_train:]
196 | X_test_2 = dp[num_train:]
197 | X_test_3 = dt[num_train:]
198 | 
199 | X_train_all = np.hstack((X, X_2, X_3))
200 | X_test_all = np.hstack((X_test, X_test_2, X_test_3))
201 | num_features = X_train_all.shape[1]
202 |     
203 | if learner == 'NB':
204 |     model = naive_bayes.BernoulliNB(alpha=0.03)
205 | else:
206 |     model = linear_model.LogisticRegression(class_weight='auto', penalty='l2')
207 |     
208 | # Xts holds one hot encodings for each individual feature in memory
209 | # speeding up feature selection 
210 | Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)]
211 |     
212 | print "Performing greedy feature selection..."
213 | score_hist = []
214 | N = 10
215 | good_features = set([])
216 | # Greedy feature selection loop
217 | while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
218 |     scores = []
219 |     for f in range(len(Xts)):
220 |         if f not in good_features:
221 |             feats = list(good_features) + [f]
222 |             Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
223 |             score = cv_loop(Xt, y, model, N)
224 |             scores.append((score, f))
225 |             print "Feature: %i Mean AUC: %f" % (f, score)
226 |     good_features.add(sorted(scores)[-1][1])
227 |     score_hist.append(sorted(scores)[-1])
228 |     print "Current features: %s" % sorted(list(good_features))
229 |     
230 | # Remove last added feature from good_features
231 | good_features.remove(score_hist[-1][1])
232 | good_features = sorted(list(good_features))
233 | print "Selected features %s" % good_features
234 | gf = open("feats" + submit, 'w')
235 | print >>gf, good_features
236 | gf.close()
237 | print len(good_features), " features"
238 |     
239 | print "Performing hyperparameter selection..."
240 | # Hyperparameter selection loop
241 | score_hist = []
242 | Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr()
243 | if learner == 'NB':
244 |     Cvals = [0.001, 0.003, 0.006, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1]
245 | else:
246 |     Cvals = np.logspace(-4, 4, 15, base=2)  # for logistic
247 | for C in Cvals:
248 |     if learner == 'NB':
249 |         model.alpha = C
250 |     else:
251 |         model.C = C
252 |     score = cv_loop(Xt, y, model, N)
253 |     score_hist.append((score,C))
254 |     print "C: %f Mean AUC: %f" %(C, score)
255 | bestC = sorted(score_hist)[-1][1]
256 | print "Best C value: %f" % (bestC)
257 |     
258 | print "Performing One Hot Encoding on entire dataset..."
259 | Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features]))
260 | Xt, keymap = OneHotEncoder(Xt)
261 | X_train = Xt[:num_train]
262 | X_test = Xt[num_train:]
263 |     
264 | if learner == 'NB':
265 |     model.alpha = bestC
266 | else:
267 |     model.C = bestC
268 | 
269 | print "Training full model..."
270 | print "Making prediction and saving results..."
271 | model.fit(X_train, y)
272 | preds = model.predict_proba(X_test)[:,1]
273 | create_test_submission(submit, preds)
274 | preds = model.predict_proba(X_train)[:,1]
275 | create_test_submission('Train'+submit, preds)
276 | 


--------------------------------------------------------------------------------
/MIT-LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Paul Duan, Benjamin Solecki
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a
 6 | copy
 7 | of this software and associated documentation files (the "Software"), to
 8 | deal
 9 | in the Software without restriction, including without limitation the
10 | rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or
12 | sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 | 
16 | The above copyright notice and this permission notice shall be included
17 | in
18 | all copies or substantial portions of the Software.
19 | 
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 | OR
22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 | THE
25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 | FROM,
28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
29 | IN
30 | THE SOFTWARE.
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Amazon Employee Access Challenge
 2 | ================================
 3 | 
 4 | This code was written by Paul Duan (<email@paulduan.com>) and Benjamin Solecki (<bensolucky@gmail.com>).
 5 | It provides our winning solution to the Amazon Employee Access Challenge.
 6 | Our code is currently not merged. You'll find Benjamin's code in the BSMan/ folder, which needs to be run separately.
 7 | 
 8 | 
 9 | Usage:
10 | ---------------
11 |     [python] classifier.py [-h] [-d] [-i ITER] [-f OUTPUTFILE] [-g] [-m] [-n] [-s] [-v] [-w]
12 | 
13 |     Parameters for the script.
14 | 
15 |     optional arguments:
16 |       -h, --help            show this help message and exit
17 |       -d, --diagnostics     Compute diagnostics.
18 |       -i ITER, --iter ITER  Number of iterations for averaging.
19 |       -f OUTPUTFILE, --outputfile OUTPUTFILE
20 |                             Name of the file where predictions are saved.
21 |       -g, --grid-search     Use grid search to find best parameters.
22 |       -m, --model-selection
23 |                             Use model selection.
24 |       -n, --no-cache        Use cache.
25 |       -s, --stack           Use stacking.
26 |       -v, --verbose         Show computation steps.
27 |       -w, --fwls            Use metafeatures.
28 | 
29 | 
30 | To directly generate predictions on the test set without computing CV
31 | metrics, simply run:  
32 | 
33 |     python classifier.py -i0 -f[output_filename]
34 | 
35 | This script will launch Paul's model, which incorporates some of Benjamin's features.
36 | Benjamin's model is in the BSMan folder and can be run this way:  
37 | 
38 |     (in BSMan/)
39 |     [python] logistic.py log 75
40 |     [python] ensemble.py
41 | 
42 | The output of our models is then combined by simple standardization then weighted averaging, using 2/3 Paul's model and 1/3 Benjamin's.
43 | 
44 | 
45 | Requirements:
46 | ---------------
47 | This code requires Python, numpy/scipy, scikit-learn, and pandas for
48 | some of the external code (this dependency will be removed in the
49 | future).  
50 | It has been tested under Mac OS X with Python v.7.x,
51 | scikit-learn 0.13, numpy 0.17, and pandas 0.11.
52 | 
53 | License:
54 | ---------------
55 | This content is released under the [MIT Licence](http://opensource.org/licenses/MIT).
56 | 


--------------------------------------------------------------------------------
/cache/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | 


--------------------------------------------------------------------------------
/cache/models/diagnostics/cv_preds/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything except this file
2 | *
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/cache/models/main/cv_preds/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything except this file
2 | *
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Amazon Access Challenge
  4 | 
  5 | This is my part of the code that produced the winning solution to the
  6 | Amazon Employee Access Challenge. See README.md for more details.
  7 | 
  8 | Author: Paul Duan <email@paulduan.com>
  9 | """
 10 | 
 11 | from __future__ import division
 12 | 
 13 | import argparse
 14 | import logging
 15 | 
 16 | from sklearn import metrics, cross_validation, linear_model, ensemble
 17 | from helpers import ml, diagnostics
 18 | from helpers.data import load_data, save_results
 19 | from helpers.feature_extraction import create_datasets
 20 | 
 21 | logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s",
 22 |                     filename="history.log", filemode='a', level=logging.DEBUG,
 23 |                     datefmt='%m/%d/%y %H:%M:%S')
 24 | formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s",
 25 |                               datefmt='%m/%d/%y %H:%M:%S')
 26 | console = logging.StreamHandler()
 27 | console.setFormatter(formatter)
 28 | console.setLevel(logging.INFO)
 29 | logging.getLogger().addHandler(console)
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | 
 34 | def main(CONFIG):
 35 |     """
 36 |     The final model is a combination of several base models, which are then
 37 |     combined using StackedClassifier defined in the helpers.ml module.
 38 | 
 39 |     The list of models and associated datasets is generated automatically
 40 |     from their identifying strings. The format is as follows:
 41 |     A:b_c where A is the initials of the algorithm to use, b is the base
 42 |     dataset, and c is the feature set and the variants to use.
 43 |     """
 44 |     SEED = 42
 45 |     selected_models = [
 46 |         "LR:tuples_sf",
 47 |         "LR:greedy_sfl",
 48 |         "LR:greedy2_sfl",
 49 |         "LR:greedy3_sf",
 50 |         "RFC:basic_b",
 51 |         "RFC:tuples_f",
 52 |         "RFC:tuples_fd",
 53 |         "RFC:greedy_f",
 54 |         "RFC:greedy2_f",
 55 |         "GBC:basic_f",
 56 |         "GBC:tuples_f",
 57 |         "LR:greedy_sbl",
 58 |         "GBC:greedy_c",
 59 |         "GBC:tuples_cf",
 60 |         #"RFC:effects_f",  # experimental; added after the competition
 61 |     ]
 62 | 
 63 |     # Create the models on the fly
 64 |     models = []
 65 |     for item in selected_models:
 66 |         model_id, dataset = item.split(':')
 67 |         model = {'LR': linear_model.LogisticRegression,
 68 |                  'GBC': ensemble.GradientBoostingClassifier,
 69 |                  'RFC': ensemble.RandomForestClassifier,
 70 |                  'ETC': ensemble.ExtraTreesClassifier}[model_id]()
 71 |         model.set_params(random_state=SEED)
 72 |         models.append((model, dataset))
 73 | 
 74 |     datasets = [dataset for model, dataset in models]
 75 | 
 76 |     logger.info("loading data")
 77 |     y, X = load_data('train.csv')
 78 |     X_test = load_data('test.csv', return_labels=False)
 79 | 
 80 |     logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache))
 81 |     create_datasets(X, X_test, y, datasets, CONFIG.use_cache)
 82 | 
 83 |     # Set params
 84 |     for model, feature_set in models:
 85 |         model.set_params(**ml.find_params(model, feature_set, y,
 86 |                                           grid_search=CONFIG.grid_search))
 87 |     clf = ml.StackedClassifier(
 88 |         models, stack=CONFIG.stack, fwls=CONFIG.fwls,
 89 |         model_selection=CONFIG.model_selection,
 90 |         use_cached_models=CONFIG.use_cache)
 91 | 
 92 |     #  Metrics
 93 |     logger.info("computing cv score")
 94 |     mean_auc = 0.0
 95 |     for i in range(CONFIG.iter):
 96 |         train, cv = cross_validation.train_test_split(
 97 |             range(len(y)), test_size=.20, random_state=1+i*SEED)
 98 |         cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose)
 99 | 
100 |         fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds)
101 |         roc_auc = metrics.auc(fpr, tpr)
102 |         logger.info("AUC (fold %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc)
103 |         mean_auc += roc_auc
104 | 
105 |         if CONFIG.diagnostics and i == 0:  # only plot for first fold
106 |             logger.info("plotting learning curve")
107 |             diagnostics.learning_curve(clf, y, train, cv)
108 |             diagnostics.plot_roc(fpr, tpr)
109 |     if CONFIG.iter:
110 |         logger.info("Mean AUC: %.5f",  mean_auc/CONFIG.iter)
111 | 
112 |     # Create submissions
113 |     if CONFIG.outputfile:
114 |         logger.info("making test submissions (CV AUC: %.4f)", mean_auc)
115 |         preds = clf.fit_predict(y, show_steps=CONFIG.verbose)
116 |         save_results(preds, CONFIG.outputfile + ".csv")
117 | 
118 | if __name__ == '__main__':
119 |     PARSER = argparse.ArgumentParser(description="Parameters for the script.")
120 |     PARSER.add_argument('-d', "--diagnostics", action="store_true",
121 |                         help="Compute diagnostics.")
122 |     PARSER.add_argument('-i', "--iter", type=int, default=1,
123 |                         help="Number of iterations for averaging.")
124 |     PARSER.add_argument("-f", "--outputfile", default="",
125 |                         help="Name of the file where predictions are saved.")
126 |     PARSER.add_argument('-g', "--grid-search", action="store_true",
127 |                         help="Use grid search to find best parameters.")
128 |     PARSER.add_argument('-m', "--model-selection", action="store_true",
129 |                         default=False, help="Use model selection.")
130 |     PARSER.add_argument('-n', "--no-cache", action="store_false", default=True,
131 |                         help="Use cache.", dest="use_cache")
132 |     PARSER.add_argument("-s", "--stack", action="store_true",
133 |                         help="Use stacking.")
134 |     PARSER.add_argument('-v', "--verbose", action="store_true",
135 |                         help="Show computation steps.")
136 |     PARSER.add_argument("-w", "--fwls", action="store_true",
137 |                         help="Use metafeatures.")
138 |     PARSER.set_defaults(argument_default=False)
139 |     CONFIG = PARSER.parse_args()
140 | 
141 |     CONFIG.stack = CONFIG.stack or CONFIG.fwls
142 | 
143 |     logger.debug('\n' + '='*50)
144 |     main(CONFIG)
145 | 


--------------------------------------------------------------------------------
/combine/combine.py:
--------------------------------------------------------------------------------
  1 | """combine.py
  2 | 
  3 | This is an ad-hoc script we used to find how to merge our submissions.
  4 | For this to work, the prediction vectors must be placed in the internal/
  5 | folder.
  6 | 
  7 | Author: Paul Duan <email@paulduan.com>
  8 | """
  9 | 
 10 | import numpy as np
 11 | import math
 12 | from sklearn import linear_model, cross_validation, preprocessing
 13 | 
 14 | from ..helpers.data import load_data
 15 | from ..helpers.ml import compute_auc, AUCRegressor
 16 | 
 17 | 
 18 | def inverse_transform(X):
 19 |     def clamp(x):
 20 |         return min(max(x, .00000001), .99999999)
 21 |     return np.vectorize(lambda x: -math.log((1 - clamp(x))/clamp(x)))(X)
 22 | 
 23 | 
 24 | def print_param(obj, params, prefix=''):
 25 |     for param in params:
 26 |         if hasattr(obj, param):
 27 |             paramvalue = getattr(obj, param)
 28 |             if "coef" in param:
 29 |                 paramvalue /= np.sum(paramvalue)
 30 |             print prefix + param + ": " + str(paramvalue)
 31 | 
 32 | 
 33 | mean_prediction = 0.0
 34 | y = load_data('train.csv')[0]
 35 | y = y[range(len(y) - 7770, len(y))]
 36 | 
 37 | files = ["log75", "ens", "paul"]
 38 | totransform = []
 39 | 
 40 | preds = []
 41 | for filename in files:
 42 |     with open("%s.csv" % filename) as f:
 43 |         pred = np.loadtxt(f, delimiter=',', usecols=[1], skiprows=1)
 44 |         if filename in totransform:
 45 |             pred = inverse_transform(pred)
 46 |         preds.append(pred)
 47 | X = np.array(preds).T
 48 | 
 49 | standardizer = preprocessing.StandardScaler()
 50 | X = standardizer.fit_transform(X)
 51 | 
 52 | print "============================================================"
 53 | print '\t\t'.join(files)
 54 | aucs = []
 55 | for filename in files:
 56 |     with open("%s.csv" % filename) as f:
 57 |         pred = np.loadtxt(f, delimiter=',', usecols=[1], skiprows=1)
 58 |         aucs.append("%.3f" % (compute_auc(y, pred) * 100))
 59 | print '\t\t'.join(aucs)
 60 | print "------------------------------------------------------------"
 61 | 
 62 | combiners = [
 63 |     linear_model.LinearRegression(),
 64 |     linear_model.Ridge(20),
 65 |     AUCRegressor(),
 66 | ]
 67 | 
 68 | for combiner in combiners:
 69 |     mean_coefs = 0.0
 70 |     mean_auc = 0.0
 71 |     N = 10
 72 | 
 73 |     print "\n%s:" % combiner.__class__.__name__
 74 |     if hasattr(combiner, 'predict_proba'):
 75 |         combiner.predict = lambda X: combiner.predict_proba(X)[:, 1]
 76 | 
 77 |     combiner.fit(X, y)
 78 |     print_param(combiner, ["alpha_", "coef_"], "(post) ")
 79 |     print "Train AUC: %.3f" % (compute_auc(y, combiner.predict(X)) * 100)
 80 | 
 81 |     if isinstance(combiner, AUCRegressor):
 82 |         continue
 83 | 
 84 |     kfold = cross_validation.KFold(len(y), 3, shuffle=True)
 85 |     for train, test in kfold:
 86 |         X_train = X[train]
 87 |         X_test = X[test]
 88 |         y_train = y[train]
 89 |         y_test = y[test]
 90 | 
 91 |         combiner.fit(X_train, y_train)
 92 |         prediction = combiner.predict(X_test)
 93 |         mean_auc += compute_auc(y_test, prediction)/len(kfold)
 94 | 
 95 |         if len(combiner.coef_) == 1:
 96 |             mean_coefs += combiner.coef_[0]/len(files)
 97 |         else:
 98 |             mean_coefs += combiner.coef_/len(files)
 99 | 
100 |     print "Mean AUC: %.3f" % (mean_auc * 100)
101 | 
102 | print "\n------------------------------------------------------------"
103 | 


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyduan/amazonaccess/f8addfefcee80f0ca15e416954af3926f3007d16/data/.DS_Store


--------------------------------------------------------------------------------
/external/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyduan/amazonaccess/f8addfefcee80f0ca15e416954af3926f3007d16/external/__init__.py


--------------------------------------------------------------------------------
/external/ben.py:
--------------------------------------------------------------------------------
 1 | """ Amazon Access Challenge Starter Code
 2 | 
 3 | This was built using the code of Paul Duan <email@paulduan.com> as a starting
 4 | point (thanks to Paul).
 5 | 
 6 | It builds ensemble models using the original dataset and a handful of
 7 | extracted features.
 8 | 
 9 | Author: Benjami Solecki <bensolucky@gmail.com>
10 | """
11 | 
12 | from __future__ import division
13 | 
14 | import numpy as np
15 | import pandas as pd
16 | from helpers.data import save_dataset
17 | 
18 | 
19 | def create_features():
20 |     print "loading data"
21 |     X = pd.read_csv('data/train.csv')
22 |     X = X.drop(['ROLE_CODE'], axis=1)
23 |     X = X.drop(['ACTION'], axis=1)
24 | 
25 |     X_test = pd.read_csv('data/test.csv', index_col=0)
26 |     X_test = X_test.drop(['ROLE_CODE'], axis=1)
27 |     X_test['ACTION'] = 0
28 |     X_test = X_test.drop(['ACTION'], axis=1)
29 | 
30 |     X_all = pd.concat([X_test, X], ignore_index=True)
31 |     # I want to combine role_title as a subset of role_familia and
32 |     X_all['ROLE_TITLE'] = X_all['ROLE_TITLE'] + (1000 * X_all['ROLE_FAMILY'])
33 |     X_all['ROLE_ROLLUPS'] = X_all['ROLE_ROLLUP_1'] + (
34 |         10000 * X_all['ROLE_ROLLUP_2'])
35 |     X_all = X_all.drop(['ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_FAMILY'],
36 |                        axis=1)
37 | 
38 |     # Count/freq
39 |     for col in X_all.columns:
40 |         X_all['cnt'+col] = 0
41 |         groups = X_all.groupby([col])
42 |         for name, group in groups:
43 |             count = group[col].count()
44 |             X_all['cnt'+col].ix[group.index] = count
45 |         X_all['cnt'+col] = X_all['cnt'+col].apply(np.log)
46 | 
47 |     # Percent of dept that is this resource
48 |     # And Counts of dept/resource occurancesa (tested, not used)
49 |     for col in X_all.columns[1:6]:
50 |         X_all['Duse'+col] = 0.0
51 |         groups = X_all.groupby([col])
52 |         for name, group in groups:
53 |             grps = group.groupby(['RESOURCE'])
54 |             for rsrc, grp in grps:
55 |                 X_all['Duse'+col].ix[grp.index] = \
56 |                     float(len(grp.index)) / float(len(group.index))
57 | 
58 |     # Number of resources that a manager manages
59 |     for col in X_all.columns[0:1]:
60 |     #for col in X_all.columns[0:6]:
61 |         if col == 'MGR_ID':
62 |             continue
63 |         X_all['Mdeps'+col] = 0
64 |         groups = X_all.groupby(['MGR_ID'])
65 |         for name, group in groups:
66 |             X_all['Mdeps'+col].ix[group.index] = len(group[col].unique())
67 | 
68 |     X_all = X_all.drop(X_all.columns[0:6], axis=1)
69 | 
70 |     # Now X is the train, X_test is test and X_all is both together
71 |     X = X_all[:][X_all.index >= len(X_test.index)]
72 |     X_test = X_all[:][X_all.index < len(X_test.index)]
73 |     # X is the train set alone, X_all is all features
74 |     X = X.as_matrix()
75 |     X_test = X_test.as_matrix()
76 | 
77 |     save_dataset('bsfeats', X, X_test)
78 | 


--------------------------------------------------------------------------------
/external/greedy.py:
--------------------------------------------------------------------------------
  1 | """ Greedy feature selection
  2 | This file is a slightly modified version of Miroslaw's code.
  3 | It generates a dataset containing all 3rd order combinations
  4 | of the original columns, then performs greedy feature selection.
  5 | 
  6 | Original author: Miroslaw Horbal <miroslaw@gmail.com>
  7 | Permission was granted by Miroslaw to publish this snippet as part of
  8 | our code.
  9 | """
 10 | 
 11 | from sklearn import metrics, cross_validation, linear_model
 12 | from scipy import sparse
 13 | from itertools import combinations
 14 | from helpers import data
 15 | 
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | SEED = 333
 20 | 
 21 | 
 22 | def group_data(data, degree=3, hash=hash):
 23 |     new_data = []
 24 |     m, n = data.shape
 25 |     for indices in combinations(range(n), degree):
 26 |         new_data.append([hash(tuple(v)) for v in data[:, indices]])
 27 |     return np.array(new_data).T
 28 | 
 29 | 
 30 | def OneHotEncoder(data, keymap=None):
 31 |     """
 32 |     OneHotEncoder takes data matrix with categorical columns and
 33 |     converts it to a sparse binary matrix.
 34 | 
 35 |     Returns sparse binary matrix and keymap mapping categories to indicies.
 36 |     If a keymap is supplied on input it will be used instead of creating one
 37 |     and any categories appearing in the data that are not in the keymap are
 38 |     ignored
 39 |     """
 40 |     if keymap is None:
 41 |         keymap = []
 42 |         for col in data.T:
 43 |             uniques = set(list(col))
 44 |             keymap.append(dict((key, i) for i, key in enumerate(uniques)))
 45 |     total_pts = data.shape[0]
 46 |     outdat = []
 47 |     for i, col in enumerate(data.T):
 48 |         km = keymap[i]
 49 |         num_labels = len(km)
 50 |         spmat = sparse.lil_matrix((total_pts, num_labels))
 51 |         for j, val in enumerate(col):
 52 |             if val in km:
 53 |                 spmat[j, km[val]] = 1
 54 |         outdat.append(spmat)
 55 |     outdat = sparse.hstack(outdat).tocsr()
 56 |     return outdat, keymap
 57 | 
 58 | 
 59 | def cv_loop(X, y, model, N):
 60 |     mean_auc = 0.
 61 |     for i in range(N):
 62 |         X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
 63 |             X, y, test_size=.20,
 64 |             random_state=i*SEED)
 65 |         model.fit(X_train, y_train)
 66 |         preds = model.predict_proba(X_cv)[:, 1]
 67 |         auc = metrics.auc_score(y_cv, preds)
 68 |         print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
 69 |         mean_auc += auc
 70 |     return mean_auc/N
 71 | 
 72 | 
 73 | def create_features(train='data/train.csv', test='data/test.csv'):
 74 |     print "Reading dataset..."
 75 |     train_data = pd.read_csv(train)
 76 |     test_data = pd.read_csv(test)
 77 |     all_data = np.vstack((train_data.ix[:, 1:-1], test_data.ix[:, 1:-1]))
 78 | 
 79 |     num_train = np.shape(train_data)[0]
 80 | 
 81 |     # Transform data
 82 |     print "Transforming data..."
 83 |     dp = group_data(all_data, degree=2)
 84 |     dt = group_data(all_data, degree=3)
 85 | 
 86 |     y = np.array(train_data.ACTION)
 87 |     X = all_data[:num_train]
 88 |     X_2 = dp[:num_train]
 89 |     X_3 = dt[:num_train]
 90 | 
 91 |     X_test = all_data[num_train:]
 92 |     X_test_2 = dp[num_train:]
 93 |     X_test_3 = dt[num_train:]
 94 | 
 95 |     X_train_all = np.hstack((X, X_2, X_3))
 96 |     X_test_all = np.hstack((X_test, X_test_2, X_test_3))
 97 |     num_features = X_train_all.shape[1]
 98 | 
 99 |     model = linear_model.LogisticRegression()
100 | 
101 |     # Xts holds one hot encodings for each individual feature in memory
102 |     # speeding up feature selection
103 |     Xts = [OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_features)]
104 | 
105 |     print "Performing greedy feature selection..."
106 |     score_hist = []
107 |     N = 10
108 |     good_features_list = [
109 |         [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55,
110 |          60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85],
111 |         [0, 1, 7, 8, 9, 10, 36, 37, 38, 41, 42, 43, 47, 51, 53,
112 |          56, 60, 61, 63, 64, 66, 67, 69, 71, 75, 79, 85, 91],
113 |         [0, 7, 9, 24, 36, 37, 41, 42, 47, 53, 61, 63, 64, 67, 69, 71, 75, 85],
114 |         [0, 7, 9, 20, 36, 37, 38, 41, 42, 45, 47,
115 |          53, 60, 63, 64, 67, 69, 71, 81, 85, 86]
116 |     ]
117 | 
118 |     # Greedy feature selection loop
119 |     if not good_features_list:
120 |         good_features = set([])
121 |         while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
122 |             scores = []
123 |             for f in range(len(Xts)):
124 |                 if f not in good_features:
125 |                     feats = list(good_features) + [f]
126 |                     Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
127 |                     score = cv_loop(Xt, y, model, N)
128 |                     scores.append((score, f))
129 |                     print "Feature: %i Mean AUC: %f" % (f, score)
130 |             good_features.add(sorted(scores)[-1][1])
131 |             score_hist.append(sorted(scores)[-1])
132 |             print "Current features: %s" % sorted(list(good_features))
133 | 
134 |         # Remove last added feature from good_features
135 |         good_features.remove(score_hist[-1][1])
136 |         good_features = sorted(list(good_features))
137 | 
138 |     for i, good_features in enumerate(good_features_list):
139 |         suffix = str(i + 1) if i else ''
140 |         Xt = np.vstack((X_train_all[:, good_features],
141 |                         X_test_all[:, good_features]))
142 |         X_train = Xt[:num_train]
143 |         X_test = Xt[num_train:]
144 |         data.save_dataset("greedy%s" % suffix, X_train, X_test)
145 | 


--------------------------------------------------------------------------------
/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyduan/amazonaccess/f8addfefcee80f0ca15e416954af3926f3007d16/helpers/__init__.py


--------------------------------------------------------------------------------
/helpers/data.py:
--------------------------------------------------------------------------------
 1 | """ml.py
 2 | 
 3 | Useful I/O functions.
 4 | 
 5 | Author: Paul Duan <email@paulduan.com>
 6 | """
 7 | 
 8 | import logging
 9 | import numpy as np
10 | from scipy import sparse
11 | import cPickle as pickle
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def load_data(filename, return_labels=True):
17 |     """Load data from CSV files and return them in numpy format."""
18 |     logging.debug("loading data from %s", filename)
19 |     data = np.loadtxt(open("data/" + filename), delimiter=',',
20 |                       usecols=range(1, 10), skiprows=1, dtype=int)
21 |     if return_labels:
22 |         labels = np.loadtxt(open("data/" + filename), delimiter=',',
23 |                             usecols=[0], skiprows=1)
24 |         return labels, data
25 |     else:
26 |         labels = np.zeros(data.shape[0])
27 |         return data
28 | 
29 | 
30 | def load_from_cache(filename, use_cache=True):
31 |     """Attempt to load data from cache."""
32 |     data = None
33 |     read_mode = 'rb' if '.pkl' in filename else 'r'
34 |     if use_cache:
35 |         try:
36 |             with open("cache/%s" % filename, read_mode) as f:
37 |                 data = pickle.load(f)
38 |         except IOError:
39 |             pass
40 | 
41 |     return data
42 | 
43 | 
44 | def save_results(predictions, filename):
45 |     """Save results in CSV format."""
46 |     logging.info("saving data to file %s", filename)
47 |     with open("submissions/%s" % filename, 'w') as f:
48 |         f.write("id,ACTION\n")
49 |         for i, pred in enumerate(predictions):
50 |             f.write("%d,%f\n" % (i + 1, pred))
51 | 
52 | 
53 | def save_dataset(filename, X, X_test, features=None, features_test=None):
54 |     """Save the training and test sets augmented with the given features."""
55 |     if features is not None:
56 |         assert features.shape[1] == features_test.shape[1], "features mismatch"
57 |         if sparse.issparse(X):
58 |             features = sparse.lil_matrix(features)
59 |             features_test = sparse.lil_matrix(features_test)
60 |             X = sparse.hstack((X, features), 'csr')
61 |             X_test = sparse.hstack((X_test, features_test), 'csr')
62 |         else:
63 |             X = np.hstack((X, features))
64 |             X_test = np. hstack((X_test, features_test))
65 | 
66 |     logger.info("> saving %s to disk", filename)
67 |     with open("cache/%s.pkl" % filename, 'wb') as f:
68 |         pickle.dump((X, X_test), f, pickle.HIGHEST_PROTOCOL)
69 | 
70 | 
71 | def get_dataset(feature_set='basic', train=None, cv=None):
72 |     """
73 |     Return the design matrices constructed with the specified feature set.
74 |     If train is specified, split the training set according to train and
75 |     cv (if cv is not given, subsample's complement will be used instead).
76 |     If subsample is omitted, return both the full training and test sets.
77 |     """
78 |     try:
79 |         with open("cache/%s.pkl" % feature_set, 'rb') as f:
80 |             if train is not None:
81 |                 X, _ = pickle.load(f)
82 |                 if cv is None:
83 |                     cv = [i for i in range(X.shape[0]) if i not in train]
84 | 
85 |                 X_test = X[cv, :]
86 |                 X = X[train, :]
87 |             else:
88 |                 X, X_test = pickle.load(f)
89 |     except IOError:
90 |         logging.warning("could not find feature set %s", feature_set)
91 |         return False
92 | 
93 |     return X, X_test
94 | 


--------------------------------------------------------------------------------
/helpers/diagnostics.py:
--------------------------------------------------------------------------------
 1 | """diagnostics.py
 2 | 
 3 | Some methods to plot diagnostics.
 4 | 
 5 | Author: Paul Duan <email@paulduan.com>
 6 | """
 7 | 
 8 | import matplotlib.pyplot as plt
 9 | from sklearn.metrics import hinge_loss
10 | 
11 | 
12 | def plot_roc(fpr, tpr):
13 |     """Plot ROC curve and display it."""
14 |     plt.clf()
15 |     plt.plot(fpr, tpr)
16 |     plt.plot([0, 1], [0, 1], 'k--')
17 |     plt.xlim([0.0, 1.0])
18 |     plt.ylim([0.0, 1.0])
19 |     plt.xlabel('False Positive Rate')
20 |     plt.ylabel('True Positive Rate')
21 |     plt.title('ROC Curve')
22 | 
23 | 
24 | def learning_curve(classifier, y, train, cv, n=15):
25 |     """Plot train and cv loss for increasing train sample sizes."""
26 |     chunk = int(len(y)/n)
27 |     n_samples = []
28 |     train_losses = []
29 |     cv_losses = []
30 |     previous_cache_dir = classifier.cache_dir
31 |     classifier.cache_dir = "diagnostics"
32 | 
33 |     for i in range(n):
34 |         train_subset = train[:(i + 1)*chunk]
35 |         preds_cv = classifier.fit_predict(y, train_subset, cv,
36 |                                           show_steps=False)
37 |         preds_train = classifier.fit_predict(y, train_subset, train_subset,
38 |                                              show_steps=False)
39 |         n_samples.append((i + 1)*chunk)
40 |         cv_losses.append(hinge_loss(y[cv], preds_cv, neg_label=0))
41 |         train_losses.append(hinge_loss(y[train_subset], preds_train,
42 |                             neg_label=0))
43 | 
44 |     classifier.cache_dir = previous_cache_dir
45 |     plt.clf()
46 |     plt.plot(n_samples, train_losses, 'r--', n_samples, cv_losses, 'b--')
47 |     plt.ylim([min(train_losses) - .01, max(cv_losses) + .01])
48 | 
49 |     plt.savefig('plots/learning_curve.png')
50 |     plt.show()
51 | 


--------------------------------------------------------------------------------
/helpers/feature_extraction.py:
--------------------------------------------------------------------------------
  1 | """feature_extraction.py
  2 | 
  3 | Create the requested datasets.
  4 | 
  5 | Author: Paul Duan <email@paulduan.com>
  6 | """
  7 | 
  8 | from __future__ import division
  9 | 
 10 | import logging
 11 | import cPickle as pickle
 12 | import numpy as np
 13 | import math
 14 | 
 15 | from scipy import sparse
 16 | from sklearn import preprocessing
 17 | 
 18 | from external import greedy, ben
 19 | from data import save_dataset
 20 | from ml import get_dataset
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | subformatter = logging.Formatter("[%(asctime)s] %(levelname)s\t> %(message)s")
 24 | 
 25 | COLNAMES = ["resource", "manager", "role1", "role2", "department",
 26 |             "title", "family_desc", "family"]
 27 | SELECTED_COLUMNS = [0, 1, 4, 5, 6, 7]
 28 | 
 29 | EXTERNAL_DATASETS = {
 30 |     "greedy": greedy,
 31 |     "greedy2": greedy,
 32 |     "greedy3": greedy,
 33 |     "bsfeats": ben
 34 | }
 35 | 
 36 | 
 37 | def sparsify(X, X_test):
 38 |     """Return One-Hot encoded datasets."""
 39 |     enc = OneHotEncoder()
 40 |     enc.fit(np.vstack((X, X_test)))
 41 |     return enc.transform(X), enc.transform(X_test)
 42 | 
 43 | 
 44 | def create_datasets(X, X_test, y, datasets=[], use_cache=True):
 45 |     """
 46 |     Generate datasets as needed with different sets of features
 47 |     and save them to disk.
 48 |     The datasets are created by combining a base feature set (combinations of
 49 |     the original variables) with extracted feature sets, with some additional
 50 |     variants.
 51 | 
 52 |     The nomenclature is as follows:
 53 |     Base datasets:
 54 |         - basic: the original columns, minus role1, role2, and role_code
 55 |         - tuples: all order 2 combinations of the original columns
 56 |         - triples: all order 3 combinations of the original columns
 57 |         - greedy[1,2,3]: three different datasets obtained by performing
 58 |             greedy feature selection with different seeds on the triples
 59 |             dataset
 60 |         - effects: experimental. Created to try out a suggestion by Gxav
 61 |             after the competition
 62 | 
 63 |     Feature sets and variants:
 64 |     (denoted by the letters after the underscore in the base dataset name):
 65 |         - s: the base dataset has been sparsified using One-Hot encoding
 66 |         - c: the rare features have been consolidated into one category
 67 |         - f: extracted features have been appended, with a different set for
 68 |             linear models than for tree-based models
 69 |         - b: Benjamin's extracted features.
 70 |         - d: interactions for the extracted feature set have been added
 71 |         - l: the extracted features have been log transformed
 72 |     """
 73 |     if use_cache:
 74 |         # Check if all files exist. If not, generate the missing ones
 75 |         DATASETS = []
 76 |         for dataset in datasets:
 77 |             try:
 78 |                 with open("cache/%s.pkl" % dataset, 'rb'):
 79 |                     pass
 80 |             except IOError:
 81 |                 logger.warning("couldn't load dataset %s, will generate it",
 82 |                                dataset)
 83 |                 DATASETS.append(dataset.split('_')[0])
 84 |     else:
 85 |         DATASETS = ["basic", "tuples", "triples",
 86 |                     "greedy", "greedy2", "greedy3"]
 87 | 
 88 |     # Datasets that require external code to be generated
 89 |     for dataset, module in EXTERNAL_DATASETS.iteritems():
 90 |         if not get_dataset(dataset):
 91 |             module.create_features()
 92 | 
 93 |     # Generate the missing datasets
 94 |     if len(DATASETS):
 95 |         bsfeats, bsfeats_test = get_dataset('bsfeats')
 96 | 
 97 |         basefeats, basefeats_test = create_features(X, X_test, 3)
 98 |         save_dataset("base_feats", basefeats, basefeats_test)
 99 | 
100 |         lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0))
101 |         save_dataset("lrfeats", lrfeats, lrfeats_test)
102 | 
103 |         feats, feats_test = pre_process(*create_features(X, X_test, 1))
104 |         save_dataset("features", feats, feats_test)
105 | 
106 |         meta, meta_test = pre_process(*create_features(X, X_test, 2),
107 |                                       normalize=False)
108 |         save_dataset("metafeatures", meta, meta_test)
109 | 
110 |         X = X[:, SELECTED_COLUMNS]
111 |         X_test = X_test[:, SELECTED_COLUMNS]
112 |         save_dataset("basic", X, X_test)
113 | 
114 |         Xt = create_tuples(X)
115 |         Xt_test = create_tuples(X_test)
116 |         save_dataset("tuples", Xt, Xt_test)
117 | 
118 |         Xtr = create_tuples(X)
119 |         Xtr_test = create_tuples(X_test)
120 |         save_dataset("triples", Xtr, Xtr_test)
121 | 
122 |         Xe, Xe_test = create_effects(X, X_test, y)
123 |         save_dataset("effects", Xe, Xe_test)
124 | 
125 |         feats_d, feats_d_test = pre_process(basefeats, basefeats_test,
126 |                                             create_divs=True)
127 |         bsfeats_d, bsfeats_d_test = pre_process(bsfeats, bsfeats_test,
128 |                                                 create_divs=True)
129 |         feats_l, feats_l_test = pre_process(basefeats, basefeats_test,
130 |                                             log_transform=True)
131 |         lrfeats_l, lrfeats_l_test = pre_process(lrfeats, lrfeats_test,
132 |                                                 log_transform=True)
133 |         bsfeats_l, bsfeats_l_test = pre_process(bsfeats, bsfeats_test,
134 |                                                 log_transform=True)
135 | 
136 |         for ds in DATASETS:
137 |             Xg, Xg_test = get_dataset(ds)
138 |             save_dataset(ds + '_b', Xg, Xg_test, bsfeats, bsfeats_test)
139 |             save_dataset(ds + '_f', Xg, Xg_test, feats, feats_test)
140 |             save_dataset(ds + '_fd', Xg, Xg_test, feats_d, feats_d_test)
141 |             save_dataset(ds + '_bd', Xg, Xg_test, bsfeats_d, bsfeats_d_test)
142 |             Xs, Xs_test = sparsify(Xg, Xg_test)
143 |             save_dataset(ds + '_sf', Xs, Xs_test, lrfeats, lrfeats_test)
144 |             save_dataset(ds + '_sfl', Xs, Xs_test, lrfeats_l, lrfeats_l_test)
145 |             save_dataset(ds + '_sfd', Xs, Xs_test, feats_d, feats_d_test)
146 |             save_dataset(ds + '_sb', Xs, Xs_test, bsfeats, bsfeats_test)
147 |             save_dataset(ds + '_sbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test)
148 |             save_dataset(ds + '_sbd', Xs, Xs_test, bsfeats_d, bsfeats_d_test)
149 | 
150 |             if issubclass(Xg.dtype.type, np.integer):
151 |                 consolidate(Xg, Xg_test)
152 |                 save_dataset(ds + '_c', Xg, Xg_test)
153 |                 save_dataset(ds + '_cf', Xg, Xg_test, feats, feats_test)
154 |                 save_dataset(ds + '_cb', Xg, Xg_test, bsfeats, bsfeats_test)
155 |                 Xs, Xs_test = sparsify(Xg, Xg_test)
156 |                 save_dataset(ds + '_sc', Xs, Xs_test)
157 |                 save_dataset(ds + '_scf', Xs, Xs_test, feats, feats_test)
158 |                 save_dataset(ds + '_scfl', Xs, Xs_test, feats_l, feats_l_test)
159 |                 save_dataset(ds + '_scb', Xs, Xs_test, bsfeats, bsfeats_test)
160 |                 save_dataset(ds + '_scbl', Xs, Xs_test,
161 |                              bsfeats_l, bsfeats_l_test)
162 | 
163 | 
164 | def create_effects(X_train, X_test, y):
165 |     """
166 |     Create a dataset where the features are the effects of a
167 |     logistic regression trained on sparsified data.
168 |     This has been added post-deadline after talking with Gxav.
169 |     """
170 |     from sklearn import linear_model, cross_validation
171 |     from itertools import izip
172 |     Xe_train = np.zeros(X_train.shape)
173 |     Xe_test = np.zeros(X_test.shape)
174 |     n_cols = Xe_train.shape[1]
175 | 
176 |     model = linear_model.LogisticRegression(C=2)
177 |     X_train, X_test = sparsify(X_train, X_test)
178 | 
179 |     kfold = cross_validation.KFold(len(y), 5)
180 |     for train, cv in kfold:
181 |         model.fit(X_train[train], y[train])
182 |         colindices = X_test.nonzero()[1]
183 |         for i, k in izip(cv, range(len(cv))):
184 |             for j in range(n_cols):
185 |                 z = colindices[n_cols*k + j]
186 |                 Xe_train[i, j] = model.coef_[0, z]
187 | 
188 |     model.fit(X_train, y)
189 |     colindices = X_test.nonzero()[1]
190 |     for i in range(Xe_test.shape[0]):
191 |         for j in range(n_cols):
192 |             z = colindices[n_cols*i + j]
193 |             Xe_test[i, j] = model.coef_[0, z]
194 | 
195 |     return Xe_train, Xe_test
196 | 
197 | 
198 | def create_features(X_train, X_test, feature_set=0):
199 |     """
200 |     Extract features from the training and test set.
201 |     Each feature set is defined as a list of lambda functions.
202 |     """
203 |     logger.info("performing feature extraction (feature_set=%d)", feature_set)
204 |     features_train = []
205 |     features_test = []
206 |     dictionaries = get_pivottable(X_train, X_test)
207 |     dictionaries_train = get_pivottable(X_train, X_test, use='train')
208 |     dictionaries_test = get_pivottable(X_test, X_test, use='test')
209 | 
210 |     # 0: resource, 1: manager, 2: role1, 3: role2, 4: department,
211 |     # 5: title, 6: family_desc, 7: family
212 |     feature_lists = [
213 |         [  # 0: LR features
214 |             lambda x, row, j:
215 |             x[COLNAMES[0]].get(row[0], 0) if j > 0 and j < 7 else 0,
216 |             lambda x, row, j:
217 |             x[COLNAMES[1]].get(row[1], 0) if j > 1 and j < 7 else 0,
218 |             lambda x, row, j:
219 |             x[COLNAMES[2]].get(row[2], 0) if j > 2 and j < 7 else 0,
220 |             lambda x, row, j:
221 |             x[COLNAMES[3]].get(row[3], 0) if j > 3 and j < 7 else 0,
222 |             lambda x, row, j:
223 |             x[COLNAMES[4]].get(row[4], 0) if j > 4 and j < 7 else 0,
224 |             lambda x, row, j:
225 |             x[COLNAMES[5]].get(row[5], 0) if j > 5 and j < 7 else 0,
226 |             lambda x, row, j:
227 |             x[COLNAMES[6]].get(row[6], 0) if j > 6 and j < 7 else 0,
228 |             lambda x, row, j:
229 |             x[COLNAMES[7]].get(row[7], 0) if j > 7 and j < 7 else 0,
230 | 
231 |             lambda x, row, j:
232 |             x[COLNAMES[0]].get(row[0], 0)**2 if j in range(7) else 0,
233 |             lambda x, row, j:
234 |             x[COLNAMES[j]].get(row[0], 0)/x['total']
235 |             if j > 0 and j < 7 else 0,
236 | 
237 |             lambda x, row, j:
238 |             x[COLNAMES[j]].get(row[j], 0)/len(x[COLNAMES[j]].values()),
239 | 
240 |             lambda x, row, j:
241 |             x[COLNAMES[j]].get(row[j], 0) / dictionaries[j]['total'],
242 | 
243 |             lambda x, row, j:
244 |             math.log(x[COLNAMES[0]].get(row[0], 0)) if j in range(5) else 0,
245 | 
246 |             lambda x, row, j:
247 |             int(row[j] not in dictionaries_train[j]),
248 | 
249 |             lambda x, row, j:
250 |             int(row[j] not in dictionaries_test[j]),
251 |         ],
252 | 
253 |         [  # 1: Tree features
254 |             lambda x, row, j:
255 |             x[COLNAMES[0]].get(row[0], 0),
256 |             lambda x, row, j:
257 |             x[COLNAMES[1]].get(row[1], 0),
258 |             lambda x, row, j:
259 |             x[COLNAMES[2]].get(row[2], 0),
260 |             lambda x, row, j:
261 |             x[COLNAMES[3]].get(row[3], 0),
262 |             lambda x, row, j:
263 |             x[COLNAMES[4]].get(row[4], 0),
264 |             lambda x, row, j:
265 |             x[COLNAMES[5]].get(row[5], 0),
266 |             lambda x, row, j:
267 |             x[COLNAMES[6]].get(row[6], 0),
268 |             lambda x, row, j:
269 |             x[COLNAMES[7]].get(row[7], 0),
270 | 
271 |             lambda x, row, j:
272 |             x[COLNAMES[j]].get(row[0], 0)/x['total'] if j > 0 else 0,
273 |         ],
274 | 
275 |         [  # 2: Metafeatures
276 |             lambda x, row, j:
277 |             dictionaries_train[j].get(row[j], {}).get('total', 0),
278 |             lambda x, row, j:
279 |             dictionaries_train[j].get(row[j], {}).get('total', 0) == 0,
280 |         ],
281 | 
282 |         [  # 3: Base features
283 |             lambda x, row, j:
284 |             x['total'] if j == 0 else 0,
285 | 
286 |             lambda x, row, j:
287 |             x[COLNAMES[0]].get(row[0], 0) if j > 0 else 0,
288 |             lambda x, row, j:
289 |             x[COLNAMES[1]].get(row[1], 0) if j > 1 else 0,
290 |             lambda x, row, j:
291 |             x[COLNAMES[2]].get(row[2], 0) if j > 2 else 0,
292 |             lambda x, row, j:
293 |             x[COLNAMES[3]].get(row[3], 0) if j > 3 else 0,
294 |             lambda x, row, j:
295 |             x[COLNAMES[4]].get(row[4], 0) if j > 4 else 0,
296 |             lambda x, row, j:
297 |             x[COLNAMES[5]].get(row[5], 0) if j > 5 else 0,
298 |             lambda x, row, j:
299 |             x[COLNAMES[6]].get(row[6], 0) if j > 6 else 0,
300 |             lambda x, row, j:
301 |             x[COLNAMES[7]].get(row[7], 0) if j > 7 else 0,
302 | 
303 |             lambda x, row, j:
304 |             x[COLNAMES[0]].get(row[0], 0)**2 if j in range(8) else 0,
305 |         ],
306 |     ]
307 | 
308 |     feature_generator = feature_lists[feature_set]
309 | 
310 |     # create feature vectors
311 |     logger.debug("creating feature vectors")
312 |     features_train = []
313 |     for row in X_train:
314 |         features_train.append([])
315 |         for j in range(len(COLNAMES)):
316 |             for feature in feature_generator:
317 |                 feature_row = feature(dictionaries[j][row[j]], row, j)
318 |                 features_train[-1].append(feature_row)
319 |     features_train = np.array(features_train)
320 | 
321 |     features_test = []
322 |     for row in X_test:
323 |         features_test.append([])
324 |         for j in range(len(COLNAMES)):
325 |             for feature in feature_generator:
326 |                 feature_row = feature(dictionaries[j][row[j]], row, j)
327 |                 features_test[-1].append(feature_row)
328 |     features_test = np.array(features_test)
329 | 
330 |     return features_train, features_test
331 | 
332 | 
333 | def pre_process(features_train, features_test,
334 |                 create_divs=False, log_transform=False, normalize=True):
335 |     """
336 |     Take lists of feature columns as input, pre-process them (eventually
337 |     performing some transformation), then return nicely formatted numpy arrays.
338 |     """
339 |     logger.info("performing preprocessing")
340 | 
341 |     features_train = list(features_train.T)
342 |     features_test = list(features_test.T)
343 |     features_train = [list(feature) for feature in features_train]
344 |     features_test = [list(feature) for feature in features_test]
345 | 
346 |     # remove constant features
347 |     for i in range(len(features_train) - 1, -1, -1):
348 |         if np.var(features_train[i]) + np.var(features_test[i]) == 0:
349 |             features_train.pop(i)
350 |             features_test.pop(i)
351 |     n_features = len(features_train)
352 | 
353 |     # create some polynomial features
354 |     if create_divs:
355 |         for i in range(n_features):
356 |             for j in range(1):
357 |                 features_train.append([round(a/(b + 1), 3) for a, b in zip(
358 |                     features_train[i], features_train[j])])
359 |                 features_test.append([round(a/(b + 1), 3) for a, b in zip(
360 |                     features_test[i], features_test[j])])
361 | 
362 |                 features_train.append([round(a/(b + 1), 3) for a, b in zip(
363 |                     features_train[j], features_train[i])])
364 |                 features_test.append([round(a/(b + 1), 3) for a, b in zip(
365 |                     features_test[j], features_test[i])])
366 | 
367 |                 features_train.append([a*b for a, b in zip(
368 |                     features_train[j], features_train[i])])
369 |                 features_test.append([a*b for a, b in zip(
370 |                     features_test[j], features_test[i])])
371 | 
372 |     if log_transform:
373 |         tmp_train = []
374 |         tmp_test = []
375 |         for i in range(n_features):
376 |             tmp_train.append([math.log(a + 1) if (a + 1) > 0 else 0
377 |                              for a in features_train[i]])
378 |             tmp_test.append([math.log(a + 1) if (a + 1) > 0 else 0
379 |                              for a in features_test[i]])
380 | 
381 |             tmp_train.append([a**2 for a in features_train[i]])
382 |             tmp_test.append([a**2 for a in features_test[i]])
383 |             tmp_train.append([a**3 for a in features_train[i]])
384 |             tmp_test.append([a**3 for a in features_test[i]])
385 |         features_train = tmp_train
386 |         features_test = tmp_test
387 | 
388 |     logger.info("created %d features", len(features_train))
389 |     features_train = np.array(features_train).T
390 |     features_test = np.array(features_test).T
391 | 
392 |     # normalize the new features
393 |     if normalize:
394 |         normalizer = preprocessing.StandardScaler()
395 |         normalizer.fit(features_train)
396 |         features_train = normalizer.transform(features_train)
397 |         features_test = normalizer.transform(features_test)
398 | 
399 |     return features_train, features_test
400 | 
401 | 
402 | def get_pivottable(X_train, X_test, use='all'):
403 |     """
404 |     Returns a list of dictionaries, one per feature in the
405 |     basic data, containing cross-tabulated counts
406 |     for each column and each value of the feature.
407 |     """
408 |     dictionaries = []
409 |     if use == 'all':
410 |         X = np.vstack((X_train, X_test))
411 |         filename = "pivottable"
412 |     elif use == 'train':
413 |         X = X_train
414 |         filename = "pivottable_train"
415 |     else:
416 |         X = X_test
417 |         filename = "pivottable_test"
418 | 
419 |     for i in range(len(COLNAMES)):
420 |         dictionaries.append({'total': 0})
421 | 
422 |     try:
423 |         with open("cache/%s.pkl" % filename, 'rb') as f:
424 |             logger.debug("loading cross-tabulated data from cache")
425 |             dictionaries = pickle.load(f)
426 |     except IOError:
427 |         logger.debug("no cache found, cross-tabulating data")
428 |         for i, row in enumerate(X):
429 |             for j in range(len(COLNAMES)):
430 |                 dictionaries[j]['total'] += 1
431 |                 if row[j] not in dictionaries[j]:
432 |                     dictionaries[j][row[j]] = {'total': 1}
433 |                     for k, key in enumerate(COLNAMES):
434 |                         dictionaries[j][row[j]][key] = {row[k]: 1}
435 |                 else:
436 |                     dictionaries[j][row[j]]['total'] += 1
437 |                     for k, key in enumerate(COLNAMES):
438 |                         if row[k] not in dictionaries[j][row[j]][key]:
439 |                             dictionaries[j][row[j]][key][row[k]] = 1
440 |                         else:
441 |                             dictionaries[j][row[j]][key][row[k]] += 1
442 |         with open("cache/%s.pkl" % filename, 'wb') as f:
443 |             pickle.dump(dictionaries, f, pickle.HIGHEST_PROTOCOL)
444 | 
445 |     return dictionaries
446 | 
447 | 
448 | def create_tuples(X):
449 |     logger.debug("creating feature tuples")
450 |     cols = []
451 |     for i in range(X.shape[1]):
452 |         for j in range(i, X.shape[1]):
453 |             cols.append(X[:, i] + X[:, j]*3571)
454 |     return np.hstack((X, np.vstack(cols).T))
455 | 
456 | 
457 | def create_triples(X):
458 |     logger.debug("creating feature triples")
459 |     cols = []
460 |     for i in range(X.shape[1]):
461 |         for j in range(i, X.shape[1]):
462 |             for k in range(j, X.shape[1]):
463 |                 cols.append(X[:, i]*3461 + X[:, j]*5483 + X[:, k])
464 |     return np.hstack((X, np.vstack(cols).T))
465 | 
466 | 
467 | def consolidate(X_train, X_test):
468 |     """
469 |     Transform in-place the given dataset by consolidating
470 |     rare features into a single category.
471 |     """
472 |     X = np.vstack((X_train, X_test))
473 |     relabeler = preprocessing.LabelEncoder()
474 | 
475 |     for j in range(X.shape[1]):
476 |         relabeler.fit(X[:, j])
477 |         X[:, j] = relabeler.transform(X[:, j])
478 |         X_train[:, j] = relabeler.transform(X_train[:, j])
479 |         X_test[:, j] = relabeler.transform(X_test[:, j])
480 | 
481 |         raw_counts = np.bincount(X[:, j])
482 |         indices = np.nonzero(raw_counts)[0]
483 |         counts = dict((x, raw_counts[x]) for x in indices)
484 |         max_value = np.max(X[:, j])
485 | 
486 |         for i in range(X_train.shape[0]):
487 |             if counts[X_train[i, j]] <= 1:
488 |                 X_train[i, j] = max_value + 1
489 | 
490 |         for i in range(X_test.shape[0]):
491 |             if counts[X_test[i, j]] <= 1:
492 |                 X_test[i, j] = max_value + 1
493 | 
494 | 
495 | class OneHotEncoder():
496 |     """
497 |     OneHotEncoder takes data matrix with categorical columns and
498 |     converts it to a sparse binary matrix.
499 |     """
500 |     def __init__(self):
501 |         self.keymap = None
502 | 
503 |     def fit(self, X):
504 |         self.keymap = []
505 |         for col in X.T:
506 |             uniques = set(list(col))
507 |             self.keymap.append(dict((key, i) for i, key in enumerate(uniques)))
508 | 
509 |     def transform(self, X):
510 |         if self.keymap is None:
511 |             self.fit(X)
512 | 
513 |         outdat = []
514 |         for i, col in enumerate(X.T):
515 |             km = self.keymap[i]
516 |             num_labels = len(km)
517 |             spmat = sparse.lil_matrix((X.shape[0], num_labels))
518 |             for j, val in enumerate(col):
519 |                 if val in km:
520 |                     spmat[j, km[val]] = 1
521 |             outdat.append(spmat)
522 |         outdat = sparse.hstack(outdat).tocsr()
523 |         return outdat
524 | 


--------------------------------------------------------------------------------
/helpers/ml.py:
--------------------------------------------------------------------------------
  1 | """ml.py
  2 | 
  3 | This is the file that does the heavy lifting.
  4 | It contains the ML algorithms themselves:
  5 |     - AUCRegressor: a custom class that optimizes AUC directly
  6 |     - MLR: a linear regression with non-negativity constraints
  7 |     - StackedClassifier: a custom class that combines several models
  8 | 
  9 | And some related functions:
 10 |     - find_params: sets the hyperparameters for a given model
 11 | 
 12 | Author: Paul Duan <email@paulduan.com>
 13 | """
 14 | 
 15 | from __future__ import division
 16 | 
 17 | import cPickle as pickle
 18 | import itertools
 19 | import json
 20 | import logging
 21 | import multiprocessing
 22 | import scipy as sp
 23 | import numpy as np
 24 | 
 25 | from functools import partial
 26 | from operator import itemgetter
 27 | 
 28 | from sklearn.metrics import roc_curve, auc
 29 | from sklearn.grid_search import GridSearchCV
 30 | from sklearn import cross_validation, linear_model
 31 | 
 32 | from data import load_from_cache, get_dataset
 33 | from utils import stringify, compute_auc
 34 | 
 35 | logger = logging.getLogger(__name__)
 36 | 
 37 | N_TREES = 500
 38 | 
 39 | INITIAL_PARAMS = {
 40 |     'LogisticRegression': {'C': 2, 'penalty': 'l2', 'class_weight': 'auto'},
 41 |     'RandomForestClassifier': {
 42 |         'n_estimators': N_TREES, 'n_jobs': 4,
 43 |         'min_samples_leaf': 2, 'bootstrap': False,
 44 |         'max_depth': 30, 'min_samples_split': 5, 'max_features': .1
 45 |     },
 46 |     'ExtraTreesClassifier': {
 47 |         'n_estimators': N_TREES, 'n_jobs': 3, 'min_samples_leaf': 2,
 48 |         'max_depth': 30, 'min_samples_split': 5, 'max_features': .1,
 49 |         'bootstrap': False,
 50 |     },
 51 |     'GradientBoostingClassifier': {
 52 |         'n_estimators': N_TREES, 'learning_rate': .08, 'max_features': 7,
 53 |         'min_samples_leaf': 1, 'min_samples_split': 3, 'max_depth': 5,
 54 |     },
 55 | }
 56 | 
 57 | PARAM_GRID = {
 58 |     'LogisticRegression': {'C': [1.5, 2, 2.5, 3, 3.5, 5, 5.5],
 59 |                            'class_weight': ['auto']},
 60 |     'RandomForestClassifier': {
 61 |         'n_jobs': [1], 'max_depth': [15, 20, 25, 30, 35, None],
 62 |         'min_samples_split': [1, 3, 5, 7],
 63 |         'max_features': [3, 8, 11, 15],
 64 |     },
 65 |     'ExtraTreesClassifier': {'min_samples_leaf': [2, 3],
 66 |                              'n_jobs': [1],
 67 |                              'min_samples_split': [1, 2, 5],
 68 |                              'bootstrap': [False],
 69 |                              'max_depth': [15, 20, 25, 30],
 70 |                              'max_features': [1, 3, 5, 11]},
 71 |     'GradientBoostingClassifier': {'max_features': [4, 5, 6, 7],
 72 |                                    'learning_rate': [.05, .08, .1],
 73 |                                    'max_depth': [8, 10, 13]},
 74 | }
 75 | 
 76 | 
 77 | class AUCRegressor(object):
 78 |     def __init__(self):
 79 |         self.coef_ = 0
 80 | 
 81 |     def _auc_loss(self, coef, X, y):
 82 |         fpr, tpr, _ = roc_curve(y, sp.dot(X, coef))
 83 |         return -auc(fpr, tpr)
 84 | 
 85 |     def fit(self, X, y):
 86 |         lr = linear_model.LinearRegression()
 87 |         auc_partial = partial(self._auc_loss, X=X, y=y)
 88 |         initial_coef = lr.fit(X, y).coef_
 89 |         self.coef_ = sp.optimize.fmin(auc_partial, initial_coef)
 90 | 
 91 |     def predict(self, X):
 92 |         return sp.dot(X, self.coef_)
 93 | 
 94 |     def score(self, X, y):
 95 |         fpr, tpr, _ = roc_curve(y, sp.dot(X, self.coef_))
 96 |         return auc(fpr, tpr)
 97 | 
 98 | 
 99 | class MLR(object):
100 |     def __init__(self):
101 |         self.coef_ = 0
102 | 
103 |     def fit(self, X, y):
104 |         self.coef_ = sp.optimize.nnls(X, y)[0]
105 |         self.coef_ = np.array(map(lambda x: x/sum(self.coef_), self.coef_))
106 | 
107 |     def predict(self, X):
108 |         predictions = np.array(map(sum, self.coef_ * X))
109 |         return predictions
110 | 
111 |     def score(self, X, y):
112 |         fpr, tpr, _ = roc_curve(y, sp.dot(X, self.coef_))
113 |         return auc(fpr, tpr)
114 | 
115 | 
116 | class StackedClassifier(object):
117 |     """
118 |     Implement stacking to combine several models.
119 |     The base (stage 0) models can be either combined through
120 |     simple averaging (fastest), or combined using a stage 1 generalizer
121 |     (requires computing CV predictions on the train set).
122 | 
123 |     See http://ijcai.org/Past%20Proceedings/IJCAI-97-VOL2/PDF/011.pdf:
124 |     "Stacked generalization: when does it work?", Ting and Witten, 1997
125 | 
126 |     For speed and convenience, both fitting and prediction are done
127 |     in the same method fit_predict; this is done in order to enable
128 |     one to compute metrics on the predictions after training each model without
129 |     having to wait for all the models to be trained.
130 | 
131 |     Options:
132 |     ------------------------------
133 |     - models: a list of (model, dataset) tuples that represent stage 0 models
134 |     - generalizer: an Estimator object. Must implement fit and predict
135 |     - model_selection: boolean. Whether to use brute force search to find the
136 |         optimal subset of models that produce the best AUC.
137 |     """
138 |     def __init__(self, models, generalizer=None, model_selection=True,
139 |                  stack=False, fwls=False, use_cached_models=True):
140 |         self.cache_dir = "main"
141 |         self.models = models
142 |         self.model_selection = model_selection
143 |         self.stack = stack
144 |         self.fwls = fwls
145 |         self.generalizer = linear_model.RidgeCV(
146 |             alphas=np.linspace(0, 200), cv=100)
147 |         self.use_cached_models = use_cached_models
148 | 
149 |     def _combine_preds(self, X_train, X_cv, y, train=None, predict=None,
150 |                        stack=False, fwls=False):
151 |         """
152 |         Combine preds, returning in order:
153 |             - mean_preds: the simple average of all model predictions
154 |             - stack_preds: the predictions of the stage 1 generalizer
155 |             - fwls_preds: same as stack_preds, but optionally using more
156 |                 complex blending schemes (meta-features, different
157 |                 generalizers, etc.)
158 |         """
159 |         mean_preds = np.mean(X_cv, axis=1)
160 |         stack_preds = None
161 |         fwls_preds = None
162 | 
163 |         if stack:
164 |             self.generalizer.fit(X_train, y)
165 |             stack_preds = self.generalizer.predict(X_cv)
166 | 
167 |         if self.fwls:
168 |             meta, meta_cv = get_dataset('metafeatures', train, predict)
169 |             fwls_train = np.hstack((X_train, meta))
170 |             fwls_cv = np.hstack((X_cv, meta))
171 |             self.generalizer.fit(fwls_train)
172 |             fwls_preds = self.generalizer.predict(fwls_cv)
173 | 
174 |         return mean_preds, stack_preds, fwls_preds
175 | 
176 |     def _find_best_subset(self, y, predictions_list):
177 |         """Finds the combination of models that produce the best AUC."""
178 |         best_subset_indices = range(len(predictions_list))
179 | 
180 |         pool = multiprocessing.Pool(processes=4)
181 |         partial_compute_subset_auc = partial(compute_subset_auc,
182 |                                              pred_set=predictions_list, y=y)
183 |         best_auc = 0
184 |         best_n = 0
185 |         best_indices = []
186 | 
187 |         if len(predictions_list) == 1:
188 |             return [1]
189 | 
190 |         for n in range(int(len(predictions_list)/2), len(predictions_list)):
191 |             cb = itertools.combinations(range(len(predictions_list)), n)
192 |             combination_results = pool.map(partial_compute_subset_auc, cb)
193 |             best_subset_auc, best_subset_indices = max(
194 |                 combination_results, key=itemgetter(0))
195 |             print "- best subset auc (%d models): %.4f > %s" % (
196 |                 n, best_subset_auc, n, list(best_subset_indices))
197 |             if best_subset_auc > best_auc:
198 |                 best_auc = best_subset_auc
199 |                 best_n = n
200 |                 best_indices = list(best_subset_indices)
201 |         pool.terminate()
202 | 
203 |         logger.info("best auc: %.4f", best_auc)
204 |         logger.info("best n: %d", best_n)
205 |         logger.info("best indices: %s", best_indices)
206 |         for i, (model, feature_set) in enumerate(self.models):
207 |             if i in best_subset_indices:
208 |                 logger.info("> model: %s (%s)", model.__class__.__name__,
209 |                             feature_set)
210 | 
211 |         return best_subset_indices
212 | 
213 |     def _get_model_preds(self, model, X_train, X_predict, y_train, cache_file):
214 |         """
215 |         Return the model predictions on the prediction set,
216 |         using cache if possible.
217 |         """
218 |         model_output = load_from_cache(
219 |             "models/%s/%s.pkl" % (self.cache_dir, cache_file),
220 |             self.use_cached_models)
221 | 
222 |         model_params, model_preds = model_output \
223 |             if model_output is not None else (None, None)
224 | 
225 |         if model_preds is None or model_params != model.get_params():
226 |             model.fit(X_train, y_train)
227 |             model_preds = model.predict_proba(X_predict)[:, 1]
228 |             with open("cache/models/%s/%s.pkl" % (
229 |                     self.cache_dir, cache_file), 'wb') as f:
230 |                 pickle.dump((model.get_params(), model_preds), f)
231 | 
232 |         return model_preds
233 | 
234 |     def _get_model_cv_preds(self, model, X_train, y_train, cache_file):
235 |         """
236 |         Return cross-validation predictions on the training set, using cache
237 |         if possible.
238 |         This is used if stacking is enabled (ie. a second model is used to
239 |         combine the stage 0 predictions).
240 |         """
241 |         stack_preds = load_from_cache(
242 |             "models/%s/cv_preds/%s.pkl" % (self.cache_dir, cache_file),
243 |             self.use_cached_models)
244 | 
245 |         if stack_preds is None:
246 |             kfold = cross_validation.StratifiedKFold(y_train, 4)
247 |             stack_preds = []
248 |             indexes_cv = []
249 |             for stage0, stack in kfold:
250 |                 model.fit(X_train[stage0], y_train[stage0])
251 |                 stack_preds.extend(list(model.predict_proba(
252 |                     X_train[stack])[:, 1]))
253 |                 indexes_cv.extend(list(stack))
254 |             stack_preds = np.array(stack_preds)[sp.argsort(indexes_cv)]
255 | 
256 |             with open("cache/models/%s/cv_preds/%s%d.pkl" % (
257 |                     self.cache_dir, cache_file), 'wb') as f:
258 |                 pickle.dump(stack_preds, f, pickle.HIGHEST_PROTOCOL)
259 | 
260 |         return stack_preds
261 | 
262 |     def fit_predict(self, y, train=None, predict=None, show_steps=True):
263 |         """
264 |         Fit each model on the appropriate dataset, then return the average
265 |         of their individual predictions. If train is specified, use a subset
266 |         of the training set to train the models, then predict the outcome of
267 |         either the remaining samples or (if given) those specified in cv.
268 |         If train is omitted, train the models on the full training set, then
269 |         predict the outcome of the full test set.
270 | 
271 |         Options:
272 |         ------------------------------
273 |         - y: numpy array. The full vector of the ground truths.
274 |         - train: list. The indices of the elements to be used for training.
275 |             If None, take the entire training set.
276 |         - predict: list. The indices of the elements to be predicted.
277 |         - show_steps: boolean. Whether to compute metrics after each stage
278 |             of the computation.
279 |         """
280 |         y_train = y[train] if train is not None else y
281 |         if train is not None and predict is None:
282 |             predict = [i for i in range(len(y)) if i not in train]
283 | 
284 |         stage0_train = []
285 |         stage0_predict = []
286 |         for model, feature_set in self.models:
287 |             X_train, X_predict = get_dataset(feature_set, train, predict)
288 | 
289 |             identifier = train[0] if train is not None else -1
290 |             cache_file = stringify(model, feature_set) + str(identifier)
291 | 
292 |             model_preds = self._get_model_preds(
293 |                 model, X_train, X_predict, y_train, cache_file)
294 |             stage0_predict.append(model_preds)
295 | 
296 |             # if stacking, compute cross-validated predictions on the train set
297 |             if self.stack:
298 |                 model_cv_preds = self._get_model_cv_preds(
299 |                     model, X_train, y_train, cache_file)
300 |                 stage0_train.append(model_cv_preds)
301 | 
302 |             # verbose mode: compute metrics after every model computation
303 |             if show_steps:
304 |                 if train is not None:
305 |                     mean_preds, stack_preds, fwls_preds = self._combine_preds(
306 |                         np.array(stage0_train).T, np.array(stage0_predict).T,
307 |                         y_train, train, predict,
308 |                         stack=self.stack, fwls=self.fwls)
309 | 
310 |                     model_auc = compute_auc(y[predict], stage0_predict[-1])
311 |                     mean_auc = compute_auc(y[predict], mean_preds)
312 |                     stack_auc = compute_auc(y[predict], stack_preds) \
313 |                         if self.stack else 0
314 |                     fwls_auc = compute_auc(y[predict], fwls_preds) \
315 |                         if self.fwls else 0
316 | 
317 |                     logger.info(
318 |                         "> AUC: %.4f (%.4f, %.4f, %.4f) [%s]", model_auc,
319 |                         mean_auc, stack_auc, fwls_auc,
320 |                         stringify(model, feature_set))
321 |                 else:
322 |                     logger.info("> used model %s:\n%s", stringify(
323 |                         model, feature_set), model.get_params())
324 | 
325 |         if self.model_selection and predict is not None:
326 |             best_subset = self._find_best_subset(y[predict], stage0_predict)
327 |             stage0_train = [pred for i, pred in enumerate(stage0_train)
328 |                             if i in best_subset]
329 |             stage0_predict = [pred for i, pred in enumerate(stage0_predict)
330 |                               if i in best_subset]
331 | 
332 |         mean_preds, stack_preds, fwls_preds = self._combine_preds(
333 |             np.array(stage0_train).T, np.array(stage0_predict).T,
334 |             y_train, stack=self.stack, fwls=self.fwls)
335 | 
336 |         if self.stack:
337 |             selected_preds = stack_preds if not self.fwls else fwls_preds
338 |         else:
339 |             selected_preds = mean_preds
340 | 
341 |         return selected_preds
342 | 
343 | 
344 | def compute_subset_auc(indices, pred_set, y):
345 |     subset = [vect for i, vect in enumerate(pred_set) if i in indices]
346 |     mean_preds = sp.mean(subset, axis=0)
347 |     mean_auc = compute_auc(y, mean_preds)
348 | 
349 |     return mean_auc, indices
350 | 
351 | 
352 | def find_params(model, feature_set, y, subsample=None, grid_search=False):
353 |     """
354 |     Return parameter set for the model, either predefined
355 |     or found through grid search.
356 |     """
357 |     model_name = model.__class__.__name__
358 |     params = INITIAL_PARAMS.get(model_name, {})
359 |     y = y if subsample is None else y[subsample]
360 | 
361 |     try:
362 |         with open('saved_params.json') as f:
363 |             saved_params = json.load(f)
364 |     except IOError:
365 |         saved_params = {}
366 | 
367 |     if (grid_search and model_name in PARAM_GRID and stringify(
368 |             model, feature_set) not in saved_params):
369 |         X, _ = get_dataset(feature_set, subsample, [0])
370 |         clf = GridSearchCV(model, PARAM_GRID[model_name], cv=10, n_jobs=6,
371 |                            scoring="roc_auc")
372 |         clf.fit(X, y)
373 |         logger.info("found params (%s > %.4f): %s",
374 |                     stringify(model, feature_set),
375 |                     clf.best_score_, clf.best_params_)
376 |         params.update(clf.best_params_)
377 |         saved_params[stringify(model, feature_set)] = params
378 |         with open('saved_params.json', 'w') as f:
379 |             json.dump(saved_params, f, indent=4, separators=(',', ': '),
380 |                       ensure_ascii=True, sort_keys=True)
381 |     else:
382 |         params.update(saved_params.get(stringify(model, feature_set), {}))
383 |         if grid_search:
384 |             logger.info("using params %s: %s", stringify(model, feature_set),
385 |                         params)
386 | 
387 |     return params
388 | 


--------------------------------------------------------------------------------
/helpers/utils.py:
--------------------------------------------------------------------------------
 1 | """utils.py
 2 | 
 3 | Some useful functions.
 4 | Author: Paul Duan <email@paulduan.com>
 5 | """
 6 | 
 7 | from re import sub
 8 | from sklearn.metrics import roc_curve, auc
 9 | 
10 | 
11 | def stringify(model, feature_set):
12 |     """Given a model and a feature set, return a short string that will serve
13 |     as identifier for this combination.
14 |     Ex: (LogisticRegression(), "basic_s") -> "LR:basic_s"
15 |     """
16 |     return "%s:%s" % (sub("[a-z]", '', model.__class__.__name__), feature_set)
17 | 
18 | 
19 | def compute_auc(y, y_pred):
20 |     fpr, tpr, _ = roc_curve(y, y_pred)
21 |     return auc(fpr, tpr)
22 | 


--------------------------------------------------------------------------------
/history.log:
--------------------------------------------------------------------------------
1 | # All actions will be logged here
2 | 


--------------------------------------------------------------------------------
/plots/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore all except this file
2 | *
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/saved_params.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "ETC:basic_b": {
  3 |         "bootstrap": true,
  4 |         "max_depth": null,
  5 |         "max_features": "sqrt",
  6 |         "min_samples_leaf": 2,
  7 |         "min_samples_split": 8
  8 |     },
  9 |     "ETC:basic_f": {
 10 |         "bootstrap": false,
 11 |         "max_depth": 25,
 12 |         "max_features": 11,
 13 |         "min_samples_leaf": 3,
 14 |         "min_samples_split": 1
 15 |     },
 16 |     "ETC:basic_fd": {
 17 |         "bootstrap": false,
 18 |         "max_depth": 30,
 19 |         "max_features": 11,
 20 |         "min_samples_leaf": 3,
 21 |         "min_samples_split": 1
 22 |     },
 23 |     "ETC:bsfeats": {
 24 |         "bootstrap": true,
 25 |         "max_depth": null,
 26 |         "max_features": "sqrt",
 27 |         "min_samples_leaf": 2,
 28 |         "min_samples_split": 8
 29 |     },
 30 |     "ETC:greedy": {
 31 |         "bootstrap": false,
 32 |         "max_depth": 30,
 33 |         "max_features": 11,
 34 |         "min_samples_leaf": 3,
 35 |         "min_samples_split": 1
 36 |     },
 37 |     "ETC:greedy2": {
 38 |         "bootstrap": false,
 39 |         "max_depth": 25,
 40 |         "max_features": 11,
 41 |         "min_samples_leaf": 3,
 42 |         "min_samples_split": 1
 43 |     },
 44 |     "ETC:greedy2_f": {
 45 |         "bootstrap": false,
 46 |         "max_depth": 25,
 47 |         "max_features": 11,
 48 |         "min_samples_leaf": 3,
 49 |         "min_samples_split": 1
 50 |     },
 51 |     "ETC:greedy2_fd": {
 52 |         "bootstrap": false,
 53 |         "max_depth": 30,
 54 |         "max_features": 11,
 55 |         "min_samples_leaf": 2,
 56 |         "min_samples_split": 5
 57 |     },
 58 |     "ETC:greedy3": {
 59 |         "bootstrap": false,
 60 |         "max_depth": 30,
 61 |         "max_features": 11,
 62 |         "min_samples_leaf": 2,
 63 |         "min_samples_split": 5
 64 |     },
 65 |     "ETC:greedy3_f": {
 66 |         "bootstrap": false,
 67 |         "max_depth": 20,
 68 |         "max_features": 11,
 69 |         "min_samples_leaf": 3,
 70 |         "min_samples_split": 1
 71 |     },
 72 |     "ETC:greedy3_fd": {
 73 |         "bootstrap": false,
 74 |         "max_depth": 30,
 75 |         "max_features": 11,
 76 |         "min_samples_leaf": 3,
 77 |         "min_samples_split": 1
 78 |     },
 79 |     "ETC:greedy_f": {
 80 |         "bootstrap": false,
 81 |         "max_depth": 30,
 82 |         "max_features": 11,
 83 |         "min_samples_leaf": 3,
 84 |         "min_samples_split": 1
 85 |     },
 86 |     "ETC:greedy_fd": {
 87 |         "bootstrap": false,
 88 |         "max_depth": 25,
 89 |         "max_features": 11,
 90 |         "min_samples_leaf": 3,
 91 |         "min_samples_split": 1
 92 |     },
 93 |     "ETC:tuples_f": {
 94 |         "bootstrap": false,
 95 |         "max_depth": 25,
 96 |         "max_features": 11,
 97 |         "min_samples_leaf": 3,
 98 |         "min_samples_split": 1
 99 |     },
100 |     "ETC:tuples_fd": {
101 |         "bootstrap": false,
102 |         "max_depth": 20,
103 |         "max_features": 11,
104 |         "min_samples_leaf": 3,
105 |         "min_samples_split": 1
106 |     },
107 |     "GBC:basic_b": {
108 |         "learning_rate": 0.005,
109 |         "max_depth": 20,
110 |         "min_samples_split": 9
111 |     },
112 |     "GBC:basic_f": {
113 |         "learning_rate": 0.05,
114 |         "max_depth": 10,
115 |         "max_features": 5,
116 |         "min_samples_leaf": 1,
117 |         "min_samples_split": 3
118 |     },
119 |     "GBC:bsfeats": {
120 |         "learning_rate": 0.005,
121 |         "max_depth": 20,
122 |         "min_samples_split": 9
123 |     },
124 |     "GBC:greedy": {
125 |         "learning_rate": 0.08,
126 |         "max_depth": 10,
127 |         "max_features": 6,
128 |         "min_samples_leaf": 1,
129 |         "min_samples_split": 3
130 |     },
131 |     "GBC:greedy2": {
132 |         "learning_rate": 0.08,
133 |         "max_depth": 10,
134 |         "max_features": 6,
135 |         "min_samples_leaf": 1,
136 |         "min_samples_split": 3
137 |     },
138 |     "GBC:greedy2_f": {
139 |         "learning_rate": 0.08,
140 |         "max_depth": 10,
141 |         "max_features": 3,
142 |         "min_samples_leaf": 1,
143 |         "min_samples_split": 3
144 |     },
145 |     "GBC:greedy2_fd": {
146 |         "learning_rate": 0.08,
147 |         "max_depth": 10,
148 |         "max_features": 7,
149 |         "min_samples_leaf": 1,
150 |         "min_samples_split": 3
151 |     },
152 |     "GBC:greedy3": {
153 |         "learning_rate": 0.08,
154 |         "max_depth": 10,
155 |         "max_features": 6,
156 |         "min_samples_leaf": 1,
157 |         "min_samples_split": 3
158 |     },
159 |     "GBC:greedy3_f": {
160 |         "learning_rate": 0.08,
161 |         "max_depth": 10,
162 |         "max_features": 3,
163 |         "min_samples_leaf": 1,
164 |         "min_samples_split": 3
165 |     },
166 |     "GBC:greedy3_fd": {
167 |         "learning_rate": 0.08,
168 |         "max_depth": 10,
169 |         "max_features": 7,
170 |         "min_samples_leaf": 1,
171 |         "min_samples_split": 3
172 |     },
173 |     "GBC:greedy_f": {
174 |         "learning_rate": 0.08,
175 |         "max_depth": 13,
176 |         "max_features": 4,
177 |         "min_samples_leaf": 1,
178 |         "min_samples_split": 3
179 |     },
180 |     "GBC:greedy_fd": {
181 |         "learning_rate": 0.08,
182 |         "max_depth": 10,
183 |         "max_features": 7,
184 |         "min_samples_leaf": 1,
185 |         "min_samples_split": 3
186 |     },
187 |     "GBC:tuples_f": {
188 |         "learning_rate": 0.05,
189 |         "max_depth": 10,
190 |         "max_features": 4,
191 |         "min_samples_leaf": 1,
192 |         "min_samples_split": 3
193 |     },
194 |     "GBC:tuples_fd": {
195 |         "learning_rate": 0.08,
196 |         "max_depth": 10,
197 |         "max_features": 6,
198 |         "min_samples_leaf": 1,
199 |         "min_samples_split": 3
200 |     },
201 |     "LR:basic_sf": {
202 |         "C": 3.5,
203 |         "class_weight": "auto",
204 |         "penalty": "l2"
205 |     },
206 |     "LR:basic_sfd": {
207 |         "C": 5.5,
208 |         "class_weight": "auto",
209 |         "penalty": "l2"
210 |     },
211 |     "LR:basic_sfl": {
212 |         "C": 3.5,
213 |         "class_weight": "auto",
214 |         "penalty": "l2"
215 |     },
216 |     "LR:consolidated_s": {
217 |         "C": 1.5,
218 |         "class_weight": "auto",
219 |         "penalty": "l2"
220 |     },
221 |     "LR:consolidated_sf": {
222 |         "C": 5,
223 |         "class_weight": "auto",
224 |         "penalty": "l2"
225 |     },
226 |     "LR:greedy2_sbl": {
227 |         "C": 3.5,
228 |         "class_weight": "auto",
229 |         "penalty": "l2"
230 |     },
231 |     "LR:greedy2_sf": {
232 |         "C": 5.5,
233 |         "class_weight": "auto",
234 |         "penalty": "l2"
235 |     },
236 |     "LR:greedy2_sfd": {
237 |         "C": 5.5,
238 |         "class_weight": "auto",
239 |         "penalty": "l2"
240 |     },
241 |     "LR:greedy2_sfl": {
242 |         "C": 5,
243 |         "class_weight": "auto",
244 |         "penalty": "l2"
245 |     },
246 |     "LR:greedy3_sbl": {
247 |         "C": 5.5,
248 |         "class_weight": "auto",
249 |         "penalty": "l2"
250 |     },
251 |     "LR:greedy3_sf": {
252 |         "C": 5,
253 |         "class_weight": "auto",
254 |         "penalty": "l2"
255 |     },
256 |     "LR:greedy3_sfd": {
257 |         "C": 5.5,
258 |         "class_weight": "auto",
259 |         "penalty": "l2"
260 |     },
261 |     "LR:greedy3_sfl": {
262 |         "C": 5.5,
263 |         "class_weight": "auto",
264 |         "penalty": "l2"
265 |     },
266 |     "LR:greedy_sbl": {
267 |         "C": 5.5,
268 |         "class_weight": "auto",
269 |         "penalty": "l2"
270 |     },
271 |     "LR:greedy_sf": {
272 |         "C": 3.5,
273 |         "class_weight": "auto",
274 |         "penalty": "l2"
275 |     },
276 |     "LR:greedy_sfl": {
277 |         "C": 5,
278 |         "class_weight": "auto",
279 |         "penalty": "l2"
280 |     },
281 |     "LR:triples_sbl": {
282 |         "C": 3,
283 |         "class_weight": "auto",
284 |         "penalty": "l2"
285 |     },
286 |     "LR:tuples_sbl": {
287 |         "C": 3,
288 |         "class_weight": "auto",
289 |         "penalty": "l2"
290 |     },
291 |     "LR:tuples_sf": {
292 |         "C": 2,
293 |         "class_weight": "auto",
294 |         "penalty": "l2"
295 |     },
296 |     "LR:tuples_sfd": {
297 |         "C": 2.5,
298 |         "class_weight": "auto",
299 |         "penalty": "l2"
300 |     },
301 |     "LR:tuples_sfl": {
302 |         "C": 2.5,
303 |         "class_weight": "auto",
304 |         "penalty": "l2"
305 |     },
306 |     "RFC:basic_b": {
307 |         "bootstrap": true,
308 |         "max_depth": null,
309 |         "max_features": "sqrt",
310 |         "min_samples_leaf": 2,
311 |         "min_samples_split": 8
312 |     },
313 |     "RFC:basic_f": {
314 |         "bootstrap": false,
315 |         "max_depth": 15,
316 |         "max_features": 3,
317 |         "min_samples_leaf": 2,
318 |         "min_samples_split": 7
319 |     },
320 |     "RFC:basic_fd": {
321 |         "bootstrap": false,
322 |         "max_depth": 30,
323 |         "max_features": 11,
324 |         "min_samples_leaf": 2,
325 |         "min_samples_split": 7
326 |     },
327 |     "RFC:bsfeats": {
328 |         "bootstrap": true,
329 |         "max_depth": null,
330 |         "max_features": "sqrt",
331 |         "min_samples_leaf": 2,
332 |         "min_samples_split": 8
333 |     },
334 |     "RFC:effects_f": {
335 |         "bootstrap": false,
336 |         "max_depth": 15,
337 |         "max_features": 15,
338 |         "min_samples_leaf": 2,
339 |         "min_samples_split": 5,
340 |         "n_estimators": 500,
341 |         "n_jobs": 1
342 |     },
343 |     "RFC:effects_b": {
344 |         "bootstrap": false,
345 |         "max_depth": 15,
346 |         "max_features": 15,
347 |         "min_samples_leaf": 2,
348 |         "min_samples_split": 5,
349 |         "n_estimators": 500,
350 |         "n_jobs": 1
351 |     },
352 |     "RFC:greedy": {
353 |         "bootstrap": false,
354 |         "max_depth": null,
355 |         "max_features": 11,
356 |         "min_samples_leaf": 2,
357 |         "min_samples_split": 7
358 |     },
359 |     "RFC:greedy2": {
360 |         "bootstrap": false,
361 |         "max_depth": 30,
362 |         "max_features": 8,
363 |         "min_samples_leaf": 2,
364 |         "min_samples_split": 8
365 |     },
366 |     "RFC:greedy2_f": {
367 |         "bootstrap": false,
368 |         "max_depth": 25,
369 |         "max_features": 3,
370 |         "min_samples_leaf": 2,
371 |         "min_samples_split": 8
372 |     },
373 |     "RFC:greedy2_fd": {
374 |         "bootstrap": false,
375 |         "max_depth": 25,
376 |         "max_features": 11,
377 |         "min_samples_leaf": 2,
378 |         "min_samples_split": 7
379 |     },
380 |     "RFC:greedy3": {
381 |         "bootstrap": false,
382 |         "max_depth": null,
383 |         "max_features": 11,
384 |         "min_samples_leaf": 2,
385 |         "min_samples_split": 8
386 |     },
387 |     "RFC:greedy3_f": {
388 |         "bootstrap": false,
389 |         "max_depth": 25,
390 |         "max_features": 8,
391 |         "min_samples_leaf": 2,
392 |         "min_samples_split": 8
393 |     },
394 |     "RFC:greedy3_fd": {
395 |         "bootstrap": false,
396 |         "max_depth": 15,
397 |         "max_features": 11,
398 |         "min_samples_leaf": 2,
399 |         "min_samples_split": 8
400 |     },
401 |     "RFC:greedy_f": {
402 |         "bootstrap": false,
403 |         "max_depth": 25,
404 |         "max_features": 3,
405 |         "min_samples_leaf": 2,
406 |         "min_samples_split": 7
407 |     },
408 |     "RFC:greedy_fd": {
409 |         "bootstrap": false,
410 |         "max_depth": 15,
411 |         "max_features": 8,
412 |         "min_samples_leaf": 2,
413 |         "min_samples_split": 7
414 |     },
415 |     "RFC:tuples_f": {
416 |         "bootstrap": false,
417 |         "max_depth": 25,
418 |         "max_features": 3,
419 |         "min_samples_leaf": 2,
420 |         "min_samples_split": 7
421 |     },
422 |     "RFC:tuples_fd": {
423 |         "bootstrap": false,
424 |         "max_depth": null,
425 |         "max_features": 11,
426 |         "min_samples_leaf": 2,
427 |         "min_samples_split": 7
428 |     },
429 |     "SGDC:basic_sf": {
430 |         "alpha": 0.0003,
431 |         "l1_ratio": 0.1,
432 |         "loss": "log",
433 |         "penalty": "l2"
434 |     },
435 |     "SGDC:basic_sfl": {
436 |         "alpha": 0.0003,
437 |         "l1_ratio": 0.1,
438 |         "loss": "log",
439 |         "penalty": "l2"
440 |     },
441 |     "SGDC:greedy2_sf": {
442 |         "alpha": 0.0001,
443 |         "l1_ratio": 0.1,
444 |         "loss": "log",
445 |         "penalty": "l2"
446 |     },
447 |     "SGDC:greedy2_sfd": {
448 |         "alpha": 0.0003,
449 |         "l1_ratio": 0.1,
450 |         "loss": "log",
451 |         "penalty": "l2"
452 |     },
453 |     "SGDC:greedy2_sfl": {
454 |         "alpha": 0.0003,
455 |         "l1_ratio": 0.1,
456 |         "loss": "log",
457 |         "penalty": "l2"
458 |     },
459 |     "SGDC:greedy3_sf": {
460 |         "alpha": 0.0001,
461 |         "l1_ratio": 0.1,
462 |         "loss": "log",
463 |         "penalty": "l2"
464 |     },
465 |     "SGDC:greedy3_sfd": {
466 |         "alpha": 0.0003,
467 |         "l1_ratio": 0.1,
468 |         "loss": "log",
469 |         "penalty": "l2"
470 |     },
471 |     "SGDC:greedy3_sfl": {
472 |         "alpha": 0.0003,
473 |         "l1_ratio": 0.1,
474 |         "loss": "log",
475 |         "penalty": "l2"
476 |     },
477 |     "SGDC:greedy_sf": {
478 |         "alpha": 8e-05,
479 |         "l1_ratio": 0.1,
480 |         "loss": "log",
481 |         "penalty": "l2"
482 |     },
483 |     "SGDC:greedy_sfl": {
484 |         "alpha": 0.0003,
485 |         "l1_ratio": 0.1,
486 |         "loss": "log",
487 |         "penalty": "l2"
488 |     },
489 |     "SGDC:tuples_sf": {
490 |         "alpha": 0.0003,
491 |         "l1_ratio": 0.1,
492 |         "loss": "log",
493 |         "penalty": "l2"
494 |     },
495 |     "SGDC:tuples_sfd": {
496 |         "alpha": 0.0003,
497 |         "l1_ratio": 0.1,
498 |         "loss": "log",
499 |         "penalty": "l2"
500 |     },
501 |     "SGDC:tuples_sfl": {
502 |         "alpha": 0.0003,
503 |         "l1_ratio": 0.1,
504 |         "loss": "log",
505 |         "penalty": "l2"
506 |     }
507 | }
508 | 


--------------------------------------------------------------------------------
/submissions/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore all files except this one
2 | *
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------