├── .gitignore ├── BSMan ├── __init__.py ├── ensemble.py └── logistic.py ├── MIT-LICENSE ├── README.md ├── cache ├── .gitignore └── models │ ├── diagnostics │ └── cv_preds │ │ └── .gitignore │ └── main │ └── cv_preds │ └── .gitignore ├── classifier.py ├── combine └── combine.py ├── data ├── .DS_Store ├── test.csv └── train.csv ├── external ├── __init__.py ├── ben.py └── greedy.py ├── helpers ├── __init__.py ├── data.py ├── diagnostics.py ├── feature_extraction.py ├── ml.py └── utils.py ├── history.log ├── plots └── .gitignore ├── saved_params.json └── submissions └── .gitignore /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.pkl 3 | *.log 4 | .DS_Store 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Packages 10 | *.egg 11 | *.egg-info 12 | dist 13 | build 14 | eggs 15 | parts 16 | bin 17 | var 18 | sdist 19 | develop-eggs 20 | .installed.cfg 21 | lib 22 | lib64 23 | 24 | # Installer logs 25 | pip-log.txt 26 | 27 | # Unit test / coverage reports 28 | .coverage 29 | .tox 30 | nosetests.xml 31 | 32 | # Translations 33 | *.mo 34 | 35 | # Mr Developer 36 | .mr.developer.cfg 37 | .project 38 | .pydevproject 39 | -------------------------------------------------------------------------------- /BSMan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyduan/amazonaccess/f8addfefcee80f0ca15e416954af3926f3007d16/BSMan/__init__.py -------------------------------------------------------------------------------- /BSMan/ensemble.py: -------------------------------------------------------------------------------- 1 | """ Amazon Access Challenge Starter Code 2 | 3 | This was built using the code of Paul Duan as a starting 4 | point (thanks to Paul). 5 | 6 | It builds ensemble models using the original dataset and a handful of 7 | extracted features. 8 | 9 | Author: Benjamin Solecki 10 | """ 11 | 12 | from __future__ import division 13 | 14 | import numpy as np 15 | import pandas as pd 16 | from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier) 17 | from sklearn import (metrics, cross_validation, linear_model, preprocessing) 18 | 19 | SEED = 42 # always use a seed for randomized procedures 20 | 21 | def save_results(predictions, filename): 22 | """Given a vector of predictions, save results in CSV format.""" 23 | with open(filename, 'w') as f: 24 | f.write("id,ACTION\n") 25 | for i, pred in enumerate(predictions): 26 | f.write("%d,%f\n" % (i + 1, pred)) 27 | 28 | 29 | """ 30 | Fit models and make predictions. 31 | We'll use one-hot encoding to transform our categorical features 32 | into binary features. 33 | y and X will be numpy array objects. 34 | """ 35 | # === load data in memory === # 36 | print "loading data" 37 | X = pd.read_csv('data/train.csv') 38 | X = X.drop(['ROLE_CODE'], axis=1) 39 | y = X['ACTION'] 40 | X = X.drop(['ACTION'], axis=1) 41 | X_test = pd.read_csv('data/test.csv', index_col=0) 42 | X_test = X_test.drop(['ROLE_CODE'], axis=1) 43 | X_test['ACTION'] = 0 44 | y_test = X_test['ACTION'] 45 | X_test = X_test.drop(['ACTION'], axis=1) 46 | 47 | modelRF =RandomForestClassifier(n_estimators=1999, max_features='sqrt', max_depth=None, min_samples_split=9, compute_importances=True, random_state=SEED)#8803 48 | modelXT =ExtraTreesClassifier(n_estimators=1999, max_features='sqrt', max_depth=None, min_samples_split=8, compute_importances=True, random_state=SEED) #8903 49 | modelGB =GradientBoostingClassifier(n_estimators=50, learning_rate=0.20, max_depth=20, min_samples_split=9, random_state=SEED) #8749 50 | # 599: 20/90/08 51 | #1999: 24/95/06 52 | 53 | X_all = pd.concat([X_test,X], ignore_index=True) 54 | 55 | # I want to combine role_title as a subset of role_familia and see if same results 56 | X_all['ROLE_TITLE'] = X_all['ROLE_TITLE'] + (1000 * X_all['ROLE_FAMILY']) 57 | X_all['ROLE_ROLLUPS'] = X_all['ROLE_ROLLUP_1'] + (10000 * X_all['ROLE_ROLLUP_2']) 58 | X_all = X_all.drop(['ROLE_ROLLUP_1','ROLE_ROLLUP_2','ROLE_FAMILY'], axis=1) 59 | 60 | # Count/freq 61 | print "Counts" 62 | for col in X_all.columns: 63 | X_all['cnt'+col] = 0 64 | groups = X_all.groupby([col]) 65 | for name, group in groups: 66 | count = group[col].count() 67 | X_all['cnt'+col].ix[group.index] = count 68 | X_all['cnt'+col] = X_all['cnt'+col].apply(np.log) # could check if this is neccesary, I think probably not 69 | 70 | # Percent of dept that is this resource 71 | for col in X_all.columns[1:6]: 72 | X_all['Duse'+col] = 0.0 73 | groups = X_all.groupby([col]) 74 | for name, group in groups: 75 | grps = group.groupby(['RESOURCE']) 76 | for rsrc, grp in grps: 77 | X_all['Duse'+col].ix[grp.index] = float(len(grp.index)) / float(len(group.index) ) 78 | 79 | # Number of resources that a manager manages 80 | for col in X_all.columns[0:1]: 81 | if col == 'MGR_ID': 82 | continue 83 | print col 84 | X_all['Mdeps'+col] = 0 85 | groups = X_all.groupby(['MGR_ID']) 86 | for name, group in groups: 87 | X_all['Mdeps'+col].ix[group.index] = len(group[col].unique()) 88 | 89 | 90 | X = X_all[:][X_all.index>=len(X_test.index)] 91 | X_test = X_all[:][X_all.index0.9999999]=0.9999999 105 | preds[preds<0.0000001]=0.0000001 106 | preds = -np.log((1-preds)/preds) 107 | modelEN1 = linear_model.LogisticRegression() 108 | modelEN1.fit(preds, y_train) 109 | print modelEN1.coef_ 110 | 111 | modelRF.fit(X_train, y_train) 112 | modelXT.fit(X_train, y_train) 113 | modelGB.fit(X_train, y_train) 114 | predsRF = modelRF.predict_proba(X_cv)[:, 1] 115 | predsXT = modelXT.predict_proba(X_cv)[:, 1] 116 | predsGB = modelGB.predict_proba(X_cv)[:, 1] 117 | preds = np.hstack((predsRF, predsXT, predsGB)).reshape(3,len(predsGB)).transpose() 118 | preds[preds>0.9999999]=0.9999999 119 | preds[preds<0.0000001]=0.0000001 120 | preds = -np.log((1-preds)/preds) 121 | modelEN2 = linear_model.LogisticRegression() 122 | modelEN2.fit(preds, y_cv) 123 | print modelEN2.coef_ 124 | 125 | coefRF = modelEN1.coef_[0][0] + modelEN2.coef_[0][0] 126 | coefXT = modelEN1.coef_[0][1] + modelEN2.coef_[0][1] 127 | coefGB = modelEN1.coef_[0][2] + modelEN2.coef_[0][2] 128 | 129 | # === Predictions === # 130 | # When making predictions, retrain the model on the whole training set 131 | modelRF.fit(X, y) 132 | modelXT.fit(X, y) 133 | modelGB.fit(X, y) 134 | 135 | ### Combine here 136 | predsRF = modelRF.predict_proba(X_test)[:, 1] 137 | predsXT = modelXT.predict_proba(X_test)[:, 1] 138 | predsGB = modelGB.predict_proba(X_test)[:, 1] 139 | predsRF[predsRF>0.9999999]=0.9999999 140 | predsXT[predsXT>0.9999999]=0.9999999 141 | predsGB[predsGB>0.9999999]=0.9999999 142 | predsRF[predsRF<0.0000001]=0.0000001 143 | predsXT[predsXT<0.0000001]=0.0000001 144 | predsGB[predsGB<0.0000001]=0.0000001 145 | predsRF = -np.log((1-predsRF)/predsRF) 146 | predsXT = -np.log((1-predsXT)/predsXT) 147 | predsGB = -np.log((1-predsGB)/predsGB) 148 | preds = coefRF * predsRF + coefXT * predsXT + coefGB * predsGB 149 | 150 | filename = raw_input("Enter name for submission file: ") 151 | save_results(preds, "submissions/en" + filename + ".csv") 152 | -------------------------------------------------------------------------------- /BSMan/logistic.py: -------------------------------------------------------------------------------- 1 | """ 2 | This program is based on code submitted by Miroslaw Horbal to the Kaggle 3 | forums, which was itself based on an earlier submission from Paul Doan. 4 | My thanks to both. 5 | 6 | Author: Benjamin Solecki 7 | """ 8 | 9 | from numpy import array, hstack 10 | from sklearn import metrics, cross_validation, linear_model 11 | from sklearn import naive_bayes 12 | from sklearn import preprocessing 13 | from scipy import sparse 14 | from itertools import combinations 15 | 16 | from sets import Set 17 | import numpy as np 18 | import pandas as pd 19 | import sys 20 | 21 | #SEED = 55 22 | SEED = int(sys.argv[2]) 23 | 24 | def group_data(data, degree=3, hash=hash): 25 | """ 26 | numpy.array -> numpy.array 27 | 28 | Groups all columns of data into all combinations of triples 29 | """ 30 | new_data = [] 31 | m,n = data.shape 32 | for indicies in combinations(range(n), degree): 33 | if 5 in indicies and 7 in indicies: 34 | print "feature Xd" 35 | elif 2 in indicies and 3 in indicies: 36 | print "feature Xd" 37 | else: 38 | new_data.append([hash(tuple(v)) for v in data[:,indicies]]) 39 | return array(new_data).T 40 | 41 | def OneHotEncoder(data, keymap=None): 42 | """ 43 | OneHotEncoder takes data matrix with categorical columns and 44 | converts it to a sparse binary matrix. 45 | 46 | Returns sparse binary matrix and keymap mapping categories to indicies. 47 | If a keymap is supplied on input it will be used instead of creating one 48 | and any categories appearing in the data that are not in the keymap are 49 | ignored 50 | """ 51 | if keymap is None: 52 | keymap = [] 53 | for col in data.T: 54 | uniques = set(list(col)) 55 | keymap.append(dict((key, i) for i, key in enumerate(uniques))) 56 | total_pts = data.shape[0] 57 | outdat = [] 58 | for i, col in enumerate(data.T): 59 | km = keymap[i] 60 | num_labels = len(km) 61 | spmat = sparse.lil_matrix((total_pts, num_labels)) 62 | for j, val in enumerate(col): 63 | if val in km: 64 | spmat[j, km[val]] = 1 65 | outdat.append(spmat) 66 | outdat = sparse.hstack(outdat).tocsr() 67 | return outdat, keymap 68 | 69 | def create_test_submission(filename, prediction): 70 | content = ['id,ACTION'] 71 | for i, p in enumerate(prediction): 72 | content.append('%i,%f' %(i+1,p)) 73 | f = open(filename, 'w') 74 | f.write('\n'.join(content)) 75 | f.close() 76 | print 'Saved' 77 | 78 | # This loop essentially from Paul's starter code 79 | # I (Ben) increased the size of train at the expense of test, because 80 | # when train is small many features will not be found in train. 81 | def cv_loop(X, y, model, N): 82 | mean_auc = 0. 83 | for i in range(N): 84 | X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( 85 | X, y, test_size=1.0/float(N), 86 | random_state = i*SEED) 87 | model.fit(X_train, y_train) 88 | preds = model.predict_proba(X_cv)[:,1] 89 | auc = metrics.auc_score(y_cv, preds) 90 | #print "AUC (fold %d/%d): %f" % (i + 1, N, auc) 91 | mean_auc += auc 92 | return mean_auc/N 93 | 94 | learner = sys.argv[1] 95 | print "Reading dataset..." 96 | train_data = pd.read_csv('train.csv') 97 | test_data = pd.read_csv('test.csv') 98 | submit=learner + str(SEED) + '.csv' 99 | all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1])) 100 | num_train = np.shape(train_data)[0] 101 | 102 | # Transform data 103 | print "Transforming data..." 104 | # Relabel the variable values to smallest possible so that I can use bincount 105 | # on them later. 106 | relabler = preprocessing.LabelEncoder() 107 | for col in range(len(all_data[0,:])): 108 | relabler.fit(all_data[:, col]) 109 | all_data[:, col] = relabler.transform(all_data[:, col]) 110 | ########################## 2nd order features ################################ 111 | dp = group_data(all_data, degree=2) 112 | for col in range(len(dp[0,:])): 113 | relabler.fit(dp[:, col]) 114 | dp[:, col] = relabler.transform(dp[:, col]) 115 | uniques = len(set(dp[:,col])) 116 | maximum = max(dp[:,col]) 117 | print col 118 | if maximum < 65534: 119 | count_map = np.bincount((dp[:, col]).astype('uint16')) 120 | for n,i in enumerate(dp[:, col]): 121 | if count_map[i] <= 1: 122 | dp[n, col] = uniques 123 | elif count_map[i] == 2: 124 | dp[n, col] = uniques+1 125 | else: 126 | for n,i in enumerate(dp[:, col]): 127 | if (dp[:, col] == i).sum() <= 1: 128 | dp[n, col] = uniques 129 | elif (dp[:, col] == i).sum() == 2: 130 | dp[n, col] = uniques+1 131 | print uniques # unique values 132 | uniques = len(set(dp[:,col])) 133 | print uniques 134 | relabler.fit(dp[:, col]) 135 | dp[:, col] = relabler.transform(dp[:, col]) 136 | ########################## 3rd order features ################################ 137 | dt = group_data(all_data, degree=3) 138 | for col in range(len(dt[0,:])): 139 | relabler.fit(dt[:, col]) 140 | dt[:, col] = relabler.transform(dt[:, col]) 141 | uniques = len(set(dt[:,col])) 142 | maximum = max(dt[:,col]) 143 | print col 144 | if maximum < 65534: 145 | count_map = np.bincount((dt[:, col]).astype('uint16')) 146 | for n,i in enumerate(dt[:, col]): 147 | if count_map[i] <= 1: 148 | dt[n, col] = uniques 149 | elif count_map[i] == 2: 150 | dt[n, col] = uniques+1 151 | else: 152 | for n,i in enumerate(dt[:, col]): 153 | if (dt[:, col] == i).sum() <= 1: 154 | dt[n, col] = uniques 155 | elif (dt[:, col] == i).sum() == 2: 156 | dt[n, col] = uniques+1 157 | print uniques 158 | uniques = len(set(dt[:,col])) 159 | print uniques 160 | relabler.fit(dt[:, col]) 161 | dt[:, col] = relabler.transform(dt[:, col]) 162 | ########################## 1st order features ################################ 163 | for col in range(len(all_data[0,:])): 164 | relabler.fit(all_data[:, col]) 165 | all_data[:, col] = relabler.transform(all_data[:, col]) 166 | uniques = len(set(all_data[:,col])) 167 | maximum = max(all_data[:,col]) 168 | print col 169 | if maximum < 65534: 170 | count_map = np.bincount((all_data[:, col]).astype('uint16')) 171 | for n,i in enumerate(all_data[:, col]): 172 | if count_map[i] <= 1: 173 | all_data[n, col] = uniques 174 | elif count_map[i] == 2: 175 | all_data[n, col] = uniques+1 176 | else: 177 | for n,i in enumerate(all_data[:, col]): 178 | if (all_data[:, col] == i).sum() <= 1: 179 | all_data[n, col] = uniques 180 | elif (all_data[:, col] == i).sum() == 2: 181 | all_data[n, col] = uniques+1 182 | print uniques 183 | uniques = len(set(all_data[:,col])) 184 | print uniques 185 | relabler.fit(all_data[:, col]) 186 | all_data[:, col] = relabler.transform(all_data[:, col]) 187 | 188 | # Collect the training features together 189 | y = array(train_data.ACTION) 190 | X = all_data[:num_train] 191 | X_2 = dp[:num_train] 192 | X_3 = dt[:num_train] 193 | 194 | # Collect the testing features together 195 | X_test = all_data[num_train:] 196 | X_test_2 = dp[num_train:] 197 | X_test_3 = dt[num_train:] 198 | 199 | X_train_all = np.hstack((X, X_2, X_3)) 200 | X_test_all = np.hstack((X_test, X_test_2, X_test_3)) 201 | num_features = X_train_all.shape[1] 202 | 203 | if learner == 'NB': 204 | model = naive_bayes.BernoulliNB(alpha=0.03) 205 | else: 206 | model = linear_model.LogisticRegression(class_weight='auto', penalty='l2') 207 | 208 | # Xts holds one hot encodings for each individual feature in memory 209 | # speeding up feature selection 210 | Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)] 211 | 212 | print "Performing greedy feature selection..." 213 | score_hist = [] 214 | N = 10 215 | good_features = set([]) 216 | # Greedy feature selection loop 217 | while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]: 218 | scores = [] 219 | for f in range(len(Xts)): 220 | if f not in good_features: 221 | feats = list(good_features) + [f] 222 | Xt = sparse.hstack([Xts[j] for j in feats]).tocsr() 223 | score = cv_loop(Xt, y, model, N) 224 | scores.append((score, f)) 225 | print "Feature: %i Mean AUC: %f" % (f, score) 226 | good_features.add(sorted(scores)[-1][1]) 227 | score_hist.append(sorted(scores)[-1]) 228 | print "Current features: %s" % sorted(list(good_features)) 229 | 230 | # Remove last added feature from good_features 231 | good_features.remove(score_hist[-1][1]) 232 | good_features = sorted(list(good_features)) 233 | print "Selected features %s" % good_features 234 | gf = open("feats" + submit, 'w') 235 | print >>gf, good_features 236 | gf.close() 237 | print len(good_features), " features" 238 | 239 | print "Performing hyperparameter selection..." 240 | # Hyperparameter selection loop 241 | score_hist = [] 242 | Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr() 243 | if learner == 'NB': 244 | Cvals = [0.001, 0.003, 0.006, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1] 245 | else: 246 | Cvals = np.logspace(-4, 4, 15, base=2) # for logistic 247 | for C in Cvals: 248 | if learner == 'NB': 249 | model.alpha = C 250 | else: 251 | model.C = C 252 | score = cv_loop(Xt, y, model, N) 253 | score_hist.append((score,C)) 254 | print "C: %f Mean AUC: %f" %(C, score) 255 | bestC = sorted(score_hist)[-1][1] 256 | print "Best C value: %f" % (bestC) 257 | 258 | print "Performing One Hot Encoding on entire dataset..." 259 | Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features])) 260 | Xt, keymap = OneHotEncoder(Xt) 261 | X_train = Xt[:num_train] 262 | X_test = Xt[num_train:] 263 | 264 | if learner == 'NB': 265 | model.alpha = bestC 266 | else: 267 | model.C = bestC 268 | 269 | print "Training full model..." 270 | print "Making prediction and saving results..." 271 | model.fit(X_train, y) 272 | preds = model.predict_proba(X_test)[:,1] 273 | create_test_submission(submit, preds) 274 | preds = model.predict_proba(X_train)[:,1] 275 | create_test_submission('Train'+submit, preds) 276 | -------------------------------------------------------------------------------- /MIT-LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Paul Duan, Benjamin Solecki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a 6 | copy 7 | of this software and associated documentation files (the "Software"), to 8 | deal 9 | in the Software without restriction, including without limitation the 10 | rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or 12 | sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included 17 | in 18 | all copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 21 | OR 22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 24 | THE 25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 27 | FROM, 28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 29 | IN 30 | THE SOFTWARE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Amazon Employee Access Challenge 2 | ================================ 3 | 4 | This code was written by Paul Duan () and Benjamin Solecki (). 5 | It provides our winning solution to the Amazon Employee Access Challenge. 6 | Our code is currently not merged. You'll find Benjamin's code in the BSMan/ folder, which needs to be run separately. 7 | 8 | 9 | Usage: 10 | --------------- 11 | [python] classifier.py [-h] [-d] [-i ITER] [-f OUTPUTFILE] [-g] [-m] [-n] [-s] [-v] [-w] 12 | 13 | Parameters for the script. 14 | 15 | optional arguments: 16 | -h, --help show this help message and exit 17 | -d, --diagnostics Compute diagnostics. 18 | -i ITER, --iter ITER Number of iterations for averaging. 19 | -f OUTPUTFILE, --outputfile OUTPUTFILE 20 | Name of the file where predictions are saved. 21 | -g, --grid-search Use grid search to find best parameters. 22 | -m, --model-selection 23 | Use model selection. 24 | -n, --no-cache Use cache. 25 | -s, --stack Use stacking. 26 | -v, --verbose Show computation steps. 27 | -w, --fwls Use metafeatures. 28 | 29 | 30 | To directly generate predictions on the test set without computing CV 31 | metrics, simply run: 32 | 33 | python classifier.py -i0 -f[output_filename] 34 | 35 | This script will launch Paul's model, which incorporates some of Benjamin's features. 36 | Benjamin's model is in the BSMan folder and can be run this way: 37 | 38 | (in BSMan/) 39 | [python] logistic.py log 75 40 | [python] ensemble.py 41 | 42 | The output of our models is then combined by simple standardization then weighted averaging, using 2/3 Paul's model and 1/3 Benjamin's. 43 | 44 | 45 | Requirements: 46 | --------------- 47 | This code requires Python, numpy/scipy, scikit-learn, and pandas for 48 | some of the external code (this dependency will be removed in the 49 | future). 50 | It has been tested under Mac OS X with Python v.7.x, 51 | scikit-learn 0.13, numpy 0.17, and pandas 0.11. 52 | 53 | License: 54 | --------------- 55 | This content is released under the [MIT Licence](http://opensource.org/licenses/MIT). 56 | -------------------------------------------------------------------------------- /cache/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | -------------------------------------------------------------------------------- /cache/models/diagnostics/cv_preds/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything except this file 2 | * 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /cache/models/main/cv_preds/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything except this file 2 | * 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Amazon Access Challenge 4 | 5 | This is my part of the code that produced the winning solution to the 6 | Amazon Employee Access Challenge. See README.md for more details. 7 | 8 | Author: Paul Duan 9 | """ 10 | 11 | from __future__ import division 12 | 13 | import argparse 14 | import logging 15 | 16 | from sklearn import metrics, cross_validation, linear_model, ensemble 17 | from helpers import ml, diagnostics 18 | from helpers.data import load_data, save_results 19 | from helpers.feature_extraction import create_datasets 20 | 21 | logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s", 22 | filename="history.log", filemode='a', level=logging.DEBUG, 23 | datefmt='%m/%d/%y %H:%M:%S') 24 | formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s", 25 | datefmt='%m/%d/%y %H:%M:%S') 26 | console = logging.StreamHandler() 27 | console.setFormatter(formatter) 28 | console.setLevel(logging.INFO) 29 | logging.getLogger().addHandler(console) 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | 34 | def main(CONFIG): 35 | """ 36 | The final model is a combination of several base models, which are then 37 | combined using StackedClassifier defined in the helpers.ml module. 38 | 39 | The list of models and associated datasets is generated automatically 40 | from their identifying strings. The format is as follows: 41 | A:b_c where A is the initials of the algorithm to use, b is the base 42 | dataset, and c is the feature set and the variants to use. 43 | """ 44 | SEED = 42 45 | selected_models = [ 46 | "LR:tuples_sf", 47 | "LR:greedy_sfl", 48 | "LR:greedy2_sfl", 49 | "LR:greedy3_sf", 50 | "RFC:basic_b", 51 | "RFC:tuples_f", 52 | "RFC:tuples_fd", 53 | "RFC:greedy_f", 54 | "RFC:greedy2_f", 55 | "GBC:basic_f", 56 | "GBC:tuples_f", 57 | "LR:greedy_sbl", 58 | "GBC:greedy_c", 59 | "GBC:tuples_cf", 60 | #"RFC:effects_f", # experimental; added after the competition 61 | ] 62 | 63 | # Create the models on the fly 64 | models = [] 65 | for item in selected_models: 66 | model_id, dataset = item.split(':') 67 | model = {'LR': linear_model.LogisticRegression, 68 | 'GBC': ensemble.GradientBoostingClassifier, 69 | 'RFC': ensemble.RandomForestClassifier, 70 | 'ETC': ensemble.ExtraTreesClassifier}[model_id]() 71 | model.set_params(random_state=SEED) 72 | models.append((model, dataset)) 73 | 74 | datasets = [dataset for model, dataset in models] 75 | 76 | logger.info("loading data") 77 | y, X = load_data('train.csv') 78 | X_test = load_data('test.csv', return_labels=False) 79 | 80 | logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache)) 81 | create_datasets(X, X_test, y, datasets, CONFIG.use_cache) 82 | 83 | # Set params 84 | for model, feature_set in models: 85 | model.set_params(**ml.find_params(model, feature_set, y, 86 | grid_search=CONFIG.grid_search)) 87 | clf = ml.StackedClassifier( 88 | models, stack=CONFIG.stack, fwls=CONFIG.fwls, 89 | model_selection=CONFIG.model_selection, 90 | use_cached_models=CONFIG.use_cache) 91 | 92 | # Metrics 93 | logger.info("computing cv score") 94 | mean_auc = 0.0 95 | for i in range(CONFIG.iter): 96 | train, cv = cross_validation.train_test_split( 97 | range(len(y)), test_size=.20, random_state=1+i*SEED) 98 | cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose) 99 | 100 | fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds) 101 | roc_auc = metrics.auc(fpr, tpr) 102 | logger.info("AUC (fold %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc) 103 | mean_auc += roc_auc 104 | 105 | if CONFIG.diagnostics and i == 0: # only plot for first fold 106 | logger.info("plotting learning curve") 107 | diagnostics.learning_curve(clf, y, train, cv) 108 | diagnostics.plot_roc(fpr, tpr) 109 | if CONFIG.iter: 110 | logger.info("Mean AUC: %.5f", mean_auc/CONFIG.iter) 111 | 112 | # Create submissions 113 | if CONFIG.outputfile: 114 | logger.info("making test submissions (CV AUC: %.4f)", mean_auc) 115 | preds = clf.fit_predict(y, show_steps=CONFIG.verbose) 116 | save_results(preds, CONFIG.outputfile + ".csv") 117 | 118 | if __name__ == '__main__': 119 | PARSER = argparse.ArgumentParser(description="Parameters for the script.") 120 | PARSER.add_argument('-d', "--diagnostics", action="store_true", 121 | help="Compute diagnostics.") 122 | PARSER.add_argument('-i', "--iter", type=int, default=1, 123 | help="Number of iterations for averaging.") 124 | PARSER.add_argument("-f", "--outputfile", default="", 125 | help="Name of the file where predictions are saved.") 126 | PARSER.add_argument('-g', "--grid-search", action="store_true", 127 | help="Use grid search to find best parameters.") 128 | PARSER.add_argument('-m', "--model-selection", action="store_true", 129 | default=False, help="Use model selection.") 130 | PARSER.add_argument('-n', "--no-cache", action="store_false", default=True, 131 | help="Use cache.", dest="use_cache") 132 | PARSER.add_argument("-s", "--stack", action="store_true", 133 | help="Use stacking.") 134 | PARSER.add_argument('-v', "--verbose", action="store_true", 135 | help="Show computation steps.") 136 | PARSER.add_argument("-w", "--fwls", action="store_true", 137 | help="Use metafeatures.") 138 | PARSER.set_defaults(argument_default=False) 139 | CONFIG = PARSER.parse_args() 140 | 141 | CONFIG.stack = CONFIG.stack or CONFIG.fwls 142 | 143 | logger.debug('\n' + '='*50) 144 | main(CONFIG) 145 | -------------------------------------------------------------------------------- /combine/combine.py: -------------------------------------------------------------------------------- 1 | """combine.py 2 | 3 | This is an ad-hoc script we used to find how to merge our submissions. 4 | For this to work, the prediction vectors must be placed in the internal/ 5 | folder. 6 | 7 | Author: Paul Duan 8 | """ 9 | 10 | import numpy as np 11 | import math 12 | from sklearn import linear_model, cross_validation, preprocessing 13 | 14 | from ..helpers.data import load_data 15 | from ..helpers.ml import compute_auc, AUCRegressor 16 | 17 | 18 | def inverse_transform(X): 19 | def clamp(x): 20 | return min(max(x, .00000001), .99999999) 21 | return np.vectorize(lambda x: -math.log((1 - clamp(x))/clamp(x)))(X) 22 | 23 | 24 | def print_param(obj, params, prefix=''): 25 | for param in params: 26 | if hasattr(obj, param): 27 | paramvalue = getattr(obj, param) 28 | if "coef" in param: 29 | paramvalue /= np.sum(paramvalue) 30 | print prefix + param + ": " + str(paramvalue) 31 | 32 | 33 | mean_prediction = 0.0 34 | y = load_data('train.csv')[0] 35 | y = y[range(len(y) - 7770, len(y))] 36 | 37 | files = ["log75", "ens", "paul"] 38 | totransform = [] 39 | 40 | preds = [] 41 | for filename in files: 42 | with open("%s.csv" % filename) as f: 43 | pred = np.loadtxt(f, delimiter=',', usecols=[1], skiprows=1) 44 | if filename in totransform: 45 | pred = inverse_transform(pred) 46 | preds.append(pred) 47 | X = np.array(preds).T 48 | 49 | standardizer = preprocessing.StandardScaler() 50 | X = standardizer.fit_transform(X) 51 | 52 | print "============================================================" 53 | print '\t\t'.join(files) 54 | aucs = [] 55 | for filename in files: 56 | with open("%s.csv" % filename) as f: 57 | pred = np.loadtxt(f, delimiter=',', usecols=[1], skiprows=1) 58 | aucs.append("%.3f" % (compute_auc(y, pred) * 100)) 59 | print '\t\t'.join(aucs) 60 | print "------------------------------------------------------------" 61 | 62 | combiners = [ 63 | linear_model.LinearRegression(), 64 | linear_model.Ridge(20), 65 | AUCRegressor(), 66 | ] 67 | 68 | for combiner in combiners: 69 | mean_coefs = 0.0 70 | mean_auc = 0.0 71 | N = 10 72 | 73 | print "\n%s:" % combiner.__class__.__name__ 74 | if hasattr(combiner, 'predict_proba'): 75 | combiner.predict = lambda X: combiner.predict_proba(X)[:, 1] 76 | 77 | combiner.fit(X, y) 78 | print_param(combiner, ["alpha_", "coef_"], "(post) ") 79 | print "Train AUC: %.3f" % (compute_auc(y, combiner.predict(X)) * 100) 80 | 81 | if isinstance(combiner, AUCRegressor): 82 | continue 83 | 84 | kfold = cross_validation.KFold(len(y), 3, shuffle=True) 85 | for train, test in kfold: 86 | X_train = X[train] 87 | X_test = X[test] 88 | y_train = y[train] 89 | y_test = y[test] 90 | 91 | combiner.fit(X_train, y_train) 92 | prediction = combiner.predict(X_test) 93 | mean_auc += compute_auc(y_test, prediction)/len(kfold) 94 | 95 | if len(combiner.coef_) == 1: 96 | mean_coefs += combiner.coef_[0]/len(files) 97 | else: 98 | mean_coefs += combiner.coef_/len(files) 99 | 100 | print "Mean AUC: %.3f" % (mean_auc * 100) 101 | 102 | print "\n------------------------------------------------------------" 103 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyduan/amazonaccess/f8addfefcee80f0ca15e416954af3926f3007d16/data/.DS_Store -------------------------------------------------------------------------------- /external/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyduan/amazonaccess/f8addfefcee80f0ca15e416954af3926f3007d16/external/__init__.py -------------------------------------------------------------------------------- /external/ben.py: -------------------------------------------------------------------------------- 1 | """ Amazon Access Challenge Starter Code 2 | 3 | This was built using the code of Paul Duan as a starting 4 | point (thanks to Paul). 5 | 6 | It builds ensemble models using the original dataset and a handful of 7 | extracted features. 8 | 9 | Author: Benjami Solecki 10 | """ 11 | 12 | from __future__ import division 13 | 14 | import numpy as np 15 | import pandas as pd 16 | from helpers.data import save_dataset 17 | 18 | 19 | def create_features(): 20 | print "loading data" 21 | X = pd.read_csv('data/train.csv') 22 | X = X.drop(['ROLE_CODE'], axis=1) 23 | X = X.drop(['ACTION'], axis=1) 24 | 25 | X_test = pd.read_csv('data/test.csv', index_col=0) 26 | X_test = X_test.drop(['ROLE_CODE'], axis=1) 27 | X_test['ACTION'] = 0 28 | X_test = X_test.drop(['ACTION'], axis=1) 29 | 30 | X_all = pd.concat([X_test, X], ignore_index=True) 31 | # I want to combine role_title as a subset of role_familia and 32 | X_all['ROLE_TITLE'] = X_all['ROLE_TITLE'] + (1000 * X_all['ROLE_FAMILY']) 33 | X_all['ROLE_ROLLUPS'] = X_all['ROLE_ROLLUP_1'] + ( 34 | 10000 * X_all['ROLE_ROLLUP_2']) 35 | X_all = X_all.drop(['ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_FAMILY'], 36 | axis=1) 37 | 38 | # Count/freq 39 | for col in X_all.columns: 40 | X_all['cnt'+col] = 0 41 | groups = X_all.groupby([col]) 42 | for name, group in groups: 43 | count = group[col].count() 44 | X_all['cnt'+col].ix[group.index] = count 45 | X_all['cnt'+col] = X_all['cnt'+col].apply(np.log) 46 | 47 | # Percent of dept that is this resource 48 | # And Counts of dept/resource occurancesa (tested, not used) 49 | for col in X_all.columns[1:6]: 50 | X_all['Duse'+col] = 0.0 51 | groups = X_all.groupby([col]) 52 | for name, group in groups: 53 | grps = group.groupby(['RESOURCE']) 54 | for rsrc, grp in grps: 55 | X_all['Duse'+col].ix[grp.index] = \ 56 | float(len(grp.index)) / float(len(group.index)) 57 | 58 | # Number of resources that a manager manages 59 | for col in X_all.columns[0:1]: 60 | #for col in X_all.columns[0:6]: 61 | if col == 'MGR_ID': 62 | continue 63 | X_all['Mdeps'+col] = 0 64 | groups = X_all.groupby(['MGR_ID']) 65 | for name, group in groups: 66 | X_all['Mdeps'+col].ix[group.index] = len(group[col].unique()) 67 | 68 | X_all = X_all.drop(X_all.columns[0:6], axis=1) 69 | 70 | # Now X is the train, X_test is test and X_all is both together 71 | X = X_all[:][X_all.index >= len(X_test.index)] 72 | X_test = X_all[:][X_all.index < len(X_test.index)] 73 | # X is the train set alone, X_all is all features 74 | X = X.as_matrix() 75 | X_test = X_test.as_matrix() 76 | 77 | save_dataset('bsfeats', X, X_test) 78 | -------------------------------------------------------------------------------- /external/greedy.py: -------------------------------------------------------------------------------- 1 | """ Greedy feature selection 2 | This file is a slightly modified version of Miroslaw's code. 3 | It generates a dataset containing all 3rd order combinations 4 | of the original columns, then performs greedy feature selection. 5 | 6 | Original author: Miroslaw Horbal 7 | Permission was granted by Miroslaw to publish this snippet as part of 8 | our code. 9 | """ 10 | 11 | from sklearn import metrics, cross_validation, linear_model 12 | from scipy import sparse 13 | from itertools import combinations 14 | from helpers import data 15 | 16 | import numpy as np 17 | import pandas as pd 18 | 19 | SEED = 333 20 | 21 | 22 | def group_data(data, degree=3, hash=hash): 23 | new_data = [] 24 | m, n = data.shape 25 | for indices in combinations(range(n), degree): 26 | new_data.append([hash(tuple(v)) for v in data[:, indices]]) 27 | return np.array(new_data).T 28 | 29 | 30 | def OneHotEncoder(data, keymap=None): 31 | """ 32 | OneHotEncoder takes data matrix with categorical columns and 33 | converts it to a sparse binary matrix. 34 | 35 | Returns sparse binary matrix and keymap mapping categories to indicies. 36 | If a keymap is supplied on input it will be used instead of creating one 37 | and any categories appearing in the data that are not in the keymap are 38 | ignored 39 | """ 40 | if keymap is None: 41 | keymap = [] 42 | for col in data.T: 43 | uniques = set(list(col)) 44 | keymap.append(dict((key, i) for i, key in enumerate(uniques))) 45 | total_pts = data.shape[0] 46 | outdat = [] 47 | for i, col in enumerate(data.T): 48 | km = keymap[i] 49 | num_labels = len(km) 50 | spmat = sparse.lil_matrix((total_pts, num_labels)) 51 | for j, val in enumerate(col): 52 | if val in km: 53 | spmat[j, km[val]] = 1 54 | outdat.append(spmat) 55 | outdat = sparse.hstack(outdat).tocsr() 56 | return outdat, keymap 57 | 58 | 59 | def cv_loop(X, y, model, N): 60 | mean_auc = 0. 61 | for i in range(N): 62 | X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( 63 | X, y, test_size=.20, 64 | random_state=i*SEED) 65 | model.fit(X_train, y_train) 66 | preds = model.predict_proba(X_cv)[:, 1] 67 | auc = metrics.auc_score(y_cv, preds) 68 | print "AUC (fold %d/%d): %f" % (i + 1, N, auc) 69 | mean_auc += auc 70 | return mean_auc/N 71 | 72 | 73 | def create_features(train='data/train.csv', test='data/test.csv'): 74 | print "Reading dataset..." 75 | train_data = pd.read_csv(train) 76 | test_data = pd.read_csv(test) 77 | all_data = np.vstack((train_data.ix[:, 1:-1], test_data.ix[:, 1:-1])) 78 | 79 | num_train = np.shape(train_data)[0] 80 | 81 | # Transform data 82 | print "Transforming data..." 83 | dp = group_data(all_data, degree=2) 84 | dt = group_data(all_data, degree=3) 85 | 86 | y = np.array(train_data.ACTION) 87 | X = all_data[:num_train] 88 | X_2 = dp[:num_train] 89 | X_3 = dt[:num_train] 90 | 91 | X_test = all_data[num_train:] 92 | X_test_2 = dp[num_train:] 93 | X_test_3 = dt[num_train:] 94 | 95 | X_train_all = np.hstack((X, X_2, X_3)) 96 | X_test_all = np.hstack((X_test, X_test_2, X_test_3)) 97 | num_features = X_train_all.shape[1] 98 | 99 | model = linear_model.LogisticRegression() 100 | 101 | # Xts holds one hot encodings for each individual feature in memory 102 | # speeding up feature selection 103 | Xts = [OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_features)] 104 | 105 | print "Performing greedy feature selection..." 106 | score_hist = [] 107 | N = 10 108 | good_features_list = [ 109 | [0, 8, 9, 10, 19, 34, 36, 37, 38, 41, 42, 43, 47, 53, 55, 110 | 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85], 111 | [0, 1, 7, 8, 9, 10, 36, 37, 38, 41, 42, 43, 47, 51, 53, 112 | 56, 60, 61, 63, 64, 66, 67, 69, 71, 75, 79, 85, 91], 113 | [0, 7, 9, 24, 36, 37, 41, 42, 47, 53, 61, 63, 64, 67, 69, 71, 75, 85], 114 | [0, 7, 9, 20, 36, 37, 38, 41, 42, 45, 47, 115 | 53, 60, 63, 64, 67, 69, 71, 81, 85, 86] 116 | ] 117 | 118 | # Greedy feature selection loop 119 | if not good_features_list: 120 | good_features = set([]) 121 | while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]: 122 | scores = [] 123 | for f in range(len(Xts)): 124 | if f not in good_features: 125 | feats = list(good_features) + [f] 126 | Xt = sparse.hstack([Xts[j] for j in feats]).tocsr() 127 | score = cv_loop(Xt, y, model, N) 128 | scores.append((score, f)) 129 | print "Feature: %i Mean AUC: %f" % (f, score) 130 | good_features.add(sorted(scores)[-1][1]) 131 | score_hist.append(sorted(scores)[-1]) 132 | print "Current features: %s" % sorted(list(good_features)) 133 | 134 | # Remove last added feature from good_features 135 | good_features.remove(score_hist[-1][1]) 136 | good_features = sorted(list(good_features)) 137 | 138 | for i, good_features in enumerate(good_features_list): 139 | suffix = str(i + 1) if i else '' 140 | Xt = np.vstack((X_train_all[:, good_features], 141 | X_test_all[:, good_features])) 142 | X_train = Xt[:num_train] 143 | X_test = Xt[num_train:] 144 | data.save_dataset("greedy%s" % suffix, X_train, X_test) 145 | -------------------------------------------------------------------------------- /helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyduan/amazonaccess/f8addfefcee80f0ca15e416954af3926f3007d16/helpers/__init__.py -------------------------------------------------------------------------------- /helpers/data.py: -------------------------------------------------------------------------------- 1 | """ml.py 2 | 3 | Useful I/O functions. 4 | 5 | Author: Paul Duan 6 | """ 7 | 8 | import logging 9 | import numpy as np 10 | from scipy import sparse 11 | import cPickle as pickle 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def load_data(filename, return_labels=True): 17 | """Load data from CSV files and return them in numpy format.""" 18 | logging.debug("loading data from %s", filename) 19 | data = np.loadtxt(open("data/" + filename), delimiter=',', 20 | usecols=range(1, 10), skiprows=1, dtype=int) 21 | if return_labels: 22 | labels = np.loadtxt(open("data/" + filename), delimiter=',', 23 | usecols=[0], skiprows=1) 24 | return labels, data 25 | else: 26 | labels = np.zeros(data.shape[0]) 27 | return data 28 | 29 | 30 | def load_from_cache(filename, use_cache=True): 31 | """Attempt to load data from cache.""" 32 | data = None 33 | read_mode = 'rb' if '.pkl' in filename else 'r' 34 | if use_cache: 35 | try: 36 | with open("cache/%s" % filename, read_mode) as f: 37 | data = pickle.load(f) 38 | except IOError: 39 | pass 40 | 41 | return data 42 | 43 | 44 | def save_results(predictions, filename): 45 | """Save results in CSV format.""" 46 | logging.info("saving data to file %s", filename) 47 | with open("submissions/%s" % filename, 'w') as f: 48 | f.write("id,ACTION\n") 49 | for i, pred in enumerate(predictions): 50 | f.write("%d,%f\n" % (i + 1, pred)) 51 | 52 | 53 | def save_dataset(filename, X, X_test, features=None, features_test=None): 54 | """Save the training and test sets augmented with the given features.""" 55 | if features is not None: 56 | assert features.shape[1] == features_test.shape[1], "features mismatch" 57 | if sparse.issparse(X): 58 | features = sparse.lil_matrix(features) 59 | features_test = sparse.lil_matrix(features_test) 60 | X = sparse.hstack((X, features), 'csr') 61 | X_test = sparse.hstack((X_test, features_test), 'csr') 62 | else: 63 | X = np.hstack((X, features)) 64 | X_test = np. hstack((X_test, features_test)) 65 | 66 | logger.info("> saving %s to disk", filename) 67 | with open("cache/%s.pkl" % filename, 'wb') as f: 68 | pickle.dump((X, X_test), f, pickle.HIGHEST_PROTOCOL) 69 | 70 | 71 | def get_dataset(feature_set='basic', train=None, cv=None): 72 | """ 73 | Return the design matrices constructed with the specified feature set. 74 | If train is specified, split the training set according to train and 75 | cv (if cv is not given, subsample's complement will be used instead). 76 | If subsample is omitted, return both the full training and test sets. 77 | """ 78 | try: 79 | with open("cache/%s.pkl" % feature_set, 'rb') as f: 80 | if train is not None: 81 | X, _ = pickle.load(f) 82 | if cv is None: 83 | cv = [i for i in range(X.shape[0]) if i not in train] 84 | 85 | X_test = X[cv, :] 86 | X = X[train, :] 87 | else: 88 | X, X_test = pickle.load(f) 89 | except IOError: 90 | logging.warning("could not find feature set %s", feature_set) 91 | return False 92 | 93 | return X, X_test 94 | -------------------------------------------------------------------------------- /helpers/diagnostics.py: -------------------------------------------------------------------------------- 1 | """diagnostics.py 2 | 3 | Some methods to plot diagnostics. 4 | 5 | Author: Paul Duan 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | from sklearn.metrics import hinge_loss 10 | 11 | 12 | def plot_roc(fpr, tpr): 13 | """Plot ROC curve and display it.""" 14 | plt.clf() 15 | plt.plot(fpr, tpr) 16 | plt.plot([0, 1], [0, 1], 'k--') 17 | plt.xlim([0.0, 1.0]) 18 | plt.ylim([0.0, 1.0]) 19 | plt.xlabel('False Positive Rate') 20 | plt.ylabel('True Positive Rate') 21 | plt.title('ROC Curve') 22 | 23 | 24 | def learning_curve(classifier, y, train, cv, n=15): 25 | """Plot train and cv loss for increasing train sample sizes.""" 26 | chunk = int(len(y)/n) 27 | n_samples = [] 28 | train_losses = [] 29 | cv_losses = [] 30 | previous_cache_dir = classifier.cache_dir 31 | classifier.cache_dir = "diagnostics" 32 | 33 | for i in range(n): 34 | train_subset = train[:(i + 1)*chunk] 35 | preds_cv = classifier.fit_predict(y, train_subset, cv, 36 | show_steps=False) 37 | preds_train = classifier.fit_predict(y, train_subset, train_subset, 38 | show_steps=False) 39 | n_samples.append((i + 1)*chunk) 40 | cv_losses.append(hinge_loss(y[cv], preds_cv, neg_label=0)) 41 | train_losses.append(hinge_loss(y[train_subset], preds_train, 42 | neg_label=0)) 43 | 44 | classifier.cache_dir = previous_cache_dir 45 | plt.clf() 46 | plt.plot(n_samples, train_losses, 'r--', n_samples, cv_losses, 'b--') 47 | plt.ylim([min(train_losses) - .01, max(cv_losses) + .01]) 48 | 49 | plt.savefig('plots/learning_curve.png') 50 | plt.show() 51 | -------------------------------------------------------------------------------- /helpers/feature_extraction.py: -------------------------------------------------------------------------------- 1 | """feature_extraction.py 2 | 3 | Create the requested datasets. 4 | 5 | Author: Paul Duan 6 | """ 7 | 8 | from __future__ import division 9 | 10 | import logging 11 | import cPickle as pickle 12 | import numpy as np 13 | import math 14 | 15 | from scipy import sparse 16 | from sklearn import preprocessing 17 | 18 | from external import greedy, ben 19 | from data import save_dataset 20 | from ml import get_dataset 21 | 22 | logger = logging.getLogger(__name__) 23 | subformatter = logging.Formatter("[%(asctime)s] %(levelname)s\t> %(message)s") 24 | 25 | COLNAMES = ["resource", "manager", "role1", "role2", "department", 26 | "title", "family_desc", "family"] 27 | SELECTED_COLUMNS = [0, 1, 4, 5, 6, 7] 28 | 29 | EXTERNAL_DATASETS = { 30 | "greedy": greedy, 31 | "greedy2": greedy, 32 | "greedy3": greedy, 33 | "bsfeats": ben 34 | } 35 | 36 | 37 | def sparsify(X, X_test): 38 | """Return One-Hot encoded datasets.""" 39 | enc = OneHotEncoder() 40 | enc.fit(np.vstack((X, X_test))) 41 | return enc.transform(X), enc.transform(X_test) 42 | 43 | 44 | def create_datasets(X, X_test, y, datasets=[], use_cache=True): 45 | """ 46 | Generate datasets as needed with different sets of features 47 | and save them to disk. 48 | The datasets are created by combining a base feature set (combinations of 49 | the original variables) with extracted feature sets, with some additional 50 | variants. 51 | 52 | The nomenclature is as follows: 53 | Base datasets: 54 | - basic: the original columns, minus role1, role2, and role_code 55 | - tuples: all order 2 combinations of the original columns 56 | - triples: all order 3 combinations of the original columns 57 | - greedy[1,2,3]: three different datasets obtained by performing 58 | greedy feature selection with different seeds on the triples 59 | dataset 60 | - effects: experimental. Created to try out a suggestion by Gxav 61 | after the competition 62 | 63 | Feature sets and variants: 64 | (denoted by the letters after the underscore in the base dataset name): 65 | - s: the base dataset has been sparsified using One-Hot encoding 66 | - c: the rare features have been consolidated into one category 67 | - f: extracted features have been appended, with a different set for 68 | linear models than for tree-based models 69 | - b: Benjamin's extracted features. 70 | - d: interactions for the extracted feature set have been added 71 | - l: the extracted features have been log transformed 72 | """ 73 | if use_cache: 74 | # Check if all files exist. If not, generate the missing ones 75 | DATASETS = [] 76 | for dataset in datasets: 77 | try: 78 | with open("cache/%s.pkl" % dataset, 'rb'): 79 | pass 80 | except IOError: 81 | logger.warning("couldn't load dataset %s, will generate it", 82 | dataset) 83 | DATASETS.append(dataset.split('_')[0]) 84 | else: 85 | DATASETS = ["basic", "tuples", "triples", 86 | "greedy", "greedy2", "greedy3"] 87 | 88 | # Datasets that require external code to be generated 89 | for dataset, module in EXTERNAL_DATASETS.iteritems(): 90 | if not get_dataset(dataset): 91 | module.create_features() 92 | 93 | # Generate the missing datasets 94 | if len(DATASETS): 95 | bsfeats, bsfeats_test = get_dataset('bsfeats') 96 | 97 | basefeats, basefeats_test = create_features(X, X_test, 3) 98 | save_dataset("base_feats", basefeats, basefeats_test) 99 | 100 | lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0)) 101 | save_dataset("lrfeats", lrfeats, lrfeats_test) 102 | 103 | feats, feats_test = pre_process(*create_features(X, X_test, 1)) 104 | save_dataset("features", feats, feats_test) 105 | 106 | meta, meta_test = pre_process(*create_features(X, X_test, 2), 107 | normalize=False) 108 | save_dataset("metafeatures", meta, meta_test) 109 | 110 | X = X[:, SELECTED_COLUMNS] 111 | X_test = X_test[:, SELECTED_COLUMNS] 112 | save_dataset("basic", X, X_test) 113 | 114 | Xt = create_tuples(X) 115 | Xt_test = create_tuples(X_test) 116 | save_dataset("tuples", Xt, Xt_test) 117 | 118 | Xtr = create_tuples(X) 119 | Xtr_test = create_tuples(X_test) 120 | save_dataset("triples", Xtr, Xtr_test) 121 | 122 | Xe, Xe_test = create_effects(X, X_test, y) 123 | save_dataset("effects", Xe, Xe_test) 124 | 125 | feats_d, feats_d_test = pre_process(basefeats, basefeats_test, 126 | create_divs=True) 127 | bsfeats_d, bsfeats_d_test = pre_process(bsfeats, bsfeats_test, 128 | create_divs=True) 129 | feats_l, feats_l_test = pre_process(basefeats, basefeats_test, 130 | log_transform=True) 131 | lrfeats_l, lrfeats_l_test = pre_process(lrfeats, lrfeats_test, 132 | log_transform=True) 133 | bsfeats_l, bsfeats_l_test = pre_process(bsfeats, bsfeats_test, 134 | log_transform=True) 135 | 136 | for ds in DATASETS: 137 | Xg, Xg_test = get_dataset(ds) 138 | save_dataset(ds + '_b', Xg, Xg_test, bsfeats, bsfeats_test) 139 | save_dataset(ds + '_f', Xg, Xg_test, feats, feats_test) 140 | save_dataset(ds + '_fd', Xg, Xg_test, feats_d, feats_d_test) 141 | save_dataset(ds + '_bd', Xg, Xg_test, bsfeats_d, bsfeats_d_test) 142 | Xs, Xs_test = sparsify(Xg, Xg_test) 143 | save_dataset(ds + '_sf', Xs, Xs_test, lrfeats, lrfeats_test) 144 | save_dataset(ds + '_sfl', Xs, Xs_test, lrfeats_l, lrfeats_l_test) 145 | save_dataset(ds + '_sfd', Xs, Xs_test, feats_d, feats_d_test) 146 | save_dataset(ds + '_sb', Xs, Xs_test, bsfeats, bsfeats_test) 147 | save_dataset(ds + '_sbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test) 148 | save_dataset(ds + '_sbd', Xs, Xs_test, bsfeats_d, bsfeats_d_test) 149 | 150 | if issubclass(Xg.dtype.type, np.integer): 151 | consolidate(Xg, Xg_test) 152 | save_dataset(ds + '_c', Xg, Xg_test) 153 | save_dataset(ds + '_cf', Xg, Xg_test, feats, feats_test) 154 | save_dataset(ds + '_cb', Xg, Xg_test, bsfeats, bsfeats_test) 155 | Xs, Xs_test = sparsify(Xg, Xg_test) 156 | save_dataset(ds + '_sc', Xs, Xs_test) 157 | save_dataset(ds + '_scf', Xs, Xs_test, feats, feats_test) 158 | save_dataset(ds + '_scfl', Xs, Xs_test, feats_l, feats_l_test) 159 | save_dataset(ds + '_scb', Xs, Xs_test, bsfeats, bsfeats_test) 160 | save_dataset(ds + '_scbl', Xs, Xs_test, 161 | bsfeats_l, bsfeats_l_test) 162 | 163 | 164 | def create_effects(X_train, X_test, y): 165 | """ 166 | Create a dataset where the features are the effects of a 167 | logistic regression trained on sparsified data. 168 | This has been added post-deadline after talking with Gxav. 169 | """ 170 | from sklearn import linear_model, cross_validation 171 | from itertools import izip 172 | Xe_train = np.zeros(X_train.shape) 173 | Xe_test = np.zeros(X_test.shape) 174 | n_cols = Xe_train.shape[1] 175 | 176 | model = linear_model.LogisticRegression(C=2) 177 | X_train, X_test = sparsify(X_train, X_test) 178 | 179 | kfold = cross_validation.KFold(len(y), 5) 180 | for train, cv in kfold: 181 | model.fit(X_train[train], y[train]) 182 | colindices = X_test.nonzero()[1] 183 | for i, k in izip(cv, range(len(cv))): 184 | for j in range(n_cols): 185 | z = colindices[n_cols*k + j] 186 | Xe_train[i, j] = model.coef_[0, z] 187 | 188 | model.fit(X_train, y) 189 | colindices = X_test.nonzero()[1] 190 | for i in range(Xe_test.shape[0]): 191 | for j in range(n_cols): 192 | z = colindices[n_cols*i + j] 193 | Xe_test[i, j] = model.coef_[0, z] 194 | 195 | return Xe_train, Xe_test 196 | 197 | 198 | def create_features(X_train, X_test, feature_set=0): 199 | """ 200 | Extract features from the training and test set. 201 | Each feature set is defined as a list of lambda functions. 202 | """ 203 | logger.info("performing feature extraction (feature_set=%d)", feature_set) 204 | features_train = [] 205 | features_test = [] 206 | dictionaries = get_pivottable(X_train, X_test) 207 | dictionaries_train = get_pivottable(X_train, X_test, use='train') 208 | dictionaries_test = get_pivottable(X_test, X_test, use='test') 209 | 210 | # 0: resource, 1: manager, 2: role1, 3: role2, 4: department, 211 | # 5: title, 6: family_desc, 7: family 212 | feature_lists = [ 213 | [ # 0: LR features 214 | lambda x, row, j: 215 | x[COLNAMES[0]].get(row[0], 0) if j > 0 and j < 7 else 0, 216 | lambda x, row, j: 217 | x[COLNAMES[1]].get(row[1], 0) if j > 1 and j < 7 else 0, 218 | lambda x, row, j: 219 | x[COLNAMES[2]].get(row[2], 0) if j > 2 and j < 7 else 0, 220 | lambda x, row, j: 221 | x[COLNAMES[3]].get(row[3], 0) if j > 3 and j < 7 else 0, 222 | lambda x, row, j: 223 | x[COLNAMES[4]].get(row[4], 0) if j > 4 and j < 7 else 0, 224 | lambda x, row, j: 225 | x[COLNAMES[5]].get(row[5], 0) if j > 5 and j < 7 else 0, 226 | lambda x, row, j: 227 | x[COLNAMES[6]].get(row[6], 0) if j > 6 and j < 7 else 0, 228 | lambda x, row, j: 229 | x[COLNAMES[7]].get(row[7], 0) if j > 7 and j < 7 else 0, 230 | 231 | lambda x, row, j: 232 | x[COLNAMES[0]].get(row[0], 0)**2 if j in range(7) else 0, 233 | lambda x, row, j: 234 | x[COLNAMES[j]].get(row[0], 0)/x['total'] 235 | if j > 0 and j < 7 else 0, 236 | 237 | lambda x, row, j: 238 | x[COLNAMES[j]].get(row[j], 0)/len(x[COLNAMES[j]].values()), 239 | 240 | lambda x, row, j: 241 | x[COLNAMES[j]].get(row[j], 0) / dictionaries[j]['total'], 242 | 243 | lambda x, row, j: 244 | math.log(x[COLNAMES[0]].get(row[0], 0)) if j in range(5) else 0, 245 | 246 | lambda x, row, j: 247 | int(row[j] not in dictionaries_train[j]), 248 | 249 | lambda x, row, j: 250 | int(row[j] not in dictionaries_test[j]), 251 | ], 252 | 253 | [ # 1: Tree features 254 | lambda x, row, j: 255 | x[COLNAMES[0]].get(row[0], 0), 256 | lambda x, row, j: 257 | x[COLNAMES[1]].get(row[1], 0), 258 | lambda x, row, j: 259 | x[COLNAMES[2]].get(row[2], 0), 260 | lambda x, row, j: 261 | x[COLNAMES[3]].get(row[3], 0), 262 | lambda x, row, j: 263 | x[COLNAMES[4]].get(row[4], 0), 264 | lambda x, row, j: 265 | x[COLNAMES[5]].get(row[5], 0), 266 | lambda x, row, j: 267 | x[COLNAMES[6]].get(row[6], 0), 268 | lambda x, row, j: 269 | x[COLNAMES[7]].get(row[7], 0), 270 | 271 | lambda x, row, j: 272 | x[COLNAMES[j]].get(row[0], 0)/x['total'] if j > 0 else 0, 273 | ], 274 | 275 | [ # 2: Metafeatures 276 | lambda x, row, j: 277 | dictionaries_train[j].get(row[j], {}).get('total', 0), 278 | lambda x, row, j: 279 | dictionaries_train[j].get(row[j], {}).get('total', 0) == 0, 280 | ], 281 | 282 | [ # 3: Base features 283 | lambda x, row, j: 284 | x['total'] if j == 0 else 0, 285 | 286 | lambda x, row, j: 287 | x[COLNAMES[0]].get(row[0], 0) if j > 0 else 0, 288 | lambda x, row, j: 289 | x[COLNAMES[1]].get(row[1], 0) if j > 1 else 0, 290 | lambda x, row, j: 291 | x[COLNAMES[2]].get(row[2], 0) if j > 2 else 0, 292 | lambda x, row, j: 293 | x[COLNAMES[3]].get(row[3], 0) if j > 3 else 0, 294 | lambda x, row, j: 295 | x[COLNAMES[4]].get(row[4], 0) if j > 4 else 0, 296 | lambda x, row, j: 297 | x[COLNAMES[5]].get(row[5], 0) if j > 5 else 0, 298 | lambda x, row, j: 299 | x[COLNAMES[6]].get(row[6], 0) if j > 6 else 0, 300 | lambda x, row, j: 301 | x[COLNAMES[7]].get(row[7], 0) if j > 7 else 0, 302 | 303 | lambda x, row, j: 304 | x[COLNAMES[0]].get(row[0], 0)**2 if j in range(8) else 0, 305 | ], 306 | ] 307 | 308 | feature_generator = feature_lists[feature_set] 309 | 310 | # create feature vectors 311 | logger.debug("creating feature vectors") 312 | features_train = [] 313 | for row in X_train: 314 | features_train.append([]) 315 | for j in range(len(COLNAMES)): 316 | for feature in feature_generator: 317 | feature_row = feature(dictionaries[j][row[j]], row, j) 318 | features_train[-1].append(feature_row) 319 | features_train = np.array(features_train) 320 | 321 | features_test = [] 322 | for row in X_test: 323 | features_test.append([]) 324 | for j in range(len(COLNAMES)): 325 | for feature in feature_generator: 326 | feature_row = feature(dictionaries[j][row[j]], row, j) 327 | features_test[-1].append(feature_row) 328 | features_test = np.array(features_test) 329 | 330 | return features_train, features_test 331 | 332 | 333 | def pre_process(features_train, features_test, 334 | create_divs=False, log_transform=False, normalize=True): 335 | """ 336 | Take lists of feature columns as input, pre-process them (eventually 337 | performing some transformation), then return nicely formatted numpy arrays. 338 | """ 339 | logger.info("performing preprocessing") 340 | 341 | features_train = list(features_train.T) 342 | features_test = list(features_test.T) 343 | features_train = [list(feature) for feature in features_train] 344 | features_test = [list(feature) for feature in features_test] 345 | 346 | # remove constant features 347 | for i in range(len(features_train) - 1, -1, -1): 348 | if np.var(features_train[i]) + np.var(features_test[i]) == 0: 349 | features_train.pop(i) 350 | features_test.pop(i) 351 | n_features = len(features_train) 352 | 353 | # create some polynomial features 354 | if create_divs: 355 | for i in range(n_features): 356 | for j in range(1): 357 | features_train.append([round(a/(b + 1), 3) for a, b in zip( 358 | features_train[i], features_train[j])]) 359 | features_test.append([round(a/(b + 1), 3) for a, b in zip( 360 | features_test[i], features_test[j])]) 361 | 362 | features_train.append([round(a/(b + 1), 3) for a, b in zip( 363 | features_train[j], features_train[i])]) 364 | features_test.append([round(a/(b + 1), 3) for a, b in zip( 365 | features_test[j], features_test[i])]) 366 | 367 | features_train.append([a*b for a, b in zip( 368 | features_train[j], features_train[i])]) 369 | features_test.append([a*b for a, b in zip( 370 | features_test[j], features_test[i])]) 371 | 372 | if log_transform: 373 | tmp_train = [] 374 | tmp_test = [] 375 | for i in range(n_features): 376 | tmp_train.append([math.log(a + 1) if (a + 1) > 0 else 0 377 | for a in features_train[i]]) 378 | tmp_test.append([math.log(a + 1) if (a + 1) > 0 else 0 379 | for a in features_test[i]]) 380 | 381 | tmp_train.append([a**2 for a in features_train[i]]) 382 | tmp_test.append([a**2 for a in features_test[i]]) 383 | tmp_train.append([a**3 for a in features_train[i]]) 384 | tmp_test.append([a**3 for a in features_test[i]]) 385 | features_train = tmp_train 386 | features_test = tmp_test 387 | 388 | logger.info("created %d features", len(features_train)) 389 | features_train = np.array(features_train).T 390 | features_test = np.array(features_test).T 391 | 392 | # normalize the new features 393 | if normalize: 394 | normalizer = preprocessing.StandardScaler() 395 | normalizer.fit(features_train) 396 | features_train = normalizer.transform(features_train) 397 | features_test = normalizer.transform(features_test) 398 | 399 | return features_train, features_test 400 | 401 | 402 | def get_pivottable(X_train, X_test, use='all'): 403 | """ 404 | Returns a list of dictionaries, one per feature in the 405 | basic data, containing cross-tabulated counts 406 | for each column and each value of the feature. 407 | """ 408 | dictionaries = [] 409 | if use == 'all': 410 | X = np.vstack((X_train, X_test)) 411 | filename = "pivottable" 412 | elif use == 'train': 413 | X = X_train 414 | filename = "pivottable_train" 415 | else: 416 | X = X_test 417 | filename = "pivottable_test" 418 | 419 | for i in range(len(COLNAMES)): 420 | dictionaries.append({'total': 0}) 421 | 422 | try: 423 | with open("cache/%s.pkl" % filename, 'rb') as f: 424 | logger.debug("loading cross-tabulated data from cache") 425 | dictionaries = pickle.load(f) 426 | except IOError: 427 | logger.debug("no cache found, cross-tabulating data") 428 | for i, row in enumerate(X): 429 | for j in range(len(COLNAMES)): 430 | dictionaries[j]['total'] += 1 431 | if row[j] not in dictionaries[j]: 432 | dictionaries[j][row[j]] = {'total': 1} 433 | for k, key in enumerate(COLNAMES): 434 | dictionaries[j][row[j]][key] = {row[k]: 1} 435 | else: 436 | dictionaries[j][row[j]]['total'] += 1 437 | for k, key in enumerate(COLNAMES): 438 | if row[k] not in dictionaries[j][row[j]][key]: 439 | dictionaries[j][row[j]][key][row[k]] = 1 440 | else: 441 | dictionaries[j][row[j]][key][row[k]] += 1 442 | with open("cache/%s.pkl" % filename, 'wb') as f: 443 | pickle.dump(dictionaries, f, pickle.HIGHEST_PROTOCOL) 444 | 445 | return dictionaries 446 | 447 | 448 | def create_tuples(X): 449 | logger.debug("creating feature tuples") 450 | cols = [] 451 | for i in range(X.shape[1]): 452 | for j in range(i, X.shape[1]): 453 | cols.append(X[:, i] + X[:, j]*3571) 454 | return np.hstack((X, np.vstack(cols).T)) 455 | 456 | 457 | def create_triples(X): 458 | logger.debug("creating feature triples") 459 | cols = [] 460 | for i in range(X.shape[1]): 461 | for j in range(i, X.shape[1]): 462 | for k in range(j, X.shape[1]): 463 | cols.append(X[:, i]*3461 + X[:, j]*5483 + X[:, k]) 464 | return np.hstack((X, np.vstack(cols).T)) 465 | 466 | 467 | def consolidate(X_train, X_test): 468 | """ 469 | Transform in-place the given dataset by consolidating 470 | rare features into a single category. 471 | """ 472 | X = np.vstack((X_train, X_test)) 473 | relabeler = preprocessing.LabelEncoder() 474 | 475 | for j in range(X.shape[1]): 476 | relabeler.fit(X[:, j]) 477 | X[:, j] = relabeler.transform(X[:, j]) 478 | X_train[:, j] = relabeler.transform(X_train[:, j]) 479 | X_test[:, j] = relabeler.transform(X_test[:, j]) 480 | 481 | raw_counts = np.bincount(X[:, j]) 482 | indices = np.nonzero(raw_counts)[0] 483 | counts = dict((x, raw_counts[x]) for x in indices) 484 | max_value = np.max(X[:, j]) 485 | 486 | for i in range(X_train.shape[0]): 487 | if counts[X_train[i, j]] <= 1: 488 | X_train[i, j] = max_value + 1 489 | 490 | for i in range(X_test.shape[0]): 491 | if counts[X_test[i, j]] <= 1: 492 | X_test[i, j] = max_value + 1 493 | 494 | 495 | class OneHotEncoder(): 496 | """ 497 | OneHotEncoder takes data matrix with categorical columns and 498 | converts it to a sparse binary matrix. 499 | """ 500 | def __init__(self): 501 | self.keymap = None 502 | 503 | def fit(self, X): 504 | self.keymap = [] 505 | for col in X.T: 506 | uniques = set(list(col)) 507 | self.keymap.append(dict((key, i) for i, key in enumerate(uniques))) 508 | 509 | def transform(self, X): 510 | if self.keymap is None: 511 | self.fit(X) 512 | 513 | outdat = [] 514 | for i, col in enumerate(X.T): 515 | km = self.keymap[i] 516 | num_labels = len(km) 517 | spmat = sparse.lil_matrix((X.shape[0], num_labels)) 518 | for j, val in enumerate(col): 519 | if val in km: 520 | spmat[j, km[val]] = 1 521 | outdat.append(spmat) 522 | outdat = sparse.hstack(outdat).tocsr() 523 | return outdat 524 | -------------------------------------------------------------------------------- /helpers/ml.py: -------------------------------------------------------------------------------- 1 | """ml.py 2 | 3 | This is the file that does the heavy lifting. 4 | It contains the ML algorithms themselves: 5 | - AUCRegressor: a custom class that optimizes AUC directly 6 | - MLR: a linear regression with non-negativity constraints 7 | - StackedClassifier: a custom class that combines several models 8 | 9 | And some related functions: 10 | - find_params: sets the hyperparameters for a given model 11 | 12 | Author: Paul Duan 13 | """ 14 | 15 | from __future__ import division 16 | 17 | import cPickle as pickle 18 | import itertools 19 | import json 20 | import logging 21 | import multiprocessing 22 | import scipy as sp 23 | import numpy as np 24 | 25 | from functools import partial 26 | from operator import itemgetter 27 | 28 | from sklearn.metrics import roc_curve, auc 29 | from sklearn.grid_search import GridSearchCV 30 | from sklearn import cross_validation, linear_model 31 | 32 | from data import load_from_cache, get_dataset 33 | from utils import stringify, compute_auc 34 | 35 | logger = logging.getLogger(__name__) 36 | 37 | N_TREES = 500 38 | 39 | INITIAL_PARAMS = { 40 | 'LogisticRegression': {'C': 2, 'penalty': 'l2', 'class_weight': 'auto'}, 41 | 'RandomForestClassifier': { 42 | 'n_estimators': N_TREES, 'n_jobs': 4, 43 | 'min_samples_leaf': 2, 'bootstrap': False, 44 | 'max_depth': 30, 'min_samples_split': 5, 'max_features': .1 45 | }, 46 | 'ExtraTreesClassifier': { 47 | 'n_estimators': N_TREES, 'n_jobs': 3, 'min_samples_leaf': 2, 48 | 'max_depth': 30, 'min_samples_split': 5, 'max_features': .1, 49 | 'bootstrap': False, 50 | }, 51 | 'GradientBoostingClassifier': { 52 | 'n_estimators': N_TREES, 'learning_rate': .08, 'max_features': 7, 53 | 'min_samples_leaf': 1, 'min_samples_split': 3, 'max_depth': 5, 54 | }, 55 | } 56 | 57 | PARAM_GRID = { 58 | 'LogisticRegression': {'C': [1.5, 2, 2.5, 3, 3.5, 5, 5.5], 59 | 'class_weight': ['auto']}, 60 | 'RandomForestClassifier': { 61 | 'n_jobs': [1], 'max_depth': [15, 20, 25, 30, 35, None], 62 | 'min_samples_split': [1, 3, 5, 7], 63 | 'max_features': [3, 8, 11, 15], 64 | }, 65 | 'ExtraTreesClassifier': {'min_samples_leaf': [2, 3], 66 | 'n_jobs': [1], 67 | 'min_samples_split': [1, 2, 5], 68 | 'bootstrap': [False], 69 | 'max_depth': [15, 20, 25, 30], 70 | 'max_features': [1, 3, 5, 11]}, 71 | 'GradientBoostingClassifier': {'max_features': [4, 5, 6, 7], 72 | 'learning_rate': [.05, .08, .1], 73 | 'max_depth': [8, 10, 13]}, 74 | } 75 | 76 | 77 | class AUCRegressor(object): 78 | def __init__(self): 79 | self.coef_ = 0 80 | 81 | def _auc_loss(self, coef, X, y): 82 | fpr, tpr, _ = roc_curve(y, sp.dot(X, coef)) 83 | return -auc(fpr, tpr) 84 | 85 | def fit(self, X, y): 86 | lr = linear_model.LinearRegression() 87 | auc_partial = partial(self._auc_loss, X=X, y=y) 88 | initial_coef = lr.fit(X, y).coef_ 89 | self.coef_ = sp.optimize.fmin(auc_partial, initial_coef) 90 | 91 | def predict(self, X): 92 | return sp.dot(X, self.coef_) 93 | 94 | def score(self, X, y): 95 | fpr, tpr, _ = roc_curve(y, sp.dot(X, self.coef_)) 96 | return auc(fpr, tpr) 97 | 98 | 99 | class MLR(object): 100 | def __init__(self): 101 | self.coef_ = 0 102 | 103 | def fit(self, X, y): 104 | self.coef_ = sp.optimize.nnls(X, y)[0] 105 | self.coef_ = np.array(map(lambda x: x/sum(self.coef_), self.coef_)) 106 | 107 | def predict(self, X): 108 | predictions = np.array(map(sum, self.coef_ * X)) 109 | return predictions 110 | 111 | def score(self, X, y): 112 | fpr, tpr, _ = roc_curve(y, sp.dot(X, self.coef_)) 113 | return auc(fpr, tpr) 114 | 115 | 116 | class StackedClassifier(object): 117 | """ 118 | Implement stacking to combine several models. 119 | The base (stage 0) models can be either combined through 120 | simple averaging (fastest), or combined using a stage 1 generalizer 121 | (requires computing CV predictions on the train set). 122 | 123 | See http://ijcai.org/Past%20Proceedings/IJCAI-97-VOL2/PDF/011.pdf: 124 | "Stacked generalization: when does it work?", Ting and Witten, 1997 125 | 126 | For speed and convenience, both fitting and prediction are done 127 | in the same method fit_predict; this is done in order to enable 128 | one to compute metrics on the predictions after training each model without 129 | having to wait for all the models to be trained. 130 | 131 | Options: 132 | ------------------------------ 133 | - models: a list of (model, dataset) tuples that represent stage 0 models 134 | - generalizer: an Estimator object. Must implement fit and predict 135 | - model_selection: boolean. Whether to use brute force search to find the 136 | optimal subset of models that produce the best AUC. 137 | """ 138 | def __init__(self, models, generalizer=None, model_selection=True, 139 | stack=False, fwls=False, use_cached_models=True): 140 | self.cache_dir = "main" 141 | self.models = models 142 | self.model_selection = model_selection 143 | self.stack = stack 144 | self.fwls = fwls 145 | self.generalizer = linear_model.RidgeCV( 146 | alphas=np.linspace(0, 200), cv=100) 147 | self.use_cached_models = use_cached_models 148 | 149 | def _combine_preds(self, X_train, X_cv, y, train=None, predict=None, 150 | stack=False, fwls=False): 151 | """ 152 | Combine preds, returning in order: 153 | - mean_preds: the simple average of all model predictions 154 | - stack_preds: the predictions of the stage 1 generalizer 155 | - fwls_preds: same as stack_preds, but optionally using more 156 | complex blending schemes (meta-features, different 157 | generalizers, etc.) 158 | """ 159 | mean_preds = np.mean(X_cv, axis=1) 160 | stack_preds = None 161 | fwls_preds = None 162 | 163 | if stack: 164 | self.generalizer.fit(X_train, y) 165 | stack_preds = self.generalizer.predict(X_cv) 166 | 167 | if self.fwls: 168 | meta, meta_cv = get_dataset('metafeatures', train, predict) 169 | fwls_train = np.hstack((X_train, meta)) 170 | fwls_cv = np.hstack((X_cv, meta)) 171 | self.generalizer.fit(fwls_train) 172 | fwls_preds = self.generalizer.predict(fwls_cv) 173 | 174 | return mean_preds, stack_preds, fwls_preds 175 | 176 | def _find_best_subset(self, y, predictions_list): 177 | """Finds the combination of models that produce the best AUC.""" 178 | best_subset_indices = range(len(predictions_list)) 179 | 180 | pool = multiprocessing.Pool(processes=4) 181 | partial_compute_subset_auc = partial(compute_subset_auc, 182 | pred_set=predictions_list, y=y) 183 | best_auc = 0 184 | best_n = 0 185 | best_indices = [] 186 | 187 | if len(predictions_list) == 1: 188 | return [1] 189 | 190 | for n in range(int(len(predictions_list)/2), len(predictions_list)): 191 | cb = itertools.combinations(range(len(predictions_list)), n) 192 | combination_results = pool.map(partial_compute_subset_auc, cb) 193 | best_subset_auc, best_subset_indices = max( 194 | combination_results, key=itemgetter(0)) 195 | print "- best subset auc (%d models): %.4f > %s" % ( 196 | n, best_subset_auc, n, list(best_subset_indices)) 197 | if best_subset_auc > best_auc: 198 | best_auc = best_subset_auc 199 | best_n = n 200 | best_indices = list(best_subset_indices) 201 | pool.terminate() 202 | 203 | logger.info("best auc: %.4f", best_auc) 204 | logger.info("best n: %d", best_n) 205 | logger.info("best indices: %s", best_indices) 206 | for i, (model, feature_set) in enumerate(self.models): 207 | if i in best_subset_indices: 208 | logger.info("> model: %s (%s)", model.__class__.__name__, 209 | feature_set) 210 | 211 | return best_subset_indices 212 | 213 | def _get_model_preds(self, model, X_train, X_predict, y_train, cache_file): 214 | """ 215 | Return the model predictions on the prediction set, 216 | using cache if possible. 217 | """ 218 | model_output = load_from_cache( 219 | "models/%s/%s.pkl" % (self.cache_dir, cache_file), 220 | self.use_cached_models) 221 | 222 | model_params, model_preds = model_output \ 223 | if model_output is not None else (None, None) 224 | 225 | if model_preds is None or model_params != model.get_params(): 226 | model.fit(X_train, y_train) 227 | model_preds = model.predict_proba(X_predict)[:, 1] 228 | with open("cache/models/%s/%s.pkl" % ( 229 | self.cache_dir, cache_file), 'wb') as f: 230 | pickle.dump((model.get_params(), model_preds), f) 231 | 232 | return model_preds 233 | 234 | def _get_model_cv_preds(self, model, X_train, y_train, cache_file): 235 | """ 236 | Return cross-validation predictions on the training set, using cache 237 | if possible. 238 | This is used if stacking is enabled (ie. a second model is used to 239 | combine the stage 0 predictions). 240 | """ 241 | stack_preds = load_from_cache( 242 | "models/%s/cv_preds/%s.pkl" % (self.cache_dir, cache_file), 243 | self.use_cached_models) 244 | 245 | if stack_preds is None: 246 | kfold = cross_validation.StratifiedKFold(y_train, 4) 247 | stack_preds = [] 248 | indexes_cv = [] 249 | for stage0, stack in kfold: 250 | model.fit(X_train[stage0], y_train[stage0]) 251 | stack_preds.extend(list(model.predict_proba( 252 | X_train[stack])[:, 1])) 253 | indexes_cv.extend(list(stack)) 254 | stack_preds = np.array(stack_preds)[sp.argsort(indexes_cv)] 255 | 256 | with open("cache/models/%s/cv_preds/%s%d.pkl" % ( 257 | self.cache_dir, cache_file), 'wb') as f: 258 | pickle.dump(stack_preds, f, pickle.HIGHEST_PROTOCOL) 259 | 260 | return stack_preds 261 | 262 | def fit_predict(self, y, train=None, predict=None, show_steps=True): 263 | """ 264 | Fit each model on the appropriate dataset, then return the average 265 | of their individual predictions. If train is specified, use a subset 266 | of the training set to train the models, then predict the outcome of 267 | either the remaining samples or (if given) those specified in cv. 268 | If train is omitted, train the models on the full training set, then 269 | predict the outcome of the full test set. 270 | 271 | Options: 272 | ------------------------------ 273 | - y: numpy array. The full vector of the ground truths. 274 | - train: list. The indices of the elements to be used for training. 275 | If None, take the entire training set. 276 | - predict: list. The indices of the elements to be predicted. 277 | - show_steps: boolean. Whether to compute metrics after each stage 278 | of the computation. 279 | """ 280 | y_train = y[train] if train is not None else y 281 | if train is not None and predict is None: 282 | predict = [i for i in range(len(y)) if i not in train] 283 | 284 | stage0_train = [] 285 | stage0_predict = [] 286 | for model, feature_set in self.models: 287 | X_train, X_predict = get_dataset(feature_set, train, predict) 288 | 289 | identifier = train[0] if train is not None else -1 290 | cache_file = stringify(model, feature_set) + str(identifier) 291 | 292 | model_preds = self._get_model_preds( 293 | model, X_train, X_predict, y_train, cache_file) 294 | stage0_predict.append(model_preds) 295 | 296 | # if stacking, compute cross-validated predictions on the train set 297 | if self.stack: 298 | model_cv_preds = self._get_model_cv_preds( 299 | model, X_train, y_train, cache_file) 300 | stage0_train.append(model_cv_preds) 301 | 302 | # verbose mode: compute metrics after every model computation 303 | if show_steps: 304 | if train is not None: 305 | mean_preds, stack_preds, fwls_preds = self._combine_preds( 306 | np.array(stage0_train).T, np.array(stage0_predict).T, 307 | y_train, train, predict, 308 | stack=self.stack, fwls=self.fwls) 309 | 310 | model_auc = compute_auc(y[predict], stage0_predict[-1]) 311 | mean_auc = compute_auc(y[predict], mean_preds) 312 | stack_auc = compute_auc(y[predict], stack_preds) \ 313 | if self.stack else 0 314 | fwls_auc = compute_auc(y[predict], fwls_preds) \ 315 | if self.fwls else 0 316 | 317 | logger.info( 318 | "> AUC: %.4f (%.4f, %.4f, %.4f) [%s]", model_auc, 319 | mean_auc, stack_auc, fwls_auc, 320 | stringify(model, feature_set)) 321 | else: 322 | logger.info("> used model %s:\n%s", stringify( 323 | model, feature_set), model.get_params()) 324 | 325 | if self.model_selection and predict is not None: 326 | best_subset = self._find_best_subset(y[predict], stage0_predict) 327 | stage0_train = [pred for i, pred in enumerate(stage0_train) 328 | if i in best_subset] 329 | stage0_predict = [pred for i, pred in enumerate(stage0_predict) 330 | if i in best_subset] 331 | 332 | mean_preds, stack_preds, fwls_preds = self._combine_preds( 333 | np.array(stage0_train).T, np.array(stage0_predict).T, 334 | y_train, stack=self.stack, fwls=self.fwls) 335 | 336 | if self.stack: 337 | selected_preds = stack_preds if not self.fwls else fwls_preds 338 | else: 339 | selected_preds = mean_preds 340 | 341 | return selected_preds 342 | 343 | 344 | def compute_subset_auc(indices, pred_set, y): 345 | subset = [vect for i, vect in enumerate(pred_set) if i in indices] 346 | mean_preds = sp.mean(subset, axis=0) 347 | mean_auc = compute_auc(y, mean_preds) 348 | 349 | return mean_auc, indices 350 | 351 | 352 | def find_params(model, feature_set, y, subsample=None, grid_search=False): 353 | """ 354 | Return parameter set for the model, either predefined 355 | or found through grid search. 356 | """ 357 | model_name = model.__class__.__name__ 358 | params = INITIAL_PARAMS.get(model_name, {}) 359 | y = y if subsample is None else y[subsample] 360 | 361 | try: 362 | with open('saved_params.json') as f: 363 | saved_params = json.load(f) 364 | except IOError: 365 | saved_params = {} 366 | 367 | if (grid_search and model_name in PARAM_GRID and stringify( 368 | model, feature_set) not in saved_params): 369 | X, _ = get_dataset(feature_set, subsample, [0]) 370 | clf = GridSearchCV(model, PARAM_GRID[model_name], cv=10, n_jobs=6, 371 | scoring="roc_auc") 372 | clf.fit(X, y) 373 | logger.info("found params (%s > %.4f): %s", 374 | stringify(model, feature_set), 375 | clf.best_score_, clf.best_params_) 376 | params.update(clf.best_params_) 377 | saved_params[stringify(model, feature_set)] = params 378 | with open('saved_params.json', 'w') as f: 379 | json.dump(saved_params, f, indent=4, separators=(',', ': '), 380 | ensure_ascii=True, sort_keys=True) 381 | else: 382 | params.update(saved_params.get(stringify(model, feature_set), {})) 383 | if grid_search: 384 | logger.info("using params %s: %s", stringify(model, feature_set), 385 | params) 386 | 387 | return params 388 | -------------------------------------------------------------------------------- /helpers/utils.py: -------------------------------------------------------------------------------- 1 | """utils.py 2 | 3 | Some useful functions. 4 | Author: Paul Duan 5 | """ 6 | 7 | from re import sub 8 | from sklearn.metrics import roc_curve, auc 9 | 10 | 11 | def stringify(model, feature_set): 12 | """Given a model and a feature set, return a short string that will serve 13 | as identifier for this combination. 14 | Ex: (LogisticRegression(), "basic_s") -> "LR:basic_s" 15 | """ 16 | return "%s:%s" % (sub("[a-z]", '', model.__class__.__name__), feature_set) 17 | 18 | 19 | def compute_auc(y, y_pred): 20 | fpr, tpr, _ = roc_curve(y, y_pred) 21 | return auc(fpr, tpr) 22 | -------------------------------------------------------------------------------- /history.log: -------------------------------------------------------------------------------- 1 | # All actions will be logged here 2 | -------------------------------------------------------------------------------- /plots/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore all except this file 2 | * 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /saved_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "ETC:basic_b": { 3 | "bootstrap": true, 4 | "max_depth": null, 5 | "max_features": "sqrt", 6 | "min_samples_leaf": 2, 7 | "min_samples_split": 8 8 | }, 9 | "ETC:basic_f": { 10 | "bootstrap": false, 11 | "max_depth": 25, 12 | "max_features": 11, 13 | "min_samples_leaf": 3, 14 | "min_samples_split": 1 15 | }, 16 | "ETC:basic_fd": { 17 | "bootstrap": false, 18 | "max_depth": 30, 19 | "max_features": 11, 20 | "min_samples_leaf": 3, 21 | "min_samples_split": 1 22 | }, 23 | "ETC:bsfeats": { 24 | "bootstrap": true, 25 | "max_depth": null, 26 | "max_features": "sqrt", 27 | "min_samples_leaf": 2, 28 | "min_samples_split": 8 29 | }, 30 | "ETC:greedy": { 31 | "bootstrap": false, 32 | "max_depth": 30, 33 | "max_features": 11, 34 | "min_samples_leaf": 3, 35 | "min_samples_split": 1 36 | }, 37 | "ETC:greedy2": { 38 | "bootstrap": false, 39 | "max_depth": 25, 40 | "max_features": 11, 41 | "min_samples_leaf": 3, 42 | "min_samples_split": 1 43 | }, 44 | "ETC:greedy2_f": { 45 | "bootstrap": false, 46 | "max_depth": 25, 47 | "max_features": 11, 48 | "min_samples_leaf": 3, 49 | "min_samples_split": 1 50 | }, 51 | "ETC:greedy2_fd": { 52 | "bootstrap": false, 53 | "max_depth": 30, 54 | "max_features": 11, 55 | "min_samples_leaf": 2, 56 | "min_samples_split": 5 57 | }, 58 | "ETC:greedy3": { 59 | "bootstrap": false, 60 | "max_depth": 30, 61 | "max_features": 11, 62 | "min_samples_leaf": 2, 63 | "min_samples_split": 5 64 | }, 65 | "ETC:greedy3_f": { 66 | "bootstrap": false, 67 | "max_depth": 20, 68 | "max_features": 11, 69 | "min_samples_leaf": 3, 70 | "min_samples_split": 1 71 | }, 72 | "ETC:greedy3_fd": { 73 | "bootstrap": false, 74 | "max_depth": 30, 75 | "max_features": 11, 76 | "min_samples_leaf": 3, 77 | "min_samples_split": 1 78 | }, 79 | "ETC:greedy_f": { 80 | "bootstrap": false, 81 | "max_depth": 30, 82 | "max_features": 11, 83 | "min_samples_leaf": 3, 84 | "min_samples_split": 1 85 | }, 86 | "ETC:greedy_fd": { 87 | "bootstrap": false, 88 | "max_depth": 25, 89 | "max_features": 11, 90 | "min_samples_leaf": 3, 91 | "min_samples_split": 1 92 | }, 93 | "ETC:tuples_f": { 94 | "bootstrap": false, 95 | "max_depth": 25, 96 | "max_features": 11, 97 | "min_samples_leaf": 3, 98 | "min_samples_split": 1 99 | }, 100 | "ETC:tuples_fd": { 101 | "bootstrap": false, 102 | "max_depth": 20, 103 | "max_features": 11, 104 | "min_samples_leaf": 3, 105 | "min_samples_split": 1 106 | }, 107 | "GBC:basic_b": { 108 | "learning_rate": 0.005, 109 | "max_depth": 20, 110 | "min_samples_split": 9 111 | }, 112 | "GBC:basic_f": { 113 | "learning_rate": 0.05, 114 | "max_depth": 10, 115 | "max_features": 5, 116 | "min_samples_leaf": 1, 117 | "min_samples_split": 3 118 | }, 119 | "GBC:bsfeats": { 120 | "learning_rate": 0.005, 121 | "max_depth": 20, 122 | "min_samples_split": 9 123 | }, 124 | "GBC:greedy": { 125 | "learning_rate": 0.08, 126 | "max_depth": 10, 127 | "max_features": 6, 128 | "min_samples_leaf": 1, 129 | "min_samples_split": 3 130 | }, 131 | "GBC:greedy2": { 132 | "learning_rate": 0.08, 133 | "max_depth": 10, 134 | "max_features": 6, 135 | "min_samples_leaf": 1, 136 | "min_samples_split": 3 137 | }, 138 | "GBC:greedy2_f": { 139 | "learning_rate": 0.08, 140 | "max_depth": 10, 141 | "max_features": 3, 142 | "min_samples_leaf": 1, 143 | "min_samples_split": 3 144 | }, 145 | "GBC:greedy2_fd": { 146 | "learning_rate": 0.08, 147 | "max_depth": 10, 148 | "max_features": 7, 149 | "min_samples_leaf": 1, 150 | "min_samples_split": 3 151 | }, 152 | "GBC:greedy3": { 153 | "learning_rate": 0.08, 154 | "max_depth": 10, 155 | "max_features": 6, 156 | "min_samples_leaf": 1, 157 | "min_samples_split": 3 158 | }, 159 | "GBC:greedy3_f": { 160 | "learning_rate": 0.08, 161 | "max_depth": 10, 162 | "max_features": 3, 163 | "min_samples_leaf": 1, 164 | "min_samples_split": 3 165 | }, 166 | "GBC:greedy3_fd": { 167 | "learning_rate": 0.08, 168 | "max_depth": 10, 169 | "max_features": 7, 170 | "min_samples_leaf": 1, 171 | "min_samples_split": 3 172 | }, 173 | "GBC:greedy_f": { 174 | "learning_rate": 0.08, 175 | "max_depth": 13, 176 | "max_features": 4, 177 | "min_samples_leaf": 1, 178 | "min_samples_split": 3 179 | }, 180 | "GBC:greedy_fd": { 181 | "learning_rate": 0.08, 182 | "max_depth": 10, 183 | "max_features": 7, 184 | "min_samples_leaf": 1, 185 | "min_samples_split": 3 186 | }, 187 | "GBC:tuples_f": { 188 | "learning_rate": 0.05, 189 | "max_depth": 10, 190 | "max_features": 4, 191 | "min_samples_leaf": 1, 192 | "min_samples_split": 3 193 | }, 194 | "GBC:tuples_fd": { 195 | "learning_rate": 0.08, 196 | "max_depth": 10, 197 | "max_features": 6, 198 | "min_samples_leaf": 1, 199 | "min_samples_split": 3 200 | }, 201 | "LR:basic_sf": { 202 | "C": 3.5, 203 | "class_weight": "auto", 204 | "penalty": "l2" 205 | }, 206 | "LR:basic_sfd": { 207 | "C": 5.5, 208 | "class_weight": "auto", 209 | "penalty": "l2" 210 | }, 211 | "LR:basic_sfl": { 212 | "C": 3.5, 213 | "class_weight": "auto", 214 | "penalty": "l2" 215 | }, 216 | "LR:consolidated_s": { 217 | "C": 1.5, 218 | "class_weight": "auto", 219 | "penalty": "l2" 220 | }, 221 | "LR:consolidated_sf": { 222 | "C": 5, 223 | "class_weight": "auto", 224 | "penalty": "l2" 225 | }, 226 | "LR:greedy2_sbl": { 227 | "C": 3.5, 228 | "class_weight": "auto", 229 | "penalty": "l2" 230 | }, 231 | "LR:greedy2_sf": { 232 | "C": 5.5, 233 | "class_weight": "auto", 234 | "penalty": "l2" 235 | }, 236 | "LR:greedy2_sfd": { 237 | "C": 5.5, 238 | "class_weight": "auto", 239 | "penalty": "l2" 240 | }, 241 | "LR:greedy2_sfl": { 242 | "C": 5, 243 | "class_weight": "auto", 244 | "penalty": "l2" 245 | }, 246 | "LR:greedy3_sbl": { 247 | "C": 5.5, 248 | "class_weight": "auto", 249 | "penalty": "l2" 250 | }, 251 | "LR:greedy3_sf": { 252 | "C": 5, 253 | "class_weight": "auto", 254 | "penalty": "l2" 255 | }, 256 | "LR:greedy3_sfd": { 257 | "C": 5.5, 258 | "class_weight": "auto", 259 | "penalty": "l2" 260 | }, 261 | "LR:greedy3_sfl": { 262 | "C": 5.5, 263 | "class_weight": "auto", 264 | "penalty": "l2" 265 | }, 266 | "LR:greedy_sbl": { 267 | "C": 5.5, 268 | "class_weight": "auto", 269 | "penalty": "l2" 270 | }, 271 | "LR:greedy_sf": { 272 | "C": 3.5, 273 | "class_weight": "auto", 274 | "penalty": "l2" 275 | }, 276 | "LR:greedy_sfl": { 277 | "C": 5, 278 | "class_weight": "auto", 279 | "penalty": "l2" 280 | }, 281 | "LR:triples_sbl": { 282 | "C": 3, 283 | "class_weight": "auto", 284 | "penalty": "l2" 285 | }, 286 | "LR:tuples_sbl": { 287 | "C": 3, 288 | "class_weight": "auto", 289 | "penalty": "l2" 290 | }, 291 | "LR:tuples_sf": { 292 | "C": 2, 293 | "class_weight": "auto", 294 | "penalty": "l2" 295 | }, 296 | "LR:tuples_sfd": { 297 | "C": 2.5, 298 | "class_weight": "auto", 299 | "penalty": "l2" 300 | }, 301 | "LR:tuples_sfl": { 302 | "C": 2.5, 303 | "class_weight": "auto", 304 | "penalty": "l2" 305 | }, 306 | "RFC:basic_b": { 307 | "bootstrap": true, 308 | "max_depth": null, 309 | "max_features": "sqrt", 310 | "min_samples_leaf": 2, 311 | "min_samples_split": 8 312 | }, 313 | "RFC:basic_f": { 314 | "bootstrap": false, 315 | "max_depth": 15, 316 | "max_features": 3, 317 | "min_samples_leaf": 2, 318 | "min_samples_split": 7 319 | }, 320 | "RFC:basic_fd": { 321 | "bootstrap": false, 322 | "max_depth": 30, 323 | "max_features": 11, 324 | "min_samples_leaf": 2, 325 | "min_samples_split": 7 326 | }, 327 | "RFC:bsfeats": { 328 | "bootstrap": true, 329 | "max_depth": null, 330 | "max_features": "sqrt", 331 | "min_samples_leaf": 2, 332 | "min_samples_split": 8 333 | }, 334 | "RFC:effects_f": { 335 | "bootstrap": false, 336 | "max_depth": 15, 337 | "max_features": 15, 338 | "min_samples_leaf": 2, 339 | "min_samples_split": 5, 340 | "n_estimators": 500, 341 | "n_jobs": 1 342 | }, 343 | "RFC:effects_b": { 344 | "bootstrap": false, 345 | "max_depth": 15, 346 | "max_features": 15, 347 | "min_samples_leaf": 2, 348 | "min_samples_split": 5, 349 | "n_estimators": 500, 350 | "n_jobs": 1 351 | }, 352 | "RFC:greedy": { 353 | "bootstrap": false, 354 | "max_depth": null, 355 | "max_features": 11, 356 | "min_samples_leaf": 2, 357 | "min_samples_split": 7 358 | }, 359 | "RFC:greedy2": { 360 | "bootstrap": false, 361 | "max_depth": 30, 362 | "max_features": 8, 363 | "min_samples_leaf": 2, 364 | "min_samples_split": 8 365 | }, 366 | "RFC:greedy2_f": { 367 | "bootstrap": false, 368 | "max_depth": 25, 369 | "max_features": 3, 370 | "min_samples_leaf": 2, 371 | "min_samples_split": 8 372 | }, 373 | "RFC:greedy2_fd": { 374 | "bootstrap": false, 375 | "max_depth": 25, 376 | "max_features": 11, 377 | "min_samples_leaf": 2, 378 | "min_samples_split": 7 379 | }, 380 | "RFC:greedy3": { 381 | "bootstrap": false, 382 | "max_depth": null, 383 | "max_features": 11, 384 | "min_samples_leaf": 2, 385 | "min_samples_split": 8 386 | }, 387 | "RFC:greedy3_f": { 388 | "bootstrap": false, 389 | "max_depth": 25, 390 | "max_features": 8, 391 | "min_samples_leaf": 2, 392 | "min_samples_split": 8 393 | }, 394 | "RFC:greedy3_fd": { 395 | "bootstrap": false, 396 | "max_depth": 15, 397 | "max_features": 11, 398 | "min_samples_leaf": 2, 399 | "min_samples_split": 8 400 | }, 401 | "RFC:greedy_f": { 402 | "bootstrap": false, 403 | "max_depth": 25, 404 | "max_features": 3, 405 | "min_samples_leaf": 2, 406 | "min_samples_split": 7 407 | }, 408 | "RFC:greedy_fd": { 409 | "bootstrap": false, 410 | "max_depth": 15, 411 | "max_features": 8, 412 | "min_samples_leaf": 2, 413 | "min_samples_split": 7 414 | }, 415 | "RFC:tuples_f": { 416 | "bootstrap": false, 417 | "max_depth": 25, 418 | "max_features": 3, 419 | "min_samples_leaf": 2, 420 | "min_samples_split": 7 421 | }, 422 | "RFC:tuples_fd": { 423 | "bootstrap": false, 424 | "max_depth": null, 425 | "max_features": 11, 426 | "min_samples_leaf": 2, 427 | "min_samples_split": 7 428 | }, 429 | "SGDC:basic_sf": { 430 | "alpha": 0.0003, 431 | "l1_ratio": 0.1, 432 | "loss": "log", 433 | "penalty": "l2" 434 | }, 435 | "SGDC:basic_sfl": { 436 | "alpha": 0.0003, 437 | "l1_ratio": 0.1, 438 | "loss": "log", 439 | "penalty": "l2" 440 | }, 441 | "SGDC:greedy2_sf": { 442 | "alpha": 0.0001, 443 | "l1_ratio": 0.1, 444 | "loss": "log", 445 | "penalty": "l2" 446 | }, 447 | "SGDC:greedy2_sfd": { 448 | "alpha": 0.0003, 449 | "l1_ratio": 0.1, 450 | "loss": "log", 451 | "penalty": "l2" 452 | }, 453 | "SGDC:greedy2_sfl": { 454 | "alpha": 0.0003, 455 | "l1_ratio": 0.1, 456 | "loss": "log", 457 | "penalty": "l2" 458 | }, 459 | "SGDC:greedy3_sf": { 460 | "alpha": 0.0001, 461 | "l1_ratio": 0.1, 462 | "loss": "log", 463 | "penalty": "l2" 464 | }, 465 | "SGDC:greedy3_sfd": { 466 | "alpha": 0.0003, 467 | "l1_ratio": 0.1, 468 | "loss": "log", 469 | "penalty": "l2" 470 | }, 471 | "SGDC:greedy3_sfl": { 472 | "alpha": 0.0003, 473 | "l1_ratio": 0.1, 474 | "loss": "log", 475 | "penalty": "l2" 476 | }, 477 | "SGDC:greedy_sf": { 478 | "alpha": 8e-05, 479 | "l1_ratio": 0.1, 480 | "loss": "log", 481 | "penalty": "l2" 482 | }, 483 | "SGDC:greedy_sfl": { 484 | "alpha": 0.0003, 485 | "l1_ratio": 0.1, 486 | "loss": "log", 487 | "penalty": "l2" 488 | }, 489 | "SGDC:tuples_sf": { 490 | "alpha": 0.0003, 491 | "l1_ratio": 0.1, 492 | "loss": "log", 493 | "penalty": "l2" 494 | }, 495 | "SGDC:tuples_sfd": { 496 | "alpha": 0.0003, 497 | "l1_ratio": 0.1, 498 | "loss": "log", 499 | "penalty": "l2" 500 | }, 501 | "SGDC:tuples_sfl": { 502 | "alpha": 0.0003, 503 | "l1_ratio": 0.1, 504 | "loss": "log", 505 | "penalty": "l2" 506 | } 507 | } 508 | -------------------------------------------------------------------------------- /submissions/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore all files except this one 2 | * 3 | !.gitignore 4 | --------------------------------------------------------------------------------