├── Avazu-CTR ├── finalModel.py └── readme.md ├── BNP ├── bayesian_encode_fivelevel_withint.py └── readme.md ├── Datasets-Indian_Premier_League ├── IPL_Exploration.ipynb ├── IPL_Win_Prediction.ipynb └── readme.md ├── Expedia ├── DataPrep │ ├── getBookings.py │ ├── getClicks.py │ ├── getLeakFree.py │ ├── getLeakRows_test.py │ ├── getLeakRows_val.py │ ├── readme.md │ ├── splitDevVal.py │ ├── splitDevVal_Bookings.py │ ├── splitDevVal_Clicks.py │ └── splitNonLeak_Dist.py └── readme.md ├── GhoulsGoblinsGhost ├── kaggle_simple_exploration_notebook.ipynb └── readme.md ├── LibertyMutual ├── featureSelection.py ├── finalModel.py ├── predict.py ├── prepareData.py └── readme.md ├── MMM15 ├── finalModel.py ├── prepareData.py ├── readme.md └── seed_model.py ├── OutBrain ├── ftrl.py └── readme.md ├── README.md ├── SantanderReco ├── keras_starter_kaggle.py ├── multilabel_classification.py ├── readme.md └── santander_exploartion.ipynb ├── SpookyAuthor ├── readme.md └── simple_fe_notebook_spooky_author.ipynb ├── Titanic ├── Titanic_Exploration.ipynb └── readme.md ├── TransferLearningStackExchange ├── frequent_words_model.py ├── readme.md └── simple_exploration_notebook.ipynb ├── TwoSigmaConnect_RentHop ├── SimpleExplorationNotebook.ipynb ├── XGBStarterInPython.ipynb └── readme.md ├── TwoSigmaFinancialModeling ├── OverfittingCheck.ipynb ├── SimpleExplorationNotebook.ipynb ├── UnivariateAnalysis.ipynb └── readme.md └── Walmart_TripType ├── NeuralNets ├── config_v2.py ├── neural_net.py ├── prepData.py └── readme.md ├── XGB ├── config_v5.py ├── prepData.py ├── readme.md └── xgb_model.py └── readme.md /Avazu-CTR/finalModel.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Nov 21 18:20:23 2014 5 | 6 | Used interactions using this post 7 | http://www.kaggle.com/c/avazu-ctr-prediction/forums/t/11331/feature-vector-dimension 8 | """ 9 | 10 | from datetime import datetime 11 | from csv import DictReader 12 | from math import exp, log, sqrt 13 | 14 | 15 | # TL; DR, the main training process starts on line: 250, 16 | # you may want to start reading the code from there 17 | 18 | 19 | ############################################################################## 20 | # parameters ################################################################# 21 | ############################################################################## 22 | 23 | # A, paths 24 | data_path = "Path to data" 25 | train = data_path+'train.csv' # path to training file 26 | test = data_path+'test.csv' # path to testing file 27 | submission = 'submission.csv' # path of to be outputted submission file 28 | 29 | # B, model 30 | alpha = .08 # learning rate 31 | beta = 1. # smoothing parameter for adaptive learning rate 32 | L1 = 3. # L1 regularization, larger value means more regularized 33 | L2 = 1. # L2 regularization, larger value means more regularized 34 | 35 | # C, feature/hash trick 36 | D = 2 ** 25 # number of weights to use 37 | interaction = False # whether to enable poly2 feature interactions 38 | 39 | # D, training/validation 40 | epoch = 1 # learn training data for N passes 41 | holdafter = None # data after date N (exclusive) are used as validation 42 | holdout = None # use every N training instance for holdout validation 43 | 44 | 45 | ############################################################################## 46 | # class, function, generator definitions ##################################### 47 | ############################################################################## 48 | 49 | class ftrl_proximal(object): 50 | ''' Our main algorithm: Follow the regularized leader - proximal 51 | 52 | In short, 53 | this is an adaptive-learning-rate sparse logistic-regression with 54 | efficient L1-L2-regularization 55 | 56 | Reference: 57 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 58 | ''' 59 | 60 | def __init__(self, alpha, beta, L1, L2, D, interaction): 61 | # parameters 62 | self.alpha = alpha 63 | self.beta = beta 64 | self.L1 = L1 65 | self.L2 = L2 66 | 67 | # feature related parameters 68 | self.D = D 69 | self.interaction = interaction 70 | 71 | # model 72 | # n: squared sum of past gradients 73 | # z: weights 74 | # w: lazy weights 75 | self.n = [0.] * D 76 | self.z = [0.] * D 77 | self.w = {} 78 | 79 | def _indices(self, x): 80 | ''' A helper generator that yields the indices in x 81 | 82 | The purpose of this generator is to make the following 83 | code a bit cleaner when doing feature interaction. 84 | ''' 85 | 86 | # first yield index of the bias term 87 | yield 0 88 | 89 | # then yield the normal indices 90 | for index in x: 91 | yield index 92 | 93 | # now yield interactions (if applicable) 94 | if self.interaction: 95 | D = self.D 96 | L = len(x) 97 | 98 | x = sorted(x) 99 | for i in xrange(L): 100 | for j in xrange(i+1, L): 101 | # one-hot encode interactions with hash trick 102 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D 103 | 104 | def predict(self, x): 105 | ''' Get probability estimation on x 106 | 107 | INPUT: 108 | x: features 109 | 110 | OUTPUT: 111 | probability of p(y = 1 | x; w) 112 | ''' 113 | 114 | # parameters 115 | alpha = self.alpha 116 | beta = self.beta 117 | L1 = self.L1 118 | L2 = self.L2 119 | 120 | # model 121 | n = self.n 122 | z = self.z 123 | w = {} 124 | 125 | # wTx is the inner product of w and x 126 | wTx = 0. 127 | for i in self._indices(x): 128 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 129 | 130 | # build w on the fly using z and n, hence the name - lazy weights 131 | # we are doing this at prediction instead of update time is because 132 | # this allows us for not storing the complete w 133 | if sign * z[i] <= L1: 134 | # w[i] vanishes due to L1 regularization 135 | w[i] = 0. 136 | else: 137 | # apply prediction time L1, L2 regularization to z and get w 138 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 139 | 140 | wTx += w[i] 141 | 142 | # cache the current w for update stage 143 | self.w = w 144 | 145 | # bounded sigmoid function, this is the probability estimation 146 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 147 | 148 | def update(self, x, p, y): 149 | ''' Update model using x, p, y 150 | 151 | INPUT: 152 | x: feature, a list of indices 153 | p: click probability prediction of our model 154 | y: answer 155 | 156 | MODIFIES: 157 | self.n: increase by squared gradient 158 | self.z: weights 159 | ''' 160 | 161 | # parameter 162 | alpha = self.alpha 163 | 164 | # model 165 | n = self.n 166 | z = self.z 167 | w = self.w 168 | 169 | # gradient under logloss 170 | g = p - y 171 | 172 | # update z and n 173 | for i in self._indices(x): 174 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 175 | z[i] += g - sigma * w[i] 176 | n[i] += g * g 177 | 178 | 179 | def logloss(p, y): 180 | ''' FUNCTION: Bounded logloss 181 | 182 | INPUT: 183 | p: our prediction 184 | y: real answer 185 | 186 | OUTPUT: 187 | logarithmic loss of p given y 188 | ''' 189 | 190 | p = max(min(p, 1. - 10e-15), 10e-15) 191 | return -log(p) if y == 1. else -log(1. - p) 192 | 193 | 194 | def data(path, D): 195 | ''' GENERATOR: Apply hash-trick to the original csv row 196 | and for simplicity, we one-hot-encode everything 197 | 198 | INPUT: 199 | path: path to training or testing file 200 | D: the max index that we can hash to 201 | 202 | YIELDS: 203 | ID: id of the instance, mainly useless 204 | x: a list of hashed and one-hot-encoded 'indices' 205 | we only need the index since all values are either 0 or 1 206 | y: y = 1 if we have a click, else we have y = 0 207 | ''' 208 | 209 | for t, row in enumerate(DictReader(open(path))): 210 | # process id 211 | ID = row['id'] 212 | del row['id'] 213 | 214 | # process clicks 215 | y = 0. 216 | if 'click' in row: 217 | if row['click'] == '1': 218 | y = 1. 219 | del row['click'] 220 | 221 | # extract date 222 | date = int(row['hour'][4:6]) 223 | 224 | # turn hour really into hour, it was originally YYMMDDHH 225 | row['hour'] = row['hour'][6:] 226 | 227 | # creating two way feature interactions for some variables based on the feature explanations (not an ideal method though!) 228 | row['C1_bannerpos'] = row['C1']+ "_" + row['banner_pos'] 229 | row['site_app_category'] = row['site_category'] + '_' +row['app_category'] 230 | row['site_domin_app_category'] = row['site_domain'] + '_' + row['app_category'] 231 | row['app_domain_site_category'] = row['app_domain'] + '_' + row['site_category'] 232 | row['banner_pos_site_id'] = row['banner_pos'] + '_' + row['site_id'] 233 | row['banner_pos_app_id'] = row['banner_pos'] + '_' + row['app_id'] 234 | row['banner_pos_device_model'] = row['banner_pos'] + '_' + row['device_model'] 235 | row['banner_pos_device_conn_type'] = row['banner_pos'] + '_' + row['device_conn_type'] 236 | row['site_id_device_model'] = row['site_id'] + '_' + row['device_model'] 237 | row['app_id_device_model'] = row['app_id'] + '_' + row['device_model'] 238 | row['site_id_app_id'] = row['site_id'] + '_' + row['app_id'] 239 | row['site_id_device_conn_type'] = row['site_id'] + '_' +row['device_conn_type'] 240 | row['app_id_device_conn_type'] = row['app_id'] + '_' +row['device_conn_type'] 241 | row['C14_C17'] = row['C14'] + '_' + row['C17'] 242 | row['C14_C20'] = row['C14'] + '_' + row['C20'] 243 | row['C15_C16'] = row['C15'] + '_' + row['C16'] 244 | row['C15_C18'] = row['C15'] + '_' + row['C18'] 245 | row['C17_C20'] = row['C17'] + '_' + row['C20'] 246 | row['C16_C18'] = row['C16'] + '_' + row['C18'] 247 | row['C19_C21'] = row['C19'] + '_' + row['C21'] 248 | row['C20_C21'] = row['C20'] + '_' + row['C21'] 249 | row['C20_site_id'] = row['C20'] + '_' + row['site_id'] 250 | row['C17_site_id'] = row['C17'] + '_' + row['site_id'] 251 | row['C20_app_id'] = row['C20'] + '_' + row['app_id'] 252 | row['C17_app_id'] = row['C17'] + '_' + row['app_id'] 253 | 254 | 255 | # build x 256 | x = [] 257 | for key in row: 258 | value = row[key] 259 | 260 | # one-hot encode everything with hash trick 261 | index = abs(hash(key + '_' + value)) % D 262 | x.append(index) 263 | 264 | yield t, date, ID, x, y 265 | 266 | 267 | ############################################################################## 268 | # start training ############################################################# 269 | ############################################################################## 270 | 271 | start = datetime.now() 272 | 273 | # initialize ourselves a learner 274 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 275 | 276 | # start training 277 | for e in xrange(epoch): 278 | loss = 0. 279 | count = 0 280 | 281 | for t, date, ID, x, y in data(train, D): # data is a generator 282 | # t: just a instance counter 283 | # date: you know what this is 284 | # ID: id provided in original data 285 | # x: features 286 | # y: label (click) 287 | 288 | # step 1, get prediction from learner 289 | p = learner.predict(x) 290 | 291 | if (holdafter and date > holdafter) or (holdout and t % holdout == 0): 292 | # step 2-1, calculate validation loss 293 | # we do not train with the validation data so that our 294 | # validation loss is an accurate estimation 295 | # 296 | # holdafter: train instances from day 1 to day N 297 | # validate with instances from day N + 1 and after 298 | # 299 | # holdout: validate with every N instance, train with others 300 | loss += logloss(p, y) 301 | count += 1 302 | else: 303 | # step 2-2, update learner with label (click) information 304 | learner.update(x, p, y) 305 | 306 | #print('Epoch %d finished, validation logloss: %f, elapsed time: %s' % ( 307 | # e, loss/count, str(datetime.now() - start))) 308 | 309 | 310 | ############################################################################## 311 | # start testing, and build Kaggle's submission file ########################## 312 | ############################################################################## 313 | 314 | with open(submission, 'w') as outfile: 315 | outfile.write('id,click\n') 316 | for t, date, ID, x, y in data(test, D): 317 | p = learner.predict(x) 318 | outfile.write('%s,%s\n' % (ID, str(p))) 319 | -------------------------------------------------------------------------------- /Avazu-CTR/readme.md: -------------------------------------------------------------------------------- 1 | This folder consists of the codes written for the [Avazu - Click Through Rate](https://www.kaggle.com/c/avazu-ctr-prediction) Kaggle competition. 2 | 3 | The approach followed here is Follow The Regularized Leader - Proximal (FTRL). It is an online learning algorithm and the algorithm can be read from [Google paper](http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf) and this [paper](http://people.csail.mit.edu/romer/papers/TISTRespPredAds.pdf) 4 | 5 | Thanks to Tintgru for the [base code of the algorithm](https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory) implemented in python 6 | 7 | Also from this [paper](http://quinonero.net/Publications/predicting-clicks-facebook.pdf), it is shown that feature interactions improved the performance in the similar problems. This idea is added to the base code to get better result. 8 | -------------------------------------------------------------------------------- /BNP/bayesian_encode_fivelevel_withint.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | import operator 5 | from sklearn import preprocessing 6 | from sklearn.cross_validation import KFold 7 | from sklearn import ensemble 8 | from sklearn.metrics import roc_auc_score,log_loss 9 | import xgboost as xgb 10 | 11 | 12 | def getCountVar(compute_df, count_df, var_name, count_var="v1"): 13 | grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count') 14 | grouped_df.columns = [var_name, "var_count"] 15 | 16 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 17 | merged_df.fillna(-1, inplace=True) 18 | return list(merged_df["var_count"]) 19 | 20 | def create_feature_map(features): 21 | outfile = open('xgb.fmap', 'w') 22 | for i, feat in enumerate(features): 23 | outfile.write('{0}\t{1}\tq\n'.format(i,feat)) 24 | outfile.close() 25 | 26 | 27 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="target", min_cutoff=5): 28 | grouped_df = target_df.groupby(var_name, as_index=False)["target"].agg(["mean", "count"]) 29 | grouped_df.columns = ["target_mean", "count_var"] 30 | grouped_df.reset_index(level=var_name, inplace=True) 31 | grouped_df["count_var"][grouped_df["count_var"]=min_cutoff] = 1 33 | grouped_df["target_mean"] = grouped_df["target_mean"] * grouped_df["count_var"] 34 | 35 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 36 | merged_df.fillna(-1, inplace=True) 37 | return list(merged_df["target_mean"]) 38 | 39 | 40 | def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None): 41 | params = {} 42 | params["objective"] = "binary:logistic" 43 | params['eval_metric'] = 'logloss' 44 | params["eta"] = 0.02 45 | params["min_child_weight"] = 1 46 | params["subsample"] = 0.85 47 | params["colsample_bytree"] = 0.3 48 | params["silent"] = 1 49 | params["max_depth"] = 10 50 | params["seed"] = 232345 51 | #params["gamma"] = 0.5 52 | num_rounds = 600 53 | 54 | plst = list(params.items()) 55 | xgtrain = xgb.DMatrix(train_X, label=train_y) 56 | 57 | if test_y is not None: 58 | xgtest = xgb.DMatrix(test_X, label=test_y) 59 | watchlist = [ (xgtrain,'train'), (xgtest, 'test') ] 60 | model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=1000) 61 | else: 62 | xgtest = xgb.DMatrix(test_X) 63 | model = xgb.train(plst, xgtrain, num_rounds) 64 | 65 | if feature_names: 66 | create_feature_map(feature_names) 67 | importance = model.get_fscore(fmap='xgb.fmap') 68 | importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) 69 | imp_df = pd.DataFrame(importance, columns=['feature','fscore']) 70 | imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum() 71 | imp_df.to_csv("imp_feat.txt", index=False) 72 | 73 | pred_test_y = model.predict(xgtest) 74 | 75 | if test_y is not None: 76 | loss = log_loss(test_y, pred_test_y) 77 | print loss 78 | 79 | return pred_test_y, loss 80 | else: 81 | return pred_test_y 82 | 83 | def prepData(var4_col="v52"): 84 | import datetime 85 | start_time = datetime.datetime.now() 86 | print "Start time : ", start_time 87 | 88 | print "Reading files.." 89 | train = pd.read_csv('../Data/train.csv') 90 | test = pd.read_csv('../Data/test.csv') 91 | print train.shape, test.shape 92 | 93 | print "Filling NA.." 94 | train = train.fillna(-1) 95 | test = test.fillna(-1) 96 | 97 | print "Label encoding.." 98 | cat_columns = ["v129", "v72", "v62", "v38"] 99 | for f in train.columns: 100 | if train[f].dtype=='object': 101 | print(f), len(np.unique(train[f].values)) 102 | #if f != 'v22': 103 | cat_columns.append(f) 104 | lbl = preprocessing.LabelEncoder() 105 | lbl.fit(list(train[f].values) + list(test[f].values)) 106 | train[f] = lbl.transform(list(train[f].values)) 107 | test[f] = lbl.transform(list(test[f].values)) 108 | new_train = pd.concat([ train[['v1',f]], test[['v1',f]] ]) 109 | train["CountVar_"+str(f)] = getCountVar(train[['v1',f]], new_train[['v1', f]], f) 110 | test["CountVar_"+str(f)] = getCountVar(test[['v1',f]], new_train[['v1',f]], f) 111 | cat_columns_copy = cat_columns[:] 112 | 113 | 114 | print "Encoding train...." 115 | for f in cat_columns: 116 | print f 117 | val_list = np.zeros(train.shape[0]) 118 | folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] ) 119 | for fold_index in xrange(1,6): 120 | dev_index = np.where(folds_array != fold_index)[0] 121 | val_index = np.where(folds_array == fold_index)[0] 122 | new_train = train[["v1", f, "target"]] 123 | dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:] 124 | enc_list = np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=0 )) 125 | val_list[val_index] = enc_list 126 | train["DVEncode_"+str(f)] = val_list 127 | 128 | 129 | print "Encoding test.." 130 | for f in cat_columns: 131 | print f 132 | test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=0) 133 | 134 | 135 | 136 | print "Two way encoding.." 137 | other_cols=[] 138 | new_var_list = [] 139 | cat_columns = [col for col in cat_columns_copy] 140 | for ind, var1 in enumerate(cat_columns): 141 | rem_cols = cat_columns[ind+1:] 142 | #if var1 in "v30": 143 | # break 144 | for var2 in rem_cols: 145 | print var1, var2 146 | new_var = var1+"_"+var2 147 | 148 | train[new_var] = train[var1].astype("str") +"_" + train[var2].astype("str") 149 | test[new_var] = test[var1].astype("str") + "_" + test[var2].astype("str") 150 | #print train[new_var][:10] 151 | #print test[new_var][:10] 152 | 153 | lbl = preprocessing.LabelEncoder() 154 | lbl.fit(list(train[new_var].values) + list(test[new_var].values)) 155 | train[new_var] = lbl.transform(list(train[new_var].values)) 156 | test[new_var] = lbl.transform(list(test[new_var].values)) 157 | 158 | new_train = pd.concat([ train[['v1',new_var]], test[['v1',new_var]] ]) 159 | test["Count_"+new_var] = getCountVar(test[['v1',new_var]], new_train[['v1', new_var]], new_var) 160 | train["Count_"+new_var] = getCountVar(train[['v1',new_var]], new_train[['v1', new_var]], new_var) 161 | new_var_list.append(new_var) 162 | 163 | print "Train.." 164 | for f in new_var_list: 165 | print f 166 | val_list = np.zeros(train.shape[0]) 167 | folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] ) 168 | for fold_index in xrange(1,6): 169 | dev_index = np.where(folds_array != fold_index)[0] 170 | val_index = np.where(folds_array == fold_index)[0] 171 | new_train = train[["v1", f, "target"]] 172 | dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:] 173 | enc_list = np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=0 ) ) 174 | val_list[val_index] = enc_list 175 | train["DVEncode_"+str(f)] = val_list 176 | 177 | print "Test.." 178 | for f in new_var_list: 179 | print f 180 | test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=0) 181 | train = train.drop(new_var_list, axis=1) 182 | test = test.drop(new_var_list, axis=1) 183 | 184 | 185 | 186 | 187 | 188 | print "Three way encoding.." 189 | other_cols=[] 190 | new_var_list = [] 191 | var3 = "v22" 192 | cat_columns = [col for col in cat_columns_copy if col!= var3] 193 | for ind, var1 in enumerate(cat_columns): 194 | rem_cols = cat_columns[ind+1:] 195 | #if var1 in "v30": 196 | # break 197 | for var2 in rem_cols: 198 | print var1, var2 199 | new_var = var1+"_"+var2+"_"+var3 200 | 201 | train[new_var] = train[var1].astype("str") +"_" + train[var2].astype("str") +"_" + train[var3].astype("str") 202 | test[new_var] = test[var1].astype("str") + "_" + test[var2].astype("str") + "_" + test[var3].astype("str") 203 | #print train[new_var][:10] 204 | #print test[new_var][:10] 205 | 206 | lbl = preprocessing.LabelEncoder() 207 | lbl.fit(list(train[new_var].values) + list(test[new_var].values)) 208 | train[new_var] = lbl.transform(list(train[new_var].values)) 209 | test[new_var] = lbl.transform(list(test[new_var].values)) 210 | 211 | new_train = pd.concat([ train[['v1',new_var]], test[['v1',new_var]] ]) 212 | test["Count_"+new_var] = getCountVar(test[['v1',new_var]], new_train[['v1', new_var]], new_var) 213 | train["Count_"+new_var] = getCountVar(train[['v1',new_var]], new_train[['v1', new_var]], new_var) 214 | new_var_list.append(new_var) 215 | 216 | print "Train.." 217 | for f in new_var_list: 218 | print f 219 | val_list = np.zeros(train.shape[0]) 220 | folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] ) 221 | for fold_index in xrange(1,6): 222 | dev_index = np.where(folds_array != fold_index)[0] 223 | val_index = np.where(folds_array == fold_index)[0] 224 | new_train = train[["v1", f, "target"]] 225 | dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:] 226 | enc_list = np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=0) ) 227 | val_list[val_index] = enc_list 228 | train["DVEncode_"+str(f)] = val_list 229 | 230 | print "Test.." 231 | for f in new_var_list: 232 | print f 233 | test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=0) 234 | train = train.drop(new_var_list, axis=1) 235 | test = test.drop(new_var_list, axis=1) 236 | 237 | 238 | 239 | 240 | 241 | 242 | print "Four way encoding.." 243 | other_cols=[] 244 | new_var_list = [] 245 | for var4_col in ["v52", "v66", "v24", "v56", "v125", "v30"]: 246 | var1_cols = ["v22"] 247 | var4 = var4_col 248 | other_cols.append(var4) 249 | cat_columns = [col for col in cat_columns_copy if col not in var1_cols if col not in other_cols] 250 | for var1 in var1_cols: 251 | for ind, var2 in enumerate(cat_columns): 252 | rem_cols = cat_columns[ind+1:] 253 | #if var1 in "v30": 254 | # break 255 | for var3 in rem_cols: 256 | print var1, var4, var2, var3 257 | new_var = var1+"_"+var4+"_"+var2+"_"+var3 258 | 259 | train[new_var] = train[var1].astype("str") +"_" + train[var2].astype("str") + "_"+ train[var3].astype("str") + "_" +train[var4].astype("str") 260 | test[new_var] = test[var1].astype("str") + "_" + test[var2].astype("str") + "_" +test[var3].astype("str") + "_" + test[var4].astype("str") 261 | #print train[new_var][:10] 262 | #print test[new_var][:10] 263 | 264 | lbl = preprocessing.LabelEncoder() 265 | lbl.fit(list(train[new_var].values) + list(test[new_var].values)) 266 | train[new_var] = lbl.transform(list(train[new_var].values)) 267 | test[new_var] = lbl.transform(list(test[new_var].values)) 268 | 269 | new_train = pd.concat([ train[['v1',new_var]], test[['v1',new_var]] ]) 270 | test["Count_"+new_var] = getCountVar(test[['v1',new_var]], new_train[['v1', new_var]], new_var) 271 | train["Count_"+new_var] = getCountVar(train[['v1',new_var]], new_train[['v1', new_var]], new_var) 272 | new_var_list.append(new_var) 273 | 274 | print "Train.." 275 | for f in new_var_list: 276 | print f 277 | val_list = np.zeros(train.shape[0]) 278 | folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] ) 279 | for fold_index in xrange(1,6): 280 | dev_index = np.where(folds_array != fold_index)[0] 281 | val_index = np.where(folds_array == fold_index)[0] 282 | new_train = train[["v1", f, "target"]] 283 | dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:] 284 | enc_list = np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=2) ) 285 | val_list[val_index] = enc_list 286 | train["DVEncode_"+str(f)] = val_list 287 | 288 | print "Test.." 289 | for f in new_var_list: 290 | print f 291 | test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=2) 292 | train = train.drop(new_var_list, axis=1) 293 | test = test.drop(new_var_list, axis=1) 294 | 295 | 296 | print "Five way encoding.." 297 | new_var_list = [] 298 | for var4_col, var5_col in [["v52", "v66"], ["v24", "v56"], ["v125", "v30"], ["v52","v56"], ["v71", "v91"], ["v112","v113"]]: 299 | var1_cols = ["v22"] 300 | var4 = var4_col 301 | var5 = var5_col 302 | cat_columns = [col for col in cat_columns_copy if col not in var1_cols if col != var4 if col!=var5] 303 | for var1 in var1_cols: 304 | for ind, var2 in enumerate(cat_columns): 305 | rem_cols = cat_columns[ind+1:] 306 | #if var1 in "v30": 307 | # break 308 | for var3 in rem_cols: 309 | print var1, var4, var5, var2, var3 310 | new_var = var1+"_"+var4+"_"+var5+"_"+var2+"_"+var3 311 | 312 | train[new_var] = train[var1].astype("str") + "_"+ train[var2].astype("str") + "_"+train[var3].astype("str") + "_"+ train[var4].astype("str") + "_"+ train[var5].astype("str") 313 | test[new_var] = test[var1].astype("str") + "_"+ test[var2].astype("str") + "_"+ test[var3].astype("str") + "_"+ test[var4].astype("str") + "_"+ test[var5].astype("str") 314 | 315 | lbl = preprocessing.LabelEncoder() 316 | lbl.fit(list(train[new_var].values) + list(test[new_var].values)) 317 | train[new_var] = lbl.transform(list(train[new_var].values)) 318 | test[new_var] = lbl.transform(list(test[new_var].values)) 319 | 320 | new_train = pd.concat([ train[['v1',new_var]], test[['v1',new_var]] ]) 321 | test["Count_"+new_var] = getCountVar(test[['v1',new_var]], new_train[['v1', new_var]], new_var) 322 | train["Count_"+new_var] = getCountVar(train[['v1',new_var]], new_train[['v1', new_var]], new_var) 323 | new_var_list.append(new_var) 324 | 325 | print "Train.." 326 | for f in new_var_list: 327 | print f 328 | val_list = np.zeros(train.shape[0]) 329 | folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] ) 330 | for fold_index in xrange(1,6): 331 | dev_index = np.where(folds_array != fold_index)[0] 332 | val_index = np.where(folds_array == fold_index)[0] 333 | new_train = train[["v1", f, "target"]] 334 | dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:] 335 | enc_list = np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=2) ) 336 | val_list[val_index] = enc_list 337 | train["DVEncode_"+str(f)] = val_list 338 | 339 | print "Test.." 340 | for f in new_var_list: 341 | print f 342 | test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=2) 343 | 344 | 345 | 346 | 347 | 348 | 349 | train = train.drop(new_var_list, axis=1) 350 | test = test.drop(new_var_list, axis=1) 351 | train.to_csv("train_5levelenc_withint.csv", index=False) 352 | test.to_csv("test_5levelenc_withint.csv", index=False) 353 | 354 | end_time = datetime.datetime.now() 355 | print "End time : ",end_time 356 | 357 | print end_time - start_time 358 | 359 | def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0): 360 | clf = ensemble.ExtraTreesClassifier( 361 | n_estimators = n_est_val, 362 | max_depth = depth_val, 363 | min_samples_split = split_val, 364 | min_samples_leaf = leaf_val, 365 | max_features = feat_val, 366 | criterion='entropy', 367 | n_jobs = jobs_val, 368 | random_state = random_state_val) 369 | clf.fit(train_X, train_y) 370 | pred_train_y = clf.predict_proba(train_X)[:,1] 371 | pred_test_y = clf.predict_proba(test_X)[:,1] 372 | 373 | if validation: 374 | train_loss = log_loss(train_y, pred_train_y) 375 | loss = log_loss(test_y, pred_test_y) 376 | print "Train, Test loss : ", train_loss, loss 377 | return pred_test_y, loss 378 | else: 379 | return pred_test_y 380 | 381 | 382 | def prepModel(var4_col="v52"): 383 | print "Reading files.." 384 | train = pd.read_csv('./train_5levelenc_withint.csv') 385 | test = pd.read_csv('./test_5levelenc_withint.csv') 386 | print train.shape, test.shape 387 | 388 | print "Getting DV and ID.." 389 | train_y = train.target.values 390 | train_ID = train.ID.values 391 | test_ID = test.ID.values 392 | train = train.drop(['ID', "target"], axis=1) 393 | test = test.drop(['ID'], axis=1) 394 | 395 | print "Filling NA.." 396 | train = train.fillna(-1) 397 | test = test.fillna(-1) 398 | 399 | feat_names = list(train.columns) 400 | print "Converting to array.." 401 | train = np.array(train) 402 | test = np.array(test) 403 | print train.shape, test.shape 404 | 405 | assert train.shape[1] == test.shape[1] 406 | print "Cross validating.." 407 | cv_scores = [] 408 | train_preds = np.zeros(train.shape[0]) 409 | folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] ) 410 | for fold_index in xrange(1,6): 411 | dev_index = np.where(folds_array != fold_index)[0] 412 | val_index = np.where(folds_array == fold_index)[0] 413 | dev_X, val_X = train[dev_index,:], train[val_index,:] 414 | dev_y, val_y = train_y[dev_index], train_y[val_index] 415 | 416 | #preds, loss = runXGB(dev_X, dev_y, val_X, val_y, feature_names=feat_names) 417 | #for feat in [60, 100, 150]: 418 | preds, loss = runET(dev_X, dev_y, val_X, val_y, validation=1, n_est_val=500, depth_val=40, split_val=4, leaf_val=2, feat_val=180, jobs_val=4, random_state_val=98751) 419 | #print feat, loss 420 | cv_scores.append(loss) 421 | print cv_scores 422 | train_preds[val_index] = preds 423 | print cv_scores, np.mean(cv_scores) 424 | 425 | out_df = pd.DataFrame({"ID":train_ID}) 426 | out_df["et1_srk_5levelenc_withint"] = train_preds 427 | out_df.to_csv("prval_et1_srk_5levelenc_withint.csv", index=False) 428 | 429 | print "Final model.." 430 | preds = runET(train, train_y, test, validation=0, n_est_val=500, depth_val=40, split_val=4, leaf_val=2, feat_val=180, jobs_val=4, random_state_val=98751) 431 | out_df = pd.DataFrame({"ID":test_ID}) 432 | out_df["et1_srk_5levelenc_withint"] = preds 433 | out_df.to_csv("prfull_et1_srk_5levelenc_withint.csv", index=False) 434 | 435 | 436 | 437 | def prepModelXGB(var4_col="v52"): 438 | print "Reading files.." 439 | train = pd.read_csv('./train_5levelenc_withint.csv') 440 | print train.shape 441 | 442 | print "Getting DV and ID.." 443 | train_y = train.target.values 444 | train_ID = train.ID.values 445 | train = train.drop(['ID', "target"], axis=1) 446 | 447 | print "Filling NA.." 448 | train = train.fillna(-1) 449 | 450 | feat_names = list(train.columns) 451 | print "Converting to array.." 452 | train = np.array(train) 453 | print train.shape 454 | 455 | 456 | print "Cross validating.." 457 | cv_scores = [] 458 | train_preds = np.zeros(train.shape[0]) 459 | folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] ) 460 | for fold_index in xrange(1,6): 461 | dev_index = np.where(folds_array != fold_index)[0] 462 | val_index = np.where(folds_array == fold_index)[0] 463 | dev_X, val_X = train[dev_index,:], train[val_index,:] 464 | dev_y, val_y = train_y[dev_index], train_y[val_index] 465 | 466 | preds, loss = runXGB(dev_X, dev_y, val_X, val_y, feature_names=feat_names) 467 | #for feat in [60, 100, 150]: 468 | #preds, loss = runET(dev_X, dev_y, val_X, val_y, validation=1, n_est_val=600, depth_val=40, split_val=4, leaf_val=2, feat_val=180, jobs_val=4, random_state_val=8111) 469 | #print feat, loss 470 | cv_scores.append(loss) 471 | print cv_scores 472 | train_preds[val_index] = preds 473 | print cv_scores, np.mean(cv_scores) 474 | 475 | out_df = pd.DataFrame({"ID":train_ID}) 476 | out_df["xg1_srk_5levelenc_withint"] = train_preds 477 | out_df.to_csv("prval_xg1_srk_5levelenc_withint.csv", index=False) 478 | 479 | import gc 480 | del dev_X 481 | del val_X 482 | gc.collect() 483 | 484 | 485 | print "Final model.." 486 | test = pd.read_csv('./test_5levelenc_withint.csv') 487 | print train.shape, test.shape 488 | test_ID = test.ID.values 489 | test = test.drop(['ID'], axis=1) 490 | test = test.fillna(-1) 491 | test = np.array(test) 492 | print train.shape, test.shape 493 | 494 | assert train.shape[1] == test.shape[1] 495 | #preds = runET(train, train_y, test, validation=0, n_est_val=600, depth_val=40, split_val=4, leaf_val=2, feat_val=180, jobs_val=4, random_state_val=8111) 496 | preds = runXGB(train, train_y, test, feature_names=feat_names) 497 | out_df = pd.DataFrame({"ID":test_ID}) 498 | out_df["xg1_srk_5levelenc_withint"] = preds 499 | out_df.to_csv("prfull_xg1_srk_5levelenc_withint.csv", index=False) 500 | 501 | 502 | if __name__ == "__main__": 503 | #for var4_col_name in ["v52", "v66", "v24", "v56", "v125", "v30"]: 504 | for var4_col_name in ["v30"]: 505 | try: 506 | prepData(var4_col_name) 507 | prepModelXGB(var4_col_name) 508 | prepModel(var4_col_name) 509 | except Exception,e: 510 | print e 511 | pass 512 | -------------------------------------------------------------------------------- /BNP/readme.md: -------------------------------------------------------------------------------- 1 | Code for Kaggle - BNP competition where we finished in 12th position 2 | -------------------------------------------------------------------------------- /Datasets-Indian_Premier_League/readme.md: -------------------------------------------------------------------------------- 1 | Ipython notebooks created for exploring the [Indian Premier League dataset](https://www.kaggle.com/manasgarg/ipl) present in Kaggle is present in this folder. 2 | 3 | File explanations: 4 | 5 | 1. IPL_Exploration.ipynb - Notebook which has some exploratory analysis on the IPL data 6 | 2. IPL_Win_Prediction.ipynb - Notebook to predict the win probability of the given team at the end of each over 7 | -------------------------------------------------------------------------------- /Expedia/DataPrep/getBookings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code to get the bookings from the train file 3 | __author__ : SRK 4 | """ 5 | import csv 6 | 7 | train_file_handle = open("../../Data/train.csv") 8 | train_out_file_handle = open("../../Data/train_bookings.csv","w") 9 | 10 | reader = csv.reader(train_file_handle) 11 | writer = csv.writer(train_out_file_handle) 12 | 13 | header = reader.next() 14 | writer.writerow(["id"] + header) 15 | 16 | is_booking_index = header.index("is_booking") 17 | print "Booking index is : ", is_booking_index 18 | 19 | total_count = 0 20 | count = 0 21 | for row in reader: 22 | if row[is_booking_index] == "1": 23 | writer.writerow([total_count] + row) 24 | count += 1 25 | total_count += 1 26 | if total_count % 100000 == 0: 27 | print total_count, count 28 | 29 | print "Total count : ", total_count 30 | print "Booking count : ", count 31 | 32 | train_file_handle.close() 33 | train_out_file_handle.close() 34 | -------------------------------------------------------------------------------- /Expedia/DataPrep/getClicks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code to get the bookings from the train file 3 | __author__ : SRK 4 | """ 5 | import csv 6 | 7 | train_file_handle = open("../../Data/train.csv") 8 | train_out_file_handle = open("../../Data/train_clicks.csv","w") 9 | 10 | reader = csv.reader(train_file_handle) 11 | writer = csv.writer(train_out_file_handle) 12 | 13 | header = reader.next() 14 | writer.writerow(["id"] +header) 15 | 16 | is_booking_index = header.index("is_booking") 17 | print "Booking index is : ", is_booking_index 18 | 19 | total_count = 0 20 | count = 0 21 | for row in reader: 22 | if row[is_booking_index] == "0": 23 | writer.writerow([total_count]+row) 24 | count += 1 25 | total_count += 1 26 | if total_count % 100000 == 0: 27 | print total_count, count 28 | 29 | print "Total count : ", total_count 30 | print "Booking count : ", count 31 | 32 | train_file_handle.close() 33 | train_out_file_handle.close() 34 | -------------------------------------------------------------------------------- /Expedia/DataPrep/getLeakFree.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from datetime import datetime 3 | 4 | with open("../../Data/val.csv") as train_file: 5 | with open("../../Data/val_leak_preds.csv") as leak_file: 6 | reader = csv.reader(train_file) 7 | leak_reader = csv.DictReader(leak_file) 8 | 9 | out_file = open("../../Data/val_woleak.csv","w") 10 | out_writer = csv.writer(out_file) 11 | out_file2 = open("../../Data/val_withleak.csv","w") 12 | out_writer2 = csv.writer(out_file2) 13 | 14 | header = reader.next() 15 | out_writer.writerow(header) 16 | 17 | leak_count = 0 18 | for index, row in enumerate(reader): 19 | leak_row = leak_reader.next() 20 | if leak_row["hotel_cluster"] == "": 21 | out_writer.writerow(row) 22 | else: 23 | out_writer2.writerow(row) 24 | leak_count +=1 25 | print "Leak count is : ", leak_count 26 | 27 | out_file.close() 28 | 29 | 30 | with open("../../Data/test.csv") as train_file: 31 | with open("../../Data/test_leak_preds.csv") as leak_file: 32 | reader = csv.reader(train_file) 33 | leak_reader = csv.DictReader(leak_file) 34 | 35 | out_file = open("../../Data/test_woleak.csv","w") 36 | out_writer = csv.writer(out_file) 37 | out_file2 = open("../../Data/test_withleak.csv","w") 38 | out_writer2 = csv.writer(out_file2) 39 | 40 | header = reader.next() 41 | out_writer.writerow(header) 42 | 43 | leak_count = 0 44 | for index, row in enumerate(reader): 45 | leak_row = leak_reader.next() 46 | if leak_row["hotel_cluster"] == "": 47 | out_writer.writerow(row) 48 | else: 49 | out_writer2.writerow(row) 50 | leak_count +=1 51 | print "Leak count is : ", leak_count 52 | 53 | out_file.close() 54 | 55 | 56 | # get only the bookings from the validation sample # 57 | train_file_handle = open("../../Data/val_woleak.csv") 58 | train_out_file_handle = open("../../Data/val_bookings_woleak.csv","w") 59 | 60 | reader = csv.reader(train_file_handle) 61 | writer = csv.writer(train_out_file_handle) 62 | 63 | header = reader.next() 64 | writer.writerow(header) 65 | 66 | is_booking_index = header.index("is_booking") 67 | print "Booking index is : ", is_booking_index 68 | 69 | total_count = 0 70 | count = 0 71 | for row in reader: 72 | if row[is_booking_index] == "1": 73 | writer.writerow(row) 74 | count += 1 75 | total_count += 1 76 | if total_count % 100000 == 0: 77 | print total_count, count 78 | 79 | print "Total count : ", total_count 80 | print "Booking count : ", count 81 | 82 | train_file_handle.close() 83 | train_out_file_handle.close() 84 | 85 | -------------------------------------------------------------------------------- /Expedia/DataPrep/getLeakRows_test.py: -------------------------------------------------------------------------------- 1 | from csv import DictReader 2 | from collections import defaultdict 3 | from datetime import datetime 4 | 5 | start = datetime.now() 6 | 7 | def get_top5(d): 8 | return sorted(d, key=d.get, reverse=True)[:5] 9 | 10 | destination_clusters = defaultdict(lambda: defaultdict(int)) 11 | destination_clusters2 = defaultdict(lambda: defaultdict(int)) 12 | destination_clusters3 = defaultdict(lambda: defaultdict(int)) 13 | destination_clusters4 = defaultdict(lambda: defaultdict(int)) 14 | 15 | print "Reading the train.." 16 | for i, row in enumerate(DictReader(open("../../Data/train.csv"))): 17 | key = row["user_location_country"] + "_" + row["user_location_region"] + "_" + row["user_location_city"] + "_" + row["hotel_market"] + "_"+ row["orig_destination_distance"] 18 | #key2 = row["user_id"] + "_" + row["srch_destination_id"] 19 | #key3 = row["srch_destination_id"] + "_" + row["hotel_market"] 20 | #key4 = row["hotel_market"] 21 | destination_clusters[key][row["hotel_cluster"]] += 1 22 | #destination_clusters2[key2][row["hotel_cluster"]] += 1 23 | #destination_clusters3[key3][row["hotel_cluster"]] += 1 24 | #destination_clusters4[key4][row["hotel_cluster"]] += 1 25 | if i % 1000000 == 0: 26 | print("%s\t%s"%(i, datetime.now() - start)) 27 | 28 | most_frequent = defaultdict(str) 29 | most_frequent2 = defaultdict(str) 30 | most_frequent3 = defaultdict(str) 31 | most_frequent4 = defaultdict(str) 32 | 33 | print "Getting top 5 list.." 34 | for k in destination_clusters: 35 | top5_list = get_top5(destination_clusters[k]) 36 | most_frequent[k] = top5_list[:] 37 | del destination_clusters 38 | import gc 39 | gc.collect() 40 | 41 | #for k in destination_clusters2: 42 | # top5_list = get_top5(destination_clusters2[k]) 43 | # most_frequent2[k] = top5_list[:] 44 | #del destination_clusters2 45 | #gc.collect() 46 | # 47 | #for k in destination_clusters3: 48 | # top5_list = get_top5(destination_clusters3[k]) 49 | # most_frequent3[k] = top5_list[:] 50 | #del destination_clusters3 51 | #gc.collect() 52 | # 53 | #for k in destination_clusters4: 54 | # top5_list = get_top5(destination_clusters4[k]) 55 | # most_frequent4[k] = top5_list[:] 56 | #del destination_clusters4 57 | #gc.collect() 58 | 59 | 60 | 61 | 62 | 63 | 64 | print "Predicting on test.." 65 | with open("../../Data/test_leak_preds.csv", "w") as outfile: 66 | outfile.write("id,hotel_cluster\n") 67 | for i, row in enumerate(DictReader(open("../../Data/test.csv"))): 68 | key = row["user_location_country"] + "_" + row["user_location_region"] + "_" + row["user_location_city"] + "_" + row["hotel_market"] + "_"+ row["orig_destination_distance"] 69 | #key2 = row["user_id"] + "_" + row["srch_destination_id"] 70 | #key3 = row["srch_destination_id"] + "_" + row["hotel_market"] 71 | #key4 = row["hotel_market"] 72 | 73 | if row["orig_destination_distance"] == "": 74 | top5_list = [] 75 | else: 76 | top5_list = most_frequent[key][:] 77 | if isinstance(top5_list, str): 78 | top5_list = [] 79 | 80 | 81 | #if len(top5_list) < 5: 82 | # temp_top5_list = most_frequent2.get(key2,[]) 83 | # for v in temp_top5_list: 84 | # if v not in top5_list: 85 | # top5_list.append(v) 86 | # if len(top5_list) == 5: 87 | # break 88 | 89 | #if len(top5_list) < 5: 90 | # temp_top5_list = most_frequent3[key3] 91 | # for v in temp_top5_list: 92 | # if v not in top5_list: 93 | # top5_list.append(v) 94 | # if len(top5_list) == 5: 95 | # break 96 | 97 | #if len(top5_list) < 5: 98 | # temp_top5_list = most_frequent4[key4] 99 | # for v in temp_top5_list: 100 | # if v not in top5_list: 101 | # top5_list.append(v) 102 | # if len(top5_list) == 5: 103 | # break 104 | 105 | top5_clusters = " ".join(top5_list) 106 | 107 | outfile.write("%d,%s\n"%(i,top5_clusters)) 108 | if i % 1000000 == 0: 109 | print("%s\t%s"%(i, datetime.now() - start)) 110 | del most_frequent 111 | del most_frequent2 112 | del most_frequent3 113 | del most_frequent4 114 | gc.collect() 115 | 116 | -------------------------------------------------------------------------------- /Expedia/DataPrep/getLeakRows_val.py: -------------------------------------------------------------------------------- 1 | from csv import DictReader 2 | from collections import defaultdict 3 | from datetime import datetime 4 | 5 | start = datetime.now() 6 | 7 | def get_top5(d): 8 | return sorted(d, key=d.get, reverse=True)[:5] 9 | 10 | destination_clusters = defaultdict(lambda: defaultdict(int)) 11 | destination_clusters2 = defaultdict(lambda: defaultdict(int)) 12 | destination_clusters3 = defaultdict(lambda: defaultdict(int)) 13 | destination_clusters4 = defaultdict(lambda: defaultdict(int)) 14 | 15 | print "Reading the train.." 16 | for i, row in enumerate(DictReader(open("../../Data/dev.csv"))): 17 | key = row["user_location_country"] + "_" + row["user_location_region"] + "_" + row["user_location_city"] + "_" + row["hotel_market"] + "_"+ row["orig_destination_distance"] 18 | #key2 = row["user_id"] + "_" + row["srch_destination_id"] 19 | #key3 = row["srch_destination_id"] + "_" + row["hotel_market"] 20 | #key4 = row["hotel_market"] 21 | destination_clusters[key][row["hotel_cluster"]] += 1 22 | #destination_clusters2[key2][row["hotel_cluster"]] += 1 23 | #destination_clusters3[key3][row["hotel_cluster"]] += 1 24 | #destination_clusters4[key4][row["hotel_cluster"]] += 1 25 | if i % 1000000 == 0: 26 | print("%s\t%s"%(i, datetime.now() - start)) 27 | 28 | most_frequent = defaultdict(str) 29 | most_frequent2 = defaultdict(str) 30 | most_frequent3 = defaultdict(str) 31 | most_frequent4 = defaultdict(str) 32 | 33 | print "Getting top 5 list.." 34 | for k in destination_clusters: 35 | top5_list = get_top5(destination_clusters[k]) 36 | most_frequent[k] = top5_list[:] 37 | del destination_clusters 38 | import gc 39 | gc.collect() 40 | 41 | #for k in destination_clusters2: 42 | # top5_list = get_top5(destination_clusters2[k]) 43 | # most_frequent2[k] = top5_list[:] 44 | #del destination_clusters2 45 | #gc.collect() 46 | # 47 | #for k in destination_clusters3: 48 | # top5_list = get_top5(destination_clusters3[k]) 49 | # most_frequent3[k] = top5_list[:] 50 | #del destination_clusters3 51 | #gc.collect() 52 | # 53 | #for k in destination_clusters4: 54 | # top5_list = get_top5(destination_clusters4[k]) 55 | # most_frequent4[k] = top5_list[:] 56 | #del destination_clusters4 57 | #gc.collect() 58 | 59 | 60 | 61 | 62 | 63 | 64 | print "Predicting on test.." 65 | with open("../../Data/val_leak_preds.csv", "w") as outfile: 66 | outfile.write("id,hotel_cluster\n") 67 | for i, row in enumerate(DictReader(open("../../Data/val.csv"))): 68 | key = row["user_location_country"] + "_" + row["user_location_region"] + "_" + row["user_location_city"] + "_" + row["hotel_market"] + "_"+ row["orig_destination_distance"] 69 | #key2 = row["user_id"] + "_" + row["srch_destination_id"] 70 | #key3 = row["srch_destination_id"] + "_" + row["hotel_market"] 71 | #key4 = row["hotel_market"] 72 | 73 | if row["orig_destination_distance"] == "": 74 | top5_list = [] 75 | else: 76 | top5_list = most_frequent[key][:] 77 | if isinstance(top5_list, str): 78 | top5_list = [] 79 | 80 | 81 | #if len(top5_list) < 5: 82 | # temp_top5_list = most_frequent2.get(key2,[]) 83 | # for v in temp_top5_list: 84 | # if v not in top5_list: 85 | # top5_list.append(v) 86 | # if len(top5_list) == 5: 87 | # break 88 | 89 | #if len(top5_list) < 5: 90 | # temp_top5_list = most_frequent3[key3] 91 | # for v in temp_top5_list: 92 | # if v not in top5_list: 93 | # top5_list.append(v) 94 | # if len(top5_list) == 5: 95 | # break 96 | 97 | #if len(top5_list) < 5: 98 | # temp_top5_list = most_frequent4[key4] 99 | # for v in temp_top5_list: 100 | # if v not in top5_list: 101 | # top5_list.append(v) 102 | # if len(top5_list) == 5: 103 | # break 104 | 105 | top5_clusters = " ".join(top5_list) 106 | 107 | outfile.write("%d,%s\n"%(i,top5_clusters)) 108 | if i % 1000000 == 0: 109 | print("%s\t%s"%(i, datetime.now() - start)) 110 | del most_frequent 111 | del most_frequent2 112 | del most_frequent3 113 | del most_frequent4 114 | gc.collect() 115 | 116 | sys.exit() 117 | 118 | ### Code to get the mapk value ### 119 | print "Getting Eval Metric" 120 | import pandas as pd 121 | import numpy as np 122 | from ml_metrics import mapk 123 | 124 | preds_df = pd.read_csv("val_leak_preds.csv") 125 | preds = np.array( preds_df["hotel_cluster"].apply(lambda x: str(x).split(" ")) ) 126 | #preds = [pred for pred in preds] 127 | print preds[:10] 128 | found_count= 0 129 | total_count = 0 130 | item_count = 0 131 | for pred in preds: 132 | if pred != ['nan']: 133 | found_count+=1 134 | item_count += len(pred) 135 | total_count+=1 136 | print "Item, Found and total : ", item_count,found_count, total_count 137 | 138 | 139 | actuals = np.array( pd.read_csv("../../Data/val.csv", usecols = ["hotel_cluster"])).astype('str') 140 | actuals = actuals.reshape(len(actuals),1) 141 | #actuals = [list(actual) for actual in actuals] 142 | print actuals[:10] 143 | 144 | print mapk(actuals, preds, k=5) 145 | 146 | -------------------------------------------------------------------------------- /Expedia/DataPrep/readme.md: -------------------------------------------------------------------------------- 1 | 1. splitDevVal.py - Code to split the train data into dev and val samples based on time 2 | 2. getBookings.py - Code to get the bookings from the train file 3 | 3. getClicks.py - Code to get the clicks from train file 4 | 4. splitDevVal_Bookings.py - Code to split the bookings into dev and val sample based on time 5 | 5. splitDevVal_Clicks.py - Code to split the clicks into dev and val sample based on time 6 | 6. getLeakRows_val.py - Code to get the leaky rows of validation sample and save it in csv file 7 | 7. getLeakRows_test.py - Code to get the leaky rows of test sample and save it in csv file 8 | 8. getLeakFree.py - Code to get the leak free rows for both test and val sample 9 | -------------------------------------------------------------------------------- /Expedia/DataPrep/splitDevVal.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code to split the train data into two samples - dev and val. Last four months of 2014 is used as val sample 3 | __author__ : SRK 4 | """ 5 | import csv 6 | from datetime import datetime 7 | 8 | with open("../../Data/train.csv") as train_file: 9 | dev_file = open("../../Data/dev.csv","w") 10 | val_file = open("../../Data/val.csv","w") 11 | 12 | dev_writer = csv.writer(dev_file) 13 | val_writer = csv.writer(val_file) 14 | 15 | reader = csv.reader(train_file) 16 | header = reader.next() 17 | dev_writer.writerow(["id"] + header) 18 | val_writer.writerow(["id"] + header) 19 | date_index = header.index("date_time") 20 | 21 | dev_counter = 0 22 | val_counter = 0 23 | total_counter = 0 24 | for row in reader: 25 | #print row 26 | date_val = datetime.strptime(row[date_index], "%Y-%m-%d %H:%M:%S") 27 | if date_val.year == 2014 and date_val.month >= 9: 28 | val_writer.writerow([total_counter]+row) 29 | val_counter += 1 30 | else: 31 | dev_writer.writerow([total_counter]+row) 32 | dev_counter += 1 33 | total_counter += 1 34 | if total_counter % 1000000 == 0: 35 | print total_counter, dev_counter, val_counter 36 | 37 | dev_file.close() 38 | val_file.close() 39 | 40 | -------------------------------------------------------------------------------- /Expedia/DataPrep/splitDevVal_Bookings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code to split the train bookings data into two samples - dev and val. Last four months of 2014 is used as val sample 3 | __author__ : SRK 4 | """ 5 | 6 | import csv 7 | from datetime import datetime 8 | 9 | with open("../../Data/train_bookings.csv") as train_file: 10 | dev_file = open("../../Data/dev_bookings.csv","w") 11 | val_file = open("../../Data/val_bookings.csv","w") 12 | 13 | dev_writer = csv.writer(dev_file) 14 | val_writer = csv.writer(val_file) 15 | 16 | reader = csv.reader(train_file) 17 | header = reader.next() 18 | dev_writer.writerow(header) 19 | val_writer.writerow(header) 20 | date_index = header.index("date_time") 21 | 22 | dev_counter = 0 23 | val_counter = 0 24 | total_counter = 0 25 | for row in reader: 26 | #print row 27 | date_val = datetime.strptime(row[date_index], "%Y-%m-%d %H:%M:%S") 28 | if date_val.year == 2014 and date_val.month >= 9: 29 | val_writer.writerow(row) 30 | val_counter += 1 31 | else: 32 | dev_writer.writerow(row) 33 | dev_counter += 1 34 | total_counter += 1 35 | if total_counter % 1000000 == 0: 36 | print total_counter, dev_counter, val_counter 37 | 38 | dev_file.close() 39 | val_file.close() 40 | 41 | -------------------------------------------------------------------------------- /Expedia/DataPrep/splitDevVal_Clicks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code to split the train bookings data into two samples - dev and val. Last four months of 2014 is used as val sample 3 | __author__ : SRK 4 | """ 5 | 6 | import csv 7 | from datetime import datetime 8 | 9 | with open("../../Data/train_clicks.csv") as train_file: 10 | dev_file = open("../../Data/dev_clicks.csv","w") 11 | val_file = open("../../Data/val_clicks.csv","w") 12 | 13 | dev_writer = csv.writer(dev_file) 14 | val_writer = csv.writer(val_file) 15 | 16 | reader = csv.reader(train_file) 17 | header = reader.next() 18 | dev_writer.writerow(header) 19 | val_writer.writerow(header) 20 | date_index = header.index("date_time") 21 | 22 | dev_counter = 0 23 | val_counter = 0 24 | total_counter = 0 25 | for row in reader: 26 | #print row 27 | date_val = datetime.strptime(row[date_index], "%Y-%m-%d %H:%M:%S") 28 | if date_val.year == 2014 and date_val.month >= 9: 29 | val_writer.writerow(row) 30 | val_counter += 1 31 | else: 32 | dev_writer.writerow(row) 33 | dev_counter += 1 34 | total_counter += 1 35 | if total_counter % 1000000 == 0: 36 | print total_counter, dev_counter, val_counter 37 | 38 | dev_file.close() 39 | val_file.close() 40 | 41 | -------------------------------------------------------------------------------- /Expedia/DataPrep/splitNonLeak_Dist.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | 4 | from datetime import datetime 5 | 6 | with open("../../Data/val_bookings_woleak.csv") as train_file: 7 | reader = csv.reader(train_file) 8 | #leak_reader = csv.DictReader(leak_file) 9 | 10 | out_file = open("../../Data/val_bookings_woleak_wodist.csv","w") 11 | out_writer = csv.writer(out_file) 12 | out_file2 = open("../../Data/val_bookings_woleak_dist.csv","w") 13 | out_writer2 = csv.writer(out_file2) 14 | 15 | header = reader.next() 16 | dist_index = header.index("orig_destination_distance") 17 | out_writer.writerow(header) 18 | out_writer2.writerow(header) 19 | 20 | leak_count = 0 21 | for index, row in enumerate(reader): 22 | if row[dist_index] == "": 23 | out_writer.writerow(row) 24 | else: 25 | out_writer2.writerow(row) 26 | leak_count +=1 27 | print "With Dist count is : ", leak_count 28 | print index 29 | 30 | out_file.close() 31 | 32 | -------------------------------------------------------------------------------- /Expedia/readme.md: -------------------------------------------------------------------------------- 1 | Codes for the [Kaggle Expedia Competition](https://www.kaggle.com/c/expedia-hotel-recommendations) 2 | -------------------------------------------------------------------------------- /GhoulsGoblinsGhost/readme.md: -------------------------------------------------------------------------------- 1 | Codes for the Kaggle competition - [Ghouls, Goblins and Ghosts](https://www.kaggle.com/c/ghouls-goblins-and-ghosts-boo/) are present in this folder 2 | 3 | 1. [Kaggle kernel for exploration](https://www.kaggle.com/sudalairajkumar/ghouls-goblins-and-ghosts-boo/simple-exploration-notebook) 4 | -------------------------------------------------------------------------------- /LibertyMutual/featureSelection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jul 22 17:17:40 2014 4 | 5 | @author: Sudalai Rajkumar S 6 | 7 | This code is for selecting the features to run the final model 8 | Feature sets 2 and 3 in the final model are selected based on this code 9 | Feature selection uses stepwise forward feature selection algorithm that maximizes weighted gini coefficient 10 | """ 11 | from __future__ import division 12 | import numpy as np 13 | import pandas as pd 14 | from sklearn.linear_model import Ridge, Lasso, LogisticRegression, SGDClassifier, LinearRegression 15 | from sklearn.cross_validation import cross_val_score, KFold 16 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 17 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif 18 | from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score 19 | 20 | import pandas as pd 21 | import numpy as np 22 | 23 | def weighted_gini(act,pred,weight): 24 | df = pd.DataFrame({"act":act,"pred":pred,"weight":weight}) 25 | df = df.sort('pred',ascending=False) 26 | df["random"] = (df.weight / df.weight.sum()).cumsum() 27 | total_pos = (df.act * df.weight).sum() 28 | df["cum_pos_found"] = (df.act * df.weight).cumsum() 29 | df["lorentz"] = df.cum_pos_found / total_pos 30 | #n = df.shape[0] 31 | #df["gini"] = (df.lorentz - df.random) * df.weight 32 | #return df.gini.sum() 33 | gini = sum(df.lorentz[1:].values * (df.random[:-1])) - sum(df.lorentz[:-1].values * (df.random[1:])) 34 | return gini 35 | 36 | def normalized_weighted_gini(act,pred,weight): 37 | return weighted_gini(act,pred,weight) / weighted_gini(act,act,weight) 38 | 39 | data_path = "Path to data" 40 | 41 | tr = np.load(data_path+"train.npy") 42 | ts = np.load(data_path+"test.npy") 43 | train_y = np.load(data_path+"train_y.npy") 44 | 45 | 46 | ### Feature selection using stepwise fashion based on cross validation ### 47 | # This code will select one variable at a time from the given input variables using greedy approach which maximizes weighted gini metric # 48 | print "Cross Validating.." 49 | wt_gini = 0 50 | kf = KFold(tr.shape[0], n_folds=5) 51 | for i in xrange(tr.shape[1]): 52 | cv_gini_list=[] 53 | for dev_index, val_index in kf: 54 | tr_new = tr[:,[i]] 55 | #tr_new = tr[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378,i]] 56 | #tr_new = tr[:,[288, 334, 50, 359, 29, 238, 45, 369, 188, 183, 225, 370, 310, 40, 63, 321, 226, 119, 2, 300, 291, 157, 303, 214, 46, 282, 349, 155, 32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3, i]] 57 | X_dev, X_val = tr_new[dev_index,:], tr_new[val_index,:] 58 | y_dev, y_val = train_y[dev_index], train_y[val_index] 59 | wt_dev, wt_val = tr[dev_index,1], tr[val_index,1] 60 | clf = Ridge() 61 | clf.fit(X_dev, y_dev) 62 | preds = clf.predict(X_val) 63 | 64 | cv_gini_list.append(normalized_weighted_gini(y_val,preds,wt_val)) 65 | print cv_gini_list 66 | print np.mean(cv_gini_list) 67 | if np.mean(cv_gini_list) > wt_gini: 68 | wt_gini = np.mean(cv_gini_list) 69 | selected_index = i 70 | if i % 50 == 0: 71 | print "Processed : ",i 72 | print wt_gini 73 | print selected_index 74 | -------------------------------------------------------------------------------- /LibertyMutual/finalModel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Jul 28 17:17:40 2014 4 | 5 | @author: Sudalai Rajkumar S 6 | """ 7 | from __future__ import division 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn.linear_model import Ridge, Lasso, LogisticRegression, SGDClassifier, LinearRegression 11 | from sklearn.cross_validation import cross_val_score, KFold 12 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 13 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif 14 | from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score 15 | 16 | import pandas as pd 17 | import numpy as np 18 | 19 | def weighted_gini(act,pred,weight): 20 | df = pd.DataFrame({"act":act,"pred":pred,"weight":weight}) 21 | df = df.sort('pred',ascending=False) 22 | df["random"] = (df.weight / df.weight.sum()).cumsum() 23 | total_pos = (df.act * df.weight).sum() 24 | df["cum_pos_found"] = (df.act * df.weight).cumsum() 25 | df["lorentz"] = df.cum_pos_found / total_pos 26 | #n = df.shape[0] 27 | #df["gini"] = (df.lorentz - df.random) * df.weight 28 | #return df.gini.sum() 29 | gini = sum(df.lorentz[1:].values * (df.random[:-1])) - sum(df.lorentz[:-1].values * (df.random[1:])) 30 | return gini 31 | 32 | def normalized_weighted_gini(act,pred,weight): 33 | return weighted_gini(act,pred,weight) / weighted_gini(act,act,weight) 34 | 35 | data_path = "Path to data" 36 | 37 | tr = np.load(data_path+"train.npy") 38 | ts = np.load(data_path+"test.npy") 39 | train_y = np.load(data_path+"train_y.npy") 40 | sample = pd.read_csv(data_path+'sampleSubmission.csv') 41 | 42 | ### Three training sets are created based on different feature selection methodologies ### 43 | ### Set1 - Run univariate regression to get the top 30 features ### 44 | feature_selector = SelectKBest(score_func=f_regression, k=30) 45 | feature_selector.fit(tr, train_y) 46 | tr1 = feature_selector.transform(tr) 47 | ts1 = feature_selector.transform(ts) 48 | 49 | ### Set 2 & 3 - Features selected based on stepwise cross validation ( (tr2,ts2) and (tr3,ts3) )### 50 | tr2 = tr[:,[288, 334, 50, 359, 29, 238, 45, 369, 188, 183, 225, 370, 310, 40, 63, 321, 226, 119, 2, 300, 291, 157, 303, 214, 46, 282, 349, 155, 32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]] 51 | ts2 = ts[:,[288, 334, 50, 359, 29, 238, 45, 369, 188, 183, 225, 370, 310, 40, 63, 321, 226, 119, 2, 300, 291, 157, 303, 214, 46, 282, 349, 155, 32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]] 52 | 53 | tr3 = tr[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378]] 54 | ts3 = ts[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378]] 55 | 56 | ### Running ridge regression using all three train samples then make predictions on test set separately ### 57 | # Model 1 # 58 | clf = Ridge() 59 | clf.fit(tr1, train_y) 60 | preds1 = clf.predict(ts1) 61 | 62 | # Model 2 # 63 | clf = Ridge() 64 | clf.fit(tr2, train_y) 65 | preds2 = clf.predict(ts2) 66 | 67 | # Model 3# 68 | clf = Ridge() 69 | clf.fit(tr3, train_y) 70 | preds3 = clf.predict(ts3) 71 | 72 | ### Ensembling the models together ### 73 | preds = (0.2*preds1) + (0.32*preds2)+ (0.48*preds3) 74 | 75 | ### Writing the outputs to out file ### 76 | sample['target'] = preds 77 | sample.to_csv('submission.csv', index = False) 78 | -------------------------------------------------------------------------------- /LibertyMutual/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jul 22 17:17:40 2014 4 | 5 | @author: Sudalai Rajkumar S 6 | """ 7 | from __future__ import division 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn.linear_model import Ridge, Lasso, LogisticRegression, SGDClassifier, LinearRegression 11 | from sklearn.cross_validation import cross_val_score, KFold 12 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 13 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif 14 | from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score 15 | 16 | import pandas as pd 17 | import numpy as np 18 | 19 | def weighted_gini(act,pred,weight): 20 | df = pd.DataFrame({"act":act,"pred":pred,"weight":weight}) 21 | df = df.sort('pred',ascending=False) 22 | df["random"] = (df.weight / df.weight.sum()).cumsum() 23 | total_pos = (df.act * df.weight).sum() 24 | df["cum_pos_found"] = (df.act * df.weight).cumsum() 25 | df["lorentz"] = df.cum_pos_found / total_pos 26 | #n = df.shape[0] 27 | #df["gini"] = (df.lorentz - df.random) * df.weight 28 | #return df.gini.sum() 29 | gini = sum(df.lorentz[1:].values * (df.random[:-1])) - sum(df.lorentz[:-1].values * (df.random[1:])) 30 | return gini 31 | 32 | def normalized_weighted_gini(act,pred,weight): 33 | return weighted_gini(act,pred,weight) / weighted_gini(act,act,weight) 34 | 35 | data_path = "C:/Sudalai/Others/Comp/Kaggle/LibertyMutual/Data/" 36 | 37 | #train = pd.read_csv(data_path+'train.csv') 38 | #test = pd.read_csv(data_path+'test.csv') 39 | 40 | #train_var1 = pd.get_dummies(train['var1']) 41 | #test_var1 = pd.get_dummies(test['var1']) 42 | 43 | #train_var2 = pd.get_dummies(train['var2']) 44 | #test_var2 = pd.get_dummies(test['var2']) 45 | 46 | #train_var3 = pd.get_dummies(train['var3']) 47 | #test_var3 = pd.get_dummies(test['var3']) 48 | 49 | #train_var4 = pd.get_dummies(train['var4']) 50 | #test_var4 = pd.get_dummies(test['var4']) 51 | 52 | #train_var5 = pd.get_dummies(train['var5']) 53 | #test_var5 = pd.get_dummies(test['var5']) 54 | 55 | #train_var6 = pd.get_dummies(train['var6']) 56 | #test_var6 = pd.get_dummies(test['var6']) 57 | 58 | #train_var7 = pd.get_dummies(train['var7']) 59 | #test_var7 = pd.get_dummies(test['var7']) 60 | 61 | #train_var8 = pd.get_dummies(train['var8']) 62 | #test_var8 = pd.get_dummies(test['var8']) 63 | 64 | #train_var9 = pd.get_dummies(train['var9']) 65 | #test_var9 = pd.get_dummies(test['var9']) 66 | 67 | #train = np.hstack([train.iloc[:,11:19], train.iloc[:,20:], train_var1, train_var2, train_var3, train_var4, train_var5, train_var6, train_var7, train_var8, train_var9]) 68 | #test = np.hstack([test.iloc[:,10:18], test.iloc[:,19:], test_var1, test_var2, test_var3, test_var4, test_var5, test_var6, test_var7, test_var8, test_var9]) 69 | 70 | #train = np.nan_to_num(np.array(train)).astype('float64') 71 | #test = np.nan_to_num(np.array(test)).astype('float64') 72 | #print train.shape 73 | #print test.shape 74 | 75 | #np.save("train.npy", train) 76 | #np.save("test.npy", test) 77 | #print ts.shape 78 | 79 | #np.save("train_y.npy", train['target'].values) 80 | 81 | #sys.exit() 82 | tr = np.load(data_path+"train.npy") 83 | ts = np.load(data_path+"test.npy") 84 | train_y = np.load(data_path+"train_y.npy") 85 | sample = pd.read_csv(data_path+'sampleSubmission.csv') 86 | 87 | print tr.shape 88 | #print ts.shape 89 | 90 | #tr = train[['var11', 'var12', 'var13', 'var14', 'var15', 'var16', 'var17']] 91 | #ts = test[['var11', 'var12', 'var13', 'var14', 'var15', 'var16', 'var17']] 92 | 93 | #tr = tr.iloc[:,2:] 94 | #ts = ts.iloc[:,2:] 95 | 96 | #for k in xrange(2,30): 97 | #feature_selector = SelectKBest(score_func=f_regression, k=30) 98 | #feature_selector.fit(tr, train_y) 99 | #tr1 = feature_selector.transform(tr) 100 | #ts1 = feature_selector.transform(ts) 101 | 102 | #tr = tr[:,[288, 334, 50, 359, 29, 238, 45, 369, 188, 183, 225, 370, 310, 40, 63, 321, 226, 119, 2, 300, 291, 157, 303, 214, 46, 282, 349, 155, 32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]] 103 | #ts = ts[:,[288, 334, 50, 359, 29, 238, 45, 369, 188, 183, 225, 370, 310, 40, 63, 321, 226, 119, 2, 300, 291, 157, 303, 214, 46, 282, 349, 155, 32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]] 104 | 105 | tr2 = tr[:,[288, 334, 50, 359, 29, 238, 45, 369, 188, 183, 225, 370, 310, 40, 63, 321, 226, 119, 2, 300, 291, 157, 303, 214, 46, 282, 349, 155, 32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]] 106 | ts2 = ts[:,[288, 334, 50, 359, 29, 238, 45, 369, 188, 183, 225, 370, 310, 40, 63, 321, 226, 119, 2, 300, 291, 157, 303, 214, 46, 282, 349, 155, 32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]] 107 | 108 | tr3 = tr[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378]] 109 | ts3 = ts[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378]] 110 | 111 | #print tr3.shape 112 | #print ts.shape 113 | 114 | #train_y_cat = train_y[:] 115 | #train_y_cat[train_y_cat>0] = 1 116 | 117 | #tr = np.nan_to_num(np.array(tr)) 118 | #ts = np.nan_to_num(np.array(ts)) 119 | 120 | """ 121 | print "Cross Validating.." 122 | #clf = Ridge() 123 | #train_y[train_y>0]=1 124 | wt_gini = 0 125 | #whole_cv_list = [] 126 | kf = KFold(tr.shape[0], n_folds=5) 127 | #for i in xrange(tr.shape[1]): 128 | for i in xrange(1): 129 | cv_gini_list=[] 130 | for dev_index, val_index in kf: 131 | #tr_new = tr[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378,i]] 132 | #tr_new = tr[:,[288, 334, 50, 359, 29, 238, 45, 369, 188, 183, 225, 370, 310, 40, 63, 321, 226, 119, 2, 300, 291, 157, 303, 214, 46, 282, 349, 155, 32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]] 133 | #X_dev, X_val = np.array([tr[dev_index,i]]).T, np.array([tr[val_index,i]]).T 134 | #X_dev, X_val = tr_new[dev_index,:], tr_new[val_index,:] 135 | #X_dev, X_val = tr1[dev_index,:], tr1[val_index,:] 136 | y_dev, y_val = train_y[dev_index], train_y[val_index] 137 | wt_dev, wt_val = tr[dev_index,1], tr[val_index,1] 138 | #print X_dev.shape 139 | #for i in xrange(1): 140 | #clf = Ridge() 141 | #clf.fit(X_dev, y_dev) 142 | #preds1 = clf.predict(X_val) 143 | 144 | X_dev, X_val = tr2[dev_index,:], tr2[val_index,:] 145 | clf = Ridge() 146 | clf.fit(X_dev, y_dev) 147 | preds2 = clf.predict(X_val) 148 | 149 | X_dev, X_val = tr3[dev_index,:], tr3[val_index,:] 150 | clf = Ridge() 151 | clf.fit(X_dev, y_dev) 152 | preds3 = clf.predict(X_val) 153 | 154 | preds = (0.4*preds2)+ (0.6*preds3) 155 | cv_gini_list.append(normalized_weighted_gini(y_val,preds,wt_val)) 156 | #clf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split = 1000, min_samples_leaf=20, random_state=0) 157 | #cv_scores = cross_val_score(clf, tr[:,[3,5,6,7,40,53,161,251,312,335,338,372,378, i]], train_y, cv=5, scoring = "mean_squared_error") 158 | #cv_scores = cross_val_score(clf, tr, train_y, cv=5, scoring = "roc_auc") 159 | #print c_value 160 | #print cv_scores 161 | #print np.mean(cv_scores) 162 | #if abs(np.mean(cv_scores)) < min_rms: 163 | # min_rms = abs(np.mean(cv_scores)) 164 | # selected_index = i 165 | print cv_gini_list 166 | print np.mean(cv_gini_list) 167 | #whole_cv_list.append(np.mean(cv_gini_list)) 168 | if np.mean(cv_gini_list) > wt_gini: 169 | wt_gini = np.mean(cv_gini_list) 170 | selected_index = i 171 | if i % 50 == 0: 172 | print "Processed : ",i 173 | print wt_gini 174 | print selected_index 175 | """ 176 | 177 | """ 178 | kf = KFold(tr.shape[0], n_folds=5) 179 | f1_cv_list = [] 180 | roc_cv_list = [] 181 | for dev_index, val_index in kf: 182 | X_dev, X_val = tr[dev_index,:], tr[val_index,:] 183 | y_dev, y_val = train_y[dev_index], train_y[val_index] 184 | y_dev_cat, y_val_cat = train_y_cat[dev_index], train_y_cat[val_index] 185 | #y_dev_cat = y_dev[:] 186 | #y_val_cat = y_val[:] 187 | #y_dev_cat[y_dev_cat>0]=1 188 | #y_val_cat[y_val_cat>0]=1 189 | #clf = Ridge() 190 | #clf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split = 1000, min_samples_leaf=20, random_state=0) 191 | clf = LogisticRegression(penalty='l2', class_weight='auto') 192 | #clf = SGDClassifier(loss='log', alpha=0.00001, n_iter=50) 193 | clf.fit(X_dev, y_dev_cat) 194 | pred_y_val = clf.predict_proba(X_val)[:,1] 195 | #f1_err = f1_score(y_val_cat, pred_y_val) 196 | #f1_cv_list.append(f1_err) 197 | #print "f1",f1_err 198 | roc_err = roc_auc_score(y_val_cat, pred_y_val) 199 | roc_cv_list.append(roc_err) 200 | print "roc", roc_err 201 | print roc_cv_list 202 | print np.mean(roc_cv_list) 203 | print f1_cv_list 204 | print np.mean(f1_cv_list) 205 | """ 206 | 207 | #clf = Ridge() 208 | #clf.fit(tr1, train_y) 209 | #preds1 = clf.predict(ts1) 210 | 211 | clf = Ridge() 212 | clf.fit(tr2, train_y) 213 | preds2 = clf.predict(ts2) 214 | 215 | clf = Ridge() 216 | clf.fit(tr3, train_y) 217 | preds3 = clf.predict(ts3) 218 | 219 | preds = (0.4*preds2)+ (0.6*preds3) 220 | 221 | ##preds[preds<0] = 0 222 | sample['target'] = preds 223 | sample.to_csv('submission23.csv', index = False) 224 | -------------------------------------------------------------------------------- /LibertyMutual/prepareData.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jul 22 17:17:40 2014 5 | 6 | @author: Sudalai Rajkumar S 7 | """ 8 | from __future__ import division 9 | import numpy as np 10 | import pandas as pd 11 | from sklearn.linear_model import Ridge, Lasso, LogisticRegression, SGDClassifier, LinearRegression 12 | from sklearn.cross_validation import cross_val_score, KFold 13 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 14 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif 15 | from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score 16 | 17 | import pandas as pd 18 | import numpy as np 19 | 20 | ### Reading the Input files ### 21 | data_path = "Path to data" 22 | train = pd.read_csv(data_path+'train.csv') 23 | test = pd.read_csv(data_path+'test.csv') 24 | 25 | ### creating dummy variables from categorical variables ### 26 | train_var1 = pd.get_dummies(train['var1']) 27 | test_var1 = pd.get_dummies(test['var1']) 28 | 29 | train_var2 = pd.get_dummies(train['var2']) 30 | test_var2 = pd.get_dummies(test['var2']) 31 | 32 | train_var3 = pd.get_dummies(train['var3']) 33 | test_var3 = pd.get_dummies(test['var3']) 34 | 35 | train_var4 = pd.get_dummies(train['var4']) 36 | test_var4 = pd.get_dummies(test['var4']) 37 | 38 | train_var5 = pd.get_dummies(train['var5']) 39 | test_var5 = pd.get_dummies(test['var5']) 40 | 41 | train_var6 = pd.get_dummies(train['var6']) 42 | test_var6 = pd.get_dummies(test['var6']) 43 | 44 | train_var7 = pd.get_dummies(train['var7']) 45 | test_var7 = pd.get_dummies(test['var7']) 46 | 47 | train_var8 = pd.get_dummies(train['var8']) 48 | test_var8 = pd.get_dummies(test['var8']) 49 | 50 | train_var9 = pd.get_dummies(train['var9']) 51 | test_var9 = pd.get_dummies(test['var9']) 52 | 53 | ### Stacking the dummy variables together with the numerical variables ### 54 | train = np.hstack([train.iloc[:,11:19], train.iloc[:,20:], train_var1, train_var2, train_var3, train_var4, train_var5, train_var6, train_var7, train_var8, train_var9]) 55 | test = np.hstack([test.iloc[:,10:18], test.iloc[:,19:], test_var1, test_var2, test_var3, test_var4, test_var5, test_var6, test_var7, test_var8, test_var9]) 56 | 57 | ### Replacing the missing values with zero ### 58 | train = np.nan_to_num(np.array(train)).astype('float64') 59 | test = np.nan_to_num(np.array(test)).astype('float64') 60 | 61 | ### Saving the outputs as .npy file ### 62 | np.save("train.npy", train) 63 | np.save("test.npy", test) 64 | np.save("train_y.npy", train['target'].values) 65 | -------------------------------------------------------------------------------- /LibertyMutual/readme.md: -------------------------------------------------------------------------------- 1 | This folder has the codes which I have used in the Kaggle - [Liberty Mutual Competition](http://www.kaggle.com/c/liberty-mutual-fire-peril) 2 | 3 | prepareData.py - File used to do data preprocessing and then to create features from the given raw file 4 | 5 | finalModel.py - Module used to train the final model and to make predictions 6 | -------------------------------------------------------------------------------- /MMM15/finalModel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 02 15:28:42 2015 4 | 5 | @author: Sudalai Rajkumar S 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 12 | from sklearn.metrics import roc_auc_score 13 | from sklearn.cross_validation import cross_val_score 14 | 15 | def runLogistic(X, y, test_X, C_val = 1, penalty_val='l1'): 16 | clf = LogisticRegression(C = C_val, penalty=penalty_val, random_state=0) 17 | clf.fit(X, y) 18 | scores = clf.predict_proba(test_X)[:,1] 19 | return scores 20 | 21 | def runRF(X, y, test_X, estimator_val=200, max_depth_val=5, min_samples_val = 10): 22 | clf = RandomForestClassifier(n_estimators=estimator_val, max_depth = max_depth_val, min_samples_split= min_samples_val, random_state=0) 23 | clf.fit(X, y) 24 | scores = clf.predict_proba(test_X)[:,1] 25 | return scores 26 | 27 | 28 | if __name__ == "__main__": 29 | data_path = "/home/sudalai/Others/Kaggle/MMM15/Data/" 30 | train_file = data_path + "train_v4.csv" 31 | test_file = data_path + "test_v4.csv" 32 | sub_file = "sub8.csv" 33 | 34 | train_data = pd.read_csv(train_file) 35 | test_data = pd.read_csv(test_file) 36 | 37 | X = train_data.iloc[:,:-1] 38 | y = train_data['DV'].astype('int') 39 | test_X = test_data.iloc[:,1:] 40 | id_val = test_data['id'] 41 | 42 | scores = runLogistic(X, y, test_X, C_val=1, penalty_val='l1') 43 | #scores = runRF(X, y, test_X, estimator_val=200, max_depth_val=6, min_samples_val = 200) 44 | 45 | #print X.shape 46 | #print y.shape 47 | 48 | sub_file_handle = open(sub_file, 'w') 49 | sub_file_handle.write('id,pred\n') 50 | for i in xrange(len(scores)): 51 | sub_file_handle.write(str(id_val[i])+','+ str(scores[i]) +'\n') 52 | sub_file_handle.close() 53 | -------------------------------------------------------------------------------- /MMM15/prepareData.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 02 12:12:45 2015 4 | 5 | @author: Sudalai Rajkumar S 6 | """ 7 | 8 | import sys 9 | import csv 10 | import numpy as np 11 | import pandas as pd 12 | 13 | def getSeasonStats(season_file): 14 | season_file_handle = open(season_file, 'r') 15 | reader = csv.DictReader(season_file_handle) 16 | out_dict = {} 17 | 18 | for row in reader: 19 | season_dict = out_dict.get(row['season'], {}) 20 | wteam_dict = season_dict.get(row['wteam'], {}) 21 | lteam_dict = season_dict.get(row['lteam'], {}) 22 | 23 | wteam_dict['NoOfWins'] = wteam_dict.get('NoOfWins',0) + 1 24 | wteam_dict['NoOfGames'] = wteam_dict.get('NoOfGames',0) + 1 25 | wteam_dict['TotalScore'] = wteam_dict.get('TotalScore',0) + int(row['wscore']) 26 | wteam_dict['NumOT'] = wteam_dict.get('NumOT',0) + int(row['numot']) 27 | if row['wloc'] == "H": 28 | wteam_dict['NoOfHomeWins'] = wteam_dict.get('NoOfHomeWins',0) + 1 29 | elif row['wloc'] == 'N': 30 | wteam_dict['NoOfNeutralWins'] = wteam_dict.get('NoOfNeutralWins',0) + 1 31 | elif row['wloc'] == 'A': 32 | wteam_dict['NoOfAwayWins'] = wteam_dict.get('NoOfAwayWins',0) + 1 33 | else: 34 | print row['wloc'] 35 | sys.exit() 36 | wteam_dict['fgm'] = wteam_dict.get('fgm',0) + int(row['wfgm']) 37 | wteam_dict['fga'] = wteam_dict.get('fga',0) + int(row['wfga']) 38 | wteam_dict['fgm3'] = wteam_dict.get('fgm3',0) + int(row['wfgm3']) 39 | wteam_dict['fga3'] = wteam_dict.get('fga3',0) + int(row['wfga3']) 40 | wteam_dict['ftm'] = wteam_dict.get('ftm',0) + int(row['wftm']) 41 | wteam_dict['fta'] = wteam_dict.get('fta',0) + int(row['wfta']) 42 | wteam_dict['or'] = wteam_dict.get('or',0) + int(row['wor']) 43 | wteam_dict['dr'] = wteam_dict.get('dr',0) + int(row['wdr']) 44 | wteam_dict['ast'] = wteam_dict.get('ast',0) + int(row['wast']) 45 | wteam_dict['to'] = wteam_dict.get('to',0) + int(row['wto']) 46 | wteam_dict['pf'] = wteam_dict.get('pf',0) + int(row['wpf']) 47 | wteam_dict['stl'] = wteam_dict.get('stl',0) + int(row['wstl']) 48 | wteam_dict['blk'] = wteam_dict.get('blk',0) + int(row['wblk']) 49 | 50 | 51 | 52 | 53 | lteam_dict['NoOfLoss'] = lteam_dict.get('NoOfLoss',0) + 1 54 | lteam_dict['NoOfGames'] = lteam_dict.get('NoOfGames',0) + 1 55 | lteam_dict['TotalScore'] = lteam_dict.get('TotalScore',0) + int(row['lscore']) 56 | lteam_dict['NumOT'] = lteam_dict.get('NumOT',0) + int(row['numot']) 57 | if row['wloc'] == "H": 58 | lteam_dict['NoOfAwayLoss'] = lteam_dict.get('NoOfAwayLoss',0) + 1 59 | elif row['wloc'] == 'N': 60 | lteam_dict['NoOfNeutralLoss'] = lteam_dict.get('NoOfNeutralLoss',0) + 1 61 | elif row['wloc'] == 'A': 62 | lteam_dict['NoOfHomeLoss'] = lteam_dict.get('NoOfHomeLoss',0) + 1 63 | else: 64 | print row['wloc'] 65 | sys.exit() 66 | lteam_dict['fgm'] = lteam_dict.get('fgm',0) + int(row['lfgm']) 67 | lteam_dict['fga'] = lteam_dict.get('fga',0) + int(row['lfga']) 68 | lteam_dict['fgm3'] = lteam_dict.get('fgm3',0) + int(row['lfgm3']) 69 | lteam_dict['fga3'] = lteam_dict.get('fga3',0) + int(row['lfga3']) 70 | lteam_dict['ftm'] = lteam_dict.get('ftm',0) + int(row['lftm']) 71 | lteam_dict['fta'] = lteam_dict.get('fta',0) + int(row['lfta']) 72 | lteam_dict['or'] = lteam_dict.get('or',0) + int(row['lor']) 73 | lteam_dict['dr'] = lteam_dict.get('dr',0) + int(row['ldr']) 74 | lteam_dict['ast'] = lteam_dict.get('ast',0) + int(row['last']) 75 | lteam_dict['to'] = lteam_dict.get('to',0) + int(row['lto']) 76 | lteam_dict['pf'] = lteam_dict.get('pf',0) + int(row['lpf']) 77 | lteam_dict['stl'] = lteam_dict.get('stl',0) + int(row['lstl']) 78 | lteam_dict['blk'] = lteam_dict.get('blk',0) + int(row['lblk']) 79 | lteam_dict['blk'] = lteam_dict.get('blk',0) + int(row['lblk']) 80 | 81 | 82 | season_dict[row['wteam']] = wteam_dict 83 | season_dict[row['lteam']] = lteam_dict 84 | out_dict[row['season']] = season_dict 85 | 86 | #print out_dict['1985']['1228'] 87 | #print out_dict['1985']['1328'] 88 | #print out_dict['1985'].keys() 89 | #print len(out_dict['1985'].keys()) 90 | return out_dict 91 | 92 | def prepareTrainData(tourney_results_file, season_dict, seeds_dict): 93 | tourney_file_handle = open(tourney_results_file, 'r') 94 | reader = csv.DictReader(tourney_file_handle) 95 | header_list = ['WinPercentage','LossPercentage','AverageScore','HomeWinPerc','AwayWinPerc','NeutralWinPerc','HomeLossPerc','AwayLossPerc','NeutralLossPerc','NumOTPerc', 'NoOfGames', 'OppWinPercentage','OppLossPercentage','OppAverageScore','OppHomeWinPerc','OppAwayWinPerc','OppNeutralWinPerc','OppHomeLossPerc','OppAwayLossPerc','OppNeutralLossPerc', 'OppNumOTPerc', 'OppNoOfGames', 'fgm','fga','fgm3','fga3','ftm','fta','or','dr','to','pf','ast','stl','blk', 'ofgm','ofga','ofgm3','ofga3','oftm','ofta','oor','odr','oto','opf','oast','ostl','oblk', 'DV'] 96 | 97 | out_list = [] 98 | for row in reader: 99 | season = row['season'] 100 | if season == '2011': 101 | break 102 | 103 | # Get the winning and losing team # 104 | wteam = row['wteam'] 105 | lteam = row['lteam'] 106 | # Get the stats of both teams # 107 | season_wteam_dict = season_dict[season][wteam] 108 | season_lteam_dict = season_dict[season][lteam] 109 | 110 | # Get the seeds of both teams # 111 | #seed_wteam = seeds_dict[season][wteam] 112 | #seed_lteam = seeds_dict[season][lteam] 113 | # Get win percentage for both teams # 114 | win_perc_wteam = season_wteam_dict.get('NoOfWins',0) / float(season_wteam_dict['NoOfGames']) 115 | win_perc_lteam = season_lteam_dict.get('NoOfWins',0) / float(season_lteam_dict['NoOfGames']) 116 | # Get Loss percentage for both teams # 117 | loss_perc_wteam = season_wteam_dict.get('NoOfLoss',0) / float(season_wteam_dict['NoOfGames']) 118 | loss_perc_lteam = season_lteam_dict.get('NoOfLoss',0) / float(season_lteam_dict['NoOfGames']) 119 | # Get the average score for both teams # 120 | avg_score_wteam = season_wteam_dict.get('TotalScore',0) / float(season_wteam_dict['NoOfGames']) 121 | avg_score_lteam = season_lteam_dict.get('TotalScore',0) / float(season_lteam_dict['NoOfGames']) 122 | # Get the home win percentage for both teams # 123 | home_win_perc_wteam = season_wteam_dict.get('NoOfHomeWins',0) / float(season_wteam_dict['NoOfGames']) 124 | home_win_perc_lteam = season_lteam_dict.get('NoOfHomeWins',0) / float(season_lteam_dict['NoOfGames']) 125 | # Get the Away win percentage for both teams # 126 | away_win_perc_wteam = season_wteam_dict.get('NoOfAwayWins',0) / float(season_wteam_dict['NoOfGames']) 127 | away_win_perc_lteam = season_lteam_dict.get('NoOfAwayWins',0) / float(season_lteam_dict['NoOfGames']) 128 | # Get the neutral win percentage # 129 | neutral_win_perc_wteam = season_wteam_dict.get('NoOfNeutralWins',0) / float(season_wteam_dict['NoOfGames']) 130 | neutral_win_perc_lteam = season_lteam_dict.get('NoOfNeutralWins',0) / float(season_lteam_dict['NoOfGames']) 131 | # Get the home loss percentage for both teams # 132 | home_loss_perc_wteam = season_wteam_dict.get('NoOfHomeLoss',0) / float(season_wteam_dict['NoOfGames']) 133 | home_loss_perc_lteam = season_lteam_dict.get('NoOfHomeLoss',0) / float(season_lteam_dict['NoOfGames']) 134 | # Get the Away loss percentage for both teams # 135 | away_loss_perc_wteam = season_wteam_dict.get('NoOfAwayLoss',0) / float(season_wteam_dict['NoOfGames']) 136 | away_loss_perc_lteam = season_lteam_dict.get('NoOfAwayLoss',0) / float(season_lteam_dict['NoOfGames']) 137 | # Get the neutral loss percentage # 138 | neutral_loss_perc_wteam = season_wteam_dict.get('NoOfNeutralLoss',0) / float(season_wteam_dict['NoOfGames']) 139 | neutral_loss_perc_lteam = season_lteam_dict.get('NoOfNeutralLoss',0) / float(season_lteam_dict['NoOfGames']) 140 | # Get the number of overtime matches for both teams # 141 | ot_wteam = season_wteam_dict['NumOT'] 142 | ot_lteam = season_lteam_dict['NumOT'] 143 | # Get the number of matches for both teams # 144 | num_games_wteam = season_wteam_dict['NoOfGames'] 145 | num_games_lteam = season_lteam_dict['NoOfGames'] 146 | 147 | fgm_wteam = season_wteam_dict.get('fgm',0) / float(season_wteam_dict['NoOfGames']) 148 | fga_wteam = season_wteam_dict.get('fga',0) / float(season_wteam_dict['NoOfGames']) 149 | fgm3_wteam = season_wteam_dict.get('fgm3',0) / float(season_wteam_dict['NoOfGames']) 150 | fga3_wteam = season_wteam_dict.get('fga3',0) / float(season_wteam_dict['NoOfGames']) 151 | ftm_wteam = season_wteam_dict.get('ftm',0) / float(season_wteam_dict['NoOfGames']) 152 | fta_wteam = season_wteam_dict.get('fta',0) / float(season_wteam_dict['NoOfGames']) 153 | or_wteam = season_wteam_dict.get('or',0) / float(season_wteam_dict['NoOfGames']) 154 | dr_wteam = season_wteam_dict.get('dr',0) / float(season_wteam_dict['NoOfGames']) 155 | to_wteam = season_wteam_dict.get('to',0) / float(season_wteam_dict['NoOfGames']) 156 | pf_wteam = season_wteam_dict.get('pf',0) / float(season_wteam_dict['NoOfGames']) 157 | ast_wteam = season_wteam_dict.get('ast',0) / float(season_wteam_dict['NoOfGames']) 158 | stl_wteam = season_wteam_dict.get('stl',0) / float(season_wteam_dict['NoOfGames']) 159 | blk_wteam = season_wteam_dict.get('blk',0) / float(season_wteam_dict['NoOfGames']) 160 | 161 | fgm_lteam = season_lteam_dict.get('fgm',0) / float(season_lteam_dict['NoOfGames']) 162 | fga_lteam = season_lteam_dict.get('fga',0) / float(season_lteam_dict['NoOfGames']) 163 | fgm3_lteam = season_lteam_dict.get('fgm3',0) / float(season_lteam_dict['NoOfGames']) 164 | fga3_lteam = season_lteam_dict.get('fga3',0) / float(season_lteam_dict['NoOfGames']) 165 | ftm_lteam = season_lteam_dict.get('ftm',0) / float(season_lteam_dict['NoOfGames']) 166 | fta_lteam = season_lteam_dict.get('fta',0) / float(season_lteam_dict['NoOfGames']) 167 | or_lteam = season_lteam_dict.get('or',0) / float(season_lteam_dict['NoOfGames']) 168 | dr_lteam = season_lteam_dict.get('dr',0) / float(season_lteam_dict['NoOfGames']) 169 | to_lteam = season_lteam_dict.get('to',0) / float(season_lteam_dict['NoOfGames']) 170 | pf_lteam = season_lteam_dict.get('pf',0) / float(season_lteam_dict['NoOfGames']) 171 | ast_lteam = season_lteam_dict.get('ast',0) / float(season_lteam_dict['NoOfGames']) 172 | stl_lteam = season_lteam_dict.get('stl',0) / float(season_lteam_dict['NoOfGames']) 173 | blk_lteam = season_lteam_dict.get('blk',0) / float(season_lteam_dict['NoOfGames']) 174 | 175 | 176 | 177 | # Appending the features to out list # 178 | out_list.append([ win_perc_wteam, loss_perc_wteam, avg_score_wteam, home_win_perc_wteam, away_win_perc_wteam, neutral_win_perc_wteam, home_loss_perc_wteam, away_loss_perc_wteam, neutral_loss_perc_wteam, ot_wteam, num_games_wteam, win_perc_lteam, loss_perc_lteam, avg_score_lteam, home_win_perc_lteam, away_win_perc_lteam, neutral_win_perc_lteam, home_loss_perc_lteam, away_loss_perc_lteam, neutral_loss_perc_lteam, ot_lteam, num_games_lteam, fgm_wteam,fga_wteam,fgm3_wteam,fga3_wteam,ftm_wteam,fta_wteam,or_wteam,dr_wteam,to_wteam,pf_wteam,ast_wteam,stl_wteam,blk_wteam, fgm_lteam,fga_lteam,fgm3_lteam,fga3_lteam,ftm_lteam,fta_lteam,or_lteam,dr_lteam,to_lteam,pf_lteam,ast_lteam,stl_lteam,blk_lteam, 1]) 179 | out_list.append([win_perc_lteam, loss_perc_lteam, avg_score_lteam, home_win_perc_lteam, away_win_perc_lteam, neutral_win_perc_lteam, home_loss_perc_lteam, away_loss_perc_lteam, neutral_loss_perc_lteam, ot_lteam, num_games_lteam, win_perc_wteam, loss_perc_wteam, avg_score_wteam, home_win_perc_wteam, away_win_perc_wteam, neutral_win_perc_wteam, home_loss_perc_wteam, away_loss_perc_wteam, neutral_loss_perc_wteam, ot_wteam, num_games_wteam, fgm_lteam,fga_lteam,fgm3_lteam,fga3_lteam,ftm_lteam,fta_lteam,or_lteam,dr_lteam,to_lteam,pf_lteam,ast_lteam,stl_lteam,blk_lteam, fgm_wteam,fga_wteam,fgm3_wteam,fga3_wteam,ftm_wteam,fta_wteam,or_wteam,dr_wteam,to_wteam,pf_wteam,ast_wteam,stl_wteam,blk_wteam, 0]) 180 | 181 | out_df = pd.DataFrame(np.array(out_list)) 182 | out_df.columns = header_list 183 | return out_df 184 | 185 | def prepareTestData(fixture_file, season_dict, seeds_dict): 186 | fixture_file_handle = open(fixture_file, 'r') 187 | reader = csv.DictReader(fixture_file_handle) 188 | header_list = ['id', 'WinPercentage','LossPercentage','AverageScore','HomeWinPerc','AwayWinPerc','NeutralWinPerc','HomeLossPerc','AwayLossPerc','NeutralLossPerc','NumOTPerc', 'NoOfGames', 'OppWinPercentage','OppLossPercentage','OppAverageScore','OppHomeWinPerc','OppAwayWinPerc','OppNeutralWinPerc','OppHomeLossPerc','OppAwayLossPerc','OppNeutralLossPerc', 'OppNumOTPerc', 'OppNoOfGames', 'fgm','fga','fgm3','fga3','ftm','fta','or','dr','to','pf','ast','stl','blk', 'ofgm','ofga','ofgm3','ofga3','oftm','ofta','oor','odr','oto','opf','oast','ostl','oblk'] 189 | 190 | out_list = [] 191 | out_list = [] 192 | for row in reader: 193 | id_val = row['id'] 194 | season = id_val.split("_")[0] 195 | 196 | # Get the winning and losing team # 197 | wteam = id_val.split("_")[1] 198 | lteam = id_val.split("_")[2] 199 | # Get the stats of both teams # 200 | season_wteam_dict = season_dict[season][wteam] 201 | season_lteam_dict = season_dict[season][lteam] 202 | 203 | # Get the seeds of both teams # 204 | #seed_wteam = seeds_dict[season][wteam] 205 | #seed_lteam = seeds_dict[season][lteam] 206 | # Get win percentage for both teams # 207 | win_perc_wteam = season_wteam_dict.get('NoOfWins',0) / float(season_wteam_dict['NoOfGames']) 208 | win_perc_lteam = season_lteam_dict.get('NoOfWins',0) / float(season_lteam_dict['NoOfGames']) 209 | # Get Loss percentage for both teams # 210 | loss_perc_wteam = season_wteam_dict.get('NoOfLoss',0) / float(season_wteam_dict['NoOfGames']) 211 | loss_perc_lteam = season_lteam_dict.get('NoOfLoss',0) / float(season_lteam_dict['NoOfGames']) 212 | # Get the average score for both teams # 213 | avg_score_wteam = season_wteam_dict.get('TotalScore',0) / float(season_wteam_dict['NoOfGames']) 214 | avg_score_lteam = season_lteam_dict.get('TotalScore',0) / float(season_lteam_dict['NoOfGames']) 215 | # Get the home win percentage for both teams # 216 | home_win_perc_wteam = season_wteam_dict.get('NoOfHomeWins',0) / float(season_wteam_dict['NoOfGames']) 217 | home_win_perc_lteam = season_lteam_dict.get('NoOfHomeWins',0) / float(season_lteam_dict['NoOfGames']) 218 | # Get the Away win percentage for both teams # 219 | away_win_perc_wteam = season_wteam_dict.get('NoOfAwayWins',0) / float(season_wteam_dict['NoOfGames']) 220 | away_win_perc_lteam = season_lteam_dict.get('NoOfAwayWins',0) / float(season_lteam_dict['NoOfGames']) 221 | # Get the neutral win percentage # 222 | neutral_win_perc_wteam = season_wteam_dict.get('NoOfNeutralWins',0) / float(season_wteam_dict['NoOfGames']) 223 | neutral_win_perc_lteam = season_lteam_dict.get('NoOfNeutralWins',0) / float(season_lteam_dict['NoOfGames']) 224 | # Get the home loss percentage for both teams # 225 | home_loss_perc_wteam = season_wteam_dict.get('NoOfHomeLoss',0) / float(season_wteam_dict['NoOfGames']) 226 | home_loss_perc_lteam = season_lteam_dict.get('NoOfHomeLoss',0) / float(season_lteam_dict['NoOfGames']) 227 | # Get the Away loss percentage for both teams # 228 | away_loss_perc_wteam = season_wteam_dict.get('NoOfAwayLoss',0) / float(season_wteam_dict['NoOfGames']) 229 | away_loss_perc_lteam = season_lteam_dict.get('NoOfAwayLoss',0) / float(season_lteam_dict['NoOfGames']) 230 | # Get the neutral loss percentage # 231 | neutral_loss_perc_wteam = season_wteam_dict.get('NoOfNeutralLoss',0) / float(season_wteam_dict['NoOfGames']) 232 | neutral_loss_perc_lteam = season_lteam_dict.get('NoOfNeutralLoss',0) / float(season_lteam_dict['NoOfGames']) 233 | # Get the number of overtime matches for both teams # 234 | ot_wteam = season_wteam_dict['NumOT'] 235 | ot_lteam = season_lteam_dict['NumOT'] 236 | # Get the number of matches for both teams # 237 | num_games_wteam = season_wteam_dict['NoOfGames'] 238 | num_games_lteam = season_lteam_dict['NoOfGames'] 239 | 240 | fgm_wteam = season_wteam_dict.get('fgm',0) / float(season_wteam_dict['NoOfGames']) 241 | fga_wteam = season_wteam_dict.get('fga',0) / float(season_wteam_dict['NoOfGames']) 242 | fgm3_wteam = season_wteam_dict.get('fgm3',0) / float(season_wteam_dict['NoOfGames']) 243 | fga3_wteam = season_wteam_dict.get('fga3',0) / float(season_wteam_dict['NoOfGames']) 244 | ftm_wteam = season_wteam_dict.get('ftm',0) / float(season_wteam_dict['NoOfGames']) 245 | fta_wteam = season_wteam_dict.get('fta',0) / float(season_wteam_dict['NoOfGames']) 246 | or_wteam = season_wteam_dict.get('or',0) / float(season_wteam_dict['NoOfGames']) 247 | dr_wteam = season_wteam_dict.get('dr',0) / float(season_wteam_dict['NoOfGames']) 248 | to_wteam = season_wteam_dict.get('to',0) / float(season_wteam_dict['NoOfGames']) 249 | pf_wteam = season_wteam_dict.get('pf',0) / float(season_wteam_dict['NoOfGames']) 250 | ast_wteam = season_wteam_dict.get('ast',0) / float(season_wteam_dict['NoOfGames']) 251 | stl_wteam = season_wteam_dict.get('stl',0) / float(season_wteam_dict['NoOfGames']) 252 | blk_wteam = season_wteam_dict.get('blk',0) / float(season_wteam_dict['NoOfGames']) 253 | 254 | fgm_lteam = season_lteam_dict.get('fgm',0) / float(season_lteam_dict['NoOfGames']) 255 | fga_lteam = season_lteam_dict.get('fga',0) / float(season_lteam_dict['NoOfGames']) 256 | fgm3_lteam = season_lteam_dict.get('fgm3',0) / float(season_lteam_dict['NoOfGames']) 257 | fga3_lteam = season_lteam_dict.get('fga3',0) / float(season_lteam_dict['NoOfGames']) 258 | ftm_lteam = season_lteam_dict.get('ftm',0) / float(season_lteam_dict['NoOfGames']) 259 | fta_lteam = season_lteam_dict.get('fta',0) / float(season_lteam_dict['NoOfGames']) 260 | or_lteam = season_lteam_dict.get('or',0) / float(season_lteam_dict['NoOfGames']) 261 | dr_lteam = season_lteam_dict.get('dr',0) / float(season_lteam_dict['NoOfGames']) 262 | to_lteam = season_lteam_dict.get('to',0) / float(season_lteam_dict['NoOfGames']) 263 | pf_lteam = season_lteam_dict.get('pf',0) / float(season_lteam_dict['NoOfGames']) 264 | ast_lteam = season_lteam_dict.get('ast',0) / float(season_lteam_dict['NoOfGames']) 265 | stl_lteam = season_lteam_dict.get('stl',0) / float(season_lteam_dict['NoOfGames']) 266 | blk_lteam = season_lteam_dict.get('blk',0) / float(season_lteam_dict['NoOfGames']) 267 | 268 | 269 | # Appending the features to out list # 270 | out_list.append([id_val, win_perc_wteam, loss_perc_wteam, avg_score_wteam, home_win_perc_wteam, away_win_perc_wteam, neutral_win_perc_wteam, home_loss_perc_wteam, away_loss_perc_wteam, neutral_loss_perc_wteam, ot_wteam, num_games_wteam, win_perc_lteam, loss_perc_lteam, avg_score_lteam, home_win_perc_lteam, away_win_perc_lteam, neutral_win_perc_lteam, home_loss_perc_lteam, away_loss_perc_lteam, neutral_loss_perc_lteam, ot_lteam, num_games_lteam, fgm_wteam,fga_wteam,fgm3_wteam,fga3_wteam,ftm_wteam,fta_wteam,or_wteam,dr_wteam,to_wteam,pf_wteam,ast_wteam,stl_wteam,blk_wteam, fgm_lteam,fga_lteam,fgm3_lteam,fga3_lteam,ftm_lteam,fta_lteam,or_lteam,dr_lteam,to_lteam,pf_lteam,ast_lteam,stl_lteam,blk_lteam]) 271 | 272 | out_df = pd.DataFrame(np.array(out_list)) 273 | out_df.columns = header_list 274 | return out_df 275 | 276 | 277 | 278 | def getSeedStats(seeds_file): 279 | seeds_file_handle = open(seeds_file, 'r') 280 | reader = csv.DictReader(seeds_file_handle) 281 | out_dict = {} 282 | 283 | for row in reader: 284 | season_dict = out_dict.get(row['season'], {}) 285 | season_dict[row['team']] = int(row['seed'][1:3]) 286 | out_dict[row['season']] = season_dict 287 | 288 | return out_dict 289 | 290 | 291 | if __name__ == "__main__": 292 | data_path = "/home/sudalai/Others/Kaggle/MMM15/Data/" 293 | regular_seasons_file = data_path + "regular_season_detailed_results.csv" 294 | tourney_seeds_file= data_path + "tourney_seeds.csv" 295 | tourney_results_file = data_path + "tourney_detailed_results.csv" 296 | test_fixture_file = data_path + "sample_submission.csv" 297 | train_file = data_path + "train_v4.csv" 298 | test_file = data_path + "test_v4.csv" 299 | 300 | season_dict = getSeasonStats(regular_seasons_file) 301 | seeds_dict = getSeedStats(tourney_seeds_file) 302 | 303 | for year in season_dict.keys(): 304 | for team in seeds_dict[year]: 305 | season_dict[year][team] 306 | 307 | train_df = prepareTrainData(tourney_results_file, season_dict, seeds_dict) 308 | train_df.to_csv(train_file, index=False) 309 | 310 | test_df = prepareTestData(test_fixture_file, season_dict, seeds_dict) 311 | test_df.to_csv(test_file, index=False) 312 | -------------------------------------------------------------------------------- /MMM15/readme.md: -------------------------------------------------------------------------------- 1 | This folder has the codes for [Kaggle Competition](http://www.kaggle.com/competitions) - [March Machine Learning Mania 2015](http://www.kaggle.com/c/march-machine-learning-mania-2015) 2 | -------------------------------------------------------------------------------- /MMM15/seed_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 02 10:17:40 2015 4 | 5 | @author: Sudalai Rajkumar S 6 | 7 | Module to produce the seed based bench mark given in the competition 8 | """ 9 | 10 | import csv 11 | import numpy as np 12 | import pandas as pd 13 | 14 | def getSeedStats(seeds_file): 15 | seeds_file_handle = open(seeds_file, 'r') 16 | reader = csv.DictReader(seeds_file_handle) 17 | out_dict = {} 18 | 19 | for row in reader: 20 | season_dict = out_dict.get(row['season'], {}) 21 | season_dict[row['team']] = int(row['seed'][1:3]) 22 | out_dict[row['season']] = season_dict 23 | 24 | return out_dict 25 | 26 | if __name__ == "__main__": 27 | data_path = "/home/sudalai/Others/Kaggle/MMM15/Data/" 28 | test_fixture_file = data_path + "sample_submission.csv" 29 | tourney_seeds_file= data_path + "tourney_seeds.csv" 30 | sub_file = open("sub_seedmodel.csv","w") 31 | sub_file.write('id,pred\n') 32 | 33 | seeds_dict = getSeedStats(tourney_seeds_file) 34 | 35 | test_file_handle = open(test_fixture_file,'r') 36 | reader = csv.DictReader(test_file_handle) 37 | for row in reader: 38 | id_val = row['id'] 39 | season = id_val.split("_")[0] 40 | fteam = id_val.split("_")[1] 41 | steam = id_val.split("_")[2] 42 | 43 | fteam_seed = seeds_dict[season][fteam] 44 | steam_seed = seeds_dict[season][steam] 45 | 46 | pred_val = 0.5 + ((steam_seed - fteam_seed)*0.03) 47 | 48 | sub_file.write(str(id_val) + "," + str(pred_val) + "\n") 49 | sub_file.close() 50 | -------------------------------------------------------------------------------- /OutBrain/ftrl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Thanks to tinrtgu for the wonderful base script 4 | Use pypy for faster computations.! 5 | """ 6 | import csv 7 | from datetime import datetime 8 | from csv import DictReader 9 | from math import exp, log, sqrt 10 | 11 | 12 | ############################################################################## 13 | # parameters ################################################################# 14 | ############################################################################## 15 | 16 | # A, paths 17 | data_path = "../input/" 18 | train = data_path+'clicks_train.csv' # path to training file 19 | test = data_path+'clicks_test.csv' # path to testing file 20 | submission = 'sub_proba.csv' # path of to be outputted submission file 21 | 22 | # B, model 23 | alpha = .1 # learning rate 24 | beta = 0. # smoothing parameter for adaptive learning rate 25 | L1 = 0. # L1 regularization, larger value means more regularized 26 | L2 = 0. # L2 regularization, larger value means more regularized 27 | 28 | # C, feature/hash trick 29 | D = 2 ** 20 # number of weights to use 30 | interaction = False # whether to enable poly2 feature interactions 31 | 32 | # D, training/validation 33 | epoch = 1 # learn training data for N passes 34 | holdafter = None # data after date N (exclusive) are used as validation 35 | holdout = None # use every N training instance for holdout validation 36 | 37 | 38 | ############################################################################## 39 | # class, function, generator definitions ##################################### 40 | ############################################################################## 41 | 42 | class ftrl_proximal(object): 43 | ''' Our main algorithm: Follow the regularized leader - proximal 44 | 45 | In short, 46 | this is an adaptive-learning-rate sparse logistic-regression with 47 | efficient L1-L2-regularization 48 | 49 | Reference: 50 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 51 | ''' 52 | 53 | def __init__(self, alpha, beta, L1, L2, D, interaction): 54 | # parameters 55 | self.alpha = alpha 56 | self.beta = beta 57 | self.L1 = L1 58 | self.L2 = L2 59 | 60 | # feature related parameters 61 | self.D = D 62 | self.interaction = interaction 63 | 64 | # model 65 | # n: squared sum of past gradients 66 | # z: weights 67 | # w: lazy weights 68 | self.n = [0.] * D 69 | self.z = [0.] * D 70 | self.w = {} 71 | 72 | def _indices(self, x): 73 | ''' A helper generator that yields the indices in x 74 | 75 | The purpose of this generator is to make the following 76 | code a bit cleaner when doing feature interaction. 77 | ''' 78 | 79 | # first yield index of the bias term 80 | yield 0 81 | 82 | # then yield the normal indices 83 | for index in x: 84 | yield index 85 | 86 | # now yield interactions (if applicable) 87 | if self.interaction: 88 | D = self.D 89 | L = len(x) 90 | 91 | x = sorted(x) 92 | for i in xrange(L): 93 | for j in xrange(i+1, L): 94 | # one-hot encode interactions with hash trick 95 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D 96 | 97 | def predict(self, x): 98 | ''' Get probability estimation on x 99 | 100 | INPUT: 101 | x: features 102 | 103 | OUTPUT: 104 | probability of p(y = 1 | x; w) 105 | ''' 106 | 107 | # parameters 108 | alpha = self.alpha 109 | beta = self.beta 110 | L1 = self.L1 111 | L2 = self.L2 112 | 113 | # model 114 | n = self.n 115 | z = self.z 116 | w = {} 117 | 118 | # wTx is the inner product of w and x 119 | wTx = 0. 120 | for i in self._indices(x): 121 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 122 | 123 | # build w on the fly using z and n, hence the name - lazy weights 124 | # we are doing this at prediction instead of update time is because 125 | # this allows us for not storing the complete w 126 | if sign * z[i] <= L1: 127 | # w[i] vanishes due to L1 regularization 128 | w[i] = 0. 129 | else: 130 | # apply prediction time L1, L2 regularization to z and get w 131 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 132 | 133 | wTx += w[i] 134 | 135 | # cache the current w for update stage 136 | self.w = w 137 | 138 | # bounded sigmoid function, this is the probability estimation 139 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 140 | 141 | def update(self, x, p, y): 142 | ''' Update model using x, p, y 143 | 144 | INPUT: 145 | x: feature, a list of indices 146 | p: click probability prediction of our model 147 | y: answer 148 | 149 | MODIFIES: 150 | self.n: increase by squared gradient 151 | self.z: weights 152 | ''' 153 | 154 | # parameter 155 | alpha = self.alpha 156 | 157 | # model 158 | n = self.n 159 | z = self.z 160 | w = self.w 161 | 162 | # gradient under logloss 163 | g = p - y 164 | 165 | # update z and n 166 | for i in self._indices(x): 167 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 168 | z[i] += g - sigma * w[i] 169 | n[i] += g * g 170 | 171 | 172 | def logloss(p, y): 173 | ''' FUNCTION: Bounded logloss 174 | 175 | INPUT: 176 | p: our prediction 177 | y: real answer 178 | 179 | OUTPUT: 180 | logarithmic loss of p given y 181 | ''' 182 | 183 | p = max(min(p, 1. - 10e-15), 10e-15) 184 | return -log(p) if y == 1. else -log(1. - p) 185 | 186 | 187 | def data(path, D): 188 | ''' GENERATOR: Apply hash-trick to the original csv row 189 | and for simplicity, we one-hot-encode everything 190 | 191 | INPUT: 192 | path: path to training or testing file 193 | D: the max index that we can hash to 194 | 195 | YIELDS: 196 | ID: id of the instance, mainly useless 197 | x: a list of hashed and one-hot-encoded 'indices' 198 | we only need the index since all values are either 0 or 1 199 | y: y = 1 if we have a click, else we have y = 0 200 | ''' 201 | 202 | for t, row in enumerate(DictReader(open(path))): 203 | # process id 204 | disp_id = int(row['display_id']) 205 | ad_id = int(row['ad_id']) 206 | 207 | # process clicks 208 | y = 0. 209 | if 'clicked' in row: 210 | if row['clicked'] == '1': 211 | y = 1. 212 | del row['clicked'] 213 | 214 | x = [] 215 | for key in row: 216 | x.append(abs(hash(key + '_' + row[key])) % D) 217 | 218 | row = prcont_dict.get(ad_id, []) 219 | # build x 220 | ad_doc_id = -1 221 | for ind, val in enumerate(row): 222 | if ind==0: 223 | ad_doc_id = int(val) 224 | x.append(abs(hash(prcont_header[ind] + '_' + val)) % D) 225 | 226 | row = event_dict.get(disp_id, []) 227 | ## build x 228 | disp_doc_id = -1 229 | for ind, val in enumerate(row): 230 | if ind==0: 231 | uuid_val = val 232 | if ind==1: 233 | disp_doc_id = int(val) 234 | x.append(abs(hash(event_header[ind] + '_' + val)) % D) 235 | 236 | if (ad_doc_id in leak_uuid_dict) and (uuid_val in leak_uuid_dict[ad_doc_id]): 237 | x.append(abs(hash('leakage_row_found_1'))%D) 238 | else: 239 | x.append(abs(hash('leakage_row_not_found'))%D) 240 | 241 | yield t, disp_id, ad_id, x, y 242 | 243 | 244 | ############################################################################## 245 | # start training ############################################################# 246 | ############################################################################## 247 | 248 | start = datetime.now() 249 | 250 | # initialize ourselves a learner 251 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 252 | 253 | print("Content..") 254 | with open(data_path + "promoted_content.csv") as infile: 255 | prcont = csv.reader(infile) 256 | #prcont_header = (prcont.next())[1:] 257 | prcont_header = next(prcont)[1:] 258 | prcont_dict = {} 259 | for ind,row in enumerate(prcont): 260 | prcont_dict[int(row[0])] = row[1:] 261 | if ind%100000 == 0: 262 | print(ind) 263 | print(len(prcont_dict)) 264 | del prcont 265 | 266 | print("Events..") 267 | with open(data_path + "events.csv") as infile: 268 | events = csv.reader(infile) 269 | #events.next() 270 | next(events) 271 | event_header = ['uuid', 'document_id', 'platform', 'geo_location', 'loc_country', 'loc_state', 'loc_dma'] 272 | event_dict = {} 273 | for ind,row in enumerate(events): 274 | tlist = row[1:3] + row[4:6] 275 | loc = row[5].split('>') 276 | if len(loc) == 3: 277 | tlist.extend(loc[:]) 278 | elif len(loc) == 2: 279 | tlist.extend( loc[:]+['']) 280 | elif len(loc) == 1: 281 | tlist.extend( loc[:]+['','']) 282 | else: 283 | tlist.append(['','','']) 284 | event_dict[int(row[0])] = tlist[:] 285 | if ind%100000 == 0: 286 | print("Events : ", ind) 287 | print(len(event_dict)) 288 | del events 289 | 290 | print("Leakage file..") 291 | leak_uuid_dict= {} 292 | """ 293 | with open(data_path+"leak_uuid_doc.csv") as infile: 294 | doc = csv.reader(infile) 295 | doc.next() 296 | leak_uuid_dict = {} 297 | for ind, row in enumerate(doc): 298 | doc_id = int(row[0]) 299 | leak_uuid_dict[doc_id] = set(row[1].split(' ')) 300 | if ind%100000==0: 301 | print("Leakage file : ", ind) 302 | print(len(leak_uuid_dict)) 303 | del doc 304 | """ 305 | 306 | # start training 307 | for e in range(epoch): 308 | loss = 0. 309 | count = 0 310 | date = 0 311 | 312 | for t, disp_id, ad_id, x, y in data(train, D): # data is a generator 313 | # t: just a instance counter 314 | # date: you know what this is 315 | # ID: id provided in original data 316 | # x: features 317 | # y: label (click) 318 | 319 | # step 1, get prediction from learner 320 | p = learner.predict(x) 321 | 322 | if (holdafter and date > holdafter) or (holdout and t % holdout == 0): 323 | # step 2-1, calculate validation loss 324 | # we do not train with the validation data so that our 325 | # validation loss is an accurate estimation 326 | # 327 | # holdafter: train instances from day 1 to day N 328 | # validate with instances from day N + 1 and after 329 | # 330 | # holdout: validate with every N instance, train with others 331 | loss += logloss(p, y) 332 | count += 1 333 | else: 334 | # step 2-2, update learner with label (click) information 335 | learner.update(x, p, y) 336 | 337 | if t%1000000 == 0: 338 | print("Processed : ", t, datetime.now()) 339 | 340 | 341 | 342 | ############################################################################## 343 | # start testing, and build Kaggle's submission file ########################## 344 | ############################################################################## 345 | 346 | with open(submission, 'w') as outfile: 347 | outfile.write('display_id,ad_id,clicked\n') 348 | for t, disp_id, ad_id, x, y in data(test, D): 349 | p = learner.predict(x) 350 | outfile.write('%s,%s,%s\n' % (disp_id, ad_id, str(p))) 351 | if t%1000000 == 0: 352 | print("Processed : ", t, datetime.now()) 353 | -------------------------------------------------------------------------------- /OutBrain/readme.md: -------------------------------------------------------------------------------- 1 | Codes for the Kaggle competition - [Outbrain Click Prediction](https://www.kaggle.com/c/outbrain-click-prediction) 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kaggle 2 | This repository contains the codes that has been written for Kaggle competitions 3 | 4 | Kaggle Profile : http://www.kaggle.com/sudalairajkumar 5 | -------------------------------------------------------------------------------- /SantanderReco/keras_starter_kaggle.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn import preprocessing 5 | 6 | from keras.models import Sequential 7 | from keras.layers.core import Dense, Activation, Merge, Reshape, Dropout 8 | from keras.layers.embeddings import Embedding 9 | from keras.optimizers import SGD 10 | from keras.layers.normalization import BatchNormalization 11 | import cPickle as pkl 12 | 13 | np.random.seed(12345) 14 | 15 | # mapping dict to map the categories to numerical values # 16 | mapping_dict = { 17 | 'ind_empleado' : {'N':0, -99:1, 'B':2, 'F':3, 'A':4, 'S':5}, 18 | 'sexo' : {'V':0, 'H':1, -99:2}, 19 | 'ind_nuevo' : {0.0:0, 1.0:1, -99.0:2}, 20 | 'indrel' : {1.0:0, 99.0:1, -99.0:2}, 21 | 'indrel_1mes' : {-99:0, 1.0:1, 1:1, 2.0:2, 2:2, 3.0:3, 3:3, 4.0:4, 4:4, 'P':5}, 22 | 'tiprel_1mes' : {-99:0, 'I':1, 'A':2, 'P':3, 'R':4, 'N':5}, 23 | 'indresi' : {-99:0, 'S':1, 'N':2}, 24 | 'indext' : {-99:0, 'S':1, 'N':2}, 25 | 'conyuemp' : {-99:0, 'S':1, 'N':2}, 26 | 'indfall' : {-99:0, 'S':1, 'N':2}, 27 | 'tipodom' : {-99.0:0, 1.0:1}, 28 | 'ind_actividad_cliente' : {0.0:0, 1.0:1, -99.0:2}, 29 | 'segmento' : {'02 - PARTICULARES':0, '03 - UNIVERSITARIO':1, '01 - TOP':2, -99:2}, 30 | 'pais_residencia' : {'LV': 102, 'BE': 12, 'BG': 50, 'BA': 61, 'BM': 117, 'BO': 62, 'JP': 82, 'JM': 116, 'BR': 17, 'BY': 64, 'BZ': 113, 'RU': 43, 'RS': 89, 'RO': 41, 'GW': 99, 'GT': 44, 'GR': 39, 'GQ': 73, 'GE': 78, 'GB': 9, 'GA': 45, 'GN': 98, 'GM': 110, 'GI': 96, 'GH': 88, 'OM': 100, 'HR': 67, 'HU': 106, 'HK': 34, 'HN': 22, 'AD': 35, 'PR': 40, 'PT': 26, 'PY': 51, 'PA': 60, 'PE': 20, 'PK': 84, 'PH': 91, 'PL': 30, 'EE': 52, 'EG': 74, 'ZA': 75, 'EC': 19, 'AL': 25, 'VN': 90, 'ET': 54, 'ZW': 114, 'ES': 0, 'MD': 68, 'UY': 77, 'MM': 94, 'ML': 104, 'US': 15, 'MT': 118, 'MR': 48, 'UA': 49, 'MX': 16, 'IL': 42, 'FR': 8, 'MA': 38, 'FI': 23, 'NI': 33, 'NL': 7, 'NO': 46, 'NG': 83, 'NZ': 93, 'CI': 57, 'CH': 3, 'CO': 21, 'CN': 28, 'CM': 55, 'CL': 4, 'CA': 2, 'CG': 101, 'CF': 109, 'CD': 112, 'CZ': 36, 'CR': 32, 'CU': 72, 'KE': 65, 'KH': 95, 'SV': 53, 'SK': 69, 'KR': 87, 'KW': 92, 'SN': 47, 'SL': 97, 'KZ': 111, 'SA': 56, 'SG': 66, 'SE': 24, 'DO': 11, 'DJ': 115, 'DK': 76, 'DE': 10, 'DZ': 80, 'MK': 105, -99: 1, 'LB': 81, 'TW': 29, 'TR': 70, 'TN': 85, 'LT': 103, 'LU': 59, 'TH': 79, 'TG': 86, 'LY': 108, 'AE': 37, 'VE': 14, 'IS': 107, 'IT': 18, 'AO': 71, 'AR': 13, 'AU': 63, 'AT': 6, 'IN': 31, 'IE': 5, 'QA': 58, 'MZ': 27}, 31 | 'canal_entrada' : {'013': 49, 'KHP': 160, 'KHQ': 157, 'KHR': 161, 'KHS': 162, 'KHK': 10, 'KHL': 0, 'KHM': 12, 'KHN': 21, 'KHO': 13, 'KHA': 22, 'KHC': 9, 'KHD': 2, 'KHE': 1, 'KHF': 19, '025': 159, 'KAC': 57, 'KAB': 28, 'KAA': 39, 'KAG': 26, 'KAF': 23, 'KAE': 30, 'KAD': 16, 'KAK': 51, 'KAJ': 41, 'KAI': 35, 'KAH': 31, 'KAO': 94, 'KAN': 110, 'KAM': 107, 'KAL': 74, 'KAS': 70, 'KAR': 32, 'KAQ': 37, 'KAP': 46, 'KAW': 76, 'KAV': 139, 'KAU': 142, 'KAT': 5, 'KAZ': 7, 'KAY': 54, 'KBJ': 133, 'KBH': 90, 'KBN': 122, 'KBO': 64, 'KBL': 88, 'KBM': 135, 'KBB': 131, 'KBF': 102, 'KBG': 17, 'KBD': 109, 'KBE': 119, 'KBZ': 67, 'KBX': 116, 'KBY': 111, 'KBR': 101, 'KBS': 118, 'KBP': 121, 'KBQ': 62, 'KBV': 100, 'KBW': 114, 'KBU': 55, 'KCE': 86, 'KCD': 85, 'KCG': 59, 'KCF': 105, 'KCA': 73, 'KCC': 29, 'KCB': 78, 'KCM': 82, 'KCL': 53, 'KCO': 104, 'KCN': 81, 'KCI': 65, 'KCH': 84, 'KCK': 52, 'KCJ': 156, 'KCU': 115, 'KCT': 112, 'KCV': 106, 'KCQ': 154, 'KCP': 129, 'KCS': 77, 'KCR': 153, 'KCX': 120, 'RED': 8, 'KDL': 158, 'KDM': 130, 'KDN': 151, 'KDO': 60, 'KDH': 14, 'KDI': 150, 'KDD': 113, 'KDE': 47, 'KDF': 127, 'KDG': 126, 'KDA': 63, 'KDB': 117, 'KDC': 75, 'KDX': 69, 'KDY': 61, 'KDZ': 99, 'KDT': 58, 'KDU': 79, 'KDV': 91, 'KDW': 132, 'KDP': 103, 'KDQ': 80, 'KDR': 56, 'KDS': 124, 'K00': 50, 'KEO': 96, 'KEN': 137, 'KEM': 155, 'KEL': 125, 'KEK': 145, 'KEJ': 95, 'KEI': 97, 'KEH': 15, 'KEG': 136, 'KEF': 128, 'KEE': 152, 'KED': 143, 'KEC': 66, 'KEB': 123, 'KEA': 89, 'KEZ': 108, 'KEY': 93, 'KEW': 98, 'KEV': 87, 'KEU': 72, 'KES': 68, 'KEQ': 138, -99: 6, 'KFV': 48, 'KFT': 92, 'KFU': 36, 'KFR': 144, 'KFS': 38, 'KFP': 40, 'KFF': 45, 'KFG': 27, 'KFD': 25, 'KFE': 148, 'KFB': 146, 'KFC': 4, 'KFA': 3, 'KFN': 42, 'KFL': 34, 'KFM': 141, 'KFJ': 33, 'KFK': 20, 'KFH': 140, 'KFI': 134, '007': 71, '004': 83, 'KGU': 149, 'KGW': 147, 'KGV': 43, 'KGY': 44, 'KGX': 24, 'KGC': 18, 'KGN': 11}, 32 | 'nomprov' : {'ZARAGOZA': 2, 'BURGOS': 11, 'GRANADA': 46, 'MADRID': 18, 'CIUDAD REAL': 1, 'GIRONA': 7, 'TARRAGONA': 50, 'LEON': 4, 'SORIA': 20, 'SANTA CRUZ DE TENERIFE': 48, 'CEUTA': 52, 'HUESCA': 12, 'VALLADOLID': 24, 'LERIDA': 17, 'ZAMORA': 8, 'CUENCA': 31, 'RIOJA, LA': 34, 'TERUEL': 27, 'PONTEVEDRA': 25, 'MELILLA': 49, 'CORDOBA': 44, 'SEVILLA': 21, -99: 39, 'ALICANTE': 19, 'CASTELLON': 33, 'OURENSE': 29, 'VALENCIA': 26, 'CORU\xc3\x91A, A': 28, 'HUELVA': 45, 'ALBACETE': 35, 'JAEN': 30, 'CADIZ': 38, 'BADAJOZ': 36, 'TOLEDO': 3, 'AVILA': 14, 'BARCELONA': 9, 'SEGOVIA': 15, 'NAVARRA': 13, 'MALAGA': 0, 'SALAMANCA': 10, 'PALENCIA': 42, 'ALMERIA': 40, 'MURCIA': 37, 'GUADALAJARA': 41, 'ASTURIAS': 47, 'BALEARS, ILLES': 23, 'ALAVA': 51, 'LUGO': 16, 'CANTABRIA': 22, 'CACERES': 6, 'PALMAS, LAS': 43, 'GIPUZKOA': 5, 'BIZKAIA': 32, 'CORUNA, A':28} 33 | } 34 | 35 | dtype_list = {'ind_cco_fin_ult1': 'float16', 'ind_deme_fin_ult1': 'float16', 'ind_aval_fin_ult1': 'float16', 'ind_valo_fin_ult1': 'float16', 'ind_reca_fin_ult1': 'float16', 'ind_ctju_fin_ult1': 'float16', 'ind_cder_fin_ult1': 'float16', 'ind_plan_fin_ult1': 'float16', 'ind_fond_fin_ult1': 'float16', 'ind_hip_fin_ult1': 'float16', 'ind_pres_fin_ult1': 'float16', 'ind_nomina_ult1': 'float16', 'ind_cno_fin_ult1': 'float16', 'ncodpers': 'int64', 'ind_ctpp_fin_ult1': 'float16', 'ind_ahor_fin_ult1': 'float16', 'ind_dela_fin_ult1': 'float16', 'ind_ecue_fin_ult1': 'float16', 'ind_nom_pens_ult1': 'float16', 'ind_recibo_ult1': 'float16', 'ind_deco_fin_ult1': 'float16', 'ind_tjcr_fin_ult1': 'float16', 'ind_ctop_fin_ult1': 'float16', 'ind_viv_fin_ult1': 'float16', 'ind_ctma_fin_ult1': 'float16'} 36 | 37 | # categorical columns to use # 38 | cols_to_use = mapping_dict.keys() 39 | print(cols_to_use) 40 | 41 | # target columns to predict # 42 | target_cols = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1','ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1'] 43 | print(target_cols) 44 | 45 | # one hot encode fit for all the categorical variables # 46 | ohes = [] 47 | feat_count = 0 48 | for col in cols_to_use: 49 | ohe = preprocessing.OneHotEncoder() 50 | ohe.fit(np.array(mapping_dict[col].values()).reshape(-1,1)) 51 | feat_count += ohe.n_values_[0] 52 | print(col, feat_count) 53 | ohes.append(ohe) 54 | 55 | 56 | def batch_generator(file_name, batch_size, shuffle, train_input=True): 57 | while (True): 58 | if train_input: 59 | chunked_df = pd.read_csv(file_name, usecols=['ncodpers']+cols_to_use+target_cols, chunksize=batch_size) 60 | else: 61 | chunked_df = pd.read_csv(file_name, usecols=['ncodpers']+cols_to_use, chunksize=batch_size) 62 | 63 | nrows = 0 64 | for chunk_df in chunked_df: 65 | chunk_X = chunk_df[cols_to_use] 66 | chunk_X = chunk_X.fillna(-99) 67 | for col_ind, col in enumerate(cols_to_use): 68 | chunk_X[col] = chunk_X[col].apply(lambda x: mapping_dict[col][x]) 69 | ohe = ohes[col_ind] 70 | temp_X = ohe.transform( np.array(chunk_X[col]).reshape(-1,1) ) 71 | if col_ind == 0: 72 | X = temp_X.todense().copy() 73 | else: 74 | X = np.hstack((X, temp_X.todense())) 75 | 76 | if train_input: 77 | y = np.array(chunk_df[target_cols].fillna(0)) 78 | 79 | if shuffle: 80 | shuffle_index = np.random.shuffle(np.arange(X.shape[0])) 81 | X = X[shuffle_index,:] 82 | if train_input: 83 | y = y[shuffle_index,:] 84 | 85 | 86 | if train_input: 87 | yield X, y 88 | else: 89 | yield X 90 | 91 | nrows += batch_size 92 | if train_input and nrows >= train_size: 93 | break 94 | 95 | 96 | def keras_embedding_model(): 97 | # keras model architecture # 98 | final_model = Sequential() 99 | final_model.add(Dense(50, input_dim=feat_count, init='he_uniform')) 100 | final_model.add(Activation('relu')) 101 | final_model.add(Dense(len(target_cols), init='zero')) 102 | final_model.add(Activation('sigmoid')) 103 | final_model.compile(loss='binary_crossentropy', optimizer='adam') 104 | return final_model 105 | 106 | if __name__ == "__main__": 107 | train = "../input/train_ver2.csv" 108 | test = "../input/test_ver2.csv" 109 | #train_size = 13647309 110 | train_size = 1000000 111 | test_size = 929615 112 | print("Initialize the model..") 113 | model = keras_embedding_model() 114 | print("Model fit..") 115 | fit= model.fit_generator( 116 | generator = batch_generator(train, 500, False), 117 | nb_epoch = 1, 118 | samples_per_epoch = train_size 119 | ) 120 | preds = model.predict_generator(generator=batch_generator(test, 10000, False, False), val_samples=test_size) 121 | print("Predictions : ", preds.shape) 122 | 123 | last_instance_df = pd.read_csv(train, usecols=['ncodpers']+target_cols, dtype=dtype_list) 124 | last_instance_df = last_instance_df.drop_duplicates('ncodpers', keep='last') 125 | last_instance_df = last_instance_df.fillna(0).astype('int') 126 | cust_dict = {} 127 | target_cols = np.array(target_cols) 128 | for ind, row in last_instance_df.iterrows(): 129 | cust = row['ncodpers'] 130 | used_products = set(target_cols[np.array(row[1:])==1]) 131 | cust_dict[cust] = used_products 132 | del last_instance_df 133 | 134 | target_cols = np.array(target_cols) 135 | preds = np.argsort(preds, axis=1) 136 | preds = np.fliplr(preds) 137 | test_id = np.array(pd.read_csv(test, usecols=['ncodpers'])['ncodpers']) 138 | final_preds = [] 139 | for ind, pred in enumerate(preds): 140 | cust = test_id[ind] 141 | top_products = target_cols[pred] 142 | used_products = cust_dict.get(cust,[]) 143 | new_top_products = [] 144 | for product in top_products: 145 | if product not in used_products: 146 | new_top_products.append(product) 147 | if len(new_top_products) == 7: 148 | break 149 | final_preds.append(" ".join(new_top_products)) 150 | out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds}) 151 | out_df.to_csv('sub_keras.csv', index=False) 152 | 153 | 154 | -------------------------------------------------------------------------------- /SantanderReco/multilabel_classification.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import cPickle as pkl 4 | from sklearn import preprocessing, ensemble 5 | 6 | # columns to be used as features # 7 | #feature_cols = ["ind_empleado","pais_residencia","sexo","age","ind_nuevo","antiguedad","indrel","ult_fec_cli_1t","indrel_1mes","tiprel_1mes","indresi","indext","conyuemp","canal_entrada","indfall","tipodom","cod_prov","nomprov","ind_actividad_cliente","renta","segmento"] 8 | feature_cols = ["ind_empleado","pais_residencia","sexo","age", "ind_nuevo", "antiguedad", "nomprov", "segmento", "ind_actividad_cliente", "indresi"] 9 | feature_cols = ["ind_empleado","pais_residencia","sexo","age"] 10 | feature_cols = ["ind_empleado","pais_residencia"] 11 | 12 | target_cols = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1','ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1'] 13 | 14 | dtype_list = {'ind_cco_fin_ult1': 'float16', 'ind_deme_fin_ult1': 'float16', 'ind_aval_fin_ult1': 'float16', 'ind_valo_fin_ult1': 'float16', 'ind_reca_fin_ult1': 'float16', 'ind_ctju_fin_ult1': 'float16', 'ind_cder_fin_ult1': 'float16', 'ind_plan_fin_ult1': 'float16', 'ind_fond_fin_ult1': 'float16', 'ind_hip_fin_ult1': 'float16', 'ind_pres_fin_ult1': 'float16', 'ind_nomina_ult1': 'float16', 'ind_cno_fin_ult1': 'float16', 'ncodpers': 'int64', 'ind_ctpp_fin_ult1': 'float16', 'ind_ahor_fin_ult1': 'float16', 'ind_dela_fin_ult1': 'float16', 'ind_ecue_fin_ult1': 'float16', 'ind_nom_pens_ult1': 'float16', 'ind_recibo_ult1': 'float16', 'ind_deco_fin_ult1': 'float16', 'ind_tjcr_fin_ult1': 'float16', 'ind_ctop_fin_ult1': 'float16', 'ind_viv_fin_ult1': 'float16', 'ind_ctma_fin_ult1': 'float16'} 15 | 16 | if __name__ == "__main__": 17 | data_path = "../input/" 18 | train_file = data_path + "train_ver2.csv" 19 | test_file = data_path + "test_ver2.csv" 20 | train_size = 13647309 21 | nrows = 1000000 # change this value to read more rows from train 22 | 23 | start_index = train_size - nrows 24 | for ind, col in enumerate(feature_cols): 25 | print(col) 26 | train = pd.read_csv(train_file, usecols=[col]) 27 | test = pd.read_csv(test_file, usecols=[col]) 28 | train.fillna(-99, inplace=True) 29 | test.fillna(-99, inplace=True) 30 | if train[col].dtype == "object": 31 | le = preprocessing.LabelEncoder() 32 | le.fit(list(train[col].values) + list(test[col].values)) 33 | temp_train_X = le.transform(list(train[col].values)).reshape(-1,1)[start_index:,:] 34 | temp_test_X = le.transform(list(test[col].values)).reshape(-1,1) 35 | else: 36 | temp_train_X = np.array(train[col]).reshape(-1,1)[start_index:,:] 37 | temp_test_X = np.array(test[col]).reshape(-1,1) 38 | if ind == 0: 39 | train_X = temp_train_X.copy() 40 | test_X = temp_test_X.copy() 41 | else: 42 | train_X = np.hstack([train_X, temp_train_X]) 43 | test_X = np.hstack([test_X, temp_test_X]) 44 | print(train_X.shape, test_X.shape) 45 | del train 46 | del test 47 | 48 | train_y = pd.read_csv(train_file, usecols=['ncodpers']+target_cols, dtype=dtype_list) 49 | last_instance_df = train_y.drop_duplicates('ncodpers', keep='last') 50 | train_y = np.array(train_y.fillna(0)).astype('int')[start_index:,1:] 51 | print(train_X.shape, train_y.shape) 52 | print(test_X.shape) 53 | 54 | print("Running Model..") 55 | model = ensemble.RandomForestClassifier(n_estimators=5, max_depth=10, min_samples_leaf=10, n_jobs=4, random_state=2016) 56 | model.fit(train_X, train_y) 57 | del train_X, train_y 58 | print("Predicting..") 59 | preds = np.array(model.predict_proba(test_X))[:,:,1].T 60 | del test_X 61 | #print preds.shape 62 | 63 | print("Getting last instance dict..") 64 | last_instance_df = last_instance_df.fillna(0).astype('int') 65 | cust_dict = {} 66 | target_cols = np.array(target_cols) 67 | for ind, row in last_instance_df.iterrows(): 68 | cust = row['ncodpers'] 69 | used_products = set(target_cols[np.array(row[1:])==1]) 70 | cust_dict[cust] = used_products 71 | del last_instance_df 72 | print row 73 | print cust, cust_dict[cust] 74 | 75 | print("Creating submission..") 76 | preds = np.argsort(preds, axis=1) 77 | preds = np.fliplr(preds) 78 | #print preds.shape 79 | test_id = np.array(pd.read_csv(test_file, usecols=['ncodpers'])['ncodpers']) 80 | final_preds = [] 81 | for ind, pred in enumerate(preds): 82 | cust = test_id[ind] 83 | top_products = target_cols[pred] 84 | used_products = cust_dict.get(cust,[]) 85 | print cust, used_products 86 | new_top_products = [] 87 | for product in top_products: 88 | if product not in used_products: 89 | new_top_products.append(product) 90 | if len(new_top_products) == 7: 91 | break 92 | final_preds.append(" ".join(new_top_products)) 93 | out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds}) 94 | out_df.to_csv('sub_rf.csv', index=False) 95 | 96 | 97 | -------------------------------------------------------------------------------- /SantanderReco/readme.md: -------------------------------------------------------------------------------- 1 | Codes for Kaggle competition - [Santander Product recommendation](https://www.kaggle.com/c/santander-product-recommendation) is present in this folder. 2 | -------------------------------------------------------------------------------- /SpookyAuthor/readme.md: -------------------------------------------------------------------------------- 1 | Codes for the Kaggle - Spooky Author challenge 2 | -------------------------------------------------------------------------------- /Titanic/readme.md: -------------------------------------------------------------------------------- 1 | Codes for the titanic competition 2 | -------------------------------------------------------------------------------- /TransferLearningStackExchange/frequent_words_model.py: -------------------------------------------------------------------------------- 1 | import re 2 | import csv 3 | import operator 4 | from collections import defaultdict 5 | 6 | stop_words = set(['a', "a's", 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'b', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'c', "c'mon", "c's", 'came', 'can', "can't", 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', "couldn't", 'course', 'currently', 'd', 'definitely', 'described', 'despite', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', "don't", 'done', 'down', 'downwards', 'during', 'e', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'f', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'g', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'h', 'had', "hadn't", 'happens', 'hardly', 'has', "hasn't", 'have', "haven't", 'having', 'he', "he's", 'hello', 'help', 'hence', 'her', 'here', "here's", 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', 'j', 'just', 'k', 'keep', 'keeps', 'kept', 'know', 'knows', 'known', 'l', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'm', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'n', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'o', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'p', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'q', 'que', 'quite', 'qv', 'r', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 's', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 't', "t's", 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', "there's", 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', "they'd", "they'll", "they're", "they've", 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'u', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'uucp', 'v', 'value', 'various', 'very', 'via', 'viz', 'vs', 'w', 'want', 'wants', 'was', "wasn't", 'way', 'we', "we'd", "we'll", "we're", "we've", 'welcome', 'well', 'went', 'were', "weren't", 'what', "what's", 'whatever', 'when', 'whence', 'whenever', 'where', "where's", 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', "who's", 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', "won't", 'wonder', 'would', 'would', "wouldn't", 'x', 'y', 'yes', 'yet', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 'z', 'zero', '']) 7 | def f1_score(tp, fp, fn): 8 | p = (tp*1.) / (tp+fp) 9 | r = (tp*1.) / (tp+fn) 10 | f1 = (2*p*r)/(p+r) 11 | return f1 12 | 13 | def clean_html(raw_html): 14 | cleanr = re.compile('<.*?>') 15 | cleantext = re.sub(cleanr, '', raw_html) 16 | return cleantext 17 | 18 | def get_words(text): 19 | word_split = re.compile('[^a-zA-Z0-9_\\+\\-/]') 20 | return [word.strip().lower() for word in word_split.split(text)] 21 | 22 | data_path = "../input/" 23 | in_file = open(data_path+"test.csv") 24 | out_file = open("sub_freq.csv", "w") 25 | reader = csv.DictReader(in_file) 26 | writer = csv.writer(out_file) 27 | writer.writerow(['id','tags']) 28 | for ind, row in enumerate(reader): 29 | text = clean_html(row["title"]) + " " + clean_html(row['content']) 30 | frequency_dict = defaultdict(int) 31 | for word in get_words(text): 32 | if word not in stop_words: 33 | frequency_dict[word] += 1 34 | pred_tags = set(sorted(frequency_dict, key=frequency_dict.get, reverse=True)[:3]) 35 | writer.writerow([row['id'], " ".join(pred_tags)]) 36 | if ind%50000 == 0: 37 | print("Processed : ", ind) 38 | 39 | 40 | in_file.close() 41 | out_file.close() 42 | -------------------------------------------------------------------------------- /TransferLearningStackExchange/readme.md: -------------------------------------------------------------------------------- 1 | Codes for the Kaggle competition - Transfer Learning on Stack Exchange Tags is present here. 2 | -------------------------------------------------------------------------------- /TransferLearningStackExchange/simple_exploration_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "_cell_guid": "249cb54e-4588-5b7d-2c01-60bba33d731e" 7 | }, 8 | "source": [ 9 | "Simple exploration notebook " 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "_cell_guid": "50ab2790-3a51-98f3-1e8b-33926c862bfd" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 21 | "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n", 22 | "# For example, here's several helpful packages to load in \n", 23 | "\n", 24 | "import numpy as np # linear algebra\n", 25 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "import seaborn as sns\n", 28 | "from wordcloud import WordCloud\n", 29 | "from nltk.corpus import stopwords\n", 30 | "\n", 31 | "%matplotlib inline" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "_cell_guid": "bbda81c2-5282-479c-a931-743437f6d84b" 38 | }, 39 | "source": [ 40 | "**Wordcloud on tags:**\n", 41 | "\n", 42 | "Let us create a word cloud on the tags column for all topics and see the important tags." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "_cell_guid": "24027035-3f69-5988-2c21-a9fd1522ccfb" 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "topics_list = ['biology', 'cooking', 'crypto', 'diy', 'robotics', 'travel']\n", 54 | "\n", 55 | "for ind, topic in enumerate(topics_list):\n", 56 | " tags = np.array(pd.read_csv(\"../input/\"+topic+\".csv\", usecols=['tags'])['tags'])\n", 57 | " text = ''\n", 58 | " for ind, tag in enumerate(tags):\n", 59 | " text = \" \".join([text, tag])\n", 60 | " text = text.strip()\n", 61 | " \n", 62 | " wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text)\n", 63 | " wordcloud.recolor(random_state=ind*312)\n", 64 | " plt.imshow(wordcloud)\n", 65 | " plt.title(\"Wordcloud for topic : \"+topic)\n", 66 | " plt.axis(\"off\")\n", 67 | " plt.show()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "_cell_guid": "c0f04269-7f45-7cf1-5625-13e04733bc6a" 74 | }, 75 | "source": [ 76 | "**Wordcloud for topic Biology:**\n", 77 | " \n", 78 | "Let us take a single topic 'biology' and then see how the word clouds from each of the three fields title, content, tags compare with each other." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "_cell_guid": "77bb82f3-5925-34e0-7c2c-903ae0ec20d4" 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "bio = pd.read_csv(\"../input/biology.csv\")\n", 90 | "title = np.array(bio['title'])\n", 91 | "content = np.array(bio['content'])\n", 92 | "tags = np.array(bio['tags'])\n", 93 | "\n", 94 | "# wordcloud for tags #\n", 95 | "text = ''\n", 96 | "for ind, tag in enumerate(tags):\n", 97 | " text = \" \".join([text, tag])\n", 98 | "text = text.strip()\n", 99 | "\n", 100 | "wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=80).generate(text)\n", 101 | "wordcloud.recolor(random_state=218)\n", 102 | "plt.imshow(wordcloud)\n", 103 | "plt.axis(\"off\")\n", 104 | "plt.title(\"Wordcloud on 'tags' for biology \")\n", 105 | "plt.show()\n", 106 | "\n", 107 | "# wordcloud for title #\n", 108 | "text = ''\n", 109 | "for ind, tag in enumerate(title):\n", 110 | " text = \" \".join([text, tag])\n", 111 | "text = text.strip()\n", 112 | "\n", 113 | "stop_words = set(stopwords.words('english') + ['sas', 'ss', 'fas', 'des', 'les', 'ess'])\n", 114 | "wordcloud = WordCloud(background_color='white', width=600, height=300, stopwords=stop_words, max_font_size=50, max_words=80).generate(text)\n", 115 | "wordcloud.recolor(random_state=218)\n", 116 | "plt.imshow(wordcloud)\n", 117 | "plt.axis(\"off\")\n", 118 | "plt.title(\"Wordcloud on 'title' for biology \")\n", 119 | "plt.show()\n", 120 | "\n", 121 | "### Commenting this out for now as it throws error while rendering and not while running it at the backend ###\n", 122 | "## wordcloud for content #\n", 123 | "#text = ''\n", 124 | "#for ind, tag in enumerate(content):\n", 125 | "# text = \" \".join([text, tag])\n", 126 | "#text = text.strip()\n", 127 | "\n", 128 | "#stop_words = set(stopwords.words('english') + ['rbs', 'sas', 'ss', 'fas', 'des', 'ess', 'les', 'bas', 'poses', 'los', 'ros', 'cs'])\n", 129 | "#wordcloud = WordCloud(background_color='white', width=600, height=300, stopwords=stop_words, max_font_size=50, max_words=80).generate(text)\n", 130 | "#wordcloud.recolor(random_state=218)\n", 131 | "#plt.imshow(wordcloud)\n", 132 | "#plt.axis(\"off\")\n", 133 | "#plt.title(\"Wordcloud on 'content' for biology \")\n", 134 | "#plt.show()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "_cell_guid": "6948ea6a-2d00-f199-e4ef-a761f640021e" 141 | }, 142 | "source": [ 143 | "As we can see, wordcloud from 'topic' is decent and has some important words related to the topic and can be used for tag creation.\n", 144 | "\n", 145 | "Wordcloud from 'content' has more irrelevant (html) words and so we need to do proper cleaning to remove those before we start with our modeling / learning.\n", 146 | "\n", 147 | "More to come...!" 148 | ] 149 | } 150 | ], 151 | "metadata": { 152 | "_change_revision": 0, 153 | "_is_fork": false, 154 | "kernelspec": { 155 | "display_name": "Python 3", 156 | "language": "python", 157 | "name": "python3" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": { 161 | "name": "ipython", 162 | "version": 3 163 | }, 164 | "file_extension": ".py", 165 | "mimetype": "text/x-python", 166 | "name": "python", 167 | "nbconvert_exporter": "python", 168 | "pygments_lexer": "ipython3", 169 | "version": "3.5.2" 170 | } 171 | }, 172 | "nbformat": 4, 173 | "nbformat_minor": 0 174 | } 175 | -------------------------------------------------------------------------------- /TwoSigmaConnect_RentHop/XGBStarterInPython.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "_cell_guid": "6e711393-7a75-17e3-539c-9169c1ae1225" 7 | }, 8 | "source": [ 9 | "It seems the current [high scoring script][1] is written in R using H2O. So let us do one in python using XGBoost. \n", 10 | "\n", 11 | "Thanks to [this script][2] for feature engineering ideas. \n", 12 | "\n", 13 | "We shall start with importing the necessary modules\n", 14 | "\n", 15 | "\n", 16 | " [1]: https://www.kaggle.com/gospursgo/two-sigma-connect-rental-listing-inquiries/h2o-starter-pack/run/835757\n", 17 | " [2]: https://www.kaggle.com/aikinogard/two-sigma-connect-rental-listing-inquiries/random-forest-starter-with-numerical-features" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": { 24 | "_cell_guid": "1952347b-6dc9-b9f1-fa25-94587a2aee77" 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import os\n", 29 | "import sys\n", 30 | "import operator\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "from scipy import sparse\n", 34 | "import xgboost as xgb\n", 35 | "from sklearn import model_selection, preprocessing, ensemble\n", 36 | "from sklearn.metrics import log_loss\n", 37 | "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "_cell_guid": "d7d59f0a-0026-8e33-6236-31637173734f" 44 | }, 45 | "source": [ 46 | "Now let us write a custom function to run the xgboost model." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": { 53 | "_cell_guid": "af6e68af-f7a8-b0ac-c565-1d04818258f9" 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):\n", 58 | " param = {}\n", 59 | " param['objective'] = 'multi:softprob'\n", 60 | " param['eta'] = 0.1\n", 61 | " param['max_depth'] = 6\n", 62 | " param['silent'] = 1\n", 63 | " param['num_class'] = 3\n", 64 | " param['eval_metric'] = \"mlogloss\"\n", 65 | " param['min_child_weight'] = 1\n", 66 | " param['subsample'] = 0.7\n", 67 | " param['colsample_bytree'] = 0.7\n", 68 | " param['seed'] = seed_val\n", 69 | " num_rounds = num_rounds\n", 70 | "\n", 71 | " plst = list(param.items())\n", 72 | " xgtrain = xgb.DMatrix(train_X, label=train_y)\n", 73 | "\n", 74 | " if test_y is not None:\n", 75 | " xgtest = xgb.DMatrix(test_X, label=test_y)\n", 76 | " watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]\n", 77 | " model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)\n", 78 | " else:\n", 79 | " xgtest = xgb.DMatrix(test_X)\n", 80 | " model = xgb.train(plst, xgtrain, num_rounds)\n", 81 | "\n", 82 | " pred_test_y = model.predict(xgtest)\n", 83 | " return pred_test_y, model" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "_cell_guid": "c4a69cea-cb06-5d6a-83b7-16ee8ee241f6" 90 | }, 91 | "source": [ 92 | "Let us read the train and test files and store it." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 3, 98 | "metadata": { 99 | "_cell_guid": "0108ce34-5e84-7f49-bd6f-6562d60a9082" 100 | }, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "(49352, 15)\n", 107 | "(74659, 14)\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "data_path = \"../input/\"\n", 113 | "train_file = data_path + \"train.json\"\n", 114 | "test_file = data_path + \"test.json\"\n", 115 | "train_df = pd.read_json(train_file)\n", 116 | "test_df = pd.read_json(test_file)\n", 117 | "print(train_df.shape)\n", 118 | "print(test_df.shape)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": { 124 | "_cell_guid": "2bf65ce4-7375-c8e9-97d5-621736f3338d" 125 | }, 126 | "source": [ 127 | "We do not need any pre-processing for numerical features and so create a list with those features." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 4, 133 | "metadata": { 134 | "_cell_guid": "6462885f-97de-b2d1-2c1a-1958115c4c4d" 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "features_to_use = [\"bathrooms\", \"bedrooms\", \"latitude\", \"longitude\", \"price\"]" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "_cell_guid": "b7670810-6d0b-89d0-629e-f99624421229" 145 | }, 146 | "source": [ 147 | "Now let us create some new features from the given features." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 5, 153 | "metadata": { 154 | "_cell_guid": "e3b81db5-929d-b8b8-141c-1bbb4a5eaaf3" 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "# count of photos #\n", 159 | "train_df[\"num_photos\"] = train_df[\"photos\"].apply(len)\n", 160 | "test_df[\"num_photos\"] = test_df[\"photos\"].apply(len)\n", 161 | "\n", 162 | "# count of \"features\" #\n", 163 | "train_df[\"num_features\"] = train_df[\"features\"].apply(len)\n", 164 | "test_df[\"num_features\"] = test_df[\"features\"].apply(len)\n", 165 | "\n", 166 | "# count of words present in description column #\n", 167 | "train_df[\"num_description_words\"] = train_df[\"description\"].apply(lambda x: len(x.split(\" \")))\n", 168 | "test_df[\"num_description_words\"] = test_df[\"description\"].apply(lambda x: len(x.split(\" \")))\n", 169 | "\n", 170 | "# convert the created column to datetime object so as to extract more features \n", 171 | "train_df[\"created\"] = pd.to_datetime(train_df[\"created\"])\n", 172 | "test_df[\"created\"] = pd.to_datetime(test_df[\"created\"])\n", 173 | "\n", 174 | "# Let us extract some features like year, month, day, hour from date columns #\n", 175 | "train_df[\"created_year\"] = train_df[\"created\"].dt.year\n", 176 | "test_df[\"created_year\"] = test_df[\"created\"].dt.year\n", 177 | "train_df[\"created_month\"] = train_df[\"created\"].dt.month\n", 178 | "test_df[\"created_month\"] = test_df[\"created\"].dt.month\n", 179 | "train_df[\"created_day\"] = train_df[\"created\"].dt.day\n", 180 | "test_df[\"created_day\"] = test_df[\"created\"].dt.day\n", 181 | "train_df[\"created_hour\"] = train_df[\"created\"].dt.hour\n", 182 | "test_df[\"created_hour\"] = test_df[\"created\"].dt.hour\n", 183 | "\n", 184 | "# adding all these new features to use list #\n", 185 | "features_to_use.extend([\"num_photos\", \"num_features\", \"num_description_words\",\"created_year\", \"created_month\", \"created_day\", \"listing_id\", \"created_hour\"])" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "_cell_guid": "3d9aa966-66a2-8ff8-2459-40e0187418a2" 192 | }, 193 | "source": [ 194 | "We have 4 categorical features in our data\n", 195 | "\n", 196 | " - display_address\n", 197 | " - manager_id\n", 198 | " - building_id\n", 199 | " - listing_id\n", 200 | "\n", 201 | "So let us label encode these features." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 6, 207 | "metadata": { 208 | "_cell_guid": "af410ae2-6197-adce-ee68-360aa59eff7e" 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "categorical = [\"display_address\", \"manager_id\", \"building_id\", \"street_address\"]\n", 213 | "for f in categorical:\n", 214 | " if train_df[f].dtype=='object':\n", 215 | " #print(f)\n", 216 | " lbl = preprocessing.LabelEncoder()\n", 217 | " lbl.fit(list(train_df[f].values) + list(test_df[f].values))\n", 218 | " train_df[f] = lbl.transform(list(train_df[f].values))\n", 219 | " test_df[f] = lbl.transform(list(test_df[f].values))\n", 220 | " features_to_use.append(f)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "_cell_guid": "3f550f0f-0c6f-2432-2c07-d507632eaa2b" 227 | }, 228 | "source": [ 229 | "We have features column which is a list of string values. So we can first combine all the strings together to get a single string and then apply count vectorizer on top of it." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 7, 235 | "metadata": { 236 | "_cell_guid": "d1ea3504-a12c-023a-bce6-d4f93ddb8019" 237 | }, 238 | "outputs": [ 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | "10 \n", 244 | "10000 Doorman Elevator Fitness_Center Cats_Allowed D...\n", 245 | "100004 Laundry_In_Building Dishwasher Hardwood_Floors...\n", 246 | "100007 Hardwood_Floors No_Fee\n", 247 | "100013 Pre-War\n", 248 | "Name: features, dtype: object\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "train_df['features'] = train_df[\"features\"].apply(lambda x: \" \".join([\"_\".join(i.split(\" \")) for i in x]))\n", 254 | "test_df['features'] = test_df[\"features\"].apply(lambda x: \" \".join([\"_\".join(i.split(\" \")) for i in x]))\n", 255 | "print(train_df[\"features\"].head())\n", 256 | "tfidf = CountVectorizer(stop_words='english', max_features=200)\n", 257 | "tr_sparse = tfidf.fit_transform(train_df[\"features\"])\n", 258 | "te_sparse = tfidf.transform(test_df[\"features\"])" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": { 264 | "_cell_guid": "2bfbcacc-e821-654b-f2b3-cda0f1a5a20b" 265 | }, 266 | "source": [ 267 | "Now let us stack both the dense and sparse features into a single dataset and also get the target variable." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 8, 273 | "metadata": { 274 | "_cell_guid": "9eeef912-2104-e97e-1948-c246652340e1" 275 | }, 276 | "outputs": [ 277 | { 278 | "name": "stdout", 279 | "output_type": "stream", 280 | "text": [ 281 | "(49352, 217) (74659, 217)\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()\n", 287 | "test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()\n", 288 | "\n", 289 | "target_num_map = {'high':0, 'medium':1, 'low':2}\n", 290 | "train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))\n", 291 | "print(train_X.shape, test_X.shape)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": { 297 | "_cell_guid": "7d2e5fb7-7886-68b1-326f-6db491215001" 298 | }, 299 | "source": [ 300 | "Now let us do some cross validation to check the scores. \n", 301 | "\n", 302 | "Please run it in local to get the cv scores. I am commenting it out here for time." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 9, 308 | "metadata": { 309 | "_cell_guid": "13fd60b9-a8b5-c76f-1fbd-2a56219da0d2" 310 | }, 311 | "outputs": [ 312 | { 313 | "name": "stdout", 314 | "output_type": "stream", 315 | "text": [ 316 | "[0]\ttrain-mlogloss:1.04114\ttest-mlogloss:1.04219\n", 317 | "Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.\n", 318 | "\n", 319 | "Will train until test-mlogloss hasn't improved in 20 rounds.\n", 320 | "[1]\ttrain-mlogloss:0.988799\ttest-mlogloss:0.990721\n", 321 | "[2]\ttrain-mlogloss:0.944048\ttest-mlogloss:0.94691\n", 322 | "[3]\ttrain-mlogloss:0.90518\ttest-mlogloss:0.908812\n", 323 | "[4]\ttrain-mlogloss:0.8718\ttest-mlogloss:0.876215\n", 324 | "[5]\ttrain-mlogloss:0.841498\ttest-mlogloss:0.847057\n", 325 | "[6]\ttrain-mlogloss:0.815614\ttest-mlogloss:0.821795\n", 326 | "[7]\ttrain-mlogloss:0.79312\ttest-mlogloss:0.799993\n", 327 | "[8]\ttrain-mlogloss:0.773194\ttest-mlogloss:0.780815\n", 328 | "[9]\ttrain-mlogloss:0.754598\ttest-mlogloss:0.763247\n", 329 | "[10]\ttrain-mlogloss:0.738162\ttest-mlogloss:0.747594\n", 330 | "[11]\ttrain-mlogloss:0.724634\ttest-mlogloss:0.734739\n", 331 | "[12]\ttrain-mlogloss:0.711331\ttest-mlogloss:0.722318\n", 332 | "[13]\ttrain-mlogloss:0.699821\ttest-mlogloss:0.711481\n", 333 | "[14]\ttrain-mlogloss:0.689142\ttest-mlogloss:0.701381\n", 334 | "[15]\ttrain-mlogloss:0.678446\ttest-mlogloss:0.691482\n", 335 | "[16]\ttrain-mlogloss:0.669268\ttest-mlogloss:0.683158\n", 336 | "[17]\ttrain-mlogloss:0.66185\ttest-mlogloss:0.67647\n", 337 | "[18]\ttrain-mlogloss:0.654386\ttest-mlogloss:0.669772\n", 338 | "[19]\ttrain-mlogloss:0.648071\ttest-mlogloss:0.664241\n", 339 | "[20]\ttrain-mlogloss:0.642589\ttest-mlogloss:0.659292\n", 340 | "[21]\ttrain-mlogloss:0.637133\ttest-mlogloss:0.654492\n", 341 | "[22]\ttrain-mlogloss:0.632064\ttest-mlogloss:0.650024\n", 342 | "[23]\ttrain-mlogloss:0.627592\ttest-mlogloss:0.646221\n", 343 | "[24]\ttrain-mlogloss:0.622447\ttest-mlogloss:0.641828\n", 344 | "[25]\ttrain-mlogloss:0.618027\ttest-mlogloss:0.638092\n", 345 | "[26]\ttrain-mlogloss:0.614181\ttest-mlogloss:0.635053\n", 346 | "[27]\ttrain-mlogloss:0.61114\ttest-mlogloss:0.632717\n", 347 | "[28]\ttrain-mlogloss:0.607278\ttest-mlogloss:0.629888\n", 348 | "[29]\ttrain-mlogloss:0.603595\ttest-mlogloss:0.627116\n", 349 | "[30]\ttrain-mlogloss:0.600566\ttest-mlogloss:0.624912\n", 350 | "[31]\ttrain-mlogloss:0.597396\ttest-mlogloss:0.622441\n", 351 | "[32]\ttrain-mlogloss:0.594581\ttest-mlogloss:0.620373\n", 352 | "[33]\ttrain-mlogloss:0.591807\ttest-mlogloss:0.618497\n", 353 | "[34]\ttrain-mlogloss:0.589131\ttest-mlogloss:0.616384\n", 354 | "[35]\ttrain-mlogloss:0.586585\ttest-mlogloss:0.614496\n", 355 | "[36]\ttrain-mlogloss:0.583978\ttest-mlogloss:0.612716\n", 356 | "[37]\ttrain-mlogloss:0.582015\ttest-mlogloss:0.611317\n", 357 | "[38]\ttrain-mlogloss:0.579514\ttest-mlogloss:0.609588\n", 358 | "[39]\ttrain-mlogloss:0.576912\ttest-mlogloss:0.607814\n", 359 | "[40]\ttrain-mlogloss:0.574746\ttest-mlogloss:0.606454\n", 360 | "[41]\ttrain-mlogloss:0.572975\ttest-mlogloss:0.605284\n", 361 | "[42]\ttrain-mlogloss:0.570366\ttest-mlogloss:0.603354\n", 362 | "[43]\ttrain-mlogloss:0.568138\ttest-mlogloss:0.602107\n", 363 | "[44]\ttrain-mlogloss:0.565862\ttest-mlogloss:0.600475\n", 364 | "[45]\ttrain-mlogloss:0.564646\ttest-mlogloss:0.599563\n", 365 | "[46]\ttrain-mlogloss:0.562649\ttest-mlogloss:0.598221\n", 366 | "[47]\ttrain-mlogloss:0.560823\ttest-mlogloss:0.597094\n", 367 | "[48]\ttrain-mlogloss:0.559184\ttest-mlogloss:0.596101\n", 368 | "[49]\ttrain-mlogloss:0.557642\ttest-mlogloss:0.595268\n", 369 | "[50]\ttrain-mlogloss:0.555695\ttest-mlogloss:0.594217\n", 370 | "[51]\ttrain-mlogloss:0.553391\ttest-mlogloss:0.593256\n", 371 | "[52]\ttrain-mlogloss:0.551141\ttest-mlogloss:0.592129\n", 372 | "[53]\ttrain-mlogloss:0.549666\ttest-mlogloss:0.591489\n", 373 | "[54]\ttrain-mlogloss:0.547321\ttest-mlogloss:0.590389\n", 374 | "[55]\ttrain-mlogloss:0.546197\ttest-mlogloss:0.589846\n", 375 | "[56]\ttrain-mlogloss:0.544658\ttest-mlogloss:0.589096\n", 376 | "[57]\ttrain-mlogloss:0.543389\ttest-mlogloss:0.588546\n", 377 | "[58]\ttrain-mlogloss:0.541408\ttest-mlogloss:0.58737\n", 378 | "[59]\ttrain-mlogloss:0.540229\ttest-mlogloss:0.586951\n", 379 | "[60]\ttrain-mlogloss:0.538715\ttest-mlogloss:0.58633\n", 380 | "[61]\ttrain-mlogloss:0.537227\ttest-mlogloss:0.585638\n", 381 | "[62]\ttrain-mlogloss:0.535932\ttest-mlogloss:0.585132\n", 382 | "[63]\ttrain-mlogloss:0.534624\ttest-mlogloss:0.584407\n", 383 | "[64]\ttrain-mlogloss:0.533186\ttest-mlogloss:0.58367\n", 384 | "[65]\ttrain-mlogloss:0.531767\ttest-mlogloss:0.582788\n", 385 | "[66]\ttrain-mlogloss:0.530367\ttest-mlogloss:0.582063\n", 386 | "[67]\ttrain-mlogloss:0.529023\ttest-mlogloss:0.581331\n", 387 | "[68]\ttrain-mlogloss:0.527781\ttest-mlogloss:0.58068\n", 388 | "[69]\ttrain-mlogloss:0.526511\ttest-mlogloss:0.580342\n", 389 | "[70]\ttrain-mlogloss:0.525392\ttest-mlogloss:0.579888\n", 390 | "[71]\ttrain-mlogloss:0.52422\ttest-mlogloss:0.579319\n", 391 | "[72]\ttrain-mlogloss:0.523065\ttest-mlogloss:0.578852\n", 392 | "[73]\ttrain-mlogloss:0.522163\ttest-mlogloss:0.578434\n", 393 | "[74]\ttrain-mlogloss:0.520843\ttest-mlogloss:0.577687\n", 394 | "[75]\ttrain-mlogloss:0.520055\ttest-mlogloss:0.577254\n", 395 | "[76]\ttrain-mlogloss:0.519149\ttest-mlogloss:0.576857\n", 396 | "[77]\ttrain-mlogloss:0.517909\ttest-mlogloss:0.57638\n", 397 | "[78]\ttrain-mlogloss:0.516506\ttest-mlogloss:0.575721\n", 398 | "[79]\ttrain-mlogloss:0.515361\ttest-mlogloss:0.575472\n", 399 | "[80]\ttrain-mlogloss:0.514641\ttest-mlogloss:0.575183\n", 400 | "[81]\ttrain-mlogloss:0.513579\ttest-mlogloss:0.574743\n", 401 | "[82]\ttrain-mlogloss:0.512622\ttest-mlogloss:0.574371\n", 402 | "[83]\ttrain-mlogloss:0.511446\ttest-mlogloss:0.574089\n", 403 | "[84]\ttrain-mlogloss:0.510372\ttest-mlogloss:0.573719\n", 404 | "[85]\ttrain-mlogloss:0.509183\ttest-mlogloss:0.573575\n", 405 | "[86]\ttrain-mlogloss:0.508148\ttest-mlogloss:0.573277\n", 406 | "[87]\ttrain-mlogloss:0.50706\ttest-mlogloss:0.572957\n", 407 | "[88]\ttrain-mlogloss:0.50622\ttest-mlogloss:0.572635\n", 408 | "[89]\ttrain-mlogloss:0.505219\ttest-mlogloss:0.572276\n", 409 | "[90]\ttrain-mlogloss:0.504375\ttest-mlogloss:0.571933\n", 410 | "[91]\ttrain-mlogloss:0.503762\ttest-mlogloss:0.571746\n", 411 | "[92]\ttrain-mlogloss:0.502992\ttest-mlogloss:0.571413\n", 412 | "[93]\ttrain-mlogloss:0.502076\ttest-mlogloss:0.571129\n", 413 | "[94]\ttrain-mlogloss:0.500902\ttest-mlogloss:0.570822\n", 414 | "[95]\ttrain-mlogloss:0.500169\ttest-mlogloss:0.570567\n", 415 | "[96]\ttrain-mlogloss:0.499278\ttest-mlogloss:0.570131\n", 416 | "[97]\ttrain-mlogloss:0.498181\ttest-mlogloss:0.569639\n", 417 | "[98]\ttrain-mlogloss:0.497191\ttest-mlogloss:0.569336\n", 418 | "[99]\ttrain-mlogloss:0.496139\ttest-mlogloss:0.569146\n", 419 | "[100]\ttrain-mlogloss:0.495544\ttest-mlogloss:0.56896\n", 420 | "[101]\ttrain-mlogloss:0.494762\ttest-mlogloss:0.568668\n", 421 | "[102]\ttrain-mlogloss:0.493763\ttest-mlogloss:0.568456\n", 422 | "[103]\ttrain-mlogloss:0.492945\ttest-mlogloss:0.568271\n", 423 | "[104]\ttrain-mlogloss:0.491708\ttest-mlogloss:0.567905\n", 424 | "[105]\ttrain-mlogloss:0.490897\ttest-mlogloss:0.567701\n", 425 | "[106]\ttrain-mlogloss:0.490114\ttest-mlogloss:0.567514\n", 426 | "[107]\ttrain-mlogloss:0.48894\ttest-mlogloss:0.567149\n", 427 | "[108]\ttrain-mlogloss:0.488131\ttest-mlogloss:0.566846\n", 428 | "[109]\ttrain-mlogloss:0.487414\ttest-mlogloss:0.566577\n", 429 | "[110]\ttrain-mlogloss:0.486545\ttest-mlogloss:0.566364\n", 430 | "[111]\ttrain-mlogloss:0.485623\ttest-mlogloss:0.566043\n", 431 | "[112]\ttrain-mlogloss:0.484816\ttest-mlogloss:0.565925\n", 432 | "[113]\ttrain-mlogloss:0.484138\ttest-mlogloss:0.565711\n", 433 | "[114]\ttrain-mlogloss:0.483216\ttest-mlogloss:0.56544\n", 434 | "[115]\ttrain-mlogloss:0.482588\ttest-mlogloss:0.565323\n", 435 | "[116]\ttrain-mlogloss:0.481523\ttest-mlogloss:0.565\n", 436 | "[117]\ttrain-mlogloss:0.48092\ttest-mlogloss:0.564753\n", 437 | "[118]\ttrain-mlogloss:0.480238\ttest-mlogloss:0.564586\n", 438 | "[119]\ttrain-mlogloss:0.47942\ttest-mlogloss:0.564378\n", 439 | "[120]\ttrain-mlogloss:0.478738\ttest-mlogloss:0.564245\n", 440 | "[121]\ttrain-mlogloss:0.478011\ttest-mlogloss:0.56409\n", 441 | "[122]\ttrain-mlogloss:0.476949\ttest-mlogloss:0.56384\n", 442 | "[123]\ttrain-mlogloss:0.476118\ttest-mlogloss:0.563467\n", 443 | "[124]\ttrain-mlogloss:0.475843\ttest-mlogloss:0.563276\n", 444 | "[125]\ttrain-mlogloss:0.474954\ttest-mlogloss:0.562983\n", 445 | "[126]\ttrain-mlogloss:0.474088\ttest-mlogloss:0.562882\n", 446 | "[127]\ttrain-mlogloss:0.473533\ttest-mlogloss:0.562699\n", 447 | "[128]\ttrain-mlogloss:0.472967\ttest-mlogloss:0.562539\n", 448 | "[129]\ttrain-mlogloss:0.472171\ttest-mlogloss:0.562386\n", 449 | "[130]\ttrain-mlogloss:0.471264\ttest-mlogloss:0.562188\n", 450 | "[131]\ttrain-mlogloss:0.470706\ttest-mlogloss:0.562049\n", 451 | "[132]\ttrain-mlogloss:0.469903\ttest-mlogloss:0.561895\n", 452 | "[133]\ttrain-mlogloss:0.469176\ttest-mlogloss:0.561649\n", 453 | "[134]\ttrain-mlogloss:0.468483\ttest-mlogloss:0.561359\n", 454 | "[135]\ttrain-mlogloss:0.467675\ttest-mlogloss:0.561175\n", 455 | "[136]\ttrain-mlogloss:0.466944\ttest-mlogloss:0.560943\n", 456 | "[137]\ttrain-mlogloss:0.466573\ttest-mlogloss:0.560931\n", 457 | "[138]\ttrain-mlogloss:0.465994\ttest-mlogloss:0.560789\n", 458 | "[139]\ttrain-mlogloss:0.465236\ttest-mlogloss:0.560444\n", 459 | "[140]\ttrain-mlogloss:0.464364\ttest-mlogloss:0.560345\n", 460 | "[141]\ttrain-mlogloss:0.463396\ttest-mlogloss:0.560242\n", 461 | "[142]\ttrain-mlogloss:0.46274\ttest-mlogloss:0.560137\n", 462 | "[143]\ttrain-mlogloss:0.462101\ttest-mlogloss:0.55996\n", 463 | "[144]\ttrain-mlogloss:0.461377\ttest-mlogloss:0.559821\n", 464 | "[145]\ttrain-mlogloss:0.460638\ttest-mlogloss:0.559611\n", 465 | "[146]\ttrain-mlogloss:0.459958\ttest-mlogloss:0.559478\n", 466 | "[147]\ttrain-mlogloss:0.459362\ttest-mlogloss:0.559354\n", 467 | "[148]\ttrain-mlogloss:0.458515\ttest-mlogloss:0.559138\n", 468 | "[149]\ttrain-mlogloss:0.457808\ttest-mlogloss:0.559009\n", 469 | "[150]\ttrain-mlogloss:0.45738\ttest-mlogloss:0.558911\n", 470 | "[151]\ttrain-mlogloss:0.456855\ttest-mlogloss:0.55884\n", 471 | "[152]\ttrain-mlogloss:0.456063\ttest-mlogloss:0.558697\n", 472 | "[153]\ttrain-mlogloss:0.455421\ttest-mlogloss:0.558521\n", 473 | "[154]\ttrain-mlogloss:0.454662\ttest-mlogloss:0.558377\n", 474 | "[155]\ttrain-mlogloss:0.454117\ttest-mlogloss:0.558296\n", 475 | "[156]\ttrain-mlogloss:0.453326\ttest-mlogloss:0.558084\n", 476 | "[157]\ttrain-mlogloss:0.452753\ttest-mlogloss:0.557905\n", 477 | "[158]\ttrain-mlogloss:0.452359\ttest-mlogloss:0.557868\n", 478 | "[159]\ttrain-mlogloss:0.451707\ttest-mlogloss:0.557636\n", 479 | "[160]\ttrain-mlogloss:0.451068\ttest-mlogloss:0.557454\n", 480 | "[161]\ttrain-mlogloss:0.450408\ttest-mlogloss:0.557361\n", 481 | "[162]\ttrain-mlogloss:0.449685\ttest-mlogloss:0.557289\n", 482 | "[163]\ttrain-mlogloss:0.448961\ttest-mlogloss:0.557146\n", 483 | "[164]\ttrain-mlogloss:0.448501\ttest-mlogloss:0.557029\n", 484 | "[165]\ttrain-mlogloss:0.447691\ttest-mlogloss:0.556853\n", 485 | "[166]\ttrain-mlogloss:0.446992\ttest-mlogloss:0.556806\n", 486 | "[167]\ttrain-mlogloss:0.446296\ttest-mlogloss:0.556598\n", 487 | "[168]\ttrain-mlogloss:0.445686\ttest-mlogloss:0.556577\n", 488 | "[169]\ttrain-mlogloss:0.444956\ttest-mlogloss:0.556382\n", 489 | "[170]\ttrain-mlogloss:0.444435\ttest-mlogloss:0.556329\n", 490 | "[171]\ttrain-mlogloss:0.443592\ttest-mlogloss:0.556008\n", 491 | "[172]\ttrain-mlogloss:0.442805\ttest-mlogloss:0.555822\n", 492 | "[173]\ttrain-mlogloss:0.442412\ttest-mlogloss:0.555704\n", 493 | "[174]\ttrain-mlogloss:0.441773\ttest-mlogloss:0.555605\n", 494 | "[175]\ttrain-mlogloss:0.441135\ttest-mlogloss:0.555466\n", 495 | "[176]\ttrain-mlogloss:0.440742\ttest-mlogloss:0.555388\n", 496 | "[177]\ttrain-mlogloss:0.44027\ttest-mlogloss:0.555334\n", 497 | "[178]\ttrain-mlogloss:0.439462\ttest-mlogloss:0.555133\n", 498 | "[179]\ttrain-mlogloss:0.43881\ttest-mlogloss:0.554992\n", 499 | "[180]\ttrain-mlogloss:0.438174\ttest-mlogloss:0.554753\n", 500 | "[181]\ttrain-mlogloss:0.437383\ttest-mlogloss:0.554644\n", 501 | "[182]\ttrain-mlogloss:0.436838\ttest-mlogloss:0.554575\n", 502 | "[183]\ttrain-mlogloss:0.436125\ttest-mlogloss:0.554404\n", 503 | "[184]\ttrain-mlogloss:0.435588\ttest-mlogloss:0.554327\n", 504 | "[185]\ttrain-mlogloss:0.435114\ttest-mlogloss:0.55427\n", 505 | "[186]\ttrain-mlogloss:0.434355\ttest-mlogloss:0.554231\n", 506 | "[187]\ttrain-mlogloss:0.43382\ttest-mlogloss:0.554011\n", 507 | "[188]\ttrain-mlogloss:0.433208\ttest-mlogloss:0.553862\n", 508 | "[189]\ttrain-mlogloss:0.43253\ttest-mlogloss:0.553751\n", 509 | "[190]\ttrain-mlogloss:0.432027\ttest-mlogloss:0.553633\n", 510 | "[191]\ttrain-mlogloss:0.43148\ttest-mlogloss:0.553609\n", 511 | "[192]\ttrain-mlogloss:0.431025\ttest-mlogloss:0.553599\n", 512 | "[193]\ttrain-mlogloss:0.430441\ttest-mlogloss:0.553502\n", 513 | "[194]\ttrain-mlogloss:0.429787\ttest-mlogloss:0.553418\n", 514 | "[195]\ttrain-mlogloss:0.429262\ttest-mlogloss:0.553465\n", 515 | "[196]\ttrain-mlogloss:0.42865\ttest-mlogloss:0.553342\n", 516 | "[197]\ttrain-mlogloss:0.428045\ttest-mlogloss:0.553264\n", 517 | "[198]\ttrain-mlogloss:0.427341\ttest-mlogloss:0.553197\n", 518 | "[199]\ttrain-mlogloss:0.426563\ttest-mlogloss:0.552965\n", 519 | "[200]\ttrain-mlogloss:0.426066\ttest-mlogloss:0.552906\n", 520 | "[201]\ttrain-mlogloss:0.42541\ttest-mlogloss:0.552713\n", 521 | "[202]\ttrain-mlogloss:0.424861\ttest-mlogloss:0.552693\n", 522 | "[203]\ttrain-mlogloss:0.42421\ttest-mlogloss:0.552601\n", 523 | "[204]\ttrain-mlogloss:0.423567\ttest-mlogloss:0.552647\n", 524 | "[205]\ttrain-mlogloss:0.422962\ttest-mlogloss:0.552553\n", 525 | "[206]\ttrain-mlogloss:0.422326\ttest-mlogloss:0.552551\n", 526 | "[207]\ttrain-mlogloss:0.421518\ttest-mlogloss:0.55258\n", 527 | "[208]\ttrain-mlogloss:0.420897\ttest-mlogloss:0.552612\n", 528 | "[209]\ttrain-mlogloss:0.420392\ttest-mlogloss:0.552503\n", 529 | "[210]\ttrain-mlogloss:0.420065\ttest-mlogloss:0.552369\n", 530 | "[211]\ttrain-mlogloss:0.419603\ttest-mlogloss:0.55221\n", 531 | "[212]\ttrain-mlogloss:0.41903\ttest-mlogloss:0.552108\n", 532 | "[213]\ttrain-mlogloss:0.418522\ttest-mlogloss:0.551998\n", 533 | "[214]\ttrain-mlogloss:0.417667\ttest-mlogloss:0.551873\n", 534 | "[215]\ttrain-mlogloss:0.417187\ttest-mlogloss:0.551808\n", 535 | "[216]\ttrain-mlogloss:0.416637\ttest-mlogloss:0.551775\n", 536 | "[217]\ttrain-mlogloss:0.41618\ttest-mlogloss:0.55173\n", 537 | "[218]\ttrain-mlogloss:0.415826\ttest-mlogloss:0.55165\n", 538 | "[219]\ttrain-mlogloss:0.415501\ttest-mlogloss:0.551587\n", 539 | "[220]\ttrain-mlogloss:0.415265\ttest-mlogloss:0.551546\n", 540 | "[221]\ttrain-mlogloss:0.414692\ttest-mlogloss:0.551359\n", 541 | "[222]\ttrain-mlogloss:0.414234\ttest-mlogloss:0.551307\n", 542 | "[223]\ttrain-mlogloss:0.413624\ttest-mlogloss:0.551199\n", 543 | "[224]\ttrain-mlogloss:0.41308\ttest-mlogloss:0.551012\n", 544 | "[225]\ttrain-mlogloss:0.41247\ttest-mlogloss:0.550941\n", 545 | "[226]\ttrain-mlogloss:0.411947\ttest-mlogloss:0.550983\n", 546 | "[227]\ttrain-mlogloss:0.411371\ttest-mlogloss:0.550967\n", 547 | "[228]\ttrain-mlogloss:0.41081\ttest-mlogloss:0.550876\n", 548 | "[229]\ttrain-mlogloss:0.410216\ttest-mlogloss:0.550737\n", 549 | "[230]\ttrain-mlogloss:0.409747\ttest-mlogloss:0.550653\n", 550 | "[231]\ttrain-mlogloss:0.409131\ttest-mlogloss:0.550562\n", 551 | "[232]\ttrain-mlogloss:0.408654\ttest-mlogloss:0.55062\n", 552 | "[233]\ttrain-mlogloss:0.408119\ttest-mlogloss:0.550529\n", 553 | "[234]\ttrain-mlogloss:0.407361\ttest-mlogloss:0.550505\n", 554 | "[235]\ttrain-mlogloss:0.406824\ttest-mlogloss:0.550482\n", 555 | "[236]\ttrain-mlogloss:0.406348\ttest-mlogloss:0.55042\n", 556 | "[237]\ttrain-mlogloss:0.406023\ttest-mlogloss:0.550356\n", 557 | "[238]\ttrain-mlogloss:0.405309\ttest-mlogloss:0.550179\n", 558 | "[239]\ttrain-mlogloss:0.404664\ttest-mlogloss:0.55013\n", 559 | "[240]\ttrain-mlogloss:0.404285\ttest-mlogloss:0.550085\n", 560 | "[241]\ttrain-mlogloss:0.403685\ttest-mlogloss:0.55006\n", 561 | "[242]\ttrain-mlogloss:0.403308\ttest-mlogloss:0.549991\n", 562 | "[243]\ttrain-mlogloss:0.402697\ttest-mlogloss:0.549962\n", 563 | "[244]\ttrain-mlogloss:0.402272\ttest-mlogloss:0.549869\n", 564 | "[245]\ttrain-mlogloss:0.401685\ttest-mlogloss:0.549878\n", 565 | "[246]\ttrain-mlogloss:0.401243\ttest-mlogloss:0.549921\n", 566 | "[247]\ttrain-mlogloss:0.400637\ttest-mlogloss:0.549932\n", 567 | "[248]\ttrain-mlogloss:0.400319\ttest-mlogloss:0.549812\n", 568 | "[249]\ttrain-mlogloss:0.399861\ttest-mlogloss:0.549876\n", 569 | "[250]\ttrain-mlogloss:0.399276\ttest-mlogloss:0.549815\n", 570 | "[251]\ttrain-mlogloss:0.398666\ttest-mlogloss:0.549829\n", 571 | "[252]\ttrain-mlogloss:0.398211\ttest-mlogloss:0.549989\n", 572 | "[253]\ttrain-mlogloss:0.397705\ttest-mlogloss:0.549932\n", 573 | "[254]\ttrain-mlogloss:0.397121\ttest-mlogloss:0.550049\n", 574 | "[255]\ttrain-mlogloss:0.396528\ttest-mlogloss:0.550022\n", 575 | "[256]\ttrain-mlogloss:0.396249\ttest-mlogloss:0.550033\n", 576 | "[257]\ttrain-mlogloss:0.395951\ttest-mlogloss:0.549966\n", 577 | "[258]\ttrain-mlogloss:0.395331\ttest-mlogloss:0.549948\n", 578 | "[259]\ttrain-mlogloss:0.394668\ttest-mlogloss:0.549957\n", 579 | "[260]\ttrain-mlogloss:0.394171\ttest-mlogloss:0.549973\n", 580 | "[261]\ttrain-mlogloss:0.39384\ttest-mlogloss:0.549985\n", 581 | "[262]\ttrain-mlogloss:0.393273\ttest-mlogloss:0.550006\n", 582 | "[263]\ttrain-mlogloss:0.392843\ttest-mlogloss:0.5499\n", 583 | "[264]\ttrain-mlogloss:0.392273\ttest-mlogloss:0.549908\n", 584 | "[265]\ttrain-mlogloss:0.391828\ttest-mlogloss:0.549826\n", 585 | "[266]\ttrain-mlogloss:0.391468\ttest-mlogloss:0.549805\n", 586 | "[267]\ttrain-mlogloss:0.390976\ttest-mlogloss:0.549758\n", 587 | "[268]\ttrain-mlogloss:0.390481\ttest-mlogloss:0.549727\n", 588 | "[269]\ttrain-mlogloss:0.390038\ttest-mlogloss:0.549707\n", 589 | "[270]\ttrain-mlogloss:0.389536\ttest-mlogloss:0.549714\n", 590 | "[271]\ttrain-mlogloss:0.388936\ttest-mlogloss:0.549652\n", 591 | "[272]\ttrain-mlogloss:0.388576\ttest-mlogloss:0.549666\n", 592 | "[273]\ttrain-mlogloss:0.388062\ttest-mlogloss:0.549731\n", 593 | "[274]\ttrain-mlogloss:0.387869\ttest-mlogloss:0.549754\n", 594 | "[275]\ttrain-mlogloss:0.387572\ttest-mlogloss:0.549816\n", 595 | "[276]\ttrain-mlogloss:0.387073\ttest-mlogloss:0.549819\n", 596 | "[277]\ttrain-mlogloss:0.386474\ttest-mlogloss:0.54963\n", 597 | "[278]\ttrain-mlogloss:0.385841\ttest-mlogloss:0.549673\n", 598 | "[279]\ttrain-mlogloss:0.385482\ttest-mlogloss:0.549606\n", 599 | "[280]\ttrain-mlogloss:0.385114\ttest-mlogloss:0.549587\n", 600 | "[281]\ttrain-mlogloss:0.384674\ttest-mlogloss:0.54955\n", 601 | "[282]\ttrain-mlogloss:0.384137\ttest-mlogloss:0.549542\n", 602 | "[283]\ttrain-mlogloss:0.38372\ttest-mlogloss:0.549528\n", 603 | "[284]\ttrain-mlogloss:0.383234\ttest-mlogloss:0.549464\n", 604 | "[285]\ttrain-mlogloss:0.38272\ttest-mlogloss:0.549434\n", 605 | "[286]\ttrain-mlogloss:0.382295\ttest-mlogloss:0.549465\n", 606 | "[287]\ttrain-mlogloss:0.381834\ttest-mlogloss:0.549379\n", 607 | "[288]\ttrain-mlogloss:0.38132\ttest-mlogloss:0.54934\n", 608 | "[289]\ttrain-mlogloss:0.380894\ttest-mlogloss:0.549264\n", 609 | "[290]\ttrain-mlogloss:0.380498\ttest-mlogloss:0.549247\n", 610 | "[291]\ttrain-mlogloss:0.380062\ttest-mlogloss:0.549205\n", 611 | "[292]\ttrain-mlogloss:0.37965\ttest-mlogloss:0.549201\n", 612 | "[293]\ttrain-mlogloss:0.379019\ttest-mlogloss:0.549211\n", 613 | "[294]\ttrain-mlogloss:0.378508\ttest-mlogloss:0.549221\n", 614 | "[295]\ttrain-mlogloss:0.378046\ttest-mlogloss:0.549091\n", 615 | "[296]\ttrain-mlogloss:0.377815\ttest-mlogloss:0.549071\n", 616 | "[297]\ttrain-mlogloss:0.377491\ttest-mlogloss:0.549019\n", 617 | "[298]\ttrain-mlogloss:0.377001\ttest-mlogloss:0.549037\n", 618 | "[299]\ttrain-mlogloss:0.376494\ttest-mlogloss:0.549011\n", 619 | "[300]\ttrain-mlogloss:0.376066\ttest-mlogloss:0.548946\n", 620 | "[301]\ttrain-mlogloss:0.375527\ttest-mlogloss:0.548929\n", 621 | "[302]\ttrain-mlogloss:0.375013\ttest-mlogloss:0.54892\n", 622 | "[303]\ttrain-mlogloss:0.374521\ttest-mlogloss:0.549\n", 623 | "[304]\ttrain-mlogloss:0.373935\ttest-mlogloss:0.549171\n", 624 | "[305]\ttrain-mlogloss:0.373428\ttest-mlogloss:0.549223\n", 625 | "[306]\ttrain-mlogloss:0.373039\ttest-mlogloss:0.54916\n", 626 | "[307]\ttrain-mlogloss:0.372686\ttest-mlogloss:0.549035\n", 627 | "[308]\ttrain-mlogloss:0.37216\ttest-mlogloss:0.548995\n", 628 | "[309]\ttrain-mlogloss:0.371648\ttest-mlogloss:0.548941\n", 629 | "[310]\ttrain-mlogloss:0.371155\ttest-mlogloss:0.548814\n", 630 | "[311]\ttrain-mlogloss:0.370729\ttest-mlogloss:0.548765\n", 631 | "[312]\ttrain-mlogloss:0.37032\ttest-mlogloss:0.548888\n", 632 | "[313]\ttrain-mlogloss:0.369891\ttest-mlogloss:0.548985\n", 633 | "[314]\ttrain-mlogloss:0.369316\ttest-mlogloss:0.548926\n", 634 | "[315]\ttrain-mlogloss:0.368816\ttest-mlogloss:0.548971\n", 635 | "[316]\ttrain-mlogloss:0.368333\ttest-mlogloss:0.548876\n", 636 | "[317]\ttrain-mlogloss:0.368004\ttest-mlogloss:0.548885\n", 637 | "[318]\ttrain-mlogloss:0.367705\ttest-mlogloss:0.548927\n", 638 | "[319]\ttrain-mlogloss:0.367121\ttest-mlogloss:0.548788\n", 639 | "[320]\ttrain-mlogloss:0.366641\ttest-mlogloss:0.548706\n", 640 | "[321]\ttrain-mlogloss:0.366203\ttest-mlogloss:0.548571\n", 641 | "[322]\ttrain-mlogloss:0.365932\ttest-mlogloss:0.548489\n", 642 | "[323]\ttrain-mlogloss:0.365446\ttest-mlogloss:0.548531\n", 643 | "[324]\ttrain-mlogloss:0.365172\ttest-mlogloss:0.548617\n", 644 | "[325]\ttrain-mlogloss:0.364779\ttest-mlogloss:0.548644\n", 645 | "[326]\ttrain-mlogloss:0.364241\ttest-mlogloss:0.548594\n", 646 | "[327]\ttrain-mlogloss:0.363824\ttest-mlogloss:0.548602\n", 647 | "[328]\ttrain-mlogloss:0.3634\ttest-mlogloss:0.548548\n", 648 | "[329]\ttrain-mlogloss:0.363085\ttest-mlogloss:0.548491\n", 649 | "[330]\ttrain-mlogloss:0.362653\ttest-mlogloss:0.548437\n", 650 | "[331]\ttrain-mlogloss:0.362338\ttest-mlogloss:0.548367\n", 651 | "[332]\ttrain-mlogloss:0.361838\ttest-mlogloss:0.548419\n", 652 | "[333]\ttrain-mlogloss:0.361572\ttest-mlogloss:0.548516\n", 653 | "[334]\ttrain-mlogloss:0.361207\ttest-mlogloss:0.548434\n", 654 | "[335]\ttrain-mlogloss:0.360795\ttest-mlogloss:0.548389\n", 655 | "[336]\ttrain-mlogloss:0.360272\ttest-mlogloss:0.548249\n", 656 | "[337]\ttrain-mlogloss:0.359874\ttest-mlogloss:0.548235\n", 657 | "[338]\ttrain-mlogloss:0.359489\ttest-mlogloss:0.54823\n", 658 | "[339]\ttrain-mlogloss:0.358986\ttest-mlogloss:0.548271\n", 659 | "[340]\ttrain-mlogloss:0.358536\ttest-mlogloss:0.548283\n", 660 | "[341]\ttrain-mlogloss:0.358192\ttest-mlogloss:0.5482\n", 661 | "[342]\ttrain-mlogloss:0.357849\ttest-mlogloss:0.548229\n", 662 | "[343]\ttrain-mlogloss:0.357487\ttest-mlogloss:0.54821\n", 663 | "[344]\ttrain-mlogloss:0.356953\ttest-mlogloss:0.548181\n", 664 | "[345]\ttrain-mlogloss:0.356421\ttest-mlogloss:0.548106\n", 665 | "[346]\ttrain-mlogloss:0.355903\ttest-mlogloss:0.548063\n", 666 | "[347]\ttrain-mlogloss:0.355627\ttest-mlogloss:0.548068\n", 667 | "[348]\ttrain-mlogloss:0.355334\ttest-mlogloss:0.54803\n", 668 | "[349]\ttrain-mlogloss:0.354875\ttest-mlogloss:0.548005\n", 669 | "[350]\ttrain-mlogloss:0.354477\ttest-mlogloss:0.547958\n", 670 | "[351]\ttrain-mlogloss:0.354084\ttest-mlogloss:0.547862\n", 671 | "[352]\ttrain-mlogloss:0.353584\ttest-mlogloss:0.54775\n", 672 | "[353]\ttrain-mlogloss:0.353249\ttest-mlogloss:0.547744\n", 673 | "[354]\ttrain-mlogloss:0.35303\ttest-mlogloss:0.547778\n", 674 | "[355]\ttrain-mlogloss:0.352646\ttest-mlogloss:0.547696\n", 675 | "[356]\ttrain-mlogloss:0.352297\ttest-mlogloss:0.54783\n", 676 | "[357]\ttrain-mlogloss:0.351894\ttest-mlogloss:0.547775\n", 677 | "[358]\ttrain-mlogloss:0.351425\ttest-mlogloss:0.54786\n", 678 | "[359]\ttrain-mlogloss:0.350943\ttest-mlogloss:0.547774\n", 679 | "[360]\ttrain-mlogloss:0.350602\ttest-mlogloss:0.547771\n", 680 | "[361]\ttrain-mlogloss:0.350357\ttest-mlogloss:0.547768\n", 681 | "[362]\ttrain-mlogloss:0.34985\ttest-mlogloss:0.547881\n", 682 | "[363]\ttrain-mlogloss:0.349465\ttest-mlogloss:0.547835\n", 683 | "[364]\ttrain-mlogloss:0.348895\ttest-mlogloss:0.547832\n", 684 | "[365]\ttrain-mlogloss:0.348455\ttest-mlogloss:0.548\n", 685 | "[366]\ttrain-mlogloss:0.348064\ttest-mlogloss:0.547948\n", 686 | "[367]\ttrain-mlogloss:0.347629\ttest-mlogloss:0.548026\n", 687 | "[368]\ttrain-mlogloss:0.347153\ttest-mlogloss:0.547928\n", 688 | "[369]\ttrain-mlogloss:0.346734\ttest-mlogloss:0.547903\n", 689 | "[370]\ttrain-mlogloss:0.346251\ttest-mlogloss:0.547871\n", 690 | "[371]\ttrain-mlogloss:0.345869\ttest-mlogloss:0.547909\n", 691 | "[372]\ttrain-mlogloss:0.345424\ttest-mlogloss:0.547937\n", 692 | "[373]\ttrain-mlogloss:0.34505\ttest-mlogloss:0.548001\n", 693 | "[374]\ttrain-mlogloss:0.344615\ttest-mlogloss:0.547982\n", 694 | "[375]\ttrain-mlogloss:0.344206\ttest-mlogloss:0.54803\n", 695 | "Stopping. Best iteration:\n", 696 | "[355]\ttrain-mlogloss:0.352646\ttest-mlogloss:0.547696\n", 697 | "\n", 698 | "[0.54803037236074925]\n" 699 | ] 700 | } 701 | ], 702 | "source": [ 703 | "cv_scores = []\n", 704 | "kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)\n", 705 | "for dev_index, val_index in kf.split(range(train_X.shape[0])):\n", 706 | " dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]\n", 707 | " dev_y, val_y = train_y[dev_index], train_y[val_index]\n", 708 | " preds, model = runXGB(dev_X, dev_y, val_X, val_y)\n", 709 | " cv_scores.append(log_loss(val_y, preds))\n", 710 | " print(cv_scores)\n", 711 | " break" 712 | ] 713 | }, 714 | { 715 | "cell_type": "markdown", 716 | "metadata": { 717 | "_cell_guid": "5cff686f-2601-321d-8f81-5fa846ef7562" 718 | }, 719 | "source": [ 720 | "Now let us build the final model and get the predictions on the test set." 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 10, 726 | "metadata": { 727 | "_cell_guid": "4fb1954d-e3f0-9369-d50c-bd1b615c0077" 728 | }, 729 | "outputs": [], 730 | "source": [ 731 | "preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)\n", 732 | "out_df = pd.DataFrame(preds)\n", 733 | "out_df.columns = [\"high\", \"medium\", \"low\"]\n", 734 | "out_df[\"listing_id\"] = test_df.listing_id.values\n", 735 | "out_df.to_csv(\"xgb_starter2.csv\", index=False)" 736 | ] 737 | }, 738 | { 739 | "cell_type": "markdown", 740 | "metadata": { 741 | "_cell_guid": "b23cc080-cd12-dc7d-0877-66806a34bf4c" 742 | }, 743 | "source": [ 744 | "\n", 745 | "Hope this helps the python users as a good starting point." 746 | ] 747 | } 748 | ], 749 | "metadata": { 750 | "_change_revision": 488, 751 | "_is_fork": false, 752 | "kernelspec": { 753 | "display_name": "Python 3", 754 | "language": "python", 755 | "name": "python3" 756 | }, 757 | "language_info": { 758 | "codemirror_mode": { 759 | "name": "ipython", 760 | "version": 3 761 | }, 762 | "file_extension": ".py", 763 | "mimetype": "text/x-python", 764 | "name": "python", 765 | "nbconvert_exporter": "python", 766 | "pygments_lexer": "ipython3", 767 | "version": "3.6.0" 768 | } 769 | }, 770 | "nbformat": 4, 771 | "nbformat_minor": 0 772 | } 773 | -------------------------------------------------------------------------------- /TwoSigmaConnect_RentHop/readme.md: -------------------------------------------------------------------------------- 1 | Codes and notebooks used for [Kaggle - Two Sigma Connect : RentHop Rental Listing Enquiries competition](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries) 2 | -------------------------------------------------------------------------------- /TwoSigmaFinancialModeling/readme.md: -------------------------------------------------------------------------------- 1 | Codes for Two Sigma Financial Modeling Challenge 2 | -------------------------------------------------------------------------------- /Walmart_TripType/NeuralNets/neural_net.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import operator 4 | import numpy as np 5 | import pandas as pd 6 | import scipy as sp 7 | import cPickle as pkl 8 | from scipy.sparse import csr_matrix 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.cross_validation import KFold 11 | from sklearn import ensemble, preprocessing 12 | from sklearn.metrics import mean_squared_error, roc_auc_score 13 | #sys.path.append("/home/sudalai/Softwares/xgboost-master/wrapper/") 14 | sys.path.append("/home/sudalai/Softwares/XGB_pointfour/xgboost-master/wrapper/") 15 | import xgboost as xgb 16 | 17 | np.random.seed(12345) 18 | from keras.models import Sequential 19 | from keras.optimizers import SGD 20 | from keras.layers.normalization import BatchNormalization 21 | from keras.layers.core import Dense, Activation, Dropout 22 | from keras.constraints import maxnorm 23 | from keras.utils import np_utils 24 | from keras import regularizers 25 | from keras.layers.advanced_activations import PReLU 26 | 27 | 28 | def multiclassLogLoss(y_true, y_pred, eps=1e-15): 29 | """Multi class version of Logarithmic Loss metric. 30 | https://www.kaggle.com/wiki/MultiClassLogLoss 31 | 32 | Parameters 33 | ---------- 34 | y_true : array, shape = [n_samples] 35 | true class, intergers in [0, n_classes - 1) 36 | y_pred : array, shape = [n_samples, n_classes] 37 | 38 | Returns 39 | ------- 40 | loss : float 41 | """ 42 | predictions = np.clip(y_pred, eps, 1 - eps) 43 | 44 | # normalize row sums to 1 45 | predictions /= predictions.sum(axis=1)[:, np.newaxis] 46 | 47 | actual = np.zeros(y_pred.shape) 48 | n_samples = actual.shape[0] 49 | actual[np.arange(n_samples), y_true.astype(int)] = 1 50 | vectsum = np.sum(actual * np.log(predictions)) 51 | loss = -1.0 / n_samples * vectsum 52 | return loss 53 | 54 | 55 | def runNN(train_X, train_y, test_X=None, test_y=None): 56 | sc = preprocessing.StandardScaler() 57 | train_X = sc.fit_transform(train_X) 58 | #test_X = sc.transform(test_X) 59 | 60 | train_y = np_utils.to_categorical(train_y, 38) 61 | 62 | model = Sequential() 63 | #model.add(Dropout(0.2)) 64 | 65 | model.add(Dense(600, input_shape=(train_X.shape[1],), init='he_uniform', W_regularizer=regularizers.l1(0.002))) 66 | model.add(Activation('relu')) 67 | model.add(Dropout(0.3)) 68 | #model.add(BatchNormalization()) 69 | 70 | model.add(Dense(600, init='he_uniform')) 71 | model.add(Activation('relu')) 72 | model.add(Dropout(0.3)) 73 | #model.add(BatchNormalization()) 74 | 75 | #model.add(Dense(100, init='he_uniform')) 76 | #model.add(Activation('relu')) 77 | #model.add(Dropout(0.5)) 78 | 79 | model.add(Dense(38, init='he_uniform')) 80 | model.add(Activation('softmax')) 81 | 82 | #sgd_opt = SGD(lr=0.01) 83 | model.compile(loss='categorical_crossentropy', optimizer='adagrad') 84 | 85 | #for i in xrange(500): 86 | model.fit(train_X, train_y, batch_size=256, nb_epoch=200, validation_split=0.03, verbose=2, shuffle=True) 87 | #preds = model.predict(test_X, verbose=0) 88 | #print "Test preds shape : ",preds.shape 89 | #loss = multiclassLogLoss(test_y, preds) 90 | #print "At",(i+1)*2, "Epochs, Loss is : ", loss 91 | #print "ROC AUC score : ", metrics.roc_auc_score(test_y, preds) 92 | 93 | return model, sc 94 | 95 | if __name__ == "__main__": 96 | # setting the input path and reading the data into dataframe # 97 | print "Reading data.." 98 | data_path = "../Data/" 99 | train_X = pd.read_csv(data_path + "train_mod_v2.csv") 100 | 101 | print "Getting target and id" 102 | train_y = np.array(train_X["DV"]) 103 | train_id = np.array(train_X["VisitNumber"]) 104 | 105 | print "Dropping columns" 106 | drop_columns = ["DV"] 107 | train_X.drop(drop_columns+["VisitNumber"], axis=1, inplace=True) 108 | #test_X.drop(["VisitNumber"], axis=1, inplace=True) 109 | 110 | print "Converting to array" 111 | train_X = np.array(train_X) 112 | print "Train shape : ", train_X.shape 113 | 114 | print "Building model.." 115 | model, scaler = runNN(train_X, train_y) 116 | del train_X 117 | import gc 118 | gc.collect() 119 | 120 | print "Working on test data.." 121 | test_X = pd.read_csv(data_path + "test_mod_v2.csv") 122 | test_id = np.array(test_X["VisitNumber"]) 123 | test_X.drop(["VisitNumber"], axis=1, inplace=True) 124 | test_X = np.array(test_X) 125 | test_X = scaler.transform(test_X) 126 | 127 | print "Getting preds.." 128 | preds = model.predict(test_X, verbose=0) 129 | 130 | sample = pd.read_csv(data_path + "sample_submission.csv") 131 | preds = pd.DataFrame(preds, index=test_id, columns=sample.columns[1:]) 132 | preds.to_csv("sub_nn.csv", index_label="VisitNumber") 133 | -------------------------------------------------------------------------------- /Walmart_TripType/NeuralNets/prepData.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from config_v2 import fineline_dict, header_list4 6 | 7 | map_type_dv_dict = {3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 12:7, 14:8, 15:9, 18:10, 19:11, 20:12, 21:13, 22:14, 23:15, 24:16, 25:17, 26:18, 27:19, 28:20, 29:21, 30:22, 31:23, 32:24, 33:25, 34:26, 35:27, 36:28, 37:29, 38:30, 39:31, 40:32, 41:33, 42:34, 43:35, 44:36, 999:37} 8 | map_dept_dict = {'COMM BREAD': 14, 'OPTICAL - FRAMES': 47, '1-HR PHOTO': 1, 'LIQUOR,WINE,BEER': 41, 'FABRICS AND CRAFTS': 20, 'MENS WEAR': 44, 'SEAFOOD': 59, 'AUTOMOTIVE': 3, 'BEDDING': 7, 'COOK AND DINE': 16, 'OPTICAL - LENSES': 48, 'HARDWARE': 26, 'SLEEPWEAR/FOUNDATIONS': 64, 'FINANCIAL SERVICES': 21, 'OTHER DEPARTMENTS': 49, 'ELECTRONICS': 19, 'LADIESWEAR': 38, 'HOME MANAGEMENT': 29, 'HOUSEHOLD PAPER GOODS': 32, 'FROZEN FOODS': 22, 'FURNITURE': 23, 'INFANT CONSUMABLE HARDLINES': 35, 'MENSWEAR': 45, 'PAINT AND ACCESSORIES': 50, 'GROCERY DRY GOODS': 25, 'BOYS WEAR': 9, 'SERVICE DELI': 61, 'ACCESSORIES': 2, 'DSD GROCERY': 18, 'MEDIA AND GAMING': 43, -999: 0, 'JEWELRY AND SUNGLASSES': 36, 'PLUS AND MATERNITY': 56, 'LARGE HOUSEHOLD GOODS': 39, 'HOUSEHOLD CHEMICALS/SUPP': 31, 'CAMERAS AND SUPPLIES': 11, 'BATH AND SHOWER': 5, 'SEASONAL': 60, 'IMPULSE MERCHANDISE': 33, 'BRAS & SHAPEWEAR': 10, 'PHARMACY OTC': 53, 'SPORTING GOODS': 65, 'BEAUTY': 6, 'PETS AND SUPPLIES': 52, 'LADIES SOCKS': 37, 'HOME DECOR': 28, 'WIRELESS': 68, 'DAIRY': 17, 'PERSONAL CARE': 51, 'TOYS': 67, 'CONCEPT STORES': 15, 'HEALTH AND BEAUTY AIDS': 27, 'OFFICE SUPPLIES': 46, 'LAWN AND GARDEN': 40, 'SHOES': 63, 'SHEER HOSIERY': 62, 'PRE PACKED DELI': 57, 'INFANT APPAREL': 34, 'HORTICULTURE AND ACCESS': 30, 'PLAYERS AND ELECTRONICS': 55, 'BAKERY': 4, 'PRODUCE': 58, 'CANDY, TOBACCO, COOKIES': 12, 'MEAT - FRESH & FROZEN': 42, 'PHARMACY RX': 54, 'BOOKS AND MAGAZINES': 8, 'GIRLS WEAR, 4-6X AND 7-14': 24, 'SWIMWEAR/OUTERWEAR': 66, 'CELEBRATION': 13} 9 | weekday_dict = {"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6} 10 | 11 | def getHeader(train): 12 | header_list1 = ["VisitNumber", "DayOfWeek", "NumberOfRows", "NoOfUPCs", "NumberOfItems", "NumberOfDepts", "NumberOfFineLine" ] 13 | header_list2 = ['Dept_-999', 'Dept_1-HR_PHOTO', 'Dept_ACCESSORIES', 'Dept_AUTOMOTIVE', 'Dept_BAKERY', 'Dept_BATH_AND_SHOWER', 'Dept_BEAUTY', 'Dept_BEDDING', 'Dept_BOOKS_AND_MAGAZINES', 'Dept_BOYS_WEAR', 'Dept_BRAS_&_SHAPEWEAR', 'Dept_CAMERAS_AND_SUPPLIES', 'Dept_CANDY,_TOBACCO,_COOKIES', 'Dept_CELEBRATION', 'Dept_COMM_BREAD', 'Dept_CONCEPT_STORES', 'Dept_COOK_AND_DINE', 'Dept_DAIRY', 'Dept_DSD_GROCERY', 'Dept_ELECTRONICS', 'Dept_FABRICS_AND_CRAFTS', 'Dept_FINANCIAL_SERVICES', 'Dept_FROZEN_FOODS', 'Dept_FURNITURE', 'Dept_GIRLS_WEAR,_4-6X__AND_7-14', 'Dept_GROCERY_DRY_GOODS', 'Dept_HARDWARE', 'Dept_HEALTH_AND_BEAUTY_AIDS', 'Dept_HOME_DECOR', 'Dept_HOME_MANAGEMENT', 'Dept_HORTICULTURE_AND_ACCESS', 'Dept_HOUSEHOLD_CHEMICALS/SUPP', 'Dept_HOUSEHOLD_PAPER_GOODS', 'Dept_IMPULSE_MERCHANDISE', 'Dept_INFANT_APPAREL', 'Dept_INFANT_CONSUMABLE_HARDLINES', 'Dept_JEWELRY_AND_SUNGLASSES', 'Dept_LADIES_SOCKS', 'Dept_LADIESWEAR', 'Dept_LARGE_HOUSEHOLD_GOODS', 'Dept_LAWN_AND_GARDEN', 'Dept_LIQUOR,WINE,BEER', 'Dept_MEAT_-_FRESH_&_FROZEN', 'Dept_MEDIA_AND_GAMING', 'Dept_MENS_WEAR', 'Dept_MENSWEAR', 'Dept_OFFICE_SUPPLIES', 'Dept_OPTICAL_-_FRAMES', 'Dept_OPTICAL_-_LENSES', 'Dept_OTHER_DEPARTMENTS', 'Dept_PAINT_AND_ACCESSORIES', 'Dept_PERSONAL_CARE', 'Dept_PETS_AND_SUPPLIES', 'Dept_PHARMACY_OTC', 'Dept_PHARMACY_RX', 'Dept_PLAYERS_AND_ELECTRONICS', 'Dept_PLUS_AND_MATERNITY', 'Dept_PRE_PACKED_DELI', 'Dept_PRODUCE', 'Dept_SEAFOOD', 'Dept_SEASONAL', 'Dept_SERVICE_DELI', 'Dept_SHEER_HOSIERY', 'Dept_SHOES', 'Dept_SLEEPWEAR/FOUNDATIONS', 'Dept_SPORTING_GOODS', 'Dept_SWIMWEAR/OUTERWEAR', 'Dept_TOYS', 'Dept_WIRELESS'] 14 | #header_list3 = ["MinCountUPC", "MaxCountUPC", "MeanCountUPC", 'DeptScan_-999', 'DeptScan_1-HR_PHOTO', 'DeptScan_ACCESSORIES', 'DeptScan_AUTOMOTIVE', 'DeptScan_BAKERY', 'DeptScan_BATH_AND_SHOWER', 'DeptScan_BEAUTY', 'DeptScan_BEDDING', 'DeptScan_BOOKS_AND_MAGAZINES', 'DeptScan_BOYS_WEAR', 'DeptScan_BRAS_&_SHAPEWEAR', 'DeptScan_CAMERAS_AND_SUPPLIES', 'DeptScan_CANDY,_TOBACCO,_COOKIES', 'DeptScan_CELEBRATION', 'DeptScan_COMM_BREAD', 'DeptScan_CONCEPT_STORES', 'DeptScan_COOK_AND_DINE', 'DeptScan_DAIRY', 'DeptScan_DSD_GROCERY', 'DeptScan_ELECTRONICS', 'DeptScan_FABRICS_AND_CRAFTS', 'DeptScan_FINANCIAL_SERVICES', 'DeptScan_FROZEN_FOODS', 'DeptScan_FURNITURE', 'DeptScan_GIRLS_WEAR,_4-6X__AND_7-14', 'DeptScan_GROCERY_DRY_GOODS', 'DeptScan_HARDWARE', 'DeptScan_HEALTH_AND_BEAUTY_AIDS', 'DeptScan_HOME_DECOR', 'DeptScan_HOME_MANAGEMENT', 'DeptScan_HORTICULTURE_AND_ACCESS', 'DeptScan_HOUSEHOLD_CHEMICALS/SUPP', 'DeptScan_HOUSEHOLD_PAPER_GOODS', 'DeptScan_IMPULSE_MERCHANDISE', 'DeptScan_INFANT_APPAREL', 'DeptScan_INFANT_CONSUMABLE_HARDLINES', 'DeptScan_JEWELRY_AND_SUNGLASSES', 'DeptScan_LADIES_SOCKS', 'DeptScan_LADIESWEAR', 'DeptScan_LARGE_HOUSEHOLD_GOODS', 'DeptScan_LAWN_AND_GARDEN', 'DeptScan_LIQUOR,WINE,BEER', 'DeptScan_MEAT_-_FRESH_&_FROZEN', 'DeptScan_MEDIA_AND_GAMING', 'DeptScan_MENS_WEAR', 'DeptScan_MENSWEAR', 'DeptScan_OFFICE_SUPPLIES', 'DeptScan_OPTICAL_-_FRAMES', 'DeptScan_OPTICAL_-_LENSES', 'DeptScan_OTHER_DEPARTMENTS', 'DeptScan_PAINT_AND_ACCESSORIES', 'DeptScan_PERSONAL_CARE', 'DeptScan_PETS_AND_SUPPLIES', 'DeptScan_PHARMACY_OTC', 'DeptScan_PHARMACY_RX', 'DeptScan_PLAYERS_AND_ELECTRONICS', 'DeptScan_PLUS_AND_MATERNITY', 'DeptScan_PRE_PACKED_DELI', 'DeptScan_PRODUCE', 'DeptScan_SEAFOOD', 'DeptScan_SEASONAL', 'DeptScan_SERVICE_DELI', 'DeptScan_SHEER_HOSIERY', 'DeptScan_SHOES', 'DeptScan_SLEEPWEAR/FOUNDATIONS', 'DeptScan_SPORTING_GOODS', 'DeptScan_SWIMWEAR/OUTERWEAR', 'DeptScan_TOYS', 'DeptScan_WIRELESS', 'RatioItemsUPC', 'RatioItemsDept', 'RatioItemsFineLine', 'NoItemsLessZero'] 15 | header_list3 = ["MinCountUPC", "MaxCountUPC", "MeanCountUPC", 'RatioItemsUPC', 'RatioItemsDept', 'RatioItemsFineLine', 'NoItemsLessZero'] 16 | 17 | header_list = header_list1 + header_list2 + header_list3 + header_list4 18 | 19 | if train: 20 | return header_list + ["DV"] 21 | else: 22 | return header_list 23 | 24 | def getDeptCount(depts): 25 | dept_list = [0]*len(map_dept_dict.keys()) 26 | for dept in depts: 27 | dept_no = map_dept_dict[dept] 28 | dept_list[dept_no] += 1 29 | return dept_list 30 | 31 | fineline_len = len(fineline_dict.keys()) 32 | def getFineLineCount(finelines): 33 | fineline_list = [0]*fineline_len 34 | for fineline in finelines: 35 | fineline_no = fineline_dict.get(fineline,fineline_dict[-999]) 36 | fineline_list[fineline_no] += 1 37 | return fineline_list 38 | 39 | def getDeptScanCounts(depts, scans): 40 | dept_list = [0]*len(map_dept_dict.keys()) 41 | for index, dept in enumerate(depts): 42 | dept_no = map_dept_dict[dept] 43 | dept_list[dept_no] += scans[index] 44 | return dept_list 45 | 46 | 47 | def getVariables(name, grouped_df, train=0): 48 | try: 49 | out_list = [name, weekday_dict[np.array(grouped_df["Weekday"])[0]], grouped_df.shape[0]] 50 | except: 51 | raise 52 | 53 | no_upc = len( np.unique(grouped_df["Upc"]) ) 54 | out_list.append(no_upc) 55 | 56 | no_items = int(np.sum(grouped_df["ScanCount"]) ) 57 | out_list.append(no_items) 58 | 59 | no_depts = len( np.unique(grouped_df["DepartmentDescription"]) ) 60 | out_list.append(no_depts) 61 | 62 | no_fineline = len( np.unique(grouped_df["FinelineNumber"]) ) 63 | out_list.append(no_fineline) 64 | 65 | depts = grouped_df["DepartmentDescription"].tolist() 66 | out_list.extend( getDeptCount(depts) ) 67 | 68 | min_count_in_upc = int(np.min(grouped_df["ScanCount"])) 69 | out_list.append(min_count_in_upc) 70 | 71 | max_count_in_upc = int(np.max(grouped_df["ScanCount"])) 72 | out_list.append(max_count_in_upc) 73 | 74 | mean_count_in_upc = int(np.mean(grouped_df["ScanCount"])) 75 | out_list.append(mean_count_in_upc) 76 | 77 | #scans = grouped_df["ScanCount"].tolist() 78 | #out_list.extend( getDeptScanCounts(depts, scans) ) 79 | 80 | ratio_items_upc = no_items / no_upc 81 | out_list.append(ratio_items_upc) 82 | 83 | ratio_items_dept = no_items / no_depts 84 | out_list.append(ratio_items_dept) 85 | 86 | ratio_items_fineline = no_items / no_fineline 87 | out_list.append(ratio_items_fineline) 88 | 89 | no_items_less0 = np.sum( np.array(grouped_df["ScanCount"])<0 ) 90 | out_list.append(no_items_less0) 91 | 92 | finelines = grouped_df["FinelineNumber"].tolist() 93 | out_list.extend( getFineLineCount(finelines) ) 94 | 95 | if train: 96 | out_list.append( map_type_dv_dict[ np.array(grouped_df["TripType"])[0] ]) 97 | 98 | return out_list 99 | 100 | 101 | if __name__ == "__main__": 102 | data_path = "../Data/" 103 | train_file = data_path + "train.csv" 104 | test_file = data_path + "test.csv" 105 | train_out_file = data_path + "train_mod_v2.csv" 106 | test_out_file = data_path + "test_mod_v2.csv" 107 | 108 | train_df = pd.read_csv(train_file) 109 | test_df = pd.read_csv(test_file) 110 | train_out_handle = open(train_out_file, "w") 111 | test_out_handle = open(test_out_file, "w") 112 | train_writer = csv.writer(train_out_handle) 113 | test_writer = csv.writer(test_out_handle) 114 | 115 | train_df = train_df.fillna(-999) 116 | test_df = test_df.fillna(-999) 117 | 118 | train_header = getHeader(train=1) 119 | train_writer.writerow( train_header ) 120 | test_header = getHeader(train=0) 121 | test_writer.writerow( test_header ) 122 | train_header_len = len(train_header) 123 | test_header_len = len(test_header) 124 | 125 | print "Processing train.." 126 | print train_df.shape 127 | grouped_train_df = train_df.groupby("VisitNumber") 128 | counter = 0 129 | for name, group in grouped_train_df: 130 | out_row = getVariables(name, group, train=1) 131 | assert len(out_row) == train_header_len 132 | train_writer.writerow(out_row) 133 | counter += 1 134 | if counter%10000 == 0: 135 | print counter 136 | 137 | print "Processing test.." 138 | grouped_test_df = test_df.groupby("VisitNumber") 139 | counter = 0 140 | for name, group in grouped_test_df: 141 | out_row = getVariables(name, group, train=0) 142 | assert len(out_row) == test_header_len 143 | test_writer.writerow(out_row) 144 | counter += 1 145 | if counter%10000 == 0: 146 | print counter 147 | 148 | train_out_handle.close() 149 | test_out_handle.close() 150 | -------------------------------------------------------------------------------- /Walmart_TripType/NeuralNets/readme.md: -------------------------------------------------------------------------------- 1 | Codes for best Neural Net model 2 | -------------------------------------------------------------------------------- /Walmart_TripType/XGB/prepData.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from config_v5 import fineline_dict, header_list4, upc_dict, header_list5 6 | 7 | map_type_dv_dict = {3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 12:7, 14:8, 15:9, 18:10, 19:11, 20:12, 21:13, 22:14, 23:15, 24:16, 25:17, 26:18, 27:19, 28:20, 29:21, 30:22, 31:23, 32:24, 33:25, 34:26, 35:27, 36:28, 37:29, 38:30, 39:31, 40:32, 41:33, 42:34, 43:35, 44:36, 999:37} 8 | map_dept_dict = {'COMM BREAD': 14, 'OPTICAL - FRAMES': 47, '1-HR PHOTO': 1, 'LIQUOR,WINE,BEER': 41, 'FABRICS AND CRAFTS': 20, 'MENS WEAR': 44, 'SEAFOOD': 59, 'AUTOMOTIVE': 3, 'BEDDING': 7, 'COOK AND DINE': 16, 'OPTICAL - LENSES': 48, 'HARDWARE': 26, 'SLEEPWEAR/FOUNDATIONS': 64, 'FINANCIAL SERVICES': 21, 'OTHER DEPARTMENTS': 49, 'ELECTRONICS': 19, 'LADIESWEAR': 38, 'HOME MANAGEMENT': 29, 'HOUSEHOLD PAPER GOODS': 32, 'FROZEN FOODS': 22, 'FURNITURE': 23, 'INFANT CONSUMABLE HARDLINES': 35, 'MENSWEAR': 45, 'PAINT AND ACCESSORIES': 50, 'GROCERY DRY GOODS': 25, 'BOYS WEAR': 9, 'SERVICE DELI': 61, 'ACCESSORIES': 2, 'DSD GROCERY': 18, 'MEDIA AND GAMING': 43, -999: 0, 'JEWELRY AND SUNGLASSES': 36, 'PLUS AND MATERNITY': 56, 'LARGE HOUSEHOLD GOODS': 39, 'HOUSEHOLD CHEMICALS/SUPP': 31, 'CAMERAS AND SUPPLIES': 11, 'BATH AND SHOWER': 5, 'SEASONAL': 60, 'IMPULSE MERCHANDISE': 33, 'BRAS & SHAPEWEAR': 10, 'PHARMACY OTC': 53, 'SPORTING GOODS': 65, 'BEAUTY': 6, 'PETS AND SUPPLIES': 52, 'LADIES SOCKS': 37, 'HOME DECOR': 28, 'WIRELESS': 68, 'DAIRY': 17, 'PERSONAL CARE': 51, 'TOYS': 67, 'CONCEPT STORES': 15, 'HEALTH AND BEAUTY AIDS': 27, 'OFFICE SUPPLIES': 46, 'LAWN AND GARDEN': 40, 'SHOES': 63, 'SHEER HOSIERY': 62, 'PRE PACKED DELI': 57, 'INFANT APPAREL': 34, 'HORTICULTURE AND ACCESS': 30, 'PLAYERS AND ELECTRONICS': 55, 'BAKERY': 4, 'PRODUCE': 58, 'CANDY, TOBACCO, COOKIES': 12, 'MEAT - FRESH & FROZEN': 42, 'PHARMACY RX': 54, 'BOOKS AND MAGAZINES': 8, 'GIRLS WEAR, 4-6X AND 7-14': 24, 'SWIMWEAR/OUTERWEAR': 66, 'CELEBRATION': 13} 9 | weekday_dict = {"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6} 10 | 11 | def getHeader(train): 12 | header_list1 = ["VisitNumber", "DayOfWeek", "NumberOfRows", "NoOfUPCs", "NumberOfItems", "NumberOfDepts", "NumberOfFineLine" ] 13 | header_list2 = ['Dept_-999', 'Dept_1-HR_PHOTO', 'Dept_ACCESSORIES', 'Dept_AUTOMOTIVE', 'Dept_BAKERY', 'Dept_BATH_AND_SHOWER', 'Dept_BEAUTY', 'Dept_BEDDING', 'Dept_BOOKS_AND_MAGAZINES', 'Dept_BOYS_WEAR', 'Dept_BRAS_&_SHAPEWEAR', 'Dept_CAMERAS_AND_SUPPLIES', 'Dept_CANDY,_TOBACCO,_COOKIES', 'Dept_CELEBRATION', 'Dept_COMM_BREAD', 'Dept_CONCEPT_STORES', 'Dept_COOK_AND_DINE', 'Dept_DAIRY', 'Dept_DSD_GROCERY', 'Dept_ELECTRONICS', 'Dept_FABRICS_AND_CRAFTS', 'Dept_FINANCIAL_SERVICES', 'Dept_FROZEN_FOODS', 'Dept_FURNITURE', 'Dept_GIRLS_WEAR,_4-6X__AND_7-14', 'Dept_GROCERY_DRY_GOODS', 'Dept_HARDWARE', 'Dept_HEALTH_AND_BEAUTY_AIDS', 'Dept_HOME_DECOR', 'Dept_HOME_MANAGEMENT', 'Dept_HORTICULTURE_AND_ACCESS', 'Dept_HOUSEHOLD_CHEMICALS/SUPP', 'Dept_HOUSEHOLD_PAPER_GOODS', 'Dept_IMPULSE_MERCHANDISE', 'Dept_INFANT_APPAREL', 'Dept_INFANT_CONSUMABLE_HARDLINES', 'Dept_JEWELRY_AND_SUNGLASSES', 'Dept_LADIES_SOCKS', 'Dept_LADIESWEAR', 'Dept_LARGE_HOUSEHOLD_GOODS', 'Dept_LAWN_AND_GARDEN', 'Dept_LIQUOR,WINE,BEER', 'Dept_MEAT_-_FRESH_&_FROZEN', 'Dept_MEDIA_AND_GAMING', 'Dept_MENS_WEAR', 'Dept_MENSWEAR', 'Dept_OFFICE_SUPPLIES', 'Dept_OPTICAL_-_FRAMES', 'Dept_OPTICAL_-_LENSES', 'Dept_OTHER_DEPARTMENTS', 'Dept_PAINT_AND_ACCESSORIES', 'Dept_PERSONAL_CARE', 'Dept_PETS_AND_SUPPLIES', 'Dept_PHARMACY_OTC', 'Dept_PHARMACY_RX', 'Dept_PLAYERS_AND_ELECTRONICS', 'Dept_PLUS_AND_MATERNITY', 'Dept_PRE_PACKED_DELI', 'Dept_PRODUCE', 'Dept_SEAFOOD', 'Dept_SEASONAL', 'Dept_SERVICE_DELI', 'Dept_SHEER_HOSIERY', 'Dept_SHOES', 'Dept_SLEEPWEAR/FOUNDATIONS', 'Dept_SPORTING_GOODS', 'Dept_SWIMWEAR/OUTERWEAR', 'Dept_TOYS', 'Dept_WIRELESS'] 14 | #header_list3 = ["MinCountUPC", "MaxCountUPC", "MeanCountUPC", 'DeptScan_-999', 'DeptScan_1-HR_PHOTO', 'DeptScan_ACCESSORIES', 'DeptScan_AUTOMOTIVE', 'DeptScan_BAKERY', 'DeptScan_BATH_AND_SHOWER', 'DeptScan_BEAUTY', 'DeptScan_BEDDING', 'DeptScan_BOOKS_AND_MAGAZINES', 'DeptScan_BOYS_WEAR', 'DeptScan_BRAS_&_SHAPEWEAR', 'DeptScan_CAMERAS_AND_SUPPLIES', 'DeptScan_CANDY,_TOBACCO,_COOKIES', 'DeptScan_CELEBRATION', 'DeptScan_COMM_BREAD', 'DeptScan_CONCEPT_STORES', 'DeptScan_COOK_AND_DINE', 'DeptScan_DAIRY', 'DeptScan_DSD_GROCERY', 'DeptScan_ELECTRONICS', 'DeptScan_FABRICS_AND_CRAFTS', 'DeptScan_FINANCIAL_SERVICES', 'DeptScan_FROZEN_FOODS', 'DeptScan_FURNITURE', 'DeptScan_GIRLS_WEAR,_4-6X__AND_7-14', 'DeptScan_GROCERY_DRY_GOODS', 'DeptScan_HARDWARE', 'DeptScan_HEALTH_AND_BEAUTY_AIDS', 'DeptScan_HOME_DECOR', 'DeptScan_HOME_MANAGEMENT', 'DeptScan_HORTICULTURE_AND_ACCESS', 'DeptScan_HOUSEHOLD_CHEMICALS/SUPP', 'DeptScan_HOUSEHOLD_PAPER_GOODS', 'DeptScan_IMPULSE_MERCHANDISE', 'DeptScan_INFANT_APPAREL', 'DeptScan_INFANT_CONSUMABLE_HARDLINES', 'DeptScan_JEWELRY_AND_SUNGLASSES', 'DeptScan_LADIES_SOCKS', 'DeptScan_LADIESWEAR', 'DeptScan_LARGE_HOUSEHOLD_GOODS', 'DeptScan_LAWN_AND_GARDEN', 'DeptScan_LIQUOR,WINE,BEER', 'DeptScan_MEAT_-_FRESH_&_FROZEN', 'DeptScan_MEDIA_AND_GAMING', 'DeptScan_MENS_WEAR', 'DeptScan_MENSWEAR', 'DeptScan_OFFICE_SUPPLIES', 'DeptScan_OPTICAL_-_FRAMES', 'DeptScan_OPTICAL_-_LENSES', 'DeptScan_OTHER_DEPARTMENTS', 'DeptScan_PAINT_AND_ACCESSORIES', 'DeptScan_PERSONAL_CARE', 'DeptScan_PETS_AND_SUPPLIES', 'DeptScan_PHARMACY_OTC', 'DeptScan_PHARMACY_RX', 'DeptScan_PLAYERS_AND_ELECTRONICS', 'DeptScan_PLUS_AND_MATERNITY', 'DeptScan_PRE_PACKED_DELI', 'DeptScan_PRODUCE', 'DeptScan_SEAFOOD', 'DeptScan_SEASONAL', 'DeptScan_SERVICE_DELI', 'DeptScan_SHEER_HOSIERY', 'DeptScan_SHOES', 'DeptScan_SLEEPWEAR/FOUNDATIONS', 'DeptScan_SPORTING_GOODS', 'DeptScan_SWIMWEAR/OUTERWEAR', 'DeptScan_TOYS', 'DeptScan_WIRELESS', 'RatioItemsUPC', 'RatioItemsDept', 'RatioItemsFineLine', 'NoItemsLessZero'] 15 | header_list3 = ["MinCountUPC", "MaxCountUPC", "MeanCountUPC", 'RatioItemsUPC', 'RatioItemsDept', 'RatioItemsFineLine', 'NoItemsLessZero'] 16 | 17 | header_list = header_list1 + header_list2 + header_list3 + header_list4 + header_list5 18 | 19 | if train: 20 | return header_list + ["DV"] 21 | else: 22 | return header_list 23 | 24 | def getDeptCount(depts): 25 | dept_list = [0]*len(map_dept_dict.keys()) 26 | for dept in depts: 27 | dept_no = map_dept_dict[dept] 28 | dept_list[dept_no] += 1 29 | return dept_list 30 | 31 | fineline_len = len(fineline_dict.keys()) 32 | def getFineLineCount(finelines): 33 | fineline_list = [0]*fineline_len 34 | for fineline in finelines: 35 | fineline_no = fineline_dict.get(fineline,fineline_dict[-999]) 36 | fineline_list[fineline_no] += 1 37 | return fineline_list 38 | 39 | upc_len = len(upc_dict.keys()) 40 | def getUpcCount(upcs): 41 | upc_list = [0]*upc_len 42 | for upc in upcs: 43 | upc_no = upc_dict.get(upc,upc_dict[-999.0]) 44 | upc_list[upc_no] += 1 45 | return upc_list 46 | 47 | def getDeptScanCounts(depts, scans): 48 | dept_list = [0]*len(map_dept_dict.keys()) 49 | for index, dept in enumerate(depts): 50 | dept_no = map_dept_dict[dept] 51 | dept_list[dept_no] += scans[index] 52 | return dept_list 53 | 54 | 55 | def getVariables(name, grouped_df, train=0): 56 | try: 57 | out_list = [name, weekday_dict[np.array(grouped_df["Weekday"])[0]], grouped_df.shape[0]] 58 | except: 59 | raise 60 | 61 | no_upc = len( np.unique(grouped_df["Upc"]) ) 62 | out_list.append(no_upc) 63 | 64 | no_items = int(np.sum(grouped_df["ScanCount"]) ) 65 | out_list.append(no_items) 66 | 67 | no_depts = len( np.unique(grouped_df["DepartmentDescription"]) ) 68 | out_list.append(no_depts) 69 | 70 | no_fineline = len( np.unique(grouped_df["FinelineNumber"]) ) 71 | out_list.append(no_fineline) 72 | 73 | depts = grouped_df["DepartmentDescription"].tolist() 74 | out_list.extend( getDeptCount(depts) ) 75 | 76 | min_count_in_upc = int(np.min(grouped_df["ScanCount"])) 77 | out_list.append(min_count_in_upc) 78 | 79 | max_count_in_upc = int(np.max(grouped_df["ScanCount"])) 80 | out_list.append(max_count_in_upc) 81 | 82 | mean_count_in_upc = int(np.mean(grouped_df["ScanCount"])) 83 | out_list.append(mean_count_in_upc) 84 | 85 | #scans = grouped_df["ScanCount"].tolist() 86 | #out_list.extend( getDeptScanCounts(depts, scans) ) 87 | 88 | ratio_items_upc = no_items / no_upc 89 | out_list.append(ratio_items_upc) 90 | 91 | ratio_items_dept = no_items / no_depts 92 | out_list.append(ratio_items_dept) 93 | 94 | ratio_items_fineline = no_items / no_fineline 95 | out_list.append(ratio_items_fineline) 96 | 97 | no_items_less0 = np.sum( np.array(grouped_df["ScanCount"])<0 ) 98 | out_list.append(no_items_less0) 99 | 100 | finelines = grouped_df["FinelineNumber"].tolist() 101 | out_list.extend( getFineLineCount(finelines) ) 102 | 103 | upcs = grouped_df["Upc"].tolist() 104 | out_list.extend( getUpcCount(upcs) ) 105 | 106 | if train: 107 | out_list.append( map_type_dv_dict[ np.array(grouped_df["TripType"])[0] ]) 108 | 109 | return out_list 110 | 111 | 112 | if __name__ == "__main__": 113 | data_path = "../Data/" 114 | train_file = data_path + "train.csv" 115 | test_file = data_path + "test.csv" 116 | train_out_file = data_path + "train_mod_v5.csv" 117 | test_out_file = data_path + "test_mod_v5.csv" 118 | train_dv_out_file = data_path + "train_mod_v5_dv.csv" 119 | 120 | train_df = pd.read_csv(train_file) 121 | test_df = pd.read_csv(test_file) 122 | train_out_handle = open(train_out_file, "w") 123 | test_out_handle = open(test_out_file, "w") 124 | train_dv_out_handle = open(train_dv_out_file, "w") 125 | train_writer = csv.writer(train_out_handle) 126 | test_writer = csv.writer(test_out_handle) 127 | train_dv_writer = csv.writer(train_dv_out_handle) 128 | 129 | train_df = train_df.fillna(-999) 130 | test_df = test_df.fillna(-999) 131 | 132 | train_header = getHeader(train=0) 133 | train_writer.writerow( train_header ) 134 | test_header = getHeader(train=0) 135 | test_writer.writerow( test_header ) 136 | train_dv_header = ["VisitNumber", "DV"] 137 | train_dv_writer.writerow(train_dv_header) 138 | train_header_len = len(train_header) 139 | test_header_len = len(test_header) 140 | train_dv_header_len = len(train_dv_header) 141 | 142 | print "Processing train.." 143 | print train_df.shape 144 | grouped_train_df = train_df.groupby("VisitNumber") 145 | counter = 0 146 | for name, group in grouped_train_df: 147 | out_row = getVariables(name, group, train=1) 148 | dv = out_row[-1] 149 | out_row = out_row[:-1] 150 | dv_row = [name, dv] 151 | assert len(out_row) == train_header_len 152 | assert len(dv_row) == train_dv_header_len 153 | train_writer.writerow(out_row) 154 | train_dv_writer.writerow(dv_row) 155 | counter += 1 156 | if counter%10000 == 0: 157 | print counter 158 | 159 | print "Processing test.." 160 | grouped_test_df = test_df.groupby("VisitNumber") 161 | counter = 0 162 | for name, group in grouped_test_df: 163 | out_row = getVariables(name, group, train=0) 164 | assert len(out_row) == test_header_len 165 | test_writer.writerow(out_row) 166 | counter += 1 167 | if counter%10000 == 0: 168 | print counter 169 | 170 | train_out_handle.close() 171 | test_out_handle.close() 172 | train_dv_out_handle.close() 173 | 174 | -------------------------------------------------------------------------------- /Walmart_TripType/XGB/readme.md: -------------------------------------------------------------------------------- 1 | Codes for best XGB model 2 | -------------------------------------------------------------------------------- /Walmart_TripType/XGB/xgb_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import operator 4 | import numpy as np 5 | import pandas as pd 6 | import scipy as sp 7 | import cPickle as pkl 8 | from scipy.sparse import csr_matrix 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.cross_validation import KFold 11 | from sklearn import ensemble 12 | from sklearn.metrics import mean_squared_error, roc_auc_score 13 | #sys.path.append("/home/sudalai/Softwares/xgboost-master/wrapper/") 14 | sys.path.append("/home/sudalai/Softwares/XGB_pointfour/xgboost-master/wrapper/") 15 | import xgboost as xgb 16 | 17 | def multiclassLogLoss(y_true, y_pred, eps=1e-15): 18 | """Multi class version of Logarithmic Loss metric. 19 | https://www.kaggle.com/wiki/MultiClassLogLoss 20 | 21 | Parameters 22 | ---------- 23 | y_true : array, shape = [n_samples] 24 | true class, intergers in [0, n_classes - 1) 25 | y_pred : array, shape = [n_samples, n_classes] 26 | 27 | Returns 28 | ------- 29 | loss : float 30 | """ 31 | predictions = np.clip(y_pred, eps, 1 - eps) 32 | 33 | # normalize row sums to 1 34 | predictions /= predictions.sum(axis=1)[:, np.newaxis] 35 | 36 | actual = np.zeros(y_pred.shape) 37 | n_samples = actual.shape[0] 38 | actual[np.arange(n_samples), y_true.astype(int)] = 1 39 | vectsum = np.sum(actual * np.log(predictions)) 40 | loss = -1.0 / n_samples * vectsum 41 | return loss 42 | 43 | def getData(file_name): 44 | reader = csv.reader(open(file_name)) 45 | header = reader.next() 46 | 47 | row_list = [] 48 | col_list = [] 49 | data_list = [] 50 | row_ind = 0 51 | for row in reader: 52 | row = map(int, row) 53 | for col_ind, col_val in enumerate(row): 54 | if col_val != 0 : 55 | row_list.append(row_ind) 56 | col_list.append(col_ind) 57 | data_list.append(col_val) 58 | row_ind += 1 59 | 60 | sp_array = csr_matrix( (data_list, (row_list, col_list)), shape=(row_ind, len(header))) 61 | #pkl.dump(sp_array, open("train_mod_v5_sparse.pkl","w")) 62 | 63 | #sp_array = pkl.load(open("train_mod_v7_sparse.pkl")) 64 | return sp_array 65 | 66 | 67 | def getTestData(file_name): 68 | reader = csv.reader(open(file_name)) 69 | header = reader.next() 70 | 71 | row_list = [] 72 | col_list = [] 73 | data_list = [] 74 | row_ind = 0 75 | for row in reader: 76 | row = map(int, row) 77 | for col_ind, col_val in enumerate(row): 78 | if col_val != 0 : 79 | row_list.append(row_ind) 80 | col_list.append(col_ind) 81 | data_list.append(col_val) 82 | row_ind += 1 83 | 84 | sp_array = csr_matrix( (data_list, (row_list, col_list)), shape=(row_ind, len(header))) 85 | #pkl.dump(sp_array, open("test_mod_v7_sparse.pkl","w")) 86 | 87 | #sp_array = pkl.load(open("test_mod_v7_sparse.pkl")) 88 | return sp_array 89 | 90 | 91 | def runXGB(train_X, train_y): 92 | xg_train = xgb.DMatrix(train_X, label=train_y) 93 | 94 | ## Setting up the params ## 95 | param = {} 96 | # use softmax multi-class classification 97 | param['objective'] = 'multi:softprob' 98 | # scale weight of positive examples 99 | param['eta'] = 0.05 100 | param['max_depth'] = 6 101 | param['silent'] = 1 102 | param['num_class'] = 38 103 | param['eval_metric'] = "mlogloss" 104 | #param['min_child_weight'] = 2 105 | param['subsample'] = 0.9 106 | param['colsample_bytree'] = 0.7 107 | param['gamma'] = 1 108 | 109 | #watchlist = [ (xg_train,'train'), (xg_test, 'test') ] 110 | num_round = 4200 111 | bst = xgb.train( param, xg_train, num_round) 112 | return bst 113 | 114 | if __name__ == "__main__": 115 | # setting the input path and reading the data into dataframe # 116 | print "Reading data.." 117 | data_path = "../Data/" 118 | train_X = getData(data_path + "train_mod_v5.csv") 119 | train_y = np.array( pd.read_csv(data_path + "train_mod_v5_dv.csv")["DV"] ) 120 | print "Train shape : ", train_X.shape 121 | 122 | bst = runXGB(train_X, train_y) 123 | del train_X 124 | del train_y 125 | import gc 126 | gc.collect() 127 | 128 | print "Working on test.." 129 | test_X = getTestData(data_path + "test_mod_v5.csv") 130 | test_id = np.array( pd.read_csv(data_path+"test_mod_v7.csv", usecols=["VisitNumber"])["VisitNumber"] ) 131 | print test_X.shape 132 | xg_test = xgb.DMatrix(test_X) 133 | preds = bst.predict( xg_test )#.reshape( test_X.shape[0], param['num_class'] ) 134 | 135 | sample = pd.read_csv(data_path + "sample_submission.csv") 136 | preds = pd.DataFrame(preds, index=test_id, columns=sample.columns[1:]) 137 | preds.to_csv("sub6.csv", index_label="VisitNumber") 138 | 139 | -------------------------------------------------------------------------------- /Walmart_TripType/readme.md: -------------------------------------------------------------------------------- 1 | This folder has the codes used for competition Kaggle - [Walmart - Trip Type Classification](https://www.kaggle.com/c/walmart-recruiting-trip-type-classification) 2 | 3 | Finished 23rd out of >1000 people in this competition 4 | 5 | Approach: 6 | 1. Built few XGB models by converting the features to sparse format and the best model is present in XGB folder 7 | 2. Built few Neural Net models using Keras using features excluding UPCs and the best model is present in neuralnets folder 8 | 3. My final model is an ensemble of XGBs and NNs 9 | --------------------------------------------------------------------------------