├── Avazu-CTR
    ├── finalModel.py
    └── readme.md
├── BNP
    ├── bayesian_encode_fivelevel_withint.py
    └── readme.md
├── Datasets-Indian_Premier_League
    ├── IPL_Exploration.ipynb
    ├── IPL_Win_Prediction.ipynb
    └── readme.md
├── Expedia
    ├── DataPrep
    │   ├── getBookings.py
    │   ├── getClicks.py
    │   ├── getLeakFree.py
    │   ├── getLeakRows_test.py
    │   ├── getLeakRows_val.py
    │   ├── readme.md
    │   ├── splitDevVal.py
    │   ├── splitDevVal_Bookings.py
    │   ├── splitDevVal_Clicks.py
    │   └── splitNonLeak_Dist.py
    └── readme.md
├── GhoulsGoblinsGhost
    ├── kaggle_simple_exploration_notebook.ipynb
    └── readme.md
├── LibertyMutual
    ├── featureSelection.py
    ├── finalModel.py
    ├── predict.py
    ├── prepareData.py
    └── readme.md
├── MMM15
    ├── finalModel.py
    ├── prepareData.py
    ├── readme.md
    └── seed_model.py
├── OutBrain
    ├── ftrl.py
    └── readme.md
├── README.md
├── SantanderReco
    ├── keras_starter_kaggle.py
    ├── multilabel_classification.py
    ├── readme.md
    └── santander_exploartion.ipynb
├── SpookyAuthor
    ├── readme.md
    └── simple_fe_notebook_spooky_author.ipynb
├── Titanic
    ├── Titanic_Exploration.ipynb
    └── readme.md
├── TransferLearningStackExchange
    ├── frequent_words_model.py
    ├── readme.md
    └── simple_exploration_notebook.ipynb
├── TwoSigmaConnect_RentHop
    ├── SimpleExplorationNotebook.ipynb
    ├── XGBStarterInPython.ipynb
    └── readme.md
├── TwoSigmaFinancialModeling
    ├── OverfittingCheck.ipynb
    ├── SimpleExplorationNotebook.ipynb
    ├── UnivariateAnalysis.ipynb
    └── readme.md
└── Walmart_TripType
    ├── NeuralNets
        ├── config_v2.py
        ├── neural_net.py
        ├── prepData.py
        └── readme.md
    ├── XGB
        ├── config_v5.py
        ├── prepData.py
        ├── readme.md
        └── xgb_model.py
    └── readme.md


/Avazu-CTR/finalModel.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Nov 21 18:20:23 2014
  5 | 
  6 | Used interactions using this post
  7 | http://www.kaggle.com/c/avazu-ctr-prediction/forums/t/11331/feature-vector-dimension
  8 | """
  9 | 
 10 | from datetime import datetime
 11 | from csv import DictReader
 12 | from math import exp, log, sqrt
 13 | 
 14 | 
 15 | # TL; DR, the main training process starts on line: 250,
 16 | # you may want to start reading the code from there
 17 | 
 18 | 
 19 | ##############################################################################
 20 | # parameters #################################################################
 21 | ##############################################################################
 22 | 
 23 | # A, paths
 24 | data_path = "Path to data"
 25 | train = data_path+'train.csv'               # path to training file
 26 | test = data_path+'test.csv'                 # path to testing file
 27 | submission = 'submission.csv'  # path of to be outputted submission file
 28 | 
 29 | # B, model
 30 | alpha = .08  # learning rate
 31 | beta = 1.   # smoothing parameter for adaptive learning rate
 32 | L1 = 3.    # L1 regularization, larger value means more regularized
 33 | L2 = 1.     # L2 regularization, larger value means more regularized
 34 | 
 35 | # C, feature/hash trick
 36 | D = 2 ** 25             # number of weights to use
 37 | interaction = False     # whether to enable poly2 feature interactions
 38 | 
 39 | # D, training/validation
 40 | epoch = 1       # learn training data for N passes
 41 | holdafter = None   # data after date N (exclusive) are used as validation
 42 | holdout = None  # use every N training instance for holdout validation
 43 | 
 44 | 
 45 | ##############################################################################
 46 | # class, function, generator definitions #####################################
 47 | ##############################################################################
 48 | 
 49 | class ftrl_proximal(object):
 50 |     ''' Our main algorithm: Follow the regularized leader - proximal
 51 | 
 52 |         In short,
 53 |         this is an adaptive-learning-rate sparse logistic-regression with
 54 |         efficient L1-L2-regularization
 55 | 
 56 |         Reference:
 57 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
 58 |     '''
 59 | 
 60 |     def __init__(self, alpha, beta, L1, L2, D, interaction):
 61 |         # parameters
 62 |         self.alpha = alpha
 63 |         self.beta = beta
 64 |         self.L1 = L1
 65 |         self.L2 = L2
 66 | 
 67 |         # feature related parameters
 68 |         self.D = D
 69 |         self.interaction = interaction
 70 | 
 71 |         # model
 72 |         # n: squared sum of past gradients
 73 |         # z: weights
 74 |         # w: lazy weights
 75 |         self.n = [0.] * D
 76 |         self.z = [0.] * D
 77 |         self.w = {}
 78 | 
 79 |     def _indices(self, x):
 80 |         ''' A helper generator that yields the indices in x
 81 | 
 82 |             The purpose of this generator is to make the following
 83 |             code a bit cleaner when doing feature interaction.
 84 |         '''
 85 | 
 86 |         # first yield index of the bias term
 87 |         yield 0
 88 | 
 89 |         # then yield the normal indices
 90 |         for index in x:
 91 |             yield index
 92 | 
 93 |         # now yield interactions (if applicable)
 94 |         if self.interaction:
 95 |             D = self.D
 96 |             L = len(x)
 97 | 
 98 |             x = sorted(x)
 99 |             for i in xrange(L):
100 |                 for j in xrange(i+1, L):
101 |                     # one-hot encode interactions with hash trick
102 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
103 | 
104 |     def predict(self, x):
105 |         ''' Get probability estimation on x
106 | 
107 |             INPUT:
108 |                 x: features
109 | 
110 |             OUTPUT:
111 |                 probability of p(y = 1 | x; w)
112 |         '''
113 | 
114 |         # parameters
115 |         alpha = self.alpha
116 |         beta = self.beta
117 |         L1 = self.L1
118 |         L2 = self.L2
119 | 
120 |         # model
121 |         n = self.n
122 |         z = self.z
123 |         w = {}
124 | 
125 |         # wTx is the inner product of w and x
126 |         wTx = 0.
127 |         for i in self._indices(x):
128 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
129 | 
130 |             # build w on the fly using z and n, hence the name - lazy weights
131 |             # we are doing this at prediction instead of update time is because
132 |             # this allows us for not storing the complete w
133 |             if sign * z[i] <= L1:
134 |                 # w[i] vanishes due to L1 regularization
135 |                 w[i] = 0.
136 |             else:
137 |                 # apply prediction time L1, L2 regularization to z and get w
138 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
139 | 
140 |             wTx += w[i]
141 | 
142 |         # cache the current w for update stage
143 |         self.w = w
144 | 
145 |         # bounded sigmoid function, this is the probability estimation
146 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
147 | 
148 |     def update(self, x, p, y):
149 |         ''' Update model using x, p, y
150 | 
151 |             INPUT:
152 |                 x: feature, a list of indices
153 |                 p: click probability prediction of our model
154 |                 y: answer
155 | 
156 |             MODIFIES:
157 |                 self.n: increase by squared gradient
158 |                 self.z: weights
159 |         '''
160 | 
161 |         # parameter
162 |         alpha = self.alpha
163 | 
164 |         # model
165 |         n = self.n
166 |         z = self.z
167 |         w = self.w
168 | 
169 |         # gradient under logloss
170 |         g = p - y
171 | 
172 |         # update z and n
173 |         for i in self._indices(x):
174 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
175 |             z[i] += g - sigma * w[i]
176 |             n[i] += g * g
177 | 
178 | 
179 | def logloss(p, y):
180 |     ''' FUNCTION: Bounded logloss
181 | 
182 |         INPUT:
183 |             p: our prediction
184 |             y: real answer
185 | 
186 |         OUTPUT:
187 |             logarithmic loss of p given y
188 |     '''
189 | 
190 |     p = max(min(p, 1. - 10e-15), 10e-15)
191 |     return -log(p) if y == 1. else -log(1. - p)
192 | 
193 | 
194 | def data(path, D):
195 |     ''' GENERATOR: Apply hash-trick to the original csv row
196 |                    and for simplicity, we one-hot-encode everything
197 | 
198 |         INPUT:
199 |             path: path to training or testing file
200 |             D: the max index that we can hash to
201 | 
202 |         YIELDS:
203 |             ID: id of the instance, mainly useless
204 |             x: a list of hashed and one-hot-encoded 'indices'
205 |                we only need the index since all values are either 0 or 1
206 |             y: y = 1 if we have a click, else we have y = 0
207 |     '''
208 | 
209 |     for t, row in enumerate(DictReader(open(path))):
210 |         # process id
211 |         ID = row['id']
212 |         del row['id']
213 | 
214 |         # process clicks
215 |         y = 0.
216 |         if 'click' in row:
217 |             if row['click'] == '1':
218 |                 y = 1.
219 |             del row['click']
220 | 
221 |         # extract date
222 |         date = int(row['hour'][4:6])
223 | 
224 |         # turn hour really into hour, it was originally YYMMDDHH
225 |         row['hour'] = row['hour'][6:]
226 | 
227 | 	# creating two way feature interactions for some variables based on the feature explanations (not an ideal method though!) 
228 | 	row['C1_bannerpos'] = row['C1']+ "_" + row['banner_pos']
229 | 	row['site_app_category'] = row['site_category'] + '_' +row['app_category']
230 | 	row['site_domin_app_category'] = row['site_domain'] + '_' + row['app_category']
231 | 	row['app_domain_site_category'] = row['app_domain'] + '_' + row['site_category']
232 | 	row['banner_pos_site_id'] = row['banner_pos'] + '_' + row['site_id']
233 | 	row['banner_pos_app_id'] = row['banner_pos'] + '_' + row['app_id']
234 | 	row['banner_pos_device_model'] = row['banner_pos'] + '_' + row['device_model']
235 | 	row['banner_pos_device_conn_type'] = row['banner_pos'] + '_' + row['device_conn_type']
236 | 	row['site_id_device_model'] = row['site_id'] + '_' + row['device_model']
237 | 	row['app_id_device_model'] = row['app_id'] + '_' + row['device_model']
238 | 	row['site_id_app_id'] = row['site_id'] + '_' + row['app_id']
239 | 	row['site_id_device_conn_type'] = row['site_id'] + '_' +row['device_conn_type']
240 | 	row['app_id_device_conn_type'] = row['app_id'] + '_' +row['device_conn_type']
241 | 	row['C14_C17'] = row['C14'] + '_' + row['C17']
242 | 	row['C14_C20'] = row['C14'] + '_' + row['C20']
243 | 	row['C15_C16'] = row['C15'] + '_' + row['C16']
244 | 	row['C15_C18'] = row['C15'] + '_' + row['C18']
245 | 	row['C17_C20'] = row['C17'] + '_' + row['C20']
246 | 	row['C16_C18'] = row['C16'] + '_' + row['C18']
247 | 	row['C19_C21'] = row['C19'] + '_' + row['C21']
248 | 	row['C20_C21'] = row['C20'] + '_' + row['C21']
249 | 	row['C20_site_id'] = row['C20'] + '_' + row['site_id']
250 | 	row['C17_site_id'] = row['C17'] + '_' + row['site_id']
251 | 	row['C20_app_id'] = row['C20'] + '_' + row['app_id']
252 | 	row['C17_app_id'] = row['C17'] + '_' + row['app_id']
253 | 	
254 | 
255 |         # build x
256 |         x = []
257 |         for key in row:
258 |             value = row[key]
259 | 
260 |             # one-hot encode everything with hash trick
261 |             index = abs(hash(key + '_' + value)) % D
262 |             x.append(index)
263 | 
264 |         yield t, date, ID, x, y
265 | 
266 | 
267 | ##############################################################################
268 | # start training #############################################################
269 | ##############################################################################
270 | 
271 | start = datetime.now()
272 | 
273 | # initialize ourselves a learner
274 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
275 | 
276 | # start training
277 | for e in xrange(epoch):
278 |     loss = 0.
279 |     count = 0
280 | 
281 |     for t, date, ID, x, y in data(train, D):  # data is a generator
282 |         #    t: just a instance counter
283 |         # date: you know what this is
284 |         #   ID: id provided in original data
285 |         #    x: features
286 |         #    y: label (click)
287 | 
288 |         # step 1, get prediction from learner
289 |         p = learner.predict(x)
290 | 
291 |         if (holdafter and date > holdafter) or (holdout and t % holdout == 0):
292 |             # step 2-1, calculate validation loss
293 |             #           we do not train with the validation data so that our
294 |             #           validation loss is an accurate estimation
295 |             #
296 |             # holdafter: train instances from day 1 to day N
297 |             #            validate with instances from day N + 1 and after
298 |             #
299 |             # holdout: validate with every N instance, train with others
300 |             loss += logloss(p, y)
301 |             count += 1
302 |         else:
303 |             # step 2-2, update learner with label (click) information
304 |             learner.update(x, p, y)
305 | 
306 |     #print('Epoch %d finished, validation logloss: %f, elapsed time: %s' % (
307 |     #    e, loss/count, str(datetime.now() - start)))
308 | 
309 | 
310 | ##############################################################################
311 | # start testing, and build Kaggle's submission file ##########################
312 | ##############################################################################
313 | 
314 | with open(submission, 'w') as outfile:
315 |     outfile.write('id,click\n')
316 |     for t, date, ID, x, y in data(test, D):
317 |         p = learner.predict(x)
318 |         outfile.write('%s,%s\n' % (ID, str(p)))
319 | 


--------------------------------------------------------------------------------
/Avazu-CTR/readme.md:
--------------------------------------------------------------------------------
1 | This folder consists of the codes written for the [Avazu - Click Through Rate](https://www.kaggle.com/c/avazu-ctr-prediction) Kaggle competition. 
2 | 
3 | The approach followed here is Follow The Regularized Leader - Proximal (FTRL). It is an online learning algorithm and the algorithm can be read from [Google paper](http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf) and this [paper](http://people.csail.mit.edu/romer/papers/TISTRespPredAds.pdf)
4 | 
5 | Thanks to Tintgru for the [base code of the algorithm](https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory) implemented in python 
6 | 
7 | Also from this [paper](http://quinonero.net/Publications/predicting-clicks-facebook.pdf), it is shown that feature interactions improved the performance in the similar problems. This idea is added to the base code to get better result.
8 | 


--------------------------------------------------------------------------------
/BNP/bayesian_encode_fivelevel_withint.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import pandas as pd
  3 | import numpy as np
  4 | import operator
  5 | from sklearn import preprocessing
  6 | from sklearn.cross_validation import KFold
  7 | from sklearn import ensemble
  8 | from sklearn.metrics import roc_auc_score,log_loss
  9 | import xgboost as xgb
 10 | 
 11 | 
 12 | def getCountVar(compute_df, count_df, var_name, count_var="v1"):
 13 |         grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')
 14 | 	grouped_df.columns = [var_name, "var_count"]
 15 | 
 16 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 17 | 	merged_df.fillna(-1, inplace=True)
 18 | 	return list(merged_df["var_count"])	
 19 | 
 20 | def create_feature_map(features):
 21 |         outfile = open('xgb.fmap', 'w')
 22 |         for i, feat in enumerate(features):
 23 |                 outfile.write('{0}\t{1}\tq\n'.format(i,feat))
 24 |         outfile.close()
 25 | 
 26 | 
 27 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="target", min_cutoff=5):
 28 |         grouped_df = target_df.groupby(var_name, as_index=False)["target"].agg(["mean", "count"])
 29 | 	grouped_df.columns = ["target_mean", "count_var"]
 30 | 	grouped_df.reset_index(level=var_name, inplace=True)
 31 | 	grouped_df["count_var"][grouped_df["count_var"]<min_cutoff] = 0
 32 | 	grouped_df["count_var"][grouped_df["count_var"]>=min_cutoff] = 1
 33 | 	grouped_df["target_mean"] = grouped_df["target_mean"] * grouped_df["count_var"]
 34 | 
 35 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 36 | 	merged_df.fillna(-1, inplace=True)
 37 |         return list(merged_df["target_mean"])
 38 | 
 39 | 
 40 | def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None):
 41 | 	params = {}
 42 | 	params["objective"] = "binary:logistic"
 43 | 	params['eval_metric'] = 'logloss'
 44 | 	params["eta"] = 0.02
 45 | 	params["min_child_weight"] = 1 
 46 | 	params["subsample"] = 0.85
 47 | 	params["colsample_bytree"] = 0.3
 48 | 	params["silent"] = 1
 49 | 	params["max_depth"] = 10
 50 | 	params["seed"] = 232345
 51 | 	#params["gamma"] = 0.5
 52 | 	num_rounds = 600
 53 | 
 54 | 	plst = list(params.items())
 55 | 	xgtrain = xgb.DMatrix(train_X, label=train_y)
 56 | 	
 57 | 	if test_y is not None:
 58 | 	        xgtest = xgb.DMatrix(test_X, label=test_y)
 59 | 	        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
 60 | 	        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=1000)
 61 | 	else:
 62 | 		xgtest = xgb.DMatrix(test_X)
 63 | 		model = xgb.train(plst, xgtrain, num_rounds)
 64 | 	
 65 | 	if feature_names:
 66 |                         create_feature_map(feature_names)
 67 |                         importance = model.get_fscore(fmap='xgb.fmap')
 68 |                         importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
 69 |                         imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
 70 |                         imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
 71 |                         imp_df.to_csv("imp_feat.txt", index=False)
 72 | 	
 73 | 	pred_test_y = model.predict(xgtest)
 74 | 	
 75 | 	if test_y is not None:
 76 | 	        loss = log_loss(test_y, pred_test_y)
 77 | 		print loss
 78 | 	
 79 | 		return pred_test_y, loss  
 80 | 	else:
 81 | 		return pred_test_y
 82 | 
 83 | def prepData(var4_col="v52"):
 84 | 	import datetime
 85 | 	start_time = datetime.datetime.now()
 86 | 	print "Start time : ", start_time
 87 | 
 88 | 	print "Reading files.."
 89 | 	train = pd.read_csv('../Data/train.csv')
 90 | 	test = pd.read_csv('../Data/test.csv')
 91 | 	print train.shape, test.shape
 92 | 
 93 | 	print "Filling NA.."
 94 | 	train = train.fillna(-1)
 95 | 	test = test.fillna(-1)
 96 | 
 97 | 	print "Label encoding.."
 98 | 	cat_columns = ["v129", "v72", "v62", "v38"]
 99 | 	for f in train.columns:
100 | 		if train[f].dtype=='object':
101 | 			print(f), len(np.unique(train[f].values))
102 | 			#if f != 'v22':
103 | 			cat_columns.append(f)
104 | 	        	lbl = preprocessing.LabelEncoder()
105 | 	        	lbl.fit(list(train[f].values) + list(test[f].values))
106 | 	        	train[f] = lbl.transform(list(train[f].values))
107 | 	        	test[f] = lbl.transform(list(test[f].values))
108 | 			new_train = pd.concat([ train[['v1',f]], test[['v1',f]] ])
109 | 			train["CountVar_"+str(f)] = getCountVar(train[['v1',f]], new_train[['v1', f]], f)
110 |                 	test["CountVar_"+str(f)] = getCountVar(test[['v1',f]], new_train[['v1',f]], f)
111 | 	cat_columns_copy = cat_columns[:]
112 | 
113 | 
114 | 	print "Encoding train...."
115 | 	for f in cat_columns:
116 | 		print f
117 | 		val_list = np.zeros(train.shape[0])
118 | 		folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] )
119 |         	for fold_index in xrange(1,6):
120 |                 	dev_index = np.where(folds_array != fold_index)[0]
121 |                 	val_index = np.where(folds_array == fold_index)[0]
122 | 			new_train = train[["v1", f, "target"]]
123 | 			dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:]
124 | 			enc_list =  np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=0 ))
125 | 			val_list[val_index] = enc_list
126 | 		train["DVEncode_"+str(f)] =  val_list
127 | 
128 | 
129 | 	print "Encoding test.."
130 | 	for f in cat_columns:
131 | 		print f
132 | 		test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=0)
133 | 
134 | 
135 | 
136 | 	print "Two way encoding.."
137 | 	other_cols=[]
138 | 	new_var_list = []
139 | 	cat_columns = [col for col in cat_columns_copy]
140 | 	for ind, var1 in enumerate(cat_columns):
141 | 	                rem_cols = cat_columns[ind+1:]
142 | 	                #if var1 in "v30":
143 | 	                #       break
144 | 	                for var2 in rem_cols:
145 | 	                        print var1, var2
146 | 	                        new_var = var1+"_"+var2
147 | 	
148 | 				train[new_var] = train[var1].astype("str") +"_" + train[var2].astype("str") 
149 | 				test[new_var] = test[var1].astype("str") + "_" + test[var2].astype("str") 
150 | 				#print train[new_var][:10]
151 | 				#print test[new_var][:10]
152 | 	
153 | 				lbl = preprocessing.LabelEncoder()
154 | 	                        lbl.fit(list(train[new_var].values) + list(test[new_var].values))
155 | 	                        train[new_var] = lbl.transform(list(train[new_var].values))
156 | 	                        test[new_var] = lbl.transform(list(test[new_var].values))
157 | 	
158 | 	                        new_train = pd.concat([ train[['v1',new_var]], test[['v1',new_var]] ])
159 | 	                        test["Count_"+new_var] = getCountVar(test[['v1',new_var]], new_train[['v1', new_var]], new_var)
160 | 	                        train["Count_"+new_var] = getCountVar(train[['v1',new_var]], new_train[['v1', new_var]], new_var)
161 | 	                        new_var_list.append(new_var)
162 | 
163 |         print "Train.."
164 |         for f in new_var_list:
165 |                 print f
166 |                 val_list = np.zeros(train.shape[0])
167 |                 folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] )
168 |                 for fold_index in xrange(1,6):
169 |                         dev_index = np.where(folds_array != fold_index)[0]
170 |                         val_index = np.where(folds_array == fold_index)[0]
171 |                         new_train = train[["v1", f, "target"]]
172 |                         dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:]
173 |                         enc_list =  np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=0 )  )
174 |                         val_list[val_index] = enc_list
175 |                 train["DVEncode_"+str(f)] =  val_list
176 | 
177 |         print "Test.."
178 |         for f in new_var_list:
179 |                 print f
180 |                 test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=0)
181 | 	train = train.drop(new_var_list, axis=1)
182 | 	test = test.drop(new_var_list, axis=1)
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 	print "Three way encoding.."
189 | 	other_cols=[]
190 | 	new_var_list = []
191 | 	var3 = "v22"
192 | 	cat_columns = [col for col in cat_columns_copy if col!= var3]
193 | 	for ind, var1 in enumerate(cat_columns):
194 | 	                rem_cols = cat_columns[ind+1:]
195 | 	                #if var1 in "v30":
196 | 	                #       break
197 | 	                for var2 in rem_cols:
198 | 	                        print var1, var2
199 | 	                        new_var = var1+"_"+var2+"_"+var3
200 | 	
201 | 				train[new_var] = train[var1].astype("str") +"_" + train[var2].astype("str") +"_" + train[var3].astype("str")
202 | 				test[new_var] = test[var1].astype("str") + "_" + test[var2].astype("str") + "_" + test[var3].astype("str")
203 | 				#print train[new_var][:10]
204 | 				#print test[new_var][:10]
205 | 	
206 | 				lbl = preprocessing.LabelEncoder()
207 | 	                        lbl.fit(list(train[new_var].values) + list(test[new_var].values))
208 | 	                        train[new_var] = lbl.transform(list(train[new_var].values))
209 | 	                        test[new_var] = lbl.transform(list(test[new_var].values))
210 | 	
211 | 	                        new_train = pd.concat([ train[['v1',new_var]], test[['v1',new_var]] ])
212 | 	                        test["Count_"+new_var] = getCountVar(test[['v1',new_var]], new_train[['v1', new_var]], new_var)
213 | 	                        train["Count_"+new_var] = getCountVar(train[['v1',new_var]], new_train[['v1', new_var]], new_var)
214 | 	                        new_var_list.append(new_var)
215 | 
216 |         print "Train.."
217 |         for f in new_var_list:
218 |                 print f
219 |                 val_list = np.zeros(train.shape[0])
220 |                 folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] )
221 |                 for fold_index in xrange(1,6):
222 |                         dev_index = np.where(folds_array != fold_index)[0]
223 |                         val_index = np.where(folds_array == fold_index)[0]
224 |                         new_train = train[["v1", f, "target"]]
225 |                         dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:]
226 |                         enc_list =  np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=0)  )
227 |                         val_list[val_index] = enc_list
228 |                 train["DVEncode_"+str(f)] =  val_list
229 | 
230 |         print "Test.."
231 |         for f in new_var_list:
232 |                 print f
233 |                 test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=0)
234 | 	train = train.drop(new_var_list, axis=1)
235 | 	test = test.drop(new_var_list, axis=1)
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 	
242 | 	print "Four way encoding.."
243 | 	other_cols=[]
244 | 	new_var_list = []
245 | 	for var4_col in ["v52", "v66", "v24", "v56", "v125", "v30"]:
246 | 	        var1_cols = ["v22"]
247 | 		var4 = var4_col
248 | 		other_cols.append(var4)
249 | 	        cat_columns = [col for col in cat_columns_copy if col not in var1_cols if col not in other_cols]
250 | 	        for var1 in var1_cols:
251 | 		    for ind, var2 in enumerate(cat_columns):
252 | 	                rem_cols = cat_columns[ind+1:]
253 | 	                #if var1 in "v30":
254 | 	                #       break
255 | 	                for var3 in rem_cols:
256 | 	                        print var1, var4, var2, var3
257 | 	                        new_var = var1+"_"+var4+"_"+var2+"_"+var3
258 | 	
259 | 				train[new_var] = train[var1].astype("str") +"_" + train[var2].astype("str") + "_"+ train[var3].astype("str") + "_" +train[var4].astype("str")
260 | 				test[new_var] = test[var1].astype("str") + "_" + test[var2].astype("str") + "_" +test[var3].astype("str") + "_" + test[var4].astype("str")
261 | 				#print train[new_var][:10]
262 | 				#print test[new_var][:10]
263 | 	
264 | 				lbl = preprocessing.LabelEncoder()
265 | 	                        lbl.fit(list(train[new_var].values) + list(test[new_var].values))
266 | 	                        train[new_var] = lbl.transform(list(train[new_var].values))
267 | 	                        test[new_var] = lbl.transform(list(test[new_var].values))
268 | 	
269 | 	                        new_train = pd.concat([ train[['v1',new_var]], test[['v1',new_var]] ])
270 | 	                        test["Count_"+new_var] = getCountVar(test[['v1',new_var]], new_train[['v1', new_var]], new_var)
271 | 	                        train["Count_"+new_var] = getCountVar(train[['v1',new_var]], new_train[['v1', new_var]], new_var)
272 | 	                        new_var_list.append(new_var)
273 | 
274 |         print "Train.."
275 |         for f in new_var_list:
276 |                 print f
277 |                 val_list = np.zeros(train.shape[0])
278 |                 folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] )
279 |                 for fold_index in xrange(1,6):
280 |                         dev_index = np.where(folds_array != fold_index)[0]
281 |                         val_index = np.where(folds_array == fold_index)[0]
282 |                         new_train = train[["v1", f, "target"]]
283 |                         dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:]
284 |                         enc_list =  np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=2)  )
285 |                         val_list[val_index] = enc_list
286 |                 train["DVEncode_"+str(f)] =  val_list
287 | 
288 |         print "Test.."
289 |         for f in new_var_list:
290 |                 print f
291 |                 test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=2)
292 | 	train = train.drop(new_var_list, axis=1)
293 | 	test = test.drop(new_var_list, axis=1)
294 | 
295 | 
296 | 	print "Five way encoding.."
297 |         new_var_list = []
298 |         for var4_col, var5_col in [["v52", "v66"], ["v24", "v56"], ["v125", "v30"], ["v52","v56"], ["v71", "v91"], ["v112","v113"]]:
299 |                 var1_cols = ["v22"]
300 |                 var4 = var4_col
301 |                 var5 = var5_col
302 |                 cat_columns = [col for col in cat_columns_copy if col not in var1_cols if col != var4 if col!=var5]
303 |                 for var1 in var1_cols:
304 |                     for ind, var2 in enumerate(cat_columns):
305 |                         rem_cols = cat_columns[ind+1:]
306 |                         #if var1 in "v30":
307 |                         #       break
308 |                         for var3 in rem_cols:
309 |                                 print var1, var4, var5, var2, var3
310 |                                 new_var = var1+"_"+var4+"_"+var5+"_"+var2+"_"+var3
311 | 
312 |                                 train[new_var] = train[var1].astype("str") + "_"+ train[var2].astype("str") + "_"+train[var3].astype("str") + "_"+ train[var4].astype("str") + "_"+ train[var5].astype("str")
313 |                                 test[new_var] = test[var1].astype("str") + "_"+ test[var2].astype("str") + "_"+ test[var3].astype("str") + "_"+ test[var4].astype("str") + "_"+ test[var5].astype("str")
314 | 
315 |                                 lbl = preprocessing.LabelEncoder()
316 |                                 lbl.fit(list(train[new_var].values) + list(test[new_var].values))
317 |                                 train[new_var] = lbl.transform(list(train[new_var].values))
318 |                                 test[new_var] = lbl.transform(list(test[new_var].values))
319 | 
320 |                                 new_train = pd.concat([ train[['v1',new_var]], test[['v1',new_var]] ])
321 |                                 test["Count_"+new_var] = getCountVar(test[['v1',new_var]], new_train[['v1', new_var]], new_var)
322 |                                 train["Count_"+new_var] = getCountVar(train[['v1',new_var]], new_train[['v1', new_var]], new_var)
323 |                                 new_var_list.append(new_var)
324 | 
325 |                 print "Train.."
326 |                 for f in new_var_list:
327 |                         print f
328 |                         val_list = np.zeros(train.shape[0])
329 |                         folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] )
330 |                         for fold_index in xrange(1,6):
331 |                                 dev_index = np.where(folds_array != fold_index)[0]
332 |                                 val_index = np.where(folds_array == fold_index)[0]
333 |                                 new_train = train[["v1", f, "target"]]
334 |                                 dev, val = new_train.iloc[dev_index,:], new_train.iloc[val_index,:]
335 |                                 enc_list =  np.array( getDVEncodeVar(val[["v1", f]], dev[["v1", f, "target"]], f, min_cutoff=2)  )
336 |                                 val_list[val_index] = enc_list
337 |                         train["DVEncode_"+str(f)] =  val_list
338 | 
339 |                 print "Test.."
340 |                 for f in new_var_list:
341 |                         print f
342 |                         test["DVEncode_"+str(f)] = getDVEncodeVar(test[["v1", f]], train[["v1", f, "target"]], f, min_cutoff=2)
343 | 
344 | 
345 | 
346 | 
347 | 	
348 | 
349 | 	train = train.drop(new_var_list, axis=1)
350 | 	test = test.drop(new_var_list, axis=1)
351 | 	train.to_csv("train_5levelenc_withint.csv", index=False)
352 | 	test.to_csv("test_5levelenc_withint.csv", index=False)
353 | 		
354 | 	end_time = datetime.datetime.now()
355 | 	print "End time : ",end_time
356 | 
357 | 	print end_time - start_time
358 | 
359 | def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
360 |         clf = ensemble.ExtraTreesClassifier(
361 |                 n_estimators = n_est_val,
362 |                 max_depth = depth_val,
363 |                 min_samples_split = split_val,
364 |                 min_samples_leaf = leaf_val,
365 |                 max_features = feat_val,
366 |                 criterion='entropy',
367 |                 n_jobs = jobs_val,
368 |                 random_state = random_state_val)
369 |         clf.fit(train_X, train_y)
370 |         pred_train_y = clf.predict_proba(train_X)[:,1]
371 |         pred_test_y = clf.predict_proba(test_X)[:,1]
372 | 
373 |         if validation:
374 |                 train_loss = log_loss(train_y, pred_train_y)
375 |                 loss = log_loss(test_y, pred_test_y)
376 |                 print "Train, Test loss : ", train_loss, loss
377 |                 return pred_test_y, loss
378 |         else:
379 |                 return pred_test_y
380 | 
381 | 
382 | def prepModel(var4_col="v52"):
383 |         print "Reading files.."
384 |         train = pd.read_csv('./train_5levelenc_withint.csv')
385 |         test = pd.read_csv('./test_5levelenc_withint.csv')
386 |         print train.shape, test.shape
387 | 
388 |         print "Getting DV and ID.."
389 |         train_y = train.target.values
390 |         train_ID = train.ID.values
391 |         test_ID = test.ID.values
392 |         train = train.drop(['ID', "target"], axis=1)
393 |         test = test.drop(['ID'], axis=1)
394 | 
395 |         print "Filling NA.."
396 |         train = train.fillna(-1)
397 |         test = test.fillna(-1)
398 | 
399 | 	feat_names = list(train.columns)
400 |         print "Converting to array.."
401 |         train = np.array(train)
402 |         test = np.array(test)
403 |         print train.shape, test.shape
404 | 
405 | 	assert train.shape[1] == test.shape[1]
406 | 	print "Cross validating.."
407 |         cv_scores = []
408 |         train_preds = np.zeros(train.shape[0])
409 |         folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] )
410 |         for fold_index in xrange(1,6):
411 |                 dev_index = np.where(folds_array != fold_index)[0]
412 |                 val_index = np.where(folds_array == fold_index)[0]
413 |                 dev_X, val_X = train[dev_index,:], train[val_index,:]
414 |                 dev_y, val_y = train_y[dev_index], train_y[val_index]
415 | 
416 |                 #preds, loss = runXGB(dev_X, dev_y, val_X, val_y, feature_names=feat_names)
417 |                 #for feat in [60, 100, 150]:
418 |                 preds, loss = runET(dev_X, dev_y, val_X, val_y, validation=1, n_est_val=500, depth_val=40, split_val=4, leaf_val=2, feat_val=180, jobs_val=4, random_state_val=98751)
419 |                 #print feat, loss
420 |                 cv_scores.append(loss)
421 |                 print cv_scores
422 |                 train_preds[val_index] = preds
423 |         print cv_scores, np.mean(cv_scores)
424 | 
425 |         out_df = pd.DataFrame({"ID":train_ID})
426 |         out_df["et1_srk_5levelenc_withint"] = train_preds
427 |         out_df.to_csv("prval_et1_srk_5levelenc_withint.csv", index=False)
428 | 
429 | 	print "Final model.."
430 | 	preds = runET(train, train_y, test, validation=0, n_est_val=500, depth_val=40, split_val=4, leaf_val=2, feat_val=180, jobs_val=4, random_state_val=98751)
431 | 	out_df = pd.DataFrame({"ID":test_ID})
432 |         out_df["et1_srk_5levelenc_withint"] = preds
433 |         out_df.to_csv("prfull_et1_srk_5levelenc_withint.csv", index=False)
434 | 
435 | 
436 | 
437 | def prepModelXGB(var4_col="v52"):
438 |         print "Reading files.."
439 |         train = pd.read_csv('./train_5levelenc_withint.csv')
440 | 	print train.shape
441 | 
442 |         print "Getting DV and ID.."
443 |         train_y = train.target.values
444 |         train_ID = train.ID.values
445 |         train = train.drop(['ID', "target"], axis=1)
446 | 
447 |         print "Filling NA.."
448 |         train = train.fillna(-1)
449 | 
450 | 	feat_names = list(train.columns)
451 |         print "Converting to array.."
452 |         train = np.array(train)
453 | 	print train.shape
454 | 
455 | 	
456 | 	print "Cross validating.."
457 |         cv_scores = []
458 |         train_preds = np.zeros(train.shape[0])
459 |         folds_array = np.array( pd.read_csv("./xfolds.csv")["fold5"] )
460 |         for fold_index in xrange(1,6):
461 |                 dev_index = np.where(folds_array != fold_index)[0]
462 |                 val_index = np.where(folds_array == fold_index)[0]
463 |                 dev_X, val_X = train[dev_index,:], train[val_index,:]
464 |                 dev_y, val_y = train_y[dev_index], train_y[val_index]
465 | 
466 |                 preds, loss = runXGB(dev_X, dev_y, val_X, val_y, feature_names=feat_names)
467 |                 #for feat in [60, 100, 150]:
468 |                 #preds, loss = runET(dev_X, dev_y, val_X, val_y, validation=1, n_est_val=600, depth_val=40, split_val=4, leaf_val=2, feat_val=180, jobs_val=4, random_state_val=8111)
469 |                 #print feat, loss
470 |                 cv_scores.append(loss)
471 |                 print cv_scores
472 |                 train_preds[val_index] = preds
473 |         print cv_scores, np.mean(cv_scores)
474 | 
475 |         out_df = pd.DataFrame({"ID":train_ID})
476 |         out_df["xg1_srk_5levelenc_withint"] = train_preds
477 |         out_df.to_csv("prval_xg1_srk_5levelenc_withint.csv", index=False)
478 | 
479 | 	import gc
480 | 	del dev_X
481 | 	del val_X
482 | 	gc.collect()
483 | 
484 | 
485 | 	print "Final model.."
486 |         test = pd.read_csv('./test_5levelenc_withint.csv')
487 |         print train.shape, test.shape
488 |         test_ID = test.ID.values
489 |         test = test.drop(['ID'], axis=1)
490 |         test = test.fillna(-1)
491 |         test = np.array(test)
492 |         print train.shape, test.shape
493 | 
494 | 	assert train.shape[1] == test.shape[1]
495 | 	#preds = runET(train, train_y, test, validation=0, n_est_val=600, depth_val=40, split_val=4, leaf_val=2, feat_val=180, jobs_val=4, random_state_val=8111)
496 | 	preds = runXGB(train, train_y, test, feature_names=feat_names)
497 | 	out_df = pd.DataFrame({"ID":test_ID})
498 |         out_df["xg1_srk_5levelenc_withint"] = preds
499 |         out_df.to_csv("prfull_xg1_srk_5levelenc_withint.csv", index=False)
500 | 
501 | 
502 | if __name__ == "__main__":
503 | 	#for var4_col_name in ["v52", "v66", "v24", "v56", "v125", "v30"]:
504 | 	for var4_col_name in ["v30"]:
505 | 		try:
506 | 			prepData(var4_col_name)
507 | 			prepModelXGB(var4_col_name)
508 | 			prepModel(var4_col_name)
509 | 		except Exception,e:
510 | 			print e
511 | 			pass
512 | 


--------------------------------------------------------------------------------
/BNP/readme.md:
--------------------------------------------------------------------------------
1 | Code for Kaggle - BNP competition where we finished in 12th position
2 | 


--------------------------------------------------------------------------------
/Datasets-Indian_Premier_League/readme.md:
--------------------------------------------------------------------------------
1 | Ipython notebooks created for exploring the [Indian Premier League dataset](https://www.kaggle.com/manasgarg/ipl) present in Kaggle is present in this folder.
2 | 
3 | File explanations:
4 | 
5 |  1. IPL_Exploration.ipynb - Notebook which has some exploratory analysis on the IPL data
6 |  2. IPL_Win_Prediction.ipynb - Notebook to predict the win probability of the given team at the end of each over
7 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/getBookings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code to get the bookings from the train file
 3 | __author__ : SRK
 4 | """
 5 | import csv
 6 | 
 7 | train_file_handle = open("../../Data/train.csv")
 8 | train_out_file_handle = open("../../Data/train_bookings.csv","w")
 9 | 
10 | reader = csv.reader(train_file_handle)
11 | writer = csv.writer(train_out_file_handle)
12 | 
13 | header = reader.next()
14 | writer.writerow(["id"] + header)
15 | 
16 | is_booking_index = header.index("is_booking")
17 | print "Booking index is : ", is_booking_index
18 | 
19 | total_count = 0
20 | count = 0
21 | for row in reader:
22 | 	if row[is_booking_index] == "1":
23 | 		writer.writerow([total_count] + row)
24 | 		count += 1
25 | 	total_count += 1
26 | 	if total_count % 100000 == 0:
27 | 		print total_count, count
28 | 
29 | print "Total count : ", total_count
30 | print "Booking count : ", count	
31 | 
32 | train_file_handle.close()
33 | train_out_file_handle.close()
34 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/getClicks.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code to get the bookings from the train file
 3 | __author__ : SRK
 4 | """
 5 | import csv
 6 | 
 7 | train_file_handle = open("../../Data/train.csv")
 8 | train_out_file_handle = open("../../Data/train_clicks.csv","w")
 9 | 
10 | reader = csv.reader(train_file_handle)
11 | writer = csv.writer(train_out_file_handle)
12 | 
13 | header = reader.next()
14 | writer.writerow(["id"] +header)
15 | 
16 | is_booking_index = header.index("is_booking")
17 | print "Booking index is : ", is_booking_index
18 | 
19 | total_count = 0
20 | count = 0
21 | for row in reader:
22 | 	if row[is_booking_index] == "0":
23 | 		writer.writerow([total_count]+row)
24 | 		count += 1
25 | 	total_count += 1
26 | 	if total_count % 100000 == 0:
27 | 		print total_count, count
28 | 
29 | print "Total count : ", total_count
30 | print "Booking count : ", count	
31 | 
32 | train_file_handle.close()
33 | train_out_file_handle.close()
34 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/getLeakFree.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from datetime import datetime
 3 | 
 4 | with open("../../Data/val.csv") as train_file:
 5 | 	with open("../../Data/val_leak_preds.csv") as leak_file:
 6 | 		reader = csv.reader(train_file)
 7 | 		leak_reader = csv.DictReader(leak_file)
 8 | 
 9 | 		out_file = open("../../Data/val_woleak.csv","w")
10 | 		out_writer = csv.writer(out_file)
11 | 		out_file2 = open("../../Data/val_withleak.csv","w")
12 |                 out_writer2 = csv.writer(out_file2)
13 | 
14 | 		header = reader.next()
15 |         	out_writer.writerow(header)
16 | 
17 | 		leak_count = 0
18 | 		for index, row in enumerate(reader):
19 | 			leak_row = leak_reader.next()
20 | 			if leak_row["hotel_cluster"] == "":
21 | 				out_writer.writerow(row)
22 | 			else:
23 | 				out_writer2.writerow(row)
24 | 				leak_count +=1
25 | 		print "Leak count is : ", leak_count
26 | 
27 | 		out_file.close()
28 | 
29 | 
30 | with open("../../Data/test.csv") as train_file:
31 |         with open("../../Data/test_leak_preds.csv") as leak_file:
32 |                 reader = csv.reader(train_file)
33 |                 leak_reader = csv.DictReader(leak_file)
34 | 
35 |                 out_file = open("../../Data/test_woleak.csv","w")
36 |                 out_writer = csv.writer(out_file)
37 | 		out_file2 = open("../../Data/test_withleak.csv","w")
38 |                 out_writer2 = csv.writer(out_file2)
39 | 
40 |                 header = reader.next()
41 |                 out_writer.writerow(header)
42 | 
43 |                 leak_count = 0
44 |                 for index, row in enumerate(reader):
45 |                         leak_row = leak_reader.next()
46 |                         if leak_row["hotel_cluster"] == "":
47 |                                 out_writer.writerow(row)
48 |                         else:
49 | 				out_writer2.writerow(row)
50 |                                 leak_count +=1
51 |                 print "Leak count is : ", leak_count
52 | 
53 |                 out_file.close()
54 | 
55 | 
56 | # get only the bookings from the validation sample #
57 | train_file_handle = open("../../Data/val_woleak.csv")
58 | train_out_file_handle = open("../../Data/val_bookings_woleak.csv","w")
59 | 
60 | reader = csv.reader(train_file_handle)
61 | writer = csv.writer(train_out_file_handle)
62 | 
63 | header = reader.next()
64 | writer.writerow(header)
65 | 
66 | is_booking_index = header.index("is_booking")
67 | print "Booking index is : ", is_booking_index
68 | 
69 | total_count = 0
70 | count = 0
71 | for row in reader:
72 |         if row[is_booking_index] == "1":
73 |                 writer.writerow(row)
74 |                 count += 1
75 |         total_count += 1
76 |         if total_count % 100000 == 0:
77 |                 print total_count, count
78 | 
79 | print "Total count : ", total_count
80 | print "Booking count : ", count
81 | 
82 | train_file_handle.close()
83 | train_out_file_handle.close()
84 | 
85 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/getLeakRows_test.py:
--------------------------------------------------------------------------------
  1 | from csv import DictReader
  2 | from collections import defaultdict
  3 | from datetime import datetime
  4 | 
  5 | start = datetime.now()
  6 | 
  7 | def get_top5(d):
  8 |     return sorted(d, key=d.get, reverse=True)[:5]
  9 | 
 10 | destination_clusters = defaultdict(lambda: defaultdict(int))
 11 | destination_clusters2 = defaultdict(lambda: defaultdict(int))
 12 | destination_clusters3 = defaultdict(lambda: defaultdict(int))
 13 | destination_clusters4 = defaultdict(lambda: defaultdict(int))
 14 | 
 15 | print "Reading the train.."
 16 | for i, row in enumerate(DictReader(open("../../Data/train.csv"))):
 17 | 	key = row["user_location_country"] + "_"  + row["user_location_region"] + "_" + row["user_location_city"] + "_" + row["hotel_market"] + "_"+ row["orig_destination_distance"]
 18 | 	#key2 = row["user_id"] + "_" + row["srch_destination_id"]
 19 | 	#key3 = row["srch_destination_id"] + "_" + row["hotel_market"]
 20 | 	#key4 = row["hotel_market"]
 21 | 	destination_clusters[key][row["hotel_cluster"]] += 1
 22 | 	#destination_clusters2[key2][row["hotel_cluster"]] += 1
 23 | 	#destination_clusters3[key3][row["hotel_cluster"]] += 1
 24 | 	#destination_clusters4[key4][row["hotel_cluster"]] += 1
 25 | 	if i % 1000000 == 0:
 26 | 		print("%s\t%s"%(i, datetime.now() - start))
 27 | 
 28 | most_frequent = defaultdict(str)
 29 | most_frequent2 = defaultdict(str)
 30 | most_frequent3 = defaultdict(str)
 31 | most_frequent4 = defaultdict(str)
 32 | 
 33 | print "Getting top 5 list.."
 34 | for k in destination_clusters:
 35 |         top5_list = get_top5(destination_clusters[k])
 36 |         most_frequent[k] = top5_list[:]
 37 | del destination_clusters
 38 | import gc
 39 | gc.collect()
 40 | 
 41 | #for k in destination_clusters2:
 42 | #        top5_list = get_top5(destination_clusters2[k])
 43 | #        most_frequent2[k] = top5_list[:]
 44 | #del destination_clusters2
 45 | #gc.collect()
 46 | #
 47 | #for k in destination_clusters3:
 48 | #        top5_list = get_top5(destination_clusters3[k])
 49 | #        most_frequent3[k] = top5_list[:]
 50 | #del destination_clusters3
 51 | #gc.collect()
 52 | #
 53 | #for k in destination_clusters4:
 54 | #        top5_list = get_top5(destination_clusters4[k])
 55 | #        most_frequent4[k] = top5_list[:]
 56 | #del destination_clusters4
 57 | #gc.collect()
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | print "Predicting on test.."
 65 | with open("../../Data/test_leak_preds.csv", "w") as outfile:
 66 | 	outfile.write("id,hotel_cluster\n")
 67 | 	for i, row in enumerate(DictReader(open("../../Data/test.csv"))):
 68 | 		key = row["user_location_country"] + "_"  + row["user_location_region"] + "_" + row["user_location_city"] + "_" + row["hotel_market"] + "_"+ row["orig_destination_distance"]
 69 | 	        #key2 = row["user_id"] + "_" + row["srch_destination_id"]
 70 |         	#key3 = row["srch_destination_id"] + "_" + row["hotel_market"]
 71 | 		#key4 = row["hotel_market"]
 72 | 
 73 | 		if row["orig_destination_distance"] == "":
 74 | 			top5_list = []
 75 | 		else:
 76 | 			top5_list = most_frequent[key][:]
 77 | 		if isinstance(top5_list, str):
 78 | 			top5_list = []
 79 | 
 80 | 		
 81 | 		#if len(top5_list) < 5:
 82 | 		#	temp_top5_list = most_frequent2.get(key2,[])
 83 | 		#	for v in temp_top5_list:
 84 | 		#		if v not in top5_list:
 85 |                 #                        top5_list.append(v)
 86 |                 #                        if len(top5_list) == 5:
 87 |                 #                                break
 88 | 	
 89 | 		#if len(top5_list) < 5:
 90 |                 #        temp_top5_list = most_frequent3[key3]
 91 |                 #        for v in temp_top5_list:
 92 |                 #                if v not in top5_list:
 93 | 		#			top5_list.append(v)
 94 | 		#			if len(top5_list) == 5:
 95 | 		#				break
 96 | 
 97 | 		#if len(top5_list) < 5:
 98 |                 #        temp_top5_list = most_frequent4[key4]
 99 |                 #        for v in temp_top5_list:
100 |                 #                if v not in top5_list:
101 |                 #                        top5_list.append(v)
102 |                 #                        if len(top5_list) == 5:
103 |                 #                                break
104 | 
105 | 		top5_clusters = " ".join(top5_list)
106 | 
107 | 		outfile.write("%d,%s\n"%(i,top5_clusters))
108 | 		if i % 1000000 == 0:
109 | 			print("%s\t%s"%(i, datetime.now() - start))
110 | del most_frequent
111 | del most_frequent2
112 | del most_frequent3
113 | del most_frequent4
114 | gc.collect()
115 | 
116 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/getLeakRows_val.py:
--------------------------------------------------------------------------------
  1 | from csv import DictReader
  2 | from collections import defaultdict
  3 | from datetime import datetime
  4 | 
  5 | start = datetime.now()
  6 | 
  7 | def get_top5(d):
  8 |     return sorted(d, key=d.get, reverse=True)[:5]
  9 | 
 10 | destination_clusters = defaultdict(lambda: defaultdict(int))
 11 | destination_clusters2 = defaultdict(lambda: defaultdict(int))
 12 | destination_clusters3 = defaultdict(lambda: defaultdict(int))
 13 | destination_clusters4 = defaultdict(lambda: defaultdict(int))
 14 | 
 15 | print "Reading the train.."
 16 | for i, row in enumerate(DictReader(open("../../Data/dev.csv"))):
 17 | 	key = row["user_location_country"] + "_"  + row["user_location_region"] + "_" + row["user_location_city"] + "_" + row["hotel_market"] + "_"+ row["orig_destination_distance"]
 18 | 	#key2 = row["user_id"] + "_" + row["srch_destination_id"]
 19 | 	#key3 = row["srch_destination_id"] + "_" + row["hotel_market"]
 20 | 	#key4 = row["hotel_market"]
 21 | 	destination_clusters[key][row["hotel_cluster"]] += 1
 22 | 	#destination_clusters2[key2][row["hotel_cluster"]] += 1
 23 | 	#destination_clusters3[key3][row["hotel_cluster"]] += 1
 24 | 	#destination_clusters4[key4][row["hotel_cluster"]] += 1
 25 | 	if i % 1000000 == 0:
 26 | 		print("%s\t%s"%(i, datetime.now() - start))
 27 | 
 28 | most_frequent = defaultdict(str)
 29 | most_frequent2 = defaultdict(str)
 30 | most_frequent3 = defaultdict(str)
 31 | most_frequent4 = defaultdict(str)
 32 | 
 33 | print "Getting top 5 list.."
 34 | for k in destination_clusters:
 35 |         top5_list = get_top5(destination_clusters[k])
 36 |         most_frequent[k] = top5_list[:]
 37 | del destination_clusters
 38 | import gc
 39 | gc.collect()
 40 | 
 41 | #for k in destination_clusters2:
 42 | #        top5_list = get_top5(destination_clusters2[k])
 43 | #        most_frequent2[k] = top5_list[:]
 44 | #del destination_clusters2
 45 | #gc.collect()
 46 | #
 47 | #for k in destination_clusters3:
 48 | #        top5_list = get_top5(destination_clusters3[k])
 49 | #        most_frequent3[k] = top5_list[:]
 50 | #del destination_clusters3
 51 | #gc.collect()
 52 | #
 53 | #for k in destination_clusters4:
 54 | #        top5_list = get_top5(destination_clusters4[k])
 55 | #        most_frequent4[k] = top5_list[:]
 56 | #del destination_clusters4
 57 | #gc.collect()
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | print "Predicting on test.."
 65 | with open("../../Data/val_leak_preds.csv", "w") as outfile:
 66 | 	outfile.write("id,hotel_cluster\n")
 67 | 	for i, row in enumerate(DictReader(open("../../Data/val.csv"))):
 68 | 		key = row["user_location_country"] + "_"  + row["user_location_region"] + "_" + row["user_location_city"] + "_" + row["hotel_market"] + "_"+ row["orig_destination_distance"]
 69 | 	        #key2 = row["user_id"] + "_" + row["srch_destination_id"]
 70 |         	#key3 = row["srch_destination_id"] + "_" + row["hotel_market"]
 71 | 		#key4 = row["hotel_market"]
 72 | 
 73 | 		if row["orig_destination_distance"] == "":
 74 | 			top5_list = []
 75 | 		else:
 76 | 			top5_list = most_frequent[key][:]
 77 | 		if isinstance(top5_list, str):
 78 | 			top5_list = []
 79 | 
 80 | 		
 81 | 		#if len(top5_list) < 5:
 82 | 		#	temp_top5_list = most_frequent2.get(key2,[])
 83 | 		#	for v in temp_top5_list:
 84 | 		#		if v not in top5_list:
 85 |                 #                        top5_list.append(v)
 86 |                 #                        if len(top5_list) == 5:
 87 |                 #                                break
 88 | 	
 89 | 		#if len(top5_list) < 5:
 90 |                 #        temp_top5_list = most_frequent3[key3]
 91 |                 #        for v in temp_top5_list:
 92 |                 #                if v not in top5_list:
 93 | 		#			top5_list.append(v)
 94 | 		#			if len(top5_list) == 5:
 95 | 		#				break
 96 | 
 97 | 		#if len(top5_list) < 5:
 98 |                 #        temp_top5_list = most_frequent4[key4]
 99 |                 #        for v in temp_top5_list:
100 |                 #                if v not in top5_list:
101 |                 #                        top5_list.append(v)
102 |                 #                        if len(top5_list) == 5:
103 |                 #                                break
104 | 
105 | 		top5_clusters = " ".join(top5_list)
106 | 
107 | 		outfile.write("%d,%s\n"%(i,top5_clusters))
108 | 		if i % 1000000 == 0:
109 | 			print("%s\t%s"%(i, datetime.now() - start))
110 | del most_frequent
111 | del most_frequent2
112 | del most_frequent3
113 | del most_frequent4
114 | gc.collect()
115 | 
116 | sys.exit()
117 | 
118 | ### Code to get the mapk value ###
119 | print "Getting Eval Metric"
120 | import pandas as pd
121 | import numpy as np
122 | from ml_metrics import mapk
123 | 
124 | preds_df = pd.read_csv("val_leak_preds.csv")
125 | preds = np.array( preds_df["hotel_cluster"].apply(lambda x: str(x).split(" ")) )
126 | #preds = [pred for pred in preds]
127 | print preds[:10]
128 | found_count= 0
129 | total_count = 0
130 | item_count = 0
131 | for pred in preds:
132 |         if pred != ['nan']:
133 |                 found_count+=1
134 | 		item_count += len(pred)
135 |         total_count+=1
136 | print "Item, Found and total : ", item_count,found_count, total_count
137 |  
138 | 
139 | actuals = np.array( pd.read_csv("../../Data/val.csv", usecols = ["hotel_cluster"])).astype('str')
140 | actuals = actuals.reshape(len(actuals),1)
141 | #actuals = [list(actual) for actual in actuals]
142 | print actuals[:10]
143 | 
144 | print mapk(actuals, preds, k=5)
145 | 
146 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/readme.md:
--------------------------------------------------------------------------------
1 | 1. splitDevVal.py - Code to split the train data into dev and val samples based on time
2 | 2. getBookings.py - Code to get the bookings from the train file
3 | 3. getClicks.py - Code to get the clicks from train file
4 | 4. splitDevVal_Bookings.py - Code to split the bookings into dev and val sample based on time
5 | 5. splitDevVal_Clicks.py - Code to split the clicks into dev and val sample based on time
6 | 6. getLeakRows_val.py - Code to get the leaky rows of validation sample and save it in csv file
7 | 7. getLeakRows_test.py - Code to get the leaky rows of test sample and save it in csv file 
8 | 8. getLeakFree.py - Code to get the leak free rows for both test and val sample
9 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/splitDevVal.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code to split the train data into two samples - dev and val. Last four months of 2014 is used as val sample
 3 | __author__ : SRK
 4 | """
 5 | import csv
 6 | from datetime import datetime
 7 | 
 8 | with open("../../Data/train.csv") as train_file:
 9 | 	dev_file = open("../../Data/dev.csv","w")
10 | 	val_file = open("../../Data/val.csv","w")
11 | 
12 | 	dev_writer = csv.writer(dev_file)
13 | 	val_writer = csv.writer(val_file)
14 | 
15 | 	reader = csv.reader(train_file)
16 | 	header = reader.next()
17 | 	dev_writer.writerow(["id"] + header)
18 | 	val_writer.writerow(["id"] + header)
19 | 	date_index = header.index("date_time")
20 | 
21 | 	dev_counter = 0
22 | 	val_counter = 0
23 | 	total_counter = 0
24 | 	for row in reader:
25 | 		#print row
26 | 		date_val = datetime.strptime(row[date_index], "%Y-%m-%d %H:%M:%S")
27 | 		if date_val.year == 2014 and date_val.month >= 9:
28 | 			val_writer.writerow([total_counter]+row)
29 | 			val_counter += 1
30 | 		else:
31 | 			dev_writer.writerow([total_counter]+row)
32 | 			dev_counter += 1
33 | 		total_counter += 1
34 | 		if total_counter % 1000000 == 0:
35 | 			print total_counter, dev_counter, val_counter
36 | 
37 | 	dev_file.close()
38 | 	val_file.close()
39 | 
40 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/splitDevVal_Bookings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code to split the train bookings data into two samples - dev and val. Last four months of 2014 is used as val sample
 3 | __author__ : SRK
 4 | """
 5 | 
 6 | import csv
 7 | from datetime import datetime
 8 | 
 9 | with open("../../Data/train_bookings.csv") as train_file:
10 | 	dev_file = open("../../Data/dev_bookings.csv","w")
11 | 	val_file = open("../../Data/val_bookings.csv","w")
12 | 
13 | 	dev_writer = csv.writer(dev_file)
14 | 	val_writer = csv.writer(val_file)
15 | 
16 | 	reader = csv.reader(train_file)
17 | 	header = reader.next()
18 | 	dev_writer.writerow(header)
19 | 	val_writer.writerow(header)
20 | 	date_index = header.index("date_time")
21 | 
22 | 	dev_counter = 0
23 | 	val_counter = 0
24 | 	total_counter = 0
25 | 	for row in reader:
26 | 		#print row
27 | 		date_val = datetime.strptime(row[date_index], "%Y-%m-%d %H:%M:%S")
28 | 		if date_val.year == 2014 and date_val.month >= 9:
29 | 			val_writer.writerow(row)
30 | 			val_counter += 1
31 | 		else:
32 | 			dev_writer.writerow(row)
33 | 			dev_counter += 1
34 | 		total_counter += 1
35 | 		if total_counter % 1000000 == 0:
36 | 			print total_counter, dev_counter, val_counter
37 | 
38 | 	dev_file.close()
39 | 	val_file.close()
40 | 
41 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/splitDevVal_Clicks.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code to split the train bookings data into two samples - dev and val. Last four months of 2014 is used as val sample
 3 | __author__ : SRK
 4 | """
 5 | 
 6 | import csv
 7 | from datetime import datetime
 8 | 
 9 | with open("../../Data/train_clicks.csv") as train_file:
10 | 	dev_file = open("../../Data/dev_clicks.csv","w")
11 | 	val_file = open("../../Data/val_clicks.csv","w")
12 | 
13 | 	dev_writer = csv.writer(dev_file)
14 | 	val_writer = csv.writer(val_file)
15 | 
16 | 	reader = csv.reader(train_file)
17 | 	header = reader.next()
18 | 	dev_writer.writerow(header)
19 | 	val_writer.writerow(header)
20 | 	date_index = header.index("date_time")
21 | 
22 | 	dev_counter = 0
23 | 	val_counter = 0
24 | 	total_counter = 0
25 | 	for row in reader:
26 | 		#print row
27 | 		date_val = datetime.strptime(row[date_index], "%Y-%m-%d %H:%M:%S")
28 | 		if date_val.year == 2014 and date_val.month >= 9:
29 | 			val_writer.writerow(row)
30 | 			val_counter += 1
31 | 		else:
32 | 			dev_writer.writerow(row)
33 | 			dev_counter += 1
34 | 		total_counter += 1
35 | 		if total_counter % 1000000 == 0:
36 | 			print total_counter, dev_counter, val_counter
37 | 
38 | 	dev_file.close()
39 | 	val_file.close()
40 | 
41 | 


--------------------------------------------------------------------------------
/Expedia/DataPrep/splitNonLeak_Dist.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import csv
 3 | 
 4 | from datetime import datetime
 5 | 
 6 | with open("../../Data/val_bookings_woleak.csv") as train_file:
 7 |                 reader = csv.reader(train_file)
 8 |                 #leak_reader = csv.DictReader(leak_file)
 9 | 
10 |                 out_file = open("../../Data/val_bookings_woleak_wodist.csv","w")
11 |                 out_writer = csv.writer(out_file)
12 |                 out_file2 = open("../../Data/val_bookings_woleak_dist.csv","w")
13 |                 out_writer2 = csv.writer(out_file2)
14 | 
15 |                 header = reader.next()
16 | 		dist_index = header.index("orig_destination_distance")
17 |                 out_writer.writerow(header)
18 | 		out_writer2.writerow(header)
19 | 
20 |                 leak_count = 0
21 |                 for index, row in enumerate(reader):
22 |                         if row[dist_index] == "":
23 |                                 out_writer.writerow(row)
24 |                         else:
25 |                                 out_writer2.writerow(row)
26 |                                 leak_count +=1
27 |                 print "With Dist count is : ", leak_count
28 | 		print index
29 | 
30 |                 out_file.close()
31 | 
32 | 


--------------------------------------------------------------------------------
/Expedia/readme.md:
--------------------------------------------------------------------------------
1 | Codes for the [Kaggle Expedia Competition](https://www.kaggle.com/c/expedia-hotel-recommendations)
2 | 


--------------------------------------------------------------------------------
/GhoulsGoblinsGhost/readme.md:
--------------------------------------------------------------------------------
1 | Codes for the Kaggle competition - [Ghouls, Goblins and Ghosts](https://www.kaggle.com/c/ghouls-goblins-and-ghosts-boo/) are present in this folder
2 | 
3 | 1. [Kaggle kernel for exploration](https://www.kaggle.com/sudalairajkumar/ghouls-goblins-and-ghosts-boo/simple-exploration-notebook)
4 | 


--------------------------------------------------------------------------------
/LibertyMutual/featureSelection.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jul 22 17:17:40 2014
 4 | 
 5 | @author: Sudalai Rajkumar S
 6 | 
 7 | This code is for selecting the features to run the final model
 8 | Feature sets 2 and 3 in the final model are selected based on this code
 9 | Feature selection uses stepwise forward feature selection algorithm that maximizes weighted gini coefficient
10 | """
11 | from __future__ import division
12 | import numpy as np
13 | import pandas as pd
14 | from sklearn.linear_model import Ridge, Lasso, LogisticRegression, SGDClassifier, LinearRegression
15 | from sklearn.cross_validation import cross_val_score, KFold
16 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
17 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif
18 | from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score
19 | 
20 | import pandas as pd
21 | import numpy as np
22 | 
23 | def weighted_gini(act,pred,weight):
24 |     df = pd.DataFrame({"act":act,"pred":pred,"weight":weight})
25 |     df = df.sort('pred',ascending=False)
26 |     df["random"] = (df.weight / df.weight.sum()).cumsum()
27 |     total_pos = (df.act * df.weight).sum()
28 |     df["cum_pos_found"] = (df.act * df.weight).cumsum()
29 |     df["lorentz"] = df.cum_pos_found / total_pos
30 |     #n = df.shape[0]
31 |     #df["gini"] = (df.lorentz - df.random) * df.weight 
32 |     #return df.gini.sum()
33 |     gini = sum(df.lorentz[1:].values * (df.random[:-1])) - sum(df.lorentz[:-1].values * (df.random[1:]))
34 |     return gini
35 | 
36 | def normalized_weighted_gini(act,pred,weight):
37 |     return weighted_gini(act,pred,weight) / weighted_gini(act,act,weight)
38 | 
39 | data_path = "Path to data"
40 | 
41 | tr = np.load(data_path+"train.npy")
42 | ts = np.load(data_path+"test.npy")
43 | train_y = np.load(data_path+"train_y.npy")
44 | 
45 | 
46 | ### Feature selection using stepwise fashion based on cross validation ###
47 | # This code will select one variable at a time from the given input variables using greedy approach which maximizes weighted gini metric #
48 | print "Cross Validating.."
49 | wt_gini = 0
50 | kf = KFold(tr.shape[0], n_folds=5)
51 | for i in xrange(tr.shape[1]):
52 |     cv_gini_list=[]
53 |     for dev_index, val_index in kf:
54 | 	tr_new = tr[:,[i]]
55 |         #tr_new = tr[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378,i]]
56 |         #tr_new = tr[:,[288, 334,  50, 359,  29, 238,  45, 369, 188, 183, 225, 370, 310,  40,  63, 321, 226, 119,   2, 300, 291, 157, 303, 214,  46, 282, 349, 155,  32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3, i]]
57 |         X_dev, X_val = tr_new[dev_index,:], tr_new[val_index,:]
58 |         y_dev, y_val = train_y[dev_index], train_y[val_index]
59 |         wt_dev, wt_val = tr[dev_index,1], tr[val_index,1]
60 |         clf = Ridge()
61 |         clf.fit(X_dev, y_dev)
62 |         preds = clf.predict(X_val)
63 |         
64 |         cv_gini_list.append(normalized_weighted_gini(y_val,preds,wt_val))
65 |     print cv_gini_list
66 |     print np.mean(cv_gini_list)
67 |     if np.mean(cv_gini_list) > wt_gini:
68 |         wt_gini = np.mean(cv_gini_list)
69 |         selected_index = i
70 |     if i % 50 == 0:
71 |         print "Processed : ",i
72 | print wt_gini
73 | print selected_index
74 | 


--------------------------------------------------------------------------------
/LibertyMutual/finalModel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Jul 28 17:17:40 2014
 4 | 
 5 | @author: Sudalai Rajkumar S
 6 | """
 7 | from __future__ import division
 8 | import numpy as np
 9 | import pandas as pd
10 | from sklearn.linear_model import Ridge, Lasso, LogisticRegression, SGDClassifier, LinearRegression
11 | from sklearn.cross_validation import cross_val_score, KFold
12 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
13 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif
14 | from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score
15 | 
16 | import pandas as pd
17 | import numpy as np
18 | 
19 | def weighted_gini(act,pred,weight):
20 |     df = pd.DataFrame({"act":act,"pred":pred,"weight":weight})
21 |     df = df.sort('pred',ascending=False)
22 |     df["random"] = (df.weight / df.weight.sum()).cumsum()
23 |     total_pos = (df.act * df.weight).sum()
24 |     df["cum_pos_found"] = (df.act * df.weight).cumsum()
25 |     df["lorentz"] = df.cum_pos_found / total_pos
26 |     #n = df.shape[0]
27 |     #df["gini"] = (df.lorentz - df.random) * df.weight 
28 |     #return df.gini.sum()
29 |     gini = sum(df.lorentz[1:].values * (df.random[:-1])) - sum(df.lorentz[:-1].values * (df.random[1:]))
30 |     return gini
31 | 
32 | def normalized_weighted_gini(act,pred,weight):
33 |     return weighted_gini(act,pred,weight) / weighted_gini(act,act,weight)
34 | 
35 | data_path = "Path to data"
36 | 
37 | tr = np.load(data_path+"train.npy")
38 | ts = np.load(data_path+"test.npy")
39 | train_y = np.load(data_path+"train_y.npy")
40 | sample = pd.read_csv(data_path+'sampleSubmission.csv')
41 | 
42 | ### Three training sets are created based on different feature selection methodologies ###
43 | ### Set1 - Run univariate regression to get the top 30 features ###
44 | feature_selector = SelectKBest(score_func=f_regression, k=30)
45 | feature_selector.fit(tr, train_y)
46 | tr1 = feature_selector.transform(tr)
47 | ts1 = feature_selector.transform(ts)
48 | 
49 | ### Set 2 & 3 - Features selected based on stepwise cross validation ( (tr2,ts2) and (tr3,ts3) )###
50 | tr2 = tr[:,[288, 334,  50, 359,  29, 238,  45, 369, 188, 183, 225, 370, 310,  40,  63, 321, 226, 119,   2, 300, 291, 157, 303, 214,  46, 282, 349, 155,  32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]]
51 | ts2 = ts[:,[288, 334,  50, 359,  29, 238,  45, 369, 188, 183, 225, 370, 310,  40,  63, 321, 226, 119,   2, 300, 291, 157, 303, 214,  46, 282, 349, 155,  32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]]
52 | 
53 | tr3 = tr[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378]]
54 | ts3 = ts[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378]]
55 | 
56 | ### Running ridge regression using all three train samples then make predictions on test set separately ###
57 | # Model 1 #
58 | clf = Ridge()
59 | clf.fit(tr1, train_y)
60 | preds1 = clf.predict(ts1)
61 | 
62 | # Model 2 #
63 | clf = Ridge()
64 | clf.fit(tr2, train_y)
65 | preds2 = clf.predict(ts2)
66 | 
67 | # Model 3#
68 | clf = Ridge()
69 | clf.fit(tr3, train_y)
70 | preds3 = clf.predict(ts3)
71 | 
72 | ### Ensembling the models together ###
73 | preds = (0.2*preds1) + (0.32*preds2)+ (0.48*preds3)
74 | 
75 | ### Writing the outputs to out file ###
76 | sample['target'] = preds
77 | sample.to_csv('submission.csv', index = False)
78 | 


--------------------------------------------------------------------------------
/LibertyMutual/predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jul 22 17:17:40 2014
  4 | 
  5 | @author: Sudalai Rajkumar S
  6 | """
  7 | from __future__ import division
  8 | import numpy as np
  9 | import pandas as pd
 10 | from sklearn.linear_model import Ridge, Lasso, LogisticRegression, SGDClassifier, LinearRegression
 11 | from sklearn.cross_validation import cross_val_score, KFold
 12 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 13 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif
 14 | from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score
 15 | 
 16 | import pandas as pd
 17 | import numpy as np
 18 | 
 19 | def weighted_gini(act,pred,weight):
 20 |     df = pd.DataFrame({"act":act,"pred":pred,"weight":weight}) 
 21 |     df = df.sort('pred',ascending=False) 
 22 |     df["random"] = (df.weight / df.weight.sum()).cumsum()
 23 |     total_pos = (df.act * df.weight).sum()
 24 |     df["cum_pos_found"] = (df.act * df.weight).cumsum()
 25 |     df["lorentz"] = df.cum_pos_found / total_pos
 26 |     #n = df.shape[0]
 27 |     #df["gini"] = (df.lorentz - df.random) * df.weight 
 28 |     #return df.gini.sum()
 29 |     gini = sum(df.lorentz[1:].values * (df.random[:-1])) - sum(df.lorentz[:-1].values * (df.random[1:]))
 30 |     return gini
 31 | 
 32 | def normalized_weighted_gini(act,pred,weight):
 33 |     return weighted_gini(act,pred,weight) / weighted_gini(act,act,weight)
 34 | 
 35 | data_path = "C:/Sudalai/Others/Comp/Kaggle/LibertyMutual/Data/"
 36 | 
 37 | #train = pd.read_csv(data_path+'train.csv')
 38 | #test = pd.read_csv(data_path+'test.csv')
 39 | 
 40 | #train_var1 = pd.get_dummies(train['var1'])
 41 | #test_var1 = pd.get_dummies(test['var1'])
 42 | 
 43 | #train_var2 = pd.get_dummies(train['var2'])
 44 | #test_var2 = pd.get_dummies(test['var2'])
 45 | 
 46 | #train_var3 = pd.get_dummies(train['var3'])
 47 | #test_var3 = pd.get_dummies(test['var3'])
 48 | 
 49 | #train_var4 = pd.get_dummies(train['var4'])
 50 | #test_var4 = pd.get_dummies(test['var4'])
 51 | 
 52 | #train_var5 = pd.get_dummies(train['var5'])
 53 | #test_var5 = pd.get_dummies(test['var5'])
 54 | 
 55 | #train_var6 = pd.get_dummies(train['var6'])
 56 | #test_var6 = pd.get_dummies(test['var6'])
 57 | 
 58 | #train_var7 = pd.get_dummies(train['var7'])
 59 | #test_var7 = pd.get_dummies(test['var7'])
 60 | 
 61 | #train_var8 = pd.get_dummies(train['var8'])
 62 | #test_var8 = pd.get_dummies(test['var8'])
 63 | 
 64 | #train_var9 = pd.get_dummies(train['var9'])
 65 | #test_var9 = pd.get_dummies(test['var9'])
 66 | 
 67 | #train = np.hstack([train.iloc[:,11:19], train.iloc[:,20:], train_var1, train_var2, train_var3, train_var4, train_var5, train_var6, train_var7, train_var8, train_var9])
 68 | #test = np.hstack([test.iloc[:,10:18], test.iloc[:,19:], test_var1, test_var2, test_var3, test_var4, test_var5, test_var6, test_var7, test_var8, test_var9])
 69 | 
 70 | #train = np.nan_to_num(np.array(train)).astype('float64')
 71 | #test = np.nan_to_num(np.array(test)).astype('float64')
 72 | #print train.shape
 73 | #print test.shape
 74 | 
 75 | #np.save("train.npy", train)
 76 | #np.save("test.npy", test)
 77 | #print ts.shape
 78 | 
 79 | #np.save("train_y.npy", train['target'].values)
 80 | 
 81 | #sys.exit()
 82 | tr = np.load(data_path+"train.npy")
 83 | ts = np.load(data_path+"test.npy")
 84 | train_y = np.load(data_path+"train_y.npy")
 85 | sample = pd.read_csv(data_path+'sampleSubmission.csv')
 86 | 
 87 | print tr.shape
 88 | #print ts.shape
 89 | 
 90 | #tr = train[['var11', 'var12', 'var13', 'var14', 'var15', 'var16', 'var17']]
 91 | #ts = test[['var11', 'var12', 'var13', 'var14', 'var15', 'var16', 'var17']]
 92 | 
 93 | #tr = tr.iloc[:,2:]
 94 | #ts = ts.iloc[:,2:]
 95 | 
 96 | #for k in xrange(2,30):
 97 | #feature_selector = SelectKBest(score_func=f_regression, k=30)
 98 | #feature_selector.fit(tr, train_y)
 99 | #tr1 = feature_selector.transform(tr)
100 | #ts1 = feature_selector.transform(ts)
101 | 
102 | #tr = tr[:,[288, 334,  50, 359,  29, 238,  45, 369, 188, 183, 225, 370, 310,  40,  63, 321, 226, 119,   2, 300, 291, 157, 303, 214,  46, 282, 349, 155,  32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]]
103 | #ts = ts[:,[288, 334,  50, 359,  29, 238,  45, 369, 188, 183, 225, 370, 310,  40,  63, 321, 226, 119,   2, 300, 291, 157, 303, 214,  46, 282, 349, 155,  32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]]
104 | 
105 | tr2 = tr[:,[288, 334,  50, 359,  29, 238,  45, 369, 188, 183, 225, 370, 310,  40,  63, 321, 226, 119,   2, 300, 291, 157, 303, 214,  46, 282, 349, 155,  32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]]
106 | ts2 = ts[:,[288, 334,  50, 359,  29, 238,  45, 369, 188, 183, 225, 370, 310,  40,  63, 321, 226, 119,   2, 300, 291, 157, 303, 214,  46, 282, 349, 155,  32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]]
107 | 
108 | tr3 = tr[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378]]
109 | ts3 = ts[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378]]
110 | 
111 | #print tr3.shape
112 | #print ts.shape
113 | 
114 | #train_y_cat = train_y[:]
115 | #train_y_cat[train_y_cat>0] = 1
116 | 
117 | #tr = np.nan_to_num(np.array(tr))
118 | #ts = np.nan_to_num(np.array(ts))
119 | 
120 | """
121 | print "Cross Validating.."
122 | #clf = Ridge()
123 | #train_y[train_y>0]=1
124 | wt_gini = 0
125 | #whole_cv_list = []
126 | kf = KFold(tr.shape[0], n_folds=5)
127 | #for i in xrange(tr.shape[1]):
128 | for i in xrange(1):
129 |     cv_gini_list=[]
130 |     for dev_index, val_index in kf:
131 |         #tr_new = tr[:,[0,2,3,5,6,7,40,157,245,288,305,310,312,321,323,338,372,378,i]]
132 |         #tr_new = tr[:,[288, 334,  50, 359,  29, 238,  45, 369, 188, 183, 225, 370, 310,  40,  63, 321, 226, 119,   2, 300, 291, 157, 303, 214,  46, 282, 349, 155,  32, 120, 100, 264, 382, 331, 180, 302, 295, 312, 372, 1, 335, 385, 387, 378, 338, 381, 6, 5, 0, 3]]
133 |         #X_dev, X_val = np.array([tr[dev_index,i]]).T, np.array([tr[val_index,i]]).T
134 |         #X_dev, X_val = tr_new[dev_index,:], tr_new[val_index,:]
135 |         #X_dev, X_val = tr1[dev_index,:], tr1[val_index,:]
136 |         y_dev, y_val = train_y[dev_index], train_y[val_index]
137 |         wt_dev, wt_val = tr[dev_index,1], tr[val_index,1]
138 |         #print X_dev.shape
139 | #for i in xrange(1):
140 |         #clf = Ridge()
141 |         #clf.fit(X_dev, y_dev)
142 |         #preds1 = clf.predict(X_val)
143 |         
144 |         X_dev, X_val = tr2[dev_index,:], tr2[val_index,:]
145 |         clf = Ridge()
146 |         clf.fit(X_dev, y_dev)
147 |         preds2 = clf.predict(X_val)
148 |     
149 |         X_dev, X_val = tr3[dev_index,:], tr3[val_index,:]
150 |         clf = Ridge()
151 |         clf.fit(X_dev, y_dev)
152 |         preds3 = clf.predict(X_val)
153 |         
154 |         preds = (0.4*preds2)+ (0.6*preds3)
155 |         cv_gini_list.append(normalized_weighted_gini(y_val,preds,wt_val))
156 |         #clf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split = 1000, min_samples_leaf=20, random_state=0)
157 |         #cv_scores = cross_val_score(clf, tr[:,[3,5,6,7,40,53,161,251,312,335,338,372,378, i]], train_y, cv=5, scoring = "mean_squared_error")
158 |         #cv_scores = cross_val_score(clf, tr, train_y, cv=5, scoring = "roc_auc")
159 |     #print c_value 
160 |     #print cv_scores
161 |     #print np.mean(cv_scores)
162 |     #if abs(np.mean(cv_scores)) < min_rms:
163 |     #    min_rms = abs(np.mean(cv_scores))
164 |     #    selected_index = i
165 |     print cv_gini_list
166 |     print np.mean(cv_gini_list)
167 |     #whole_cv_list.append(np.mean(cv_gini_list))
168 |     if np.mean(cv_gini_list) > wt_gini:
169 |         wt_gini = np.mean(cv_gini_list)
170 |         selected_index = i
171 |     if i % 50 == 0:
172 |         print "Processed : ",i
173 | print wt_gini
174 | print selected_index
175 | """
176 | 
177 | """
178 | kf = KFold(tr.shape[0], n_folds=5)
179 | f1_cv_list = []
180 | roc_cv_list = []
181 | for dev_index, val_index in kf:
182 |     X_dev, X_val = tr[dev_index,:], tr[val_index,:]
183 |     y_dev, y_val = train_y[dev_index], train_y[val_index]
184 |     y_dev_cat, y_val_cat = train_y_cat[dev_index], train_y_cat[val_index]
185 |     #y_dev_cat = y_dev[:]
186 |     #y_val_cat = y_val[:]
187 |     #y_dev_cat[y_dev_cat>0]=1
188 |     #y_val_cat[y_val_cat>0]=1
189 |     #clf = Ridge()
190 |     #clf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split = 1000, min_samples_leaf=20, random_state=0)
191 |     clf = LogisticRegression(penalty='l2', class_weight='auto')   
192 |     #clf = SGDClassifier(loss='log', alpha=0.00001, n_iter=50)
193 |     clf.fit(X_dev, y_dev_cat)
194 |     pred_y_val = clf.predict_proba(X_val)[:,1]
195 |     #f1_err = f1_score(y_val_cat, pred_y_val)
196 |     #f1_cv_list.append(f1_err)
197 |     #print "f1",f1_err
198 |     roc_err = roc_auc_score(y_val_cat, pred_y_val)
199 |     roc_cv_list.append(roc_err)
200 |     print "roc", roc_err
201 | print roc_cv_list    
202 | print np.mean(roc_cv_list)
203 | print f1_cv_list
204 | print np.mean(f1_cv_list)
205 | """    
206 |     
207 | #clf = Ridge()
208 | #clf.fit(tr1, train_y)
209 | #preds1 = clf.predict(ts1)
210 | 
211 | clf = Ridge()
212 | clf.fit(tr2, train_y)
213 | preds2 = clf.predict(ts2)
214 | 
215 | clf = Ridge()
216 | clf.fit(tr3, train_y)
217 | preds3 = clf.predict(ts3)
218 | 
219 | preds = (0.4*preds2)+ (0.6*preds3)
220 | 
221 | ##preds[preds<0] = 0
222 | sample['target'] = preds
223 | sample.to_csv('submission23.csv', index = False)
224 | 


--------------------------------------------------------------------------------
/LibertyMutual/prepareData.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Jul 22 17:17:40 2014
 5 | 
 6 | @author: Sudalai Rajkumar S
 7 | """
 8 | from __future__ import division
 9 | import numpy as np
10 | import pandas as pd
11 | from sklearn.linear_model import Ridge, Lasso, LogisticRegression, SGDClassifier, LinearRegression
12 | from sklearn.cross_validation import cross_val_score, KFold
13 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
14 | from sklearn.feature_selection import SelectKBest, f_regression, f_classif
15 | from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score
16 | 
17 | import pandas as pd
18 | import numpy as np
19 | 
20 | ### Reading the Input files ###
21 | data_path = "Path to data"
22 | train = pd.read_csv(data_path+'train.csv')
23 | test = pd.read_csv(data_path+'test.csv')
24 | 
25 | ### creating dummy variables from categorical variables ###
26 | train_var1 = pd.get_dummies(train['var1'])
27 | test_var1 = pd.get_dummies(test['var1'])
28 | 
29 | train_var2 = pd.get_dummies(train['var2'])
30 | test_var2 = pd.get_dummies(test['var2'])
31 | 
32 | train_var3 = pd.get_dummies(train['var3'])
33 | test_var3 = pd.get_dummies(test['var3'])
34 | 
35 | train_var4 = pd.get_dummies(train['var4'])
36 | test_var4 = pd.get_dummies(test['var4'])
37 | 
38 | train_var5 = pd.get_dummies(train['var5'])
39 | test_var5 = pd.get_dummies(test['var5'])
40 | 
41 | train_var6 = pd.get_dummies(train['var6'])
42 | test_var6 = pd.get_dummies(test['var6'])
43 | 
44 | train_var7 = pd.get_dummies(train['var7'])
45 | test_var7 = pd.get_dummies(test['var7'])
46 | 
47 | train_var8 = pd.get_dummies(train['var8'])
48 | test_var8 = pd.get_dummies(test['var8'])
49 | 
50 | train_var9 = pd.get_dummies(train['var9'])
51 | test_var9 = pd.get_dummies(test['var9'])
52 | 
53 | ### Stacking the dummy variables together with the numerical variables ###
54 | train = np.hstack([train.iloc[:,11:19], train.iloc[:,20:], train_var1, train_var2, train_var3, train_var4, train_var5, train_var6, train_var7, train_var8, train_var9])
55 | test = np.hstack([test.iloc[:,10:18], test.iloc[:,19:], test_var1, test_var2, test_var3, test_var4, test_var5, test_var6, test_var7, test_var8, test_var9])
56 | 
57 | ### Replacing the missing values with zero ###
58 | train = np.nan_to_num(np.array(train)).astype('float64')
59 | test = np.nan_to_num(np.array(test)).astype('float64')
60 | 
61 | ### Saving the outputs as .npy file ###
62 | np.save("train.npy", train)
63 | np.save("test.npy", test)
64 | np.save("train_y.npy", train['target'].values)
65 | 


--------------------------------------------------------------------------------
/LibertyMutual/readme.md:
--------------------------------------------------------------------------------
1 | This folder has the codes which I have used in the Kaggle - [Liberty Mutual Competition](http://www.kaggle.com/c/liberty-mutual-fire-peril)
2 | 
3 | prepareData.py - File used to do data preprocessing and then to create features from the given raw file
4 | 
5 | finalModel.py - Module used to train the final model and to make predictions 
6 | 


--------------------------------------------------------------------------------
/MMM15/finalModel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar 02 15:28:42 2015
 4 | 
 5 | @author: Sudalai Rajkumar S
 6 | """
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | from sklearn.linear_model import LogisticRegression
11 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
12 | from sklearn.metrics import roc_auc_score
13 | from sklearn.cross_validation import cross_val_score
14 | 
15 | def runLogistic(X, y, test_X, C_val = 1, penalty_val='l1'):
16 |         clf = LogisticRegression(C = C_val, penalty=penalty_val, random_state=0)
17 |         clf.fit(X, y)
18 |         scores = clf.predict_proba(test_X)[:,1]
19 |         return scores
20 | 
21 | def runRF(X, y, test_X, estimator_val=200, max_depth_val=5, min_samples_val = 10):
22 |         clf = RandomForestClassifier(n_estimators=estimator_val, max_depth = max_depth_val, min_samples_split= min_samples_val, random_state=0)
23 |         clf.fit(X, y)
24 |         scores = clf.predict_proba(test_X)[:,1]
25 |         return scores
26 | 
27 | 
28 | if __name__ == "__main__":
29 |         data_path = "/home/sudalai/Others/Kaggle/MMM15/Data/"
30 |         train_file = data_path + "train_v4.csv"
31 |         test_file = data_path + "test_v4.csv"
32 |         sub_file = "sub8.csv"
33 | 
34 |         train_data = pd.read_csv(train_file)
35 |         test_data = pd.read_csv(test_file)
36 | 
37 |         X = train_data.iloc[:,:-1]
38 |         y = train_data['DV'].astype('int')
39 |         test_X = test_data.iloc[:,1:]
40 |         id_val = test_data['id']
41 | 
42 |         scores = runLogistic(X, y, test_X, C_val=1, penalty_val='l1')
43 |         #scores = runRF(X, y, test_X, estimator_val=200, max_depth_val=6, min_samples_val = 200)
44 | 
45 |         #print X.shape
46 |         #print y.shape
47 | 
48 |         sub_file_handle = open(sub_file, 'w')
49 |         sub_file_handle.write('id,pred\n')
50 |         for i in xrange(len(scores)):
51 |                 sub_file_handle.write(str(id_val[i])+','+ str(scores[i]) +'\n')
52 |         sub_file_handle.close()
53 | 


--------------------------------------------------------------------------------
/MMM15/prepareData.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Mar 02 12:12:45 2015
  4 | 
  5 | @author: Sudalai Rajkumar S
  6 | """
  7 | 
  8 | import sys
  9 | import csv
 10 | import numpy as np
 11 | import pandas as pd
 12 | 
 13 | def getSeasonStats(season_file):
 14 | 	season_file_handle = open(season_file, 'r')
 15 | 	reader = csv.DictReader(season_file_handle)
 16 | 	out_dict = {}
 17 | 
 18 | 	for row in reader:
 19 | 		season_dict = out_dict.get(row['season'], {})
 20 | 		wteam_dict = season_dict.get(row['wteam'], {})
 21 | 		lteam_dict = season_dict.get(row['lteam'], {})
 22 | 		
 23 | 		wteam_dict['NoOfWins'] = wteam_dict.get('NoOfWins',0) + 1
 24 | 		wteam_dict['NoOfGames'] = wteam_dict.get('NoOfGames',0) + 1
 25 | 		wteam_dict['TotalScore'] = wteam_dict.get('TotalScore',0) + int(row['wscore'])
 26 | 		wteam_dict['NumOT'] = wteam_dict.get('NumOT',0) + int(row['numot'])
 27 | 		if row['wloc'] == "H":
 28 | 			wteam_dict['NoOfHomeWins'] = wteam_dict.get('NoOfHomeWins',0) + 1
 29 | 		elif row['wloc'] == 'N':
 30 | 			wteam_dict['NoOfNeutralWins'] = wteam_dict.get('NoOfNeutralWins',0) + 1
 31 | 		elif row['wloc'] == 'A':
 32 |                         wteam_dict['NoOfAwayWins'] = wteam_dict.get('NoOfAwayWins',0) + 1
 33 | 		else:
 34 | 			print row['wloc']
 35 | 			sys.exit()
 36 | 		wteam_dict['fgm'] = wteam_dict.get('fgm',0) + int(row['wfgm'])
 37 | 		wteam_dict['fga'] = wteam_dict.get('fga',0) + int(row['wfga'])
 38 | 		wteam_dict['fgm3'] = wteam_dict.get('fgm3',0) + int(row['wfgm3'])
 39 |                 wteam_dict['fga3'] = wteam_dict.get('fga3',0) + int(row['wfga3'])
 40 | 		wteam_dict['ftm'] = wteam_dict.get('ftm',0) + int(row['wftm'])
 41 |                 wteam_dict['fta'] = wteam_dict.get('fta',0) + int(row['wfta'])
 42 | 		wteam_dict['or'] = wteam_dict.get('or',0) + int(row['wor'])
 43 |                 wteam_dict['dr'] = wteam_dict.get('dr',0) + int(row['wdr'])
 44 | 		wteam_dict['ast'] = wteam_dict.get('ast',0) + int(row['wast'])
 45 |                 wteam_dict['to'] = wteam_dict.get('to',0) + int(row['wto'])
 46 |                 wteam_dict['pf'] = wteam_dict.get('pf',0) + int(row['wpf'])
 47 | 		wteam_dict['stl'] = wteam_dict.get('stl',0) + int(row['wstl'])
 48 |                 wteam_dict['blk'] = wteam_dict.get('blk',0) + int(row['wblk'])
 49 | 		
 50 | 		
 51 | 
 52 | 
 53 | 		lteam_dict['NoOfLoss'] = lteam_dict.get('NoOfLoss',0) + 1
 54 |                 lteam_dict['NoOfGames'] = lteam_dict.get('NoOfGames',0) + 1
 55 |                 lteam_dict['TotalScore'] = lteam_dict.get('TotalScore',0) + int(row['lscore'])
 56 | 		lteam_dict['NumOT'] = lteam_dict.get('NumOT',0) + int(row['numot'])
 57 | 		if row['wloc'] == "H":
 58 |                         lteam_dict['NoOfAwayLoss'] = lteam_dict.get('NoOfAwayLoss',0) + 1
 59 |                 elif row['wloc'] == 'N':
 60 |                         lteam_dict['NoOfNeutralLoss'] = lteam_dict.get('NoOfNeutralLoss',0) + 1
 61 |                 elif row['wloc'] == 'A':
 62 |                         lteam_dict['NoOfHomeLoss'] = lteam_dict.get('NoOfHomeLoss',0) + 1
 63 |                 else:
 64 |                         print row['wloc']
 65 |                         sys.exit()
 66 | 		lteam_dict['fgm'] = lteam_dict.get('fgm',0) + int(row['lfgm'])
 67 |                 lteam_dict['fga'] = lteam_dict.get('fga',0) + int(row['lfga'])
 68 |                 lteam_dict['fgm3'] = lteam_dict.get('fgm3',0) + int(row['lfgm3'])
 69 |                 lteam_dict['fga3'] = lteam_dict.get('fga3',0) + int(row['lfga3'])
 70 |                 lteam_dict['ftm'] = lteam_dict.get('ftm',0) + int(row['lftm'])
 71 |                 lteam_dict['fta'] = lteam_dict.get('fta',0) + int(row['lfta'])
 72 |                 lteam_dict['or'] = lteam_dict.get('or',0) + int(row['lor'])
 73 |                 lteam_dict['dr'] = lteam_dict.get('dr',0) + int(row['ldr'])
 74 |                 lteam_dict['ast'] = lteam_dict.get('ast',0) + int(row['last'])
 75 |                 lteam_dict['to'] = lteam_dict.get('to',0) + int(row['lto'])
 76 |                 lteam_dict['pf'] = lteam_dict.get('pf',0) + int(row['lpf'])
 77 |                 lteam_dict['stl'] = lteam_dict.get('stl',0) + int(row['lstl'])
 78 | 		lteam_dict['blk'] = lteam_dict.get('blk',0) + int(row['lblk'])
 79 |                 lteam_dict['blk'] = lteam_dict.get('blk',0) + int(row['lblk'])
 80 | 
 81 | 
 82 | 		season_dict[row['wteam']] = wteam_dict
 83 | 		season_dict[row['lteam']] = lteam_dict
 84 | 		out_dict[row['season']] = season_dict
 85 | 
 86 | 	#print out_dict['1985']['1228']
 87 | 	#print out_dict['1985']['1328']
 88 | 	#print out_dict['1985'].keys()
 89 | 	#print len(out_dict['1985'].keys())
 90 | 	return out_dict
 91 | 
 92 | def prepareTrainData(tourney_results_file, season_dict, seeds_dict):
 93 | 	tourney_file_handle = open(tourney_results_file, 'r')
 94 | 	reader = csv.DictReader(tourney_file_handle)
 95 | 	header_list = ['WinPercentage','LossPercentage','AverageScore','HomeWinPerc','AwayWinPerc','NeutralWinPerc','HomeLossPerc','AwayLossPerc','NeutralLossPerc','NumOTPerc', 'NoOfGames',    'OppWinPercentage','OppLossPercentage','OppAverageScore','OppHomeWinPerc','OppAwayWinPerc','OppNeutralWinPerc','OppHomeLossPerc','OppAwayLossPerc','OppNeutralLossPerc', 'OppNumOTPerc', 'OppNoOfGames', 'fgm','fga','fgm3','fga3','ftm','fta','or','dr','to','pf','ast','stl','blk', 'ofgm','ofga','ofgm3','ofga3','oftm','ofta','oor','odr','oto','opf','oast','ostl','oblk',  'DV']
 96 | 
 97 | 	out_list = []
 98 | 	for row in reader:
 99 | 		season = row['season']
100 | 		if season == '2011':
101 | 			break
102 | 
103 | 		# Get the winning and losing team #
104 | 		wteam = row['wteam']
105 | 		lteam = row['lteam']
106 | 		# Get the stats of both teams #
107 | 		season_wteam_dict = season_dict[season][wteam]
108 | 		season_lteam_dict = season_dict[season][lteam]
109 | 
110 | 		# Get the seeds of both teams #
111 | 		#seed_wteam = seeds_dict[season][wteam]
112 | 		#seed_lteam = seeds_dict[season][lteam]
113 | 		# Get win percentage for both teams #
114 | 		win_perc_wteam = season_wteam_dict.get('NoOfWins',0) / float(season_wteam_dict['NoOfGames'])
115 | 		win_perc_lteam = season_lteam_dict.get('NoOfWins',0) / float(season_lteam_dict['NoOfGames'])
116 | 		# Get Loss percentage for both teams #
117 | 		loss_perc_wteam = season_wteam_dict.get('NoOfLoss',0) / float(season_wteam_dict['NoOfGames'])
118 | 		loss_perc_lteam = season_lteam_dict.get('NoOfLoss',0) / float(season_lteam_dict['NoOfGames'])
119 | 		# Get the average score for both teams #
120 | 		avg_score_wteam = season_wteam_dict.get('TotalScore',0) / float(season_wteam_dict['NoOfGames'])
121 | 		avg_score_lteam = season_lteam_dict.get('TotalScore',0) / float(season_lteam_dict['NoOfGames'])
122 | 		# Get the home win percentage for both teams #
123 | 		home_win_perc_wteam = season_wteam_dict.get('NoOfHomeWins',0) / float(season_wteam_dict['NoOfGames'])
124 | 		home_win_perc_lteam = season_lteam_dict.get('NoOfHomeWins',0) / float(season_lteam_dict['NoOfGames'])
125 | 		# Get the Away win percentage for both teams #
126 | 		away_win_perc_wteam = season_wteam_dict.get('NoOfAwayWins',0) / float(season_wteam_dict['NoOfGames'])
127 | 		away_win_perc_lteam = season_lteam_dict.get('NoOfAwayWins',0) / float(season_lteam_dict['NoOfGames'])
128 | 		# Get the neutral win percentage #
129 | 		neutral_win_perc_wteam = season_wteam_dict.get('NoOfNeutralWins',0) / float(season_wteam_dict['NoOfGames'])
130 | 		neutral_win_perc_lteam = season_lteam_dict.get('NoOfNeutralWins',0) / float(season_lteam_dict['NoOfGames'])
131 | 		# Get the home loss percentage for both teams #
132 |                 home_loss_perc_wteam = season_wteam_dict.get('NoOfHomeLoss',0) / float(season_wteam_dict['NoOfGames'])
133 |                 home_loss_perc_lteam = season_lteam_dict.get('NoOfHomeLoss',0) / float(season_lteam_dict['NoOfGames'])
134 |                 # Get the Away loss percentage for both teams #
135 |                 away_loss_perc_wteam = season_wteam_dict.get('NoOfAwayLoss',0) / float(season_wteam_dict['NoOfGames'])
136 |                 away_loss_perc_lteam = season_lteam_dict.get('NoOfAwayLoss',0) / float(season_lteam_dict['NoOfGames'])
137 |                 # Get the neutral loss percentage #
138 |                 neutral_loss_perc_wteam = season_wteam_dict.get('NoOfNeutralLoss',0) / float(season_wteam_dict['NoOfGames'])
139 |                 neutral_loss_perc_lteam = season_lteam_dict.get('NoOfNeutralLoss',0) / float(season_lteam_dict['NoOfGames'])
140 | 		# Get the number of overtime matches for both teams #
141 | 		ot_wteam = season_wteam_dict['NumOT'] 
142 |                 ot_lteam = season_lteam_dict['NumOT'] 
143 | 		# Get the number of matches for both teams #
144 |                 num_games_wteam = season_wteam_dict['NoOfGames']
145 |                 num_games_lteam = season_lteam_dict['NoOfGames']
146 | 
147 | 		fgm_wteam = season_wteam_dict.get('fgm',0) / float(season_wteam_dict['NoOfGames'])
148 | 		fga_wteam = season_wteam_dict.get('fga',0) / float(season_wteam_dict['NoOfGames'])
149 | 		fgm3_wteam = season_wteam_dict.get('fgm3',0) / float(season_wteam_dict['NoOfGames'])
150 |                 fga3_wteam = season_wteam_dict.get('fga3',0) / float(season_wteam_dict['NoOfGames'])
151 | 		ftm_wteam = season_wteam_dict.get('ftm',0) / float(season_wteam_dict['NoOfGames'])
152 |                 fta_wteam = season_wteam_dict.get('fta',0) / float(season_wteam_dict['NoOfGames'])
153 | 		or_wteam = season_wteam_dict.get('or',0) / float(season_wteam_dict['NoOfGames'])
154 |                 dr_wteam = season_wteam_dict.get('dr',0) / float(season_wteam_dict['NoOfGames'])
155 | 		to_wteam = season_wteam_dict.get('to',0) / float(season_wteam_dict['NoOfGames'])
156 |                 pf_wteam = season_wteam_dict.get('pf',0) / float(season_wteam_dict['NoOfGames'])
157 | 		ast_wteam = season_wteam_dict.get('ast',0) / float(season_wteam_dict['NoOfGames'])
158 |                 stl_wteam = season_wteam_dict.get('stl',0) / float(season_wteam_dict['NoOfGames'])
159 | 		blk_wteam = season_wteam_dict.get('blk',0) / float(season_wteam_dict['NoOfGames'])
160 | 
161 | 		fgm_lteam = season_lteam_dict.get('fgm',0) / float(season_lteam_dict['NoOfGames'])
162 |                 fga_lteam = season_lteam_dict.get('fga',0) / float(season_lteam_dict['NoOfGames'])
163 |                 fgm3_lteam = season_lteam_dict.get('fgm3',0) / float(season_lteam_dict['NoOfGames'])
164 |                 fga3_lteam = season_lteam_dict.get('fga3',0) / float(season_lteam_dict['NoOfGames'])
165 |                 ftm_lteam = season_lteam_dict.get('ftm',0) / float(season_lteam_dict['NoOfGames'])
166 |                 fta_lteam = season_lteam_dict.get('fta',0) / float(season_lteam_dict['NoOfGames'])
167 |                 or_lteam = season_lteam_dict.get('or',0) / float(season_lteam_dict['NoOfGames'])
168 |                 dr_lteam = season_lteam_dict.get('dr',0) / float(season_lteam_dict['NoOfGames'])
169 |                 to_lteam = season_lteam_dict.get('to',0) / float(season_lteam_dict['NoOfGames'])
170 |                 pf_lteam = season_lteam_dict.get('pf',0) / float(season_lteam_dict['NoOfGames'])
171 |                 ast_lteam = season_lteam_dict.get('ast',0) / float(season_lteam_dict['NoOfGames'])
172 |                 stl_lteam = season_lteam_dict.get('stl',0) / float(season_lteam_dict['NoOfGames'])
173 |                 blk_lteam = season_lteam_dict.get('blk',0) / float(season_lteam_dict['NoOfGames'])
174 | 
175 | 		
176 | 
177 | 		# Appending the features to out list #
178 | 		out_list.append([ win_perc_wteam, loss_perc_wteam, avg_score_wteam, home_win_perc_wteam, away_win_perc_wteam, neutral_win_perc_wteam, home_loss_perc_wteam, away_loss_perc_wteam, neutral_loss_perc_wteam, ot_wteam, num_games_wteam, win_perc_lteam, loss_perc_lteam, avg_score_lteam, home_win_perc_lteam, away_win_perc_lteam, neutral_win_perc_lteam, home_loss_perc_lteam, away_loss_perc_lteam, neutral_loss_perc_lteam, ot_lteam, num_games_lteam,   fgm_wteam,fga_wteam,fgm3_wteam,fga3_wteam,ftm_wteam,fta_wteam,or_wteam,dr_wteam,to_wteam,pf_wteam,ast_wteam,stl_wteam,blk_wteam,  fgm_lteam,fga_lteam,fgm3_lteam,fga3_lteam,ftm_lteam,fta_lteam,or_lteam,dr_lteam,to_lteam,pf_lteam,ast_lteam,stl_lteam,blk_lteam,  1])
179 | 		out_list.append([win_perc_lteam, loss_perc_lteam, avg_score_lteam, home_win_perc_lteam, away_win_perc_lteam, neutral_win_perc_lteam, home_loss_perc_lteam, away_loss_perc_lteam, neutral_loss_perc_lteam, ot_lteam, num_games_lteam, win_perc_wteam, loss_perc_wteam, avg_score_wteam, home_win_perc_wteam, away_win_perc_wteam, neutral_win_perc_wteam, home_loss_perc_wteam, away_loss_perc_wteam, neutral_loss_perc_wteam, ot_wteam, num_games_wteam,   fgm_lteam,fga_lteam,fgm3_lteam,fga3_lteam,ftm_lteam,fta_lteam,or_lteam,dr_lteam,to_lteam,pf_lteam,ast_lteam,stl_lteam,blk_lteam,   fgm_wteam,fga_wteam,fgm3_wteam,fga3_wteam,ftm_wteam,fta_wteam,or_wteam,dr_wteam,to_wteam,pf_wteam,ast_wteam,stl_wteam,blk_wteam,  0])
180 | 
181 | 	out_df = pd.DataFrame(np.array(out_list))
182 | 	out_df.columns = header_list
183 | 	return out_df
184 | 
185 | def prepareTestData(fixture_file, season_dict, seeds_dict):
186 | 	fixture_file_handle = open(fixture_file, 'r')
187 |         reader = csv.DictReader(fixture_file_handle)
188 |         header_list = ['id', 'WinPercentage','LossPercentage','AverageScore','HomeWinPerc','AwayWinPerc','NeutralWinPerc','HomeLossPerc','AwayLossPerc','NeutralLossPerc','NumOTPerc', 'NoOfGames',    'OppWinPercentage','OppLossPercentage','OppAverageScore','OppHomeWinPerc','OppAwayWinPerc','OppNeutralWinPerc','OppHomeLossPerc','OppAwayLossPerc','OppNeutralLossPerc', 'OppNumOTPerc', 'OppNoOfGames', 'fgm','fga','fgm3','fga3','ftm','fta','or','dr','to','pf','ast','stl','blk', 'ofgm','ofga','ofgm3','ofga3','oftm','ofta','oor','odr','oto','opf','oast','ostl','oblk']
189 | 
190 |         out_list = []
191 | 	out_list = []
192 |         for row in reader:
193 | 		id_val = row['id']
194 |                 season = id_val.split("_")[0]
195 | 
196 |                 # Get the winning and losing team #
197 |                 wteam = id_val.split("_")[1]
198 |                 lteam = id_val.split("_")[2]
199 |                 # Get the stats of both teams #
200 |                 season_wteam_dict = season_dict[season][wteam]
201 |                 season_lteam_dict = season_dict[season][lteam]
202 | 
203 |                 # Get the seeds of both teams #
204 |                 #seed_wteam = seeds_dict[season][wteam]
205 |                 #seed_lteam = seeds_dict[season][lteam]
206 |                 # Get win percentage for both teams #
207 |                 win_perc_wteam = season_wteam_dict.get('NoOfWins',0) / float(season_wteam_dict['NoOfGames'])
208 |                 win_perc_lteam = season_lteam_dict.get('NoOfWins',0) / float(season_lteam_dict['NoOfGames'])
209 |                 # Get Loss percentage for both teams #
210 |                 loss_perc_wteam = season_wteam_dict.get('NoOfLoss',0) / float(season_wteam_dict['NoOfGames'])
211 |                 loss_perc_lteam = season_lteam_dict.get('NoOfLoss',0) / float(season_lteam_dict['NoOfGames'])
212 |                 # Get the average score for both teams #
213 |                 avg_score_wteam = season_wteam_dict.get('TotalScore',0) / float(season_wteam_dict['NoOfGames'])
214 |                 avg_score_lteam = season_lteam_dict.get('TotalScore',0) / float(season_lteam_dict['NoOfGames'])
215 |                 # Get the home win percentage for both teams #
216 |                 home_win_perc_wteam = season_wteam_dict.get('NoOfHomeWins',0) / float(season_wteam_dict['NoOfGames'])
217 |                 home_win_perc_lteam = season_lteam_dict.get('NoOfHomeWins',0) / float(season_lteam_dict['NoOfGames'])
218 |                 # Get the Away win percentage for both teams #
219 |                 away_win_perc_wteam = season_wteam_dict.get('NoOfAwayWins',0) / float(season_wteam_dict['NoOfGames'])
220 |                 away_win_perc_lteam = season_lteam_dict.get('NoOfAwayWins',0) / float(season_lteam_dict['NoOfGames'])
221 |                 # Get the neutral win percentage #
222 |                 neutral_win_perc_wteam = season_wteam_dict.get('NoOfNeutralWins',0) / float(season_wteam_dict['NoOfGames'])
223 |                 neutral_win_perc_lteam = season_lteam_dict.get('NoOfNeutralWins',0) / float(season_lteam_dict['NoOfGames'])
224 |                 # Get the home loss percentage for both teams #
225 |                 home_loss_perc_wteam = season_wteam_dict.get('NoOfHomeLoss',0) / float(season_wteam_dict['NoOfGames'])
226 |                 home_loss_perc_lteam = season_lteam_dict.get('NoOfHomeLoss',0) / float(season_lteam_dict['NoOfGames'])
227 |                 # Get the Away loss percentage for both teams #
228 |                 away_loss_perc_wteam = season_wteam_dict.get('NoOfAwayLoss',0) / float(season_wteam_dict['NoOfGames'])
229 |                 away_loss_perc_lteam = season_lteam_dict.get('NoOfAwayLoss',0) / float(season_lteam_dict['NoOfGames'])
230 | 		# Get the neutral loss percentage #
231 |                 neutral_loss_perc_wteam = season_wteam_dict.get('NoOfNeutralLoss',0) / float(season_wteam_dict['NoOfGames'])
232 |                 neutral_loss_perc_lteam = season_lteam_dict.get('NoOfNeutralLoss',0) / float(season_lteam_dict['NoOfGames'])
233 |                 # Get the number of overtime matches for both teams #
234 |                 ot_wteam = season_wteam_dict['NumOT']
235 |                 ot_lteam = season_lteam_dict['NumOT']
236 |                 # Get the number of matches for both teams #
237 |                 num_games_wteam = season_wteam_dict['NoOfGames']
238 |                 num_games_lteam = season_lteam_dict['NoOfGames']
239 | 
240 | 		fgm_wteam = season_wteam_dict.get('fgm',0) / float(season_wteam_dict['NoOfGames'])
241 |                 fga_wteam = season_wteam_dict.get('fga',0) / float(season_wteam_dict['NoOfGames'])
242 |                 fgm3_wteam = season_wteam_dict.get('fgm3',0) / float(season_wteam_dict['NoOfGames'])
243 |                 fga3_wteam = season_wteam_dict.get('fga3',0) / float(season_wteam_dict['NoOfGames'])
244 |                 ftm_wteam = season_wteam_dict.get('ftm',0) / float(season_wteam_dict['NoOfGames'])
245 |                 fta_wteam = season_wteam_dict.get('fta',0) / float(season_wteam_dict['NoOfGames'])
246 |                 or_wteam = season_wteam_dict.get('or',0) / float(season_wteam_dict['NoOfGames'])
247 |                 dr_wteam = season_wteam_dict.get('dr',0) / float(season_wteam_dict['NoOfGames'])
248 |                 to_wteam = season_wteam_dict.get('to',0) / float(season_wteam_dict['NoOfGames'])
249 |                 pf_wteam = season_wteam_dict.get('pf',0) / float(season_wteam_dict['NoOfGames'])
250 |                 ast_wteam = season_wteam_dict.get('ast',0) / float(season_wteam_dict['NoOfGames'])
251 |                 stl_wteam = season_wteam_dict.get('stl',0) / float(season_wteam_dict['NoOfGames'])
252 |                 blk_wteam = season_wteam_dict.get('blk',0) / float(season_wteam_dict['NoOfGames'])
253 | 
254 |                 fgm_lteam = season_lteam_dict.get('fgm',0) / float(season_lteam_dict['NoOfGames'])
255 |                 fga_lteam = season_lteam_dict.get('fga',0) / float(season_lteam_dict['NoOfGames'])
256 |                 fgm3_lteam = season_lteam_dict.get('fgm3',0) / float(season_lteam_dict['NoOfGames'])
257 |                 fga3_lteam = season_lteam_dict.get('fga3',0) / float(season_lteam_dict['NoOfGames'])
258 |                 ftm_lteam = season_lteam_dict.get('ftm',0) / float(season_lteam_dict['NoOfGames'])
259 |                 fta_lteam = season_lteam_dict.get('fta',0) / float(season_lteam_dict['NoOfGames'])
260 |                 or_lteam = season_lteam_dict.get('or',0) / float(season_lteam_dict['NoOfGames'])
261 |                 dr_lteam = season_lteam_dict.get('dr',0) / float(season_lteam_dict['NoOfGames'])
262 |                 to_lteam = season_lteam_dict.get('to',0) / float(season_lteam_dict['NoOfGames'])
263 |                 pf_lteam = season_lteam_dict.get('pf',0) / float(season_lteam_dict['NoOfGames'])
264 |                 ast_lteam = season_lteam_dict.get('ast',0) / float(season_lteam_dict['NoOfGames'])
265 |                 stl_lteam = season_lteam_dict.get('stl',0) / float(season_lteam_dict['NoOfGames'])
266 |                 blk_lteam = season_lteam_dict.get('blk',0) / float(season_lteam_dict['NoOfGames'])
267 | 
268 | 
269 |                 # Appending the features to out list #
270 |                 out_list.append([id_val, win_perc_wteam, loss_perc_wteam, avg_score_wteam, home_win_perc_wteam, away_win_perc_wteam, neutral_win_perc_wteam, home_loss_perc_wteam, away_loss_perc_wteam, neutral_loss_perc_wteam, ot_wteam, num_games_wteam, win_perc_lteam, loss_perc_lteam, avg_score_lteam, home_win_perc_lteam, away_win_perc_lteam, neutral_win_perc_lteam, home_loss_perc_lteam, away_loss_perc_lteam, neutral_loss_perc_lteam, ot_lteam, num_games_lteam, fgm_wteam,fga_wteam,fgm3_wteam,fga3_wteam,ftm_wteam,fta_wteam,or_wteam,dr_wteam,to_wteam,pf_wteam,ast_wteam,stl_wteam,blk_wteam,  fgm_lteam,fga_lteam,fgm3_lteam,fga3_lteam,ftm_lteam,fta_lteam,or_lteam,dr_lteam,to_lteam,pf_lteam,ast_lteam,stl_lteam,blk_lteam])
271 | 
272 |         out_df = pd.DataFrame(np.array(out_list))
273 |         out_df.columns = header_list
274 |         return out_df
275 | 
276 | 
277 | 
278 | def getSeedStats(seeds_file):
279 | 	seeds_file_handle = open(seeds_file, 'r')
280 | 	reader = csv.DictReader(seeds_file_handle)
281 | 	out_dict = {}
282 | 
283 | 	for row in reader:
284 | 		season_dict = out_dict.get(row['season'], {})
285 | 		season_dict[row['team']] = int(row['seed'][1:3])
286 | 		out_dict[row['season']] = season_dict
287 | 
288 | 	return out_dict
289 | 
290 | 
291 | if __name__ == "__main__":
292 | 	data_path = "/home/sudalai/Others/Kaggle/MMM15/Data/"
293 | 	regular_seasons_file = data_path + "regular_season_detailed_results.csv"
294 | 	tourney_seeds_file= data_path + "tourney_seeds.csv"
295 | 	tourney_results_file = data_path + "tourney_detailed_results.csv"
296 | 	test_fixture_file = data_path + "sample_submission.csv"
297 | 	train_file = data_path + "train_v4.csv"
298 | 	test_file = data_path + "test_v4.csv"
299 | 
300 | 	season_dict = getSeasonStats(regular_seasons_file)
301 | 	seeds_dict = getSeedStats(tourney_seeds_file)
302 | 
303 | 	for year in season_dict.keys():
304 | 		for team in seeds_dict[year]:
305 | 			season_dict[year][team]	
306 | 
307 | 	train_df = prepareTrainData(tourney_results_file, season_dict, seeds_dict)
308 | 	train_df.to_csv(train_file, index=False)
309 | 
310 | 	test_df = prepareTestData(test_fixture_file, season_dict, seeds_dict)
311 | 	test_df.to_csv(test_file, index=False)
312 | 


--------------------------------------------------------------------------------
/MMM15/readme.md:
--------------------------------------------------------------------------------
1 | This folder has the codes for [Kaggle Competition](http://www.kaggle.com/competitions) - [March Machine Learning Mania 2015](http://www.kaggle.com/c/march-machine-learning-mania-2015)
2 | 


--------------------------------------------------------------------------------
/MMM15/seed_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar 02 10:17:40 2015
 4 | 
 5 | @author: Sudalai Rajkumar S
 6 | 
 7 | Module to produce the seed based bench mark given in the competition
 8 | """
 9 | 
10 | import csv
11 | import numpy as np
12 | import pandas as pd
13 | 
14 | def getSeedStats(seeds_file):
15 |         seeds_file_handle = open(seeds_file, 'r')
16 |         reader = csv.DictReader(seeds_file_handle)
17 |         out_dict = {}
18 | 
19 |         for row in reader:
20 |                 season_dict = out_dict.get(row['season'], {})
21 |                 season_dict[row['team']] = int(row['seed'][1:3])
22 |                 out_dict[row['season']] = season_dict
23 | 
24 |         return out_dict
25 | 
26 | if __name__ == "__main__":
27 |         data_path = "/home/sudalai/Others/Kaggle/MMM15/Data/"
28 |         test_fixture_file = data_path + "sample_submission.csv"
29 |         tourney_seeds_file= data_path + "tourney_seeds.csv"
30 |         sub_file = open("sub_seedmodel.csv","w")
31 |         sub_file.write('id,pred\n')
32 | 
33 |         seeds_dict = getSeedStats(tourney_seeds_file)
34 | 
35 |         test_file_handle = open(test_fixture_file,'r')
36 |         reader = csv.DictReader(test_file_handle)
37 |         for row in reader:
38 |                 id_val = row['id']
39 |                 season = id_val.split("_")[0]
40 |                 fteam = id_val.split("_")[1]
41 |                 steam = id_val.split("_")[2]
42 | 
43 |                 fteam_seed = seeds_dict[season][fteam]
44 |                 steam_seed = seeds_dict[season][steam]
45 | 
46 |                 pred_val = 0.5 + ((steam_seed - fteam_seed)*0.03)
47 | 
48 |                 sub_file.write(str(id_val) + "," + str(pred_val) + "\n")
49 |         sub_file.close()
50 | 


--------------------------------------------------------------------------------
/OutBrain/ftrl.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Thanks to tinrtgu for the wonderful base script
  4 | Use pypy for faster computations.!
  5 | """
  6 | import csv
  7 | from datetime import datetime
  8 | from csv import DictReader
  9 | from math import exp, log, sqrt
 10 | 
 11 | 
 12 | ##############################################################################
 13 | # parameters #################################################################
 14 | ##############################################################################
 15 | 
 16 | # A, paths
 17 | data_path = "../input/"
 18 | train = data_path+'clicks_train.csv'               # path to training file
 19 | test = data_path+'clicks_test.csv'                 # path to testing file
 20 | submission = 'sub_proba.csv'  # path of to be outputted submission file
 21 | 
 22 | # B, model
 23 | alpha = .1  # learning rate
 24 | beta = 0.   # smoothing parameter for adaptive learning rate
 25 | L1 = 0.    # L1 regularization, larger value means more regularized
 26 | L2 = 0.     # L2 regularization, larger value means more regularized
 27 | 
 28 | # C, feature/hash trick
 29 | D = 2 ** 20             # number of weights to use
 30 | interaction = False     # whether to enable poly2 feature interactions
 31 | 
 32 | # D, training/validation
 33 | epoch = 1       # learn training data for N passes
 34 | holdafter = None   # data after date N (exclusive) are used as validation
 35 | holdout = None  # use every N training instance for holdout validation
 36 | 
 37 | 
 38 | ##############################################################################
 39 | # class, function, generator definitions #####################################
 40 | ##############################################################################
 41 | 
 42 | class ftrl_proximal(object):
 43 |     ''' Our main algorithm: Follow the regularized leader - proximal
 44 | 
 45 |         In short,
 46 |         this is an adaptive-learning-rate sparse logistic-regression with
 47 |         efficient L1-L2-regularization
 48 | 
 49 |         Reference:
 50 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
 51 |     '''
 52 | 
 53 |     def __init__(self, alpha, beta, L1, L2, D, interaction):
 54 |         # parameters
 55 |         self.alpha = alpha
 56 |         self.beta = beta
 57 |         self.L1 = L1
 58 |         self.L2 = L2
 59 | 
 60 |         # feature related parameters
 61 |         self.D = D
 62 |         self.interaction = interaction
 63 | 
 64 |         # model
 65 |         # n: squared sum of past gradients
 66 |         # z: weights
 67 |         # w: lazy weights
 68 |         self.n = [0.] * D
 69 |         self.z = [0.] * D
 70 |         self.w = {}
 71 | 
 72 |     def _indices(self, x):
 73 |         ''' A helper generator that yields the indices in x
 74 | 
 75 |             The purpose of this generator is to make the following
 76 |             code a bit cleaner when doing feature interaction.
 77 |         '''
 78 | 
 79 |         # first yield index of the bias term
 80 |         yield 0
 81 | 
 82 |         # then yield the normal indices
 83 |         for index in x:
 84 |             yield index
 85 | 
 86 |         # now yield interactions (if applicable)
 87 |         if self.interaction:
 88 |             D = self.D
 89 |             L = len(x)
 90 | 
 91 |             x = sorted(x)
 92 |             for i in xrange(L):
 93 |                 for j in xrange(i+1, L):
 94 |                     # one-hot encode interactions with hash trick
 95 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
 96 | 
 97 |     def predict(self, x):
 98 |         ''' Get probability estimation on x
 99 | 
100 |             INPUT:
101 |                 x: features
102 | 
103 |             OUTPUT:
104 |                 probability of p(y = 1 | x; w)
105 |         '''
106 | 
107 |         # parameters
108 |         alpha = self.alpha
109 |         beta = self.beta
110 |         L1 = self.L1
111 |         L2 = self.L2
112 | 
113 |         # model
114 |         n = self.n
115 |         z = self.z
116 |         w = {}
117 | 
118 |         # wTx is the inner product of w and x
119 |         wTx = 0.
120 |         for i in self._indices(x):
121 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
122 | 
123 |             # build w on the fly using z and n, hence the name - lazy weights
124 |             # we are doing this at prediction instead of update time is because
125 |             # this allows us for not storing the complete w
126 |             if sign * z[i] <= L1:
127 |                 # w[i] vanishes due to L1 regularization
128 |                 w[i] = 0.
129 |             else:
130 |                 # apply prediction time L1, L2 regularization to z and get w
131 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
132 | 
133 |             wTx += w[i]
134 | 
135 |         # cache the current w for update stage
136 |         self.w = w
137 | 
138 |         # bounded sigmoid function, this is the probability estimation
139 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
140 | 
141 |     def update(self, x, p, y):
142 |         ''' Update model using x, p, y
143 | 
144 |             INPUT:
145 |                 x: feature, a list of indices
146 |                 p: click probability prediction of our model
147 |                 y: answer
148 | 
149 |             MODIFIES:
150 |                 self.n: increase by squared gradient
151 |                 self.z: weights
152 |         '''
153 | 
154 |         # parameter
155 |         alpha = self.alpha
156 | 
157 |         # model
158 |         n = self.n
159 |         z = self.z
160 |         w = self.w
161 | 
162 |         # gradient under logloss
163 |         g = p - y
164 | 
165 |         # update z and n
166 |         for i in self._indices(x):
167 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
168 |             z[i] += g - sigma * w[i]
169 |             n[i] += g * g
170 | 
171 | 
172 | def logloss(p, y):
173 |     ''' FUNCTION: Bounded logloss
174 | 
175 |         INPUT:
176 |             p: our prediction
177 |             y: real answer
178 | 
179 |         OUTPUT:
180 |             logarithmic loss of p given y
181 |     '''
182 | 
183 |     p = max(min(p, 1. - 10e-15), 10e-15)
184 |     return -log(p) if y == 1. else -log(1. - p)
185 | 
186 | 
187 | def data(path, D):
188 |     ''' GENERATOR: Apply hash-trick to the original csv row
189 |                    and for simplicity, we one-hot-encode everything
190 | 
191 |         INPUT:
192 |             path: path to training or testing file
193 |             D: the max index that we can hash to
194 | 
195 |         YIELDS:
196 |             ID: id of the instance, mainly useless
197 |             x: a list of hashed and one-hot-encoded 'indices'
198 |                we only need the index since all values are either 0 or 1
199 |             y: y = 1 if we have a click, else we have y = 0
200 |     '''
201 | 
202 |     for t, row in enumerate(DictReader(open(path))):
203 |         # process id
204 |         disp_id = int(row['display_id'])
205 |         ad_id = int(row['ad_id'])
206 | 
207 |         # process clicks
208 |         y = 0.
209 |         if 'clicked' in row:
210 |             if row['clicked'] == '1':
211 |                 y = 1.
212 |             del row['clicked']
213 | 
214 |         x = []
215 |         for key in row:
216 |             x.append(abs(hash(key + '_' + row[key])) % D)
217 | 
218 |         row = prcont_dict.get(ad_id, [])		
219 |         # build x
220 |         ad_doc_id = -1
221 |         for ind, val in enumerate(row):
222 |             if ind==0:
223 |                 ad_doc_id = int(val)
224 |             x.append(abs(hash(prcont_header[ind] + '_' + val)) % D)
225 | 
226 |         row = event_dict.get(disp_id, [])
227 |         ## build x
228 |         disp_doc_id = -1
229 |         for ind, val in enumerate(row):
230 |             if ind==0:
231 |                 uuid_val = val
232 |             if ind==1:
233 |                 disp_doc_id = int(val)
234 |             x.append(abs(hash(event_header[ind] + '_' + val)) % D)
235 | 
236 |         if (ad_doc_id in leak_uuid_dict) and (uuid_val in leak_uuid_dict[ad_doc_id]):
237 |             x.append(abs(hash('leakage_row_found_1'))%D)
238 |         else:
239 |             x.append(abs(hash('leakage_row_not_found'))%D)
240 |             
241 |         yield t, disp_id, ad_id, x, y
242 | 
243 | 
244 | ##############################################################################
245 | # start training #############################################################
246 | ##############################################################################
247 | 
248 | start = datetime.now()
249 | 
250 | # initialize ourselves a learner
251 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
252 | 
253 | print("Content..")
254 | with open(data_path + "promoted_content.csv") as infile:
255 | 	prcont = csv.reader(infile)
256 | 	#prcont_header = (prcont.next())[1:]
257 | 	prcont_header = next(prcont)[1:]
258 | 	prcont_dict = {}
259 | 	for ind,row in enumerate(prcont):
260 | 		prcont_dict[int(row[0])] = row[1:]
261 | 		if ind%100000 == 0:
262 | 			print(ind)
263 | 	print(len(prcont_dict))
264 | del prcont
265 | 
266 | print("Events..")
267 | with open(data_path + "events.csv") as infile:
268 | 	events = csv.reader(infile)
269 | 	#events.next()
270 | 	next(events)
271 | 	event_header = ['uuid', 'document_id', 'platform', 'geo_location', 'loc_country', 'loc_state', 'loc_dma']
272 | 	event_dict = {}
273 | 	for ind,row in enumerate(events):
274 | 		tlist = row[1:3] + row[4:6]
275 | 		loc = row[5].split('>')
276 | 		if len(loc) == 3:
277 | 			tlist.extend(loc[:])
278 | 		elif len(loc) == 2:
279 | 			tlist.extend( loc[:]+[''])
280 | 		elif len(loc) == 1:
281 | 			tlist.extend( loc[:]+['',''])
282 | 		else:
283 | 			tlist.append(['','',''])	
284 | 		event_dict[int(row[0])] = tlist[:] 
285 | 		if ind%100000 == 0:
286 | 			print("Events : ", ind)
287 | 	print(len(event_dict))
288 | del events
289 | 
290 | print("Leakage file..")
291 | leak_uuid_dict= {}
292 | """
293 | with open(data_path+"leak_uuid_doc.csv") as infile:
294 | 	doc = csv.reader(infile)
295 | 	doc.next()
296 | 	leak_uuid_dict = {}
297 | 	for ind, row in enumerate(doc):
298 | 		doc_id = int(row[0])
299 | 		leak_uuid_dict[doc_id] = set(row[1].split(' '))
300 | 		if ind%100000==0:
301 | 			print("Leakage file : ", ind)
302 | 	print(len(leak_uuid_dict))
303 | del doc
304 | """	
305 | 
306 | # start training
307 | for e in range(epoch):
308 |     loss = 0.
309 |     count = 0
310 |     date = 0
311 | 
312 |     for t, disp_id, ad_id, x, y in data(train, D):  # data is a generator
313 |         #    t: just a instance counter
314 |         # date: you know what this is
315 |         #   ID: id provided in original data
316 |         #    x: features
317 |         #    y: label (click)
318 | 
319 |         # step 1, get prediction from learner
320 |         p = learner.predict(x)
321 | 
322 |         if (holdafter and date > holdafter) or (holdout and t % holdout == 0):
323 |             # step 2-1, calculate validation loss
324 |             #           we do not train with the validation data so that our
325 |             #           validation loss is an accurate estimation
326 |             #
327 |             # holdafter: train instances from day 1 to day N
328 |             #            validate with instances from day N + 1 and after
329 |             #
330 |             # holdout: validate with every N instance, train with others
331 |             loss += logloss(p, y)
332 |             count += 1
333 |         else:
334 |             # step 2-2, update learner with label (click) information
335 |             learner.update(x, p, y)
336 | 
337 |         if t%1000000 == 0:
338 |             print("Processed : ", t, datetime.now())
339 | 
340 |        
341 | 
342 | ##############################################################################
343 | # start testing, and build Kaggle's submission file ##########################
344 | ##############################################################################
345 | 
346 | with open(submission, 'w') as outfile:
347 |     outfile.write('display_id,ad_id,clicked\n')
348 |     for t, disp_id, ad_id, x, y in data(test, D):
349 |         p = learner.predict(x)
350 |         outfile.write('%s,%s,%s\n' % (disp_id, ad_id, str(p)))
351 |         if t%1000000 == 0:
352 |             print("Processed : ", t, datetime.now())
353 | 


--------------------------------------------------------------------------------
/OutBrain/readme.md:
--------------------------------------------------------------------------------
1 | Codes for the Kaggle competition - [Outbrain Click Prediction](https://www.kaggle.com/c/outbrain-click-prediction)
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Kaggle
2 | This repository contains the codes that has been written for Kaggle competitions
3 | 
4 | Kaggle Profile : http://www.kaggle.com/sudalairajkumar
5 | 


--------------------------------------------------------------------------------
/SantanderReco/keras_starter_kaggle.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn import preprocessing
  5 | 
  6 | from keras.models import Sequential
  7 | from keras.layers.core import Dense, Activation, Merge, Reshape, Dropout
  8 | from keras.layers.embeddings import Embedding
  9 | from keras.optimizers import SGD
 10 | from keras.layers.normalization import BatchNormalization
 11 | import cPickle as pkl
 12 | 
 13 | np.random.seed(12345)
 14 | 
 15 | # mapping dict to map the categories to numerical values #
 16 | mapping_dict = {
 17 | 'ind_empleado' 	: {'N':0, -99:1, 'B':2, 'F':3, 'A':4, 'S':5},
 18 | 'sexo' 			: {'V':0, 'H':1, -99:2},
 19 | 'ind_nuevo' 	: {0.0:0, 1.0:1, -99.0:2},
 20 | 'indrel'		: {1.0:0, 99.0:1, -99.0:2},
 21 | 'indrel_1mes'	: {-99:0, 1.0:1, 1:1, 2.0:2, 2:2, 3.0:3, 3:3, 4.0:4, 4:4, 'P':5},
 22 | 'tiprel_1mes'	: {-99:0, 'I':1, 'A':2, 'P':3, 'R':4, 'N':5},
 23 | 'indresi'		: {-99:0, 'S':1, 'N':2},
 24 | 'indext'		: {-99:0, 'S':1, 'N':2},
 25 | 'conyuemp'		: {-99:0, 'S':1, 'N':2},
 26 | 'indfall'		: {-99:0, 'S':1, 'N':2},
 27 | 'tipodom'		: {-99.0:0, 1.0:1},
 28 | 'ind_actividad_cliente' : {0.0:0, 1.0:1, -99.0:2},
 29 | 'segmento'		: {'02 - PARTICULARES':0, '03 - UNIVERSITARIO':1, '01 - TOP':2, -99:2},
 30 | 'pais_residencia' : {'LV': 102, 'BE': 12, 'BG': 50, 'BA': 61, 'BM': 117, 'BO': 62, 'JP': 82, 'JM': 116, 'BR': 17, 'BY': 64, 'BZ': 113, 'RU': 43, 'RS': 89, 'RO': 41, 'GW': 99, 'GT': 44, 'GR': 39, 'GQ': 73, 'GE': 78, 'GB': 9, 'GA': 45, 'GN': 98, 'GM': 110, 'GI': 96, 'GH': 88, 'OM': 100, 'HR': 67, 'HU': 106, 'HK': 34, 'HN': 22, 'AD': 35, 'PR': 40, 'PT': 26, 'PY': 51, 'PA': 60, 'PE': 20, 'PK': 84, 'PH': 91, 'PL': 30, 'EE': 52, 'EG': 74, 'ZA': 75, 'EC': 19, 'AL': 25, 'VN': 90, 'ET': 54, 'ZW': 114, 'ES': 0, 'MD': 68, 'UY': 77, 'MM': 94, 'ML': 104, 'US': 15, 'MT': 118, 'MR': 48, 'UA': 49, 'MX': 16, 'IL': 42, 'FR': 8, 'MA': 38, 'FI': 23, 'NI': 33, 'NL': 7, 'NO': 46, 'NG': 83, 'NZ': 93, 'CI': 57, 'CH': 3, 'CO': 21, 'CN': 28, 'CM': 55, 'CL': 4, 'CA': 2, 'CG': 101, 'CF': 109, 'CD': 112, 'CZ': 36, 'CR': 32, 'CU': 72, 'KE': 65, 'KH': 95, 'SV': 53, 'SK': 69, 'KR': 87, 'KW': 92, 'SN': 47, 'SL': 97, 'KZ': 111, 'SA': 56, 'SG': 66, 'SE': 24, 'DO': 11, 'DJ': 115, 'DK': 76, 'DE': 10, 'DZ': 80, 'MK': 105, -99: 1, 'LB': 81, 'TW': 29, 'TR': 70, 'TN': 85, 'LT': 103, 'LU': 59, 'TH': 79, 'TG': 86, 'LY': 108, 'AE': 37, 'VE': 14, 'IS': 107, 'IT': 18, 'AO': 71, 'AR': 13, 'AU': 63, 'AT': 6, 'IN': 31, 'IE': 5, 'QA': 58, 'MZ': 27},
 31 | 'canal_entrada' : {'013': 49, 'KHP': 160, 'KHQ': 157, 'KHR': 161, 'KHS': 162, 'KHK': 10, 'KHL': 0, 'KHM': 12, 'KHN': 21, 'KHO': 13, 'KHA': 22, 'KHC': 9, 'KHD': 2, 'KHE': 1, 'KHF': 19, '025': 159, 'KAC': 57, 'KAB': 28, 'KAA': 39, 'KAG': 26, 'KAF': 23, 'KAE': 30, 'KAD': 16, 'KAK': 51, 'KAJ': 41, 'KAI': 35, 'KAH': 31, 'KAO': 94, 'KAN': 110, 'KAM': 107, 'KAL': 74, 'KAS': 70, 'KAR': 32, 'KAQ': 37, 'KAP': 46, 'KAW': 76, 'KAV': 139, 'KAU': 142, 'KAT': 5, 'KAZ': 7, 'KAY': 54, 'KBJ': 133, 'KBH': 90, 'KBN': 122, 'KBO': 64, 'KBL': 88, 'KBM': 135, 'KBB': 131, 'KBF': 102, 'KBG': 17, 'KBD': 109, 'KBE': 119, 'KBZ': 67, 'KBX': 116, 'KBY': 111, 'KBR': 101, 'KBS': 118, 'KBP': 121, 'KBQ': 62, 'KBV': 100, 'KBW': 114, 'KBU': 55, 'KCE': 86, 'KCD': 85, 'KCG': 59, 'KCF': 105, 'KCA': 73, 'KCC': 29, 'KCB': 78, 'KCM': 82, 'KCL': 53, 'KCO': 104, 'KCN': 81, 'KCI': 65, 'KCH': 84, 'KCK': 52, 'KCJ': 156, 'KCU': 115, 'KCT': 112, 'KCV': 106, 'KCQ': 154, 'KCP': 129, 'KCS': 77, 'KCR': 153, 'KCX': 120, 'RED': 8, 'KDL': 158, 'KDM': 130, 'KDN': 151, 'KDO': 60, 'KDH': 14, 'KDI': 150, 'KDD': 113, 'KDE': 47, 'KDF': 127, 'KDG': 126, 'KDA': 63, 'KDB': 117, 'KDC': 75, 'KDX': 69, 'KDY': 61, 'KDZ': 99, 'KDT': 58, 'KDU': 79, 'KDV': 91, 'KDW': 132, 'KDP': 103, 'KDQ': 80, 'KDR': 56, 'KDS': 124, 'K00': 50, 'KEO': 96, 'KEN': 137, 'KEM': 155, 'KEL': 125, 'KEK': 145, 'KEJ': 95, 'KEI': 97, 'KEH': 15, 'KEG': 136, 'KEF': 128, 'KEE': 152, 'KED': 143, 'KEC': 66, 'KEB': 123, 'KEA': 89, 'KEZ': 108, 'KEY': 93, 'KEW': 98, 'KEV': 87, 'KEU': 72, 'KES': 68, 'KEQ': 138, -99: 6, 'KFV': 48, 'KFT': 92, 'KFU': 36, 'KFR': 144, 'KFS': 38, 'KFP': 40, 'KFF': 45, 'KFG': 27, 'KFD': 25, 'KFE': 148, 'KFB': 146, 'KFC': 4, 'KFA': 3, 'KFN': 42, 'KFL': 34, 'KFM': 141, 'KFJ': 33, 'KFK': 20, 'KFH': 140, 'KFI': 134, '007': 71, '004': 83, 'KGU': 149, 'KGW': 147, 'KGV': 43, 'KGY': 44, 'KGX': 24, 'KGC': 18, 'KGN': 11},
 32 | 'nomprov' : {'ZARAGOZA': 2, 'BURGOS': 11, 'GRANADA': 46, 'MADRID': 18, 'CIUDAD REAL': 1, 'GIRONA': 7, 'TARRAGONA': 50, 'LEON': 4, 'SORIA': 20, 'SANTA CRUZ DE TENERIFE': 48, 'CEUTA': 52, 'HUESCA': 12, 'VALLADOLID': 24, 'LERIDA': 17, 'ZAMORA': 8, 'CUENCA': 31, 'RIOJA, LA': 34, 'TERUEL': 27, 'PONTEVEDRA': 25, 'MELILLA': 49, 'CORDOBA': 44, 'SEVILLA': 21, -99: 39, 'ALICANTE': 19, 'CASTELLON': 33, 'OURENSE': 29, 'VALENCIA': 26, 'CORU\xc3\x91A, A': 28, 'HUELVA': 45, 'ALBACETE': 35, 'JAEN': 30, 'CADIZ': 38, 'BADAJOZ': 36, 'TOLEDO': 3, 'AVILA': 14, 'BARCELONA': 9, 'SEGOVIA': 15, 'NAVARRA': 13, 'MALAGA': 0, 'SALAMANCA': 10, 'PALENCIA': 42, 'ALMERIA': 40, 'MURCIA': 37, 'GUADALAJARA': 41, 'ASTURIAS': 47, 'BALEARS, ILLES': 23, 'ALAVA': 51, 'LUGO': 16, 'CANTABRIA': 22, 'CACERES': 6, 'PALMAS, LAS': 43, 'GIPUZKOA': 5, 'BIZKAIA': 32, 'CORUNA, A':28}
 33 | }
 34 | 
 35 | dtype_list = {'ind_cco_fin_ult1': 'float16', 'ind_deme_fin_ult1': 'float16', 'ind_aval_fin_ult1': 'float16', 'ind_valo_fin_ult1': 'float16', 'ind_reca_fin_ult1': 'float16', 'ind_ctju_fin_ult1': 'float16', 'ind_cder_fin_ult1': 'float16', 'ind_plan_fin_ult1': 'float16', 'ind_fond_fin_ult1': 'float16', 'ind_hip_fin_ult1': 'float16', 'ind_pres_fin_ult1': 'float16', 'ind_nomina_ult1': 'float16', 'ind_cno_fin_ult1': 'float16', 'ncodpers': 'int64', 'ind_ctpp_fin_ult1': 'float16', 'ind_ahor_fin_ult1': 'float16', 'ind_dela_fin_ult1': 'float16', 'ind_ecue_fin_ult1': 'float16', 'ind_nom_pens_ult1': 'float16', 'ind_recibo_ult1': 'float16', 'ind_deco_fin_ult1': 'float16', 'ind_tjcr_fin_ult1': 'float16', 'ind_ctop_fin_ult1': 'float16', 'ind_viv_fin_ult1': 'float16', 'ind_ctma_fin_ult1': 'float16'}
 36 | 
 37 | # categorical columns to use #
 38 | cols_to_use = mapping_dict.keys()
 39 | print(cols_to_use)
 40 | 
 41 | # target columns to predict #
 42 | target_cols = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1','ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1']
 43 | print(target_cols)
 44 | 
 45 | # one hot encode fit for all the categorical variables #
 46 | ohes = []
 47 | feat_count = 0
 48 | for col in cols_to_use:
 49 | 	ohe = preprocessing.OneHotEncoder()
 50 | 	ohe.fit(np.array(mapping_dict[col].values()).reshape(-1,1))
 51 | 	feat_count += ohe.n_values_[0]
 52 | 	print(col, feat_count)
 53 | 	ohes.append(ohe)
 54 | 
 55 | 
 56 | def batch_generator(file_name, batch_size, shuffle, train_input=True):
 57 | 	while (True):
 58 | 		if train_input:
 59 | 			chunked_df = pd.read_csv(file_name, usecols=['ncodpers']+cols_to_use+target_cols, chunksize=batch_size)
 60 | 		else:
 61 | 			chunked_df = pd.read_csv(file_name, usecols=['ncodpers']+cols_to_use, chunksize=batch_size)
 62 | 
 63 | 		nrows = 0
 64 | 		for chunk_df in chunked_df:
 65 | 			chunk_X = chunk_df[cols_to_use]
 66 | 			chunk_X = chunk_X.fillna(-99)
 67 | 			for col_ind, col in enumerate(cols_to_use):
 68 | 				chunk_X[col] = chunk_X[col].apply(lambda x: mapping_dict[col][x])
 69 | 				ohe = ohes[col_ind]
 70 | 				temp_X = ohe.transform( np.array(chunk_X[col]).reshape(-1,1) )
 71 | 				if col_ind == 0:
 72 | 					X = temp_X.todense().copy()
 73 | 				else:
 74 | 					X = np.hstack((X, temp_X.todense()))
 75 | 				
 76 | 			if train_input:
 77 | 				y = np.array(chunk_df[target_cols].fillna(0))
 78 | 
 79 | 			if shuffle:
 80 | 				shuffle_index = np.random.shuffle(np.arange(X.shape[0]))
 81 | 				X = X[shuffle_index,:]
 82 | 				if train_input:
 83 | 					y = y[shuffle_index,:]
 84 | 
 85 | 
 86 | 			if train_input:
 87 | 				yield X, y
 88 | 			else:
 89 | 				yield X
 90 | 
 91 | 			nrows += batch_size
 92 | 			if train_input and nrows >= train_size:
 93 | 				break
 94 | 
 95 | 
 96 | def keras_embedding_model():
 97 | 	# keras model architecture #
 98 | 	final_model = Sequential()
 99 | 	final_model.add(Dense(50, input_dim=feat_count, init='he_uniform'))
100 | 	final_model.add(Activation('relu'))
101 | 	final_model.add(Dense(len(target_cols), init='zero'))
102 | 	final_model.add(Activation('sigmoid'))
103 | 	final_model.compile(loss='binary_crossentropy', optimizer='adam')
104 | 	return final_model
105 | 
106 | if __name__ == "__main__":
107 | 	train = "../input/train_ver2.csv"
108 | 	test = "../input/test_ver2.csv"
109 | 	#train_size = 13647309
110 | 	train_size = 1000000
111 | 	test_size = 929615
112 | 	print("Initialize the model..")
113 | 	model = keras_embedding_model()
114 | 	print("Model fit..")
115 | 	fit= model.fit_generator(
116 | 			generator = batch_generator(train, 500, False), 
117 | 			nb_epoch = 1,
118 | 			samples_per_epoch = train_size
119 | 		)
120 | 	preds = model.predict_generator(generator=batch_generator(test, 10000, False, False), val_samples=test_size)
121 | 	print("Predictions : ", preds.shape)
122 | 	
123 | 	last_instance_df = pd.read_csv(train, usecols=['ncodpers']+target_cols, dtype=dtype_list)
124 | 	last_instance_df = last_instance_df.drop_duplicates('ncodpers', keep='last')
125 | 	last_instance_df = last_instance_df.fillna(0).astype('int')
126 | 	cust_dict = {}
127 | 	target_cols = np.array(target_cols)
128 | 	for ind, row in last_instance_df.iterrows():
129 | 		cust = row['ncodpers']
130 | 		used_products = set(target_cols[np.array(row[1:])==1])
131 | 		cust_dict[cust] = used_products
132 | 	del last_instance_df
133 | 	
134 | 	target_cols = np.array(target_cols)
135 | 	preds = np.argsort(preds, axis=1)
136 | 	preds = np.fliplr(preds)
137 | 	test_id = np.array(pd.read_csv(test, usecols=['ncodpers'])['ncodpers'])
138 | 	final_preds = []
139 | 	for ind, pred in enumerate(preds):
140 | 		cust = test_id[ind]
141 | 		top_products = target_cols[pred]
142 | 		used_products = cust_dict.get(cust,[])
143 | 		new_top_products = []
144 | 		for product in top_products:
145 | 			if product not in used_products:
146 | 				new_top_products.append(product)
147 | 			if len(new_top_products) == 7:
148 | 				break
149 | 		final_preds.append(" ".join(new_top_products))
150 | 	out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})
151 | 	out_df.to_csv('sub_keras.csv', index=False)
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/SantanderReco/multilabel_classification.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import cPickle as pkl
 4 | from sklearn import preprocessing, ensemble
 5 | 
 6 | # columns to be used as features #
 7 | #feature_cols = ["ind_empleado","pais_residencia","sexo","age","ind_nuevo","antiguedad","indrel","ult_fec_cli_1t","indrel_1mes","tiprel_1mes","indresi","indext","conyuemp","canal_entrada","indfall","tipodom","cod_prov","nomprov","ind_actividad_cliente","renta","segmento"]
 8 | feature_cols = ["ind_empleado","pais_residencia","sexo","age", "ind_nuevo", "antiguedad", "nomprov", "segmento", "ind_actividad_cliente", "indresi"]
 9 | feature_cols = ["ind_empleado","pais_residencia","sexo","age"]
10 | feature_cols = ["ind_empleado","pais_residencia"]
11 | 
12 | target_cols = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1','ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1'] 
13 | 
14 | dtype_list = {'ind_cco_fin_ult1': 'float16', 'ind_deme_fin_ult1': 'float16', 'ind_aval_fin_ult1': 'float16', 'ind_valo_fin_ult1': 'float16', 'ind_reca_fin_ult1': 'float16', 'ind_ctju_fin_ult1': 'float16', 'ind_cder_fin_ult1': 'float16', 'ind_plan_fin_ult1': 'float16', 'ind_fond_fin_ult1': 'float16', 'ind_hip_fin_ult1': 'float16', 'ind_pres_fin_ult1': 'float16', 'ind_nomina_ult1': 'float16', 'ind_cno_fin_ult1': 'float16', 'ncodpers': 'int64', 'ind_ctpp_fin_ult1': 'float16', 'ind_ahor_fin_ult1': 'float16', 'ind_dela_fin_ult1': 'float16', 'ind_ecue_fin_ult1': 'float16', 'ind_nom_pens_ult1': 'float16', 'ind_recibo_ult1': 'float16', 'ind_deco_fin_ult1': 'float16', 'ind_tjcr_fin_ult1': 'float16', 'ind_ctop_fin_ult1': 'float16', 'ind_viv_fin_ult1': 'float16', 'ind_ctma_fin_ult1': 'float16'}
15 | 
16 | if __name__ == "__main__":
17 | 	data_path = "../input/"
18 | 	train_file = data_path + "train_ver2.csv"
19 | 	test_file = data_path + "test_ver2.csv"
20 | 	train_size = 13647309
21 | 	nrows = 1000000 # change this value to read more rows from train
22 | 
23 | 	start_index = train_size - nrows	
24 | 	for ind, col in enumerate(feature_cols):
25 | 		print(col)
26 | 		train = pd.read_csv(train_file, usecols=[col])
27 | 		test = pd.read_csv(test_file, usecols=[col])
28 | 		train.fillna(-99, inplace=True)
29 | 		test.fillna(-99, inplace=True)
30 | 		if train[col].dtype == "object":
31 | 			le = preprocessing.LabelEncoder()
32 | 			le.fit(list(train[col].values) + list(test[col].values))
33 | 			temp_train_X = le.transform(list(train[col].values)).reshape(-1,1)[start_index:,:]
34 | 			temp_test_X = le.transform(list(test[col].values)).reshape(-1,1)
35 | 		else:
36 | 			temp_train_X = np.array(train[col]).reshape(-1,1)[start_index:,:]
37 | 			temp_test_X = np.array(test[col]).reshape(-1,1)
38 | 		if ind == 0:
39 | 			train_X = temp_train_X.copy()
40 | 			test_X = temp_test_X.copy()
41 | 		else:
42 | 			train_X = np.hstack([train_X, temp_train_X])
43 | 			test_X = np.hstack([test_X, temp_test_X])
44 | 		print(train_X.shape, test_X.shape)
45 | 	del train
46 | 	del test
47 | 
48 | 	train_y = pd.read_csv(train_file, usecols=['ncodpers']+target_cols, dtype=dtype_list)
49 | 	last_instance_df = train_y.drop_duplicates('ncodpers', keep='last')
50 | 	train_y = np.array(train_y.fillna(0)).astype('int')[start_index:,1:]
51 | 	print(train_X.shape, train_y.shape)
52 | 	print(test_X.shape)
53 | 
54 | 	print("Running Model..")
55 | 	model = ensemble.RandomForestClassifier(n_estimators=5, max_depth=10, min_samples_leaf=10, n_jobs=4, random_state=2016)
56 | 	model.fit(train_X, train_y)
57 | 	del train_X, train_y
58 | 	print("Predicting..")
59 | 	preds = np.array(model.predict_proba(test_X))[:,:,1].T
60 | 	del test_X
61 | 	#print preds.shape
62 | 
63 | 	print("Getting last instance dict..")
64 | 	last_instance_df = last_instance_df.fillna(0).astype('int')
65 | 	cust_dict = {}
66 | 	target_cols = np.array(target_cols)
67 | 	for ind, row in last_instance_df.iterrows():
68 | 		cust = row['ncodpers']
69 | 		used_products = set(target_cols[np.array(row[1:])==1])
70 | 		cust_dict[cust] = used_products
71 | 	del last_instance_df
72 | 	print row
73 | 	print cust, cust_dict[cust]
74 | 
75 | 	print("Creating submission..")
76 | 	preds = np.argsort(preds, axis=1)
77 | 	preds = np.fliplr(preds)
78 | 	#print preds.shape
79 | 	test_id = np.array(pd.read_csv(test_file, usecols=['ncodpers'])['ncodpers'])
80 | 	final_preds = []
81 | 	for ind, pred in enumerate(preds):
82 | 		cust = test_id[ind]
83 | 		top_products = target_cols[pred]
84 | 		used_products = cust_dict.get(cust,[])
85 | 		print cust, used_products
86 | 		new_top_products = []
87 | 		for product in top_products:
88 | 			if product not in used_products:
89 | 				new_top_products.append(product)
90 | 			if len(new_top_products) == 7:
91 | 				break
92 | 		final_preds.append(" ".join(new_top_products))
93 | 	out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})
94 | 	out_df.to_csv('sub_rf.csv', index=False)			
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/SantanderReco/readme.md:
--------------------------------------------------------------------------------
1 | Codes for Kaggle competition - [Santander Product recommendation](https://www.kaggle.com/c/santander-product-recommendation) is present in this folder.
2 | 


--------------------------------------------------------------------------------
/SpookyAuthor/readme.md:
--------------------------------------------------------------------------------
1 | Codes for the Kaggle - Spooky Author challenge 
2 | 


--------------------------------------------------------------------------------
/Titanic/readme.md:
--------------------------------------------------------------------------------
1 | Codes for the titanic competition
2 | 


--------------------------------------------------------------------------------
/TransferLearningStackExchange/frequent_words_model.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import csv
 3 | import operator
 4 | from collections import defaultdict
 5 | 
 6 | stop_words = set(['a', "a's", 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'b', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'c', "c'mon", "c's", 'came', 'can', "can't", 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', "couldn't", 'course', 'currently', 'd', 'definitely', 'described', 'despite', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', "don't", 'done', 'down', 'downwards', 'during', 'e', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'f', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'g', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'h', 'had', "hadn't", 'happens', 'hardly', 'has', "hasn't", 'have', "haven't", 'having', 'he', "he's", 'hello', 'help', 'hence', 'her', 'here', "here's", 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', 'j', 'just', 'k', 'keep', 'keeps', 'kept', 'know', 'knows', 'known', 'l', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'm', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'n', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'o', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'p', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'q', 'que', 'quite', 'qv', 'r', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 's', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 't', "t's", 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', "there's", 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', "they'd", "they'll", "they're", "they've", 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'u', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'uucp', 'v', 'value', 'various', 'very', 'via', 'viz', 'vs', 'w', 'want', 'wants', 'was', "wasn't", 'way', 'we', "we'd", "we'll", "we're", "we've", 'welcome', 'well', 'went', 'were', "weren't", 'what', "what's", 'whatever', 'when', 'whence', 'whenever', 'where', "where's", 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', "who's", 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', "won't", 'wonder', 'would', 'would', "wouldn't", 'x', 'y', 'yes', 'yet', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 'z', 'zero', ''])
 7 | def f1_score(tp, fp, fn):
 8 | 	p = (tp*1.) / (tp+fp)
 9 | 	r = (tp*1.) / (tp+fn)
10 | 	f1 = (2*p*r)/(p+r)
11 | 	return f1
12 | 
13 | def clean_html(raw_html):
14 |   cleanr = re.compile('<.*?>')
15 |   cleantext = re.sub(cleanr, '', raw_html)
16 |   return cleantext
17 | 
18 | def get_words(text):
19 | 	word_split = re.compile('[^a-zA-Z0-9_\\+\\-/]')
20 | 	return [word.strip().lower() for word in word_split.split(text)]
21 | 
22 | data_path = "../input/"
23 | in_file = open(data_path+"test.csv")
24 | out_file = open("sub_freq.csv", "w")
25 | reader = csv.DictReader(in_file)
26 | writer = csv.writer(out_file)
27 | writer.writerow(['id','tags'])
28 | for ind, row in enumerate(reader):
29 | 	text = clean_html(row["title"]) + " " + clean_html(row['content']) 
30 | 	frequency_dict = defaultdict(int)
31 | 	for word in get_words(text):
32 | 		if word not in stop_words:
33 | 			frequency_dict[word] += 1
34 | 	pred_tags = set(sorted(frequency_dict, key=frequency_dict.get, reverse=True)[:3])
35 | 	writer.writerow([row['id'], " ".join(pred_tags)])
36 | 	if ind%50000 == 0:
37 | 		print("Processed : ", ind)
38 | 
39 | 
40 | in_file.close()
41 | out_file.close()
42 | 


--------------------------------------------------------------------------------
/TransferLearningStackExchange/readme.md:
--------------------------------------------------------------------------------
1 | Codes for the Kaggle competition - Transfer Learning on Stack Exchange Tags is present here.
2 | 


--------------------------------------------------------------------------------
/TransferLearningStackExchange/simple_exploration_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "_cell_guid": "249cb54e-4588-5b7d-2c01-60bba33d731e"
  7 |       },
  8 |       "source": [
  9 |         "Simple exploration notebook "
 10 |       ]
 11 |     },
 12 |     {
 13 |       "cell_type": "code",
 14 |       "execution_count": null,
 15 |       "metadata": {
 16 |         "_cell_guid": "50ab2790-3a51-98f3-1e8b-33926c862bfd"
 17 |       },
 18 |       "outputs": [],
 19 |       "source": [
 20 |         "# This Python 3 environment comes with many helpful analytics libraries installed\n",
 21 |         "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n",
 22 |         "# For example, here's several helpful packages to load in \n",
 23 |         "\n",
 24 |         "import numpy as np # linear algebra\n",
 25 |         "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
 26 |         "import matplotlib.pyplot as plt\n",
 27 |         "import seaborn as sns\n",
 28 |         "from wordcloud import WordCloud\n",
 29 |         "from nltk.corpus import stopwords\n",
 30 |         "\n",
 31 |         "%matplotlib inline"
 32 |       ]
 33 |     },
 34 |     {
 35 |       "cell_type": "markdown",
 36 |       "metadata": {
 37 |         "_cell_guid": "bbda81c2-5282-479c-a931-743437f6d84b"
 38 |       },
 39 |       "source": [
 40 |         "**Wordcloud on tags:**\n",
 41 |         "\n",
 42 |         "Let us create a word cloud on the tags column for all topics and see the important tags."
 43 |       ]
 44 |     },
 45 |     {
 46 |       "cell_type": "code",
 47 |       "execution_count": null,
 48 |       "metadata": {
 49 |         "_cell_guid": "24027035-3f69-5988-2c21-a9fd1522ccfb"
 50 |       },
 51 |       "outputs": [],
 52 |       "source": [
 53 |         "topics_list = ['biology', 'cooking', 'crypto', 'diy', 'robotics', 'travel']\n",
 54 |         "\n",
 55 |         "for ind, topic in enumerate(topics_list):\n",
 56 |         "    tags = np.array(pd.read_csv(\"../input/\"+topic+\".csv\", usecols=['tags'])['tags'])\n",
 57 |         "    text = ''\n",
 58 |         "    for ind, tag in enumerate(tags):\n",
 59 |         "        text = \" \".join([text, tag])\n",
 60 |         "    text = text.strip()\n",
 61 |         "    \n",
 62 |         "    wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text)\n",
 63 |         "    wordcloud.recolor(random_state=ind*312)\n",
 64 |         "    plt.imshow(wordcloud)\n",
 65 |         "    plt.title(\"Wordcloud for topic : \"+topic)\n",
 66 |         "    plt.axis(\"off\")\n",
 67 |         "    plt.show()"
 68 |       ]
 69 |     },
 70 |     {
 71 |       "cell_type": "markdown",
 72 |       "metadata": {
 73 |         "_cell_guid": "c0f04269-7f45-7cf1-5625-13e04733bc6a"
 74 |       },
 75 |       "source": [
 76 |         "**Wordcloud for topic Biology:**\n",
 77 |         "   \n",
 78 |         "Let us take a single topic 'biology' and then see how the word clouds from each of the three fields title, content, tags compare with each other."
 79 |       ]
 80 |     },
 81 |     {
 82 |       "cell_type": "code",
 83 |       "execution_count": null,
 84 |       "metadata": {
 85 |         "_cell_guid": "77bb82f3-5925-34e0-7c2c-903ae0ec20d4"
 86 |       },
 87 |       "outputs": [],
 88 |       "source": [
 89 |         "bio = pd.read_csv(\"../input/biology.csv\")\n",
 90 |         "title = np.array(bio['title'])\n",
 91 |         "content = np.array(bio['content'])\n",
 92 |         "tags = np.array(bio['tags'])\n",
 93 |         "\n",
 94 |         "# wordcloud for tags #\n",
 95 |         "text = ''\n",
 96 |         "for ind, tag in enumerate(tags):\n",
 97 |         "    text = \" \".join([text, tag])\n",
 98 |         "text = text.strip()\n",
 99 |         "\n",
100 |         "wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=80).generate(text)\n",
101 |         "wordcloud.recolor(random_state=218)\n",
102 |         "plt.imshow(wordcloud)\n",
103 |         "plt.axis(\"off\")\n",
104 |         "plt.title(\"Wordcloud on 'tags' for biology \")\n",
105 |         "plt.show()\n",
106 |         "\n",
107 |         "# wordcloud for title #\n",
108 |         "text = ''\n",
109 |         "for ind, tag in enumerate(title):\n",
110 |         "    text = \" \".join([text, tag])\n",
111 |         "text = text.strip()\n",
112 |         "\n",
113 |         "stop_words = set(stopwords.words('english') + ['sas', 'ss', 'fas', 'des', 'les', 'ess'])\n",
114 |         "wordcloud = WordCloud(background_color='white', width=600, height=300, stopwords=stop_words, max_font_size=50, max_words=80).generate(text)\n",
115 |         "wordcloud.recolor(random_state=218)\n",
116 |         "plt.imshow(wordcloud)\n",
117 |         "plt.axis(\"off\")\n",
118 |         "plt.title(\"Wordcloud on 'title' for biology \")\n",
119 |         "plt.show()\n",
120 |         "\n",
121 |         "### Commenting this out for now as it throws error while rendering and not while running it at the backend ###\n",
122 |         "## wordcloud for content #\n",
123 |         "#text = ''\n",
124 |         "#for ind, tag in enumerate(content):\n",
125 |         "#    text = \" \".join([text, tag])\n",
126 |         "#text = text.strip()\n",
127 |         "\n",
128 |         "#stop_words = set(stopwords.words('english') + ['rbs', 'sas', 'ss', 'fas', 'des', 'ess', 'les', 'bas', 'poses', 'los', 'ros', 'cs'])\n",
129 |         "#wordcloud = WordCloud(background_color='white', width=600, height=300, stopwords=stop_words, max_font_size=50, max_words=80).generate(text)\n",
130 |         "#wordcloud.recolor(random_state=218)\n",
131 |         "#plt.imshow(wordcloud)\n",
132 |         "#plt.axis(\"off\")\n",
133 |         "#plt.title(\"Wordcloud on 'content' for biology \")\n",
134 |         "#plt.show()"
135 |       ]
136 |     },
137 |     {
138 |       "cell_type": "markdown",
139 |       "metadata": {
140 |         "_cell_guid": "6948ea6a-2d00-f199-e4ef-a761f640021e"
141 |       },
142 |       "source": [
143 |         "As we can see, wordcloud from 'topic' is decent and has some important words related to the topic and can be used for tag creation.\n",
144 |         "\n",
145 |         "Wordcloud from 'content' has more irrelevant (html) words and so we need to do proper cleaning to remove those before we start with our modeling / learning.\n",
146 |         "\n",
147 |         "More to come...!"
148 |       ]
149 |     }
150 |   ],
151 |   "metadata": {
152 |     "_change_revision": 0,
153 |     "_is_fork": false,
154 |     "kernelspec": {
155 |       "display_name": "Python 3",
156 |       "language": "python",
157 |       "name": "python3"
158 |     },
159 |     "language_info": {
160 |       "codemirror_mode": {
161 |         "name": "ipython",
162 |         "version": 3
163 |       },
164 |       "file_extension": ".py",
165 |       "mimetype": "text/x-python",
166 |       "name": "python",
167 |       "nbconvert_exporter": "python",
168 |       "pygments_lexer": "ipython3",
169 |       "version": "3.5.2"
170 |     }
171 |   },
172 |   "nbformat": 4,
173 |   "nbformat_minor": 0
174 | }
175 | 


--------------------------------------------------------------------------------
/TwoSigmaConnect_RentHop/XGBStarterInPython.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "_cell_guid": "6e711393-7a75-17e3-539c-9169c1ae1225"
  7 |    },
  8 |    "source": [
  9 |     "It seems the current [high scoring script][1] is written in R using H2O. So let us do one in python using XGBoost. \n",
 10 |     "\n",
 11 |     "Thanks to [this script][2] for feature engineering ideas. \n",
 12 |     "\n",
 13 |     "We shall start with importing the necessary modules\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "  [1]: https://www.kaggle.com/gospursgo/two-sigma-connect-rental-listing-inquiries/h2o-starter-pack/run/835757\n",
 17 |     "  [2]: https://www.kaggle.com/aikinogard/two-sigma-connect-rental-listing-inquiries/random-forest-starter-with-numerical-features"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {
 24 |     "_cell_guid": "1952347b-6dc9-b9f1-fa25-94587a2aee77"
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import os\n",
 29 |     "import sys\n",
 30 |     "import operator\n",
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd\n",
 33 |     "from scipy import sparse\n",
 34 |     "import xgboost as xgb\n",
 35 |     "from sklearn import model_selection, preprocessing, ensemble\n",
 36 |     "from sklearn.metrics import log_loss\n",
 37 |     "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {
 43 |     "_cell_guid": "d7d59f0a-0026-8e33-6236-31637173734f"
 44 |    },
 45 |    "source": [
 46 |     "Now let us write a custom function to run the xgboost model."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "metadata": {
 53 |     "_cell_guid": "af6e68af-f7a8-b0ac-c565-1d04818258f9"
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):\n",
 58 |     "    param = {}\n",
 59 |     "    param['objective'] = 'multi:softprob'\n",
 60 |     "    param['eta'] = 0.1\n",
 61 |     "    param['max_depth'] = 6\n",
 62 |     "    param['silent'] = 1\n",
 63 |     "    param['num_class'] = 3\n",
 64 |     "    param['eval_metric'] = \"mlogloss\"\n",
 65 |     "    param['min_child_weight'] = 1\n",
 66 |     "    param['subsample'] = 0.7\n",
 67 |     "    param['colsample_bytree'] = 0.7\n",
 68 |     "    param['seed'] = seed_val\n",
 69 |     "    num_rounds = num_rounds\n",
 70 |     "\n",
 71 |     "    plst = list(param.items())\n",
 72 |     "    xgtrain = xgb.DMatrix(train_X, label=train_y)\n",
 73 |     "\n",
 74 |     "    if test_y is not None:\n",
 75 |     "        xgtest = xgb.DMatrix(test_X, label=test_y)\n",
 76 |     "        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]\n",
 77 |     "        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)\n",
 78 |     "    else:\n",
 79 |     "        xgtest = xgb.DMatrix(test_X)\n",
 80 |     "        model = xgb.train(plst, xgtrain, num_rounds)\n",
 81 |     "\n",
 82 |     "    pred_test_y = model.predict(xgtest)\n",
 83 |     "    return pred_test_y, model"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {
 89 |     "_cell_guid": "c4a69cea-cb06-5d6a-83b7-16ee8ee241f6"
 90 |    },
 91 |    "source": [
 92 |     "Let us read the train and test files and store it."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 3,
 98 |    "metadata": {
 99 |     "_cell_guid": "0108ce34-5e84-7f49-bd6f-6562d60a9082"
100 |    },
101 |    "outputs": [
102 |     {
103 |      "name": "stdout",
104 |      "output_type": "stream",
105 |      "text": [
106 |       "(49352, 15)\n",
107 |       "(74659, 14)\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "data_path = \"../input/\"\n",
113 |     "train_file = data_path + \"train.json\"\n",
114 |     "test_file = data_path + \"test.json\"\n",
115 |     "train_df = pd.read_json(train_file)\n",
116 |     "test_df = pd.read_json(test_file)\n",
117 |     "print(train_df.shape)\n",
118 |     "print(test_df.shape)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {
124 |     "_cell_guid": "2bf65ce4-7375-c8e9-97d5-621736f3338d"
125 |    },
126 |    "source": [
127 |     "We do not need any pre-processing for numerical features and so create a list with those features."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 4,
133 |    "metadata": {
134 |     "_cell_guid": "6462885f-97de-b2d1-2c1a-1958115c4c4d"
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "features_to_use  = [\"bathrooms\", \"bedrooms\", \"latitude\", \"longitude\", \"price\"]"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {
144 |     "_cell_guid": "b7670810-6d0b-89d0-629e-f99624421229"
145 |    },
146 |    "source": [
147 |     "Now let us create some new features from the given features."
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 5,
153 |    "metadata": {
154 |     "_cell_guid": "e3b81db5-929d-b8b8-141c-1bbb4a5eaaf3"
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "# count of photos #\n",
159 |     "train_df[\"num_photos\"] = train_df[\"photos\"].apply(len)\n",
160 |     "test_df[\"num_photos\"] = test_df[\"photos\"].apply(len)\n",
161 |     "\n",
162 |     "# count of \"features\" #\n",
163 |     "train_df[\"num_features\"] = train_df[\"features\"].apply(len)\n",
164 |     "test_df[\"num_features\"] = test_df[\"features\"].apply(len)\n",
165 |     "\n",
166 |     "# count of words present in description column #\n",
167 |     "train_df[\"num_description_words\"] = train_df[\"description\"].apply(lambda x: len(x.split(\" \")))\n",
168 |     "test_df[\"num_description_words\"] = test_df[\"description\"].apply(lambda x: len(x.split(\" \")))\n",
169 |     "\n",
170 |     "# convert the created column to datetime object so as to extract more features \n",
171 |     "train_df[\"created\"] = pd.to_datetime(train_df[\"created\"])\n",
172 |     "test_df[\"created\"] = pd.to_datetime(test_df[\"created\"])\n",
173 |     "\n",
174 |     "# Let us extract some features like year, month, day, hour from date columns #\n",
175 |     "train_df[\"created_year\"] = train_df[\"created\"].dt.year\n",
176 |     "test_df[\"created_year\"] = test_df[\"created\"].dt.year\n",
177 |     "train_df[\"created_month\"] = train_df[\"created\"].dt.month\n",
178 |     "test_df[\"created_month\"] = test_df[\"created\"].dt.month\n",
179 |     "train_df[\"created_day\"] = train_df[\"created\"].dt.day\n",
180 |     "test_df[\"created_day\"] = test_df[\"created\"].dt.day\n",
181 |     "train_df[\"created_hour\"] = train_df[\"created\"].dt.hour\n",
182 |     "test_df[\"created_hour\"] = test_df[\"created\"].dt.hour\n",
183 |     "\n",
184 |     "# adding all these new features to use list #\n",
185 |     "features_to_use.extend([\"num_photos\", \"num_features\", \"num_description_words\",\"created_year\", \"created_month\", \"created_day\", \"listing_id\", \"created_hour\"])"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {
191 |     "_cell_guid": "3d9aa966-66a2-8ff8-2459-40e0187418a2"
192 |    },
193 |    "source": [
194 |     "We have 4 categorical features in our data\n",
195 |     "\n",
196 |     " - display_address\n",
197 |     " - manager_id\n",
198 |     " - building_id\n",
199 |     " - listing_id\n",
200 |     "\n",
201 |     "So let us label encode these features."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 6,
207 |    "metadata": {
208 |     "_cell_guid": "af410ae2-6197-adce-ee68-360aa59eff7e"
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "categorical = [\"display_address\", \"manager_id\", \"building_id\", \"street_address\"]\n",
213 |     "for f in categorical:\n",
214 |     "        if train_df[f].dtype=='object':\n",
215 |     "            #print(f)\n",
216 |     "            lbl = preprocessing.LabelEncoder()\n",
217 |     "            lbl.fit(list(train_df[f].values) + list(test_df[f].values))\n",
218 |     "            train_df[f] = lbl.transform(list(train_df[f].values))\n",
219 |     "            test_df[f] = lbl.transform(list(test_df[f].values))\n",
220 |     "            features_to_use.append(f)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {
226 |     "_cell_guid": "3f550f0f-0c6f-2432-2c07-d507632eaa2b"
227 |    },
228 |    "source": [
229 |     "We have features column which is a list of string values. So we can first combine all the strings together to get a single string and then apply count vectorizer on top of it."
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 7,
235 |    "metadata": {
236 |     "_cell_guid": "d1ea3504-a12c-023a-bce6-d4f93ddb8019"
237 |    },
238 |    "outputs": [
239 |     {
240 |      "name": "stdout",
241 |      "output_type": "stream",
242 |      "text": [
243 |       "10                                                         \n",
244 |       "10000     Doorman Elevator Fitness_Center Cats_Allowed D...\n",
245 |       "100004    Laundry_In_Building Dishwasher Hardwood_Floors...\n",
246 |       "100007                               Hardwood_Floors No_Fee\n",
247 |       "100013                                              Pre-War\n",
248 |       "Name: features, dtype: object\n"
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "train_df['features'] = train_df[\"features\"].apply(lambda x: \" \".join([\"_\".join(i.split(\" \")) for i in x]))\n",
254 |     "test_df['features'] = test_df[\"features\"].apply(lambda x: \" \".join([\"_\".join(i.split(\" \")) for i in x]))\n",
255 |     "print(train_df[\"features\"].head())\n",
256 |     "tfidf = CountVectorizer(stop_words='english', max_features=200)\n",
257 |     "tr_sparse = tfidf.fit_transform(train_df[\"features\"])\n",
258 |     "te_sparse = tfidf.transform(test_df[\"features\"])"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {
264 |     "_cell_guid": "2bfbcacc-e821-654b-f2b3-cda0f1a5a20b"
265 |    },
266 |    "source": [
267 |     "Now let us stack both the dense and sparse features into a single dataset and also get the target variable."
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 8,
273 |    "metadata": {
274 |     "_cell_guid": "9eeef912-2104-e97e-1948-c246652340e1"
275 |    },
276 |    "outputs": [
277 |     {
278 |      "name": "stdout",
279 |      "output_type": "stream",
280 |      "text": [
281 |       "(49352, 217) (74659, 217)\n"
282 |      ]
283 |     }
284 |    ],
285 |    "source": [
286 |     "train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()\n",
287 |     "test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()\n",
288 |     "\n",
289 |     "target_num_map = {'high':0, 'medium':1, 'low':2}\n",
290 |     "train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))\n",
291 |     "print(train_X.shape, test_X.shape)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {
297 |     "_cell_guid": "7d2e5fb7-7886-68b1-326f-6db491215001"
298 |    },
299 |    "source": [
300 |     "Now let us do some cross validation to check the scores. \n",
301 |     "\n",
302 |     "Please run it in local to get the cv scores. I am commenting it out here for time."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 9,
308 |    "metadata": {
309 |     "_cell_guid": "13fd60b9-a8b5-c76f-1fbd-2a56219da0d2"
310 |    },
311 |    "outputs": [
312 |     {
313 |      "name": "stdout",
314 |      "output_type": "stream",
315 |      "text": [
316 |       "[0]\ttrain-mlogloss:1.04114\ttest-mlogloss:1.04219\n",
317 |       "Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.\n",
318 |       "\n",
319 |       "Will train until test-mlogloss hasn't improved in 20 rounds.\n",
320 |       "[1]\ttrain-mlogloss:0.988799\ttest-mlogloss:0.990721\n",
321 |       "[2]\ttrain-mlogloss:0.944048\ttest-mlogloss:0.94691\n",
322 |       "[3]\ttrain-mlogloss:0.90518\ttest-mlogloss:0.908812\n",
323 |       "[4]\ttrain-mlogloss:0.8718\ttest-mlogloss:0.876215\n",
324 |       "[5]\ttrain-mlogloss:0.841498\ttest-mlogloss:0.847057\n",
325 |       "[6]\ttrain-mlogloss:0.815614\ttest-mlogloss:0.821795\n",
326 |       "[7]\ttrain-mlogloss:0.79312\ttest-mlogloss:0.799993\n",
327 |       "[8]\ttrain-mlogloss:0.773194\ttest-mlogloss:0.780815\n",
328 |       "[9]\ttrain-mlogloss:0.754598\ttest-mlogloss:0.763247\n",
329 |       "[10]\ttrain-mlogloss:0.738162\ttest-mlogloss:0.747594\n",
330 |       "[11]\ttrain-mlogloss:0.724634\ttest-mlogloss:0.734739\n",
331 |       "[12]\ttrain-mlogloss:0.711331\ttest-mlogloss:0.722318\n",
332 |       "[13]\ttrain-mlogloss:0.699821\ttest-mlogloss:0.711481\n",
333 |       "[14]\ttrain-mlogloss:0.689142\ttest-mlogloss:0.701381\n",
334 |       "[15]\ttrain-mlogloss:0.678446\ttest-mlogloss:0.691482\n",
335 |       "[16]\ttrain-mlogloss:0.669268\ttest-mlogloss:0.683158\n",
336 |       "[17]\ttrain-mlogloss:0.66185\ttest-mlogloss:0.67647\n",
337 |       "[18]\ttrain-mlogloss:0.654386\ttest-mlogloss:0.669772\n",
338 |       "[19]\ttrain-mlogloss:0.648071\ttest-mlogloss:0.664241\n",
339 |       "[20]\ttrain-mlogloss:0.642589\ttest-mlogloss:0.659292\n",
340 |       "[21]\ttrain-mlogloss:0.637133\ttest-mlogloss:0.654492\n",
341 |       "[22]\ttrain-mlogloss:0.632064\ttest-mlogloss:0.650024\n",
342 |       "[23]\ttrain-mlogloss:0.627592\ttest-mlogloss:0.646221\n",
343 |       "[24]\ttrain-mlogloss:0.622447\ttest-mlogloss:0.641828\n",
344 |       "[25]\ttrain-mlogloss:0.618027\ttest-mlogloss:0.638092\n",
345 |       "[26]\ttrain-mlogloss:0.614181\ttest-mlogloss:0.635053\n",
346 |       "[27]\ttrain-mlogloss:0.61114\ttest-mlogloss:0.632717\n",
347 |       "[28]\ttrain-mlogloss:0.607278\ttest-mlogloss:0.629888\n",
348 |       "[29]\ttrain-mlogloss:0.603595\ttest-mlogloss:0.627116\n",
349 |       "[30]\ttrain-mlogloss:0.600566\ttest-mlogloss:0.624912\n",
350 |       "[31]\ttrain-mlogloss:0.597396\ttest-mlogloss:0.622441\n",
351 |       "[32]\ttrain-mlogloss:0.594581\ttest-mlogloss:0.620373\n",
352 |       "[33]\ttrain-mlogloss:0.591807\ttest-mlogloss:0.618497\n",
353 |       "[34]\ttrain-mlogloss:0.589131\ttest-mlogloss:0.616384\n",
354 |       "[35]\ttrain-mlogloss:0.586585\ttest-mlogloss:0.614496\n",
355 |       "[36]\ttrain-mlogloss:0.583978\ttest-mlogloss:0.612716\n",
356 |       "[37]\ttrain-mlogloss:0.582015\ttest-mlogloss:0.611317\n",
357 |       "[38]\ttrain-mlogloss:0.579514\ttest-mlogloss:0.609588\n",
358 |       "[39]\ttrain-mlogloss:0.576912\ttest-mlogloss:0.607814\n",
359 |       "[40]\ttrain-mlogloss:0.574746\ttest-mlogloss:0.606454\n",
360 |       "[41]\ttrain-mlogloss:0.572975\ttest-mlogloss:0.605284\n",
361 |       "[42]\ttrain-mlogloss:0.570366\ttest-mlogloss:0.603354\n",
362 |       "[43]\ttrain-mlogloss:0.568138\ttest-mlogloss:0.602107\n",
363 |       "[44]\ttrain-mlogloss:0.565862\ttest-mlogloss:0.600475\n",
364 |       "[45]\ttrain-mlogloss:0.564646\ttest-mlogloss:0.599563\n",
365 |       "[46]\ttrain-mlogloss:0.562649\ttest-mlogloss:0.598221\n",
366 |       "[47]\ttrain-mlogloss:0.560823\ttest-mlogloss:0.597094\n",
367 |       "[48]\ttrain-mlogloss:0.559184\ttest-mlogloss:0.596101\n",
368 |       "[49]\ttrain-mlogloss:0.557642\ttest-mlogloss:0.595268\n",
369 |       "[50]\ttrain-mlogloss:0.555695\ttest-mlogloss:0.594217\n",
370 |       "[51]\ttrain-mlogloss:0.553391\ttest-mlogloss:0.593256\n",
371 |       "[52]\ttrain-mlogloss:0.551141\ttest-mlogloss:0.592129\n",
372 |       "[53]\ttrain-mlogloss:0.549666\ttest-mlogloss:0.591489\n",
373 |       "[54]\ttrain-mlogloss:0.547321\ttest-mlogloss:0.590389\n",
374 |       "[55]\ttrain-mlogloss:0.546197\ttest-mlogloss:0.589846\n",
375 |       "[56]\ttrain-mlogloss:0.544658\ttest-mlogloss:0.589096\n",
376 |       "[57]\ttrain-mlogloss:0.543389\ttest-mlogloss:0.588546\n",
377 |       "[58]\ttrain-mlogloss:0.541408\ttest-mlogloss:0.58737\n",
378 |       "[59]\ttrain-mlogloss:0.540229\ttest-mlogloss:0.586951\n",
379 |       "[60]\ttrain-mlogloss:0.538715\ttest-mlogloss:0.58633\n",
380 |       "[61]\ttrain-mlogloss:0.537227\ttest-mlogloss:0.585638\n",
381 |       "[62]\ttrain-mlogloss:0.535932\ttest-mlogloss:0.585132\n",
382 |       "[63]\ttrain-mlogloss:0.534624\ttest-mlogloss:0.584407\n",
383 |       "[64]\ttrain-mlogloss:0.533186\ttest-mlogloss:0.58367\n",
384 |       "[65]\ttrain-mlogloss:0.531767\ttest-mlogloss:0.582788\n",
385 |       "[66]\ttrain-mlogloss:0.530367\ttest-mlogloss:0.582063\n",
386 |       "[67]\ttrain-mlogloss:0.529023\ttest-mlogloss:0.581331\n",
387 |       "[68]\ttrain-mlogloss:0.527781\ttest-mlogloss:0.58068\n",
388 |       "[69]\ttrain-mlogloss:0.526511\ttest-mlogloss:0.580342\n",
389 |       "[70]\ttrain-mlogloss:0.525392\ttest-mlogloss:0.579888\n",
390 |       "[71]\ttrain-mlogloss:0.52422\ttest-mlogloss:0.579319\n",
391 |       "[72]\ttrain-mlogloss:0.523065\ttest-mlogloss:0.578852\n",
392 |       "[73]\ttrain-mlogloss:0.522163\ttest-mlogloss:0.578434\n",
393 |       "[74]\ttrain-mlogloss:0.520843\ttest-mlogloss:0.577687\n",
394 |       "[75]\ttrain-mlogloss:0.520055\ttest-mlogloss:0.577254\n",
395 |       "[76]\ttrain-mlogloss:0.519149\ttest-mlogloss:0.576857\n",
396 |       "[77]\ttrain-mlogloss:0.517909\ttest-mlogloss:0.57638\n",
397 |       "[78]\ttrain-mlogloss:0.516506\ttest-mlogloss:0.575721\n",
398 |       "[79]\ttrain-mlogloss:0.515361\ttest-mlogloss:0.575472\n",
399 |       "[80]\ttrain-mlogloss:0.514641\ttest-mlogloss:0.575183\n",
400 |       "[81]\ttrain-mlogloss:0.513579\ttest-mlogloss:0.574743\n",
401 |       "[82]\ttrain-mlogloss:0.512622\ttest-mlogloss:0.574371\n",
402 |       "[83]\ttrain-mlogloss:0.511446\ttest-mlogloss:0.574089\n",
403 |       "[84]\ttrain-mlogloss:0.510372\ttest-mlogloss:0.573719\n",
404 |       "[85]\ttrain-mlogloss:0.509183\ttest-mlogloss:0.573575\n",
405 |       "[86]\ttrain-mlogloss:0.508148\ttest-mlogloss:0.573277\n",
406 |       "[87]\ttrain-mlogloss:0.50706\ttest-mlogloss:0.572957\n",
407 |       "[88]\ttrain-mlogloss:0.50622\ttest-mlogloss:0.572635\n",
408 |       "[89]\ttrain-mlogloss:0.505219\ttest-mlogloss:0.572276\n",
409 |       "[90]\ttrain-mlogloss:0.504375\ttest-mlogloss:0.571933\n",
410 |       "[91]\ttrain-mlogloss:0.503762\ttest-mlogloss:0.571746\n",
411 |       "[92]\ttrain-mlogloss:0.502992\ttest-mlogloss:0.571413\n",
412 |       "[93]\ttrain-mlogloss:0.502076\ttest-mlogloss:0.571129\n",
413 |       "[94]\ttrain-mlogloss:0.500902\ttest-mlogloss:0.570822\n",
414 |       "[95]\ttrain-mlogloss:0.500169\ttest-mlogloss:0.570567\n",
415 |       "[96]\ttrain-mlogloss:0.499278\ttest-mlogloss:0.570131\n",
416 |       "[97]\ttrain-mlogloss:0.498181\ttest-mlogloss:0.569639\n",
417 |       "[98]\ttrain-mlogloss:0.497191\ttest-mlogloss:0.569336\n",
418 |       "[99]\ttrain-mlogloss:0.496139\ttest-mlogloss:0.569146\n",
419 |       "[100]\ttrain-mlogloss:0.495544\ttest-mlogloss:0.56896\n",
420 |       "[101]\ttrain-mlogloss:0.494762\ttest-mlogloss:0.568668\n",
421 |       "[102]\ttrain-mlogloss:0.493763\ttest-mlogloss:0.568456\n",
422 |       "[103]\ttrain-mlogloss:0.492945\ttest-mlogloss:0.568271\n",
423 |       "[104]\ttrain-mlogloss:0.491708\ttest-mlogloss:0.567905\n",
424 |       "[105]\ttrain-mlogloss:0.490897\ttest-mlogloss:0.567701\n",
425 |       "[106]\ttrain-mlogloss:0.490114\ttest-mlogloss:0.567514\n",
426 |       "[107]\ttrain-mlogloss:0.48894\ttest-mlogloss:0.567149\n",
427 |       "[108]\ttrain-mlogloss:0.488131\ttest-mlogloss:0.566846\n",
428 |       "[109]\ttrain-mlogloss:0.487414\ttest-mlogloss:0.566577\n",
429 |       "[110]\ttrain-mlogloss:0.486545\ttest-mlogloss:0.566364\n",
430 |       "[111]\ttrain-mlogloss:0.485623\ttest-mlogloss:0.566043\n",
431 |       "[112]\ttrain-mlogloss:0.484816\ttest-mlogloss:0.565925\n",
432 |       "[113]\ttrain-mlogloss:0.484138\ttest-mlogloss:0.565711\n",
433 |       "[114]\ttrain-mlogloss:0.483216\ttest-mlogloss:0.56544\n",
434 |       "[115]\ttrain-mlogloss:0.482588\ttest-mlogloss:0.565323\n",
435 |       "[116]\ttrain-mlogloss:0.481523\ttest-mlogloss:0.565\n",
436 |       "[117]\ttrain-mlogloss:0.48092\ttest-mlogloss:0.564753\n",
437 |       "[118]\ttrain-mlogloss:0.480238\ttest-mlogloss:0.564586\n",
438 |       "[119]\ttrain-mlogloss:0.47942\ttest-mlogloss:0.564378\n",
439 |       "[120]\ttrain-mlogloss:0.478738\ttest-mlogloss:0.564245\n",
440 |       "[121]\ttrain-mlogloss:0.478011\ttest-mlogloss:0.56409\n",
441 |       "[122]\ttrain-mlogloss:0.476949\ttest-mlogloss:0.56384\n",
442 |       "[123]\ttrain-mlogloss:0.476118\ttest-mlogloss:0.563467\n",
443 |       "[124]\ttrain-mlogloss:0.475843\ttest-mlogloss:0.563276\n",
444 |       "[125]\ttrain-mlogloss:0.474954\ttest-mlogloss:0.562983\n",
445 |       "[126]\ttrain-mlogloss:0.474088\ttest-mlogloss:0.562882\n",
446 |       "[127]\ttrain-mlogloss:0.473533\ttest-mlogloss:0.562699\n",
447 |       "[128]\ttrain-mlogloss:0.472967\ttest-mlogloss:0.562539\n",
448 |       "[129]\ttrain-mlogloss:0.472171\ttest-mlogloss:0.562386\n",
449 |       "[130]\ttrain-mlogloss:0.471264\ttest-mlogloss:0.562188\n",
450 |       "[131]\ttrain-mlogloss:0.470706\ttest-mlogloss:0.562049\n",
451 |       "[132]\ttrain-mlogloss:0.469903\ttest-mlogloss:0.561895\n",
452 |       "[133]\ttrain-mlogloss:0.469176\ttest-mlogloss:0.561649\n",
453 |       "[134]\ttrain-mlogloss:0.468483\ttest-mlogloss:0.561359\n",
454 |       "[135]\ttrain-mlogloss:0.467675\ttest-mlogloss:0.561175\n",
455 |       "[136]\ttrain-mlogloss:0.466944\ttest-mlogloss:0.560943\n",
456 |       "[137]\ttrain-mlogloss:0.466573\ttest-mlogloss:0.560931\n",
457 |       "[138]\ttrain-mlogloss:0.465994\ttest-mlogloss:0.560789\n",
458 |       "[139]\ttrain-mlogloss:0.465236\ttest-mlogloss:0.560444\n",
459 |       "[140]\ttrain-mlogloss:0.464364\ttest-mlogloss:0.560345\n",
460 |       "[141]\ttrain-mlogloss:0.463396\ttest-mlogloss:0.560242\n",
461 |       "[142]\ttrain-mlogloss:0.46274\ttest-mlogloss:0.560137\n",
462 |       "[143]\ttrain-mlogloss:0.462101\ttest-mlogloss:0.55996\n",
463 |       "[144]\ttrain-mlogloss:0.461377\ttest-mlogloss:0.559821\n",
464 |       "[145]\ttrain-mlogloss:0.460638\ttest-mlogloss:0.559611\n",
465 |       "[146]\ttrain-mlogloss:0.459958\ttest-mlogloss:0.559478\n",
466 |       "[147]\ttrain-mlogloss:0.459362\ttest-mlogloss:0.559354\n",
467 |       "[148]\ttrain-mlogloss:0.458515\ttest-mlogloss:0.559138\n",
468 |       "[149]\ttrain-mlogloss:0.457808\ttest-mlogloss:0.559009\n",
469 |       "[150]\ttrain-mlogloss:0.45738\ttest-mlogloss:0.558911\n",
470 |       "[151]\ttrain-mlogloss:0.456855\ttest-mlogloss:0.55884\n",
471 |       "[152]\ttrain-mlogloss:0.456063\ttest-mlogloss:0.558697\n",
472 |       "[153]\ttrain-mlogloss:0.455421\ttest-mlogloss:0.558521\n",
473 |       "[154]\ttrain-mlogloss:0.454662\ttest-mlogloss:0.558377\n",
474 |       "[155]\ttrain-mlogloss:0.454117\ttest-mlogloss:0.558296\n",
475 |       "[156]\ttrain-mlogloss:0.453326\ttest-mlogloss:0.558084\n",
476 |       "[157]\ttrain-mlogloss:0.452753\ttest-mlogloss:0.557905\n",
477 |       "[158]\ttrain-mlogloss:0.452359\ttest-mlogloss:0.557868\n",
478 |       "[159]\ttrain-mlogloss:0.451707\ttest-mlogloss:0.557636\n",
479 |       "[160]\ttrain-mlogloss:0.451068\ttest-mlogloss:0.557454\n",
480 |       "[161]\ttrain-mlogloss:0.450408\ttest-mlogloss:0.557361\n",
481 |       "[162]\ttrain-mlogloss:0.449685\ttest-mlogloss:0.557289\n",
482 |       "[163]\ttrain-mlogloss:0.448961\ttest-mlogloss:0.557146\n",
483 |       "[164]\ttrain-mlogloss:0.448501\ttest-mlogloss:0.557029\n",
484 |       "[165]\ttrain-mlogloss:0.447691\ttest-mlogloss:0.556853\n",
485 |       "[166]\ttrain-mlogloss:0.446992\ttest-mlogloss:0.556806\n",
486 |       "[167]\ttrain-mlogloss:0.446296\ttest-mlogloss:0.556598\n",
487 |       "[168]\ttrain-mlogloss:0.445686\ttest-mlogloss:0.556577\n",
488 |       "[169]\ttrain-mlogloss:0.444956\ttest-mlogloss:0.556382\n",
489 |       "[170]\ttrain-mlogloss:0.444435\ttest-mlogloss:0.556329\n",
490 |       "[171]\ttrain-mlogloss:0.443592\ttest-mlogloss:0.556008\n",
491 |       "[172]\ttrain-mlogloss:0.442805\ttest-mlogloss:0.555822\n",
492 |       "[173]\ttrain-mlogloss:0.442412\ttest-mlogloss:0.555704\n",
493 |       "[174]\ttrain-mlogloss:0.441773\ttest-mlogloss:0.555605\n",
494 |       "[175]\ttrain-mlogloss:0.441135\ttest-mlogloss:0.555466\n",
495 |       "[176]\ttrain-mlogloss:0.440742\ttest-mlogloss:0.555388\n",
496 |       "[177]\ttrain-mlogloss:0.44027\ttest-mlogloss:0.555334\n",
497 |       "[178]\ttrain-mlogloss:0.439462\ttest-mlogloss:0.555133\n",
498 |       "[179]\ttrain-mlogloss:0.43881\ttest-mlogloss:0.554992\n",
499 |       "[180]\ttrain-mlogloss:0.438174\ttest-mlogloss:0.554753\n",
500 |       "[181]\ttrain-mlogloss:0.437383\ttest-mlogloss:0.554644\n",
501 |       "[182]\ttrain-mlogloss:0.436838\ttest-mlogloss:0.554575\n",
502 |       "[183]\ttrain-mlogloss:0.436125\ttest-mlogloss:0.554404\n",
503 |       "[184]\ttrain-mlogloss:0.435588\ttest-mlogloss:0.554327\n",
504 |       "[185]\ttrain-mlogloss:0.435114\ttest-mlogloss:0.55427\n",
505 |       "[186]\ttrain-mlogloss:0.434355\ttest-mlogloss:0.554231\n",
506 |       "[187]\ttrain-mlogloss:0.43382\ttest-mlogloss:0.554011\n",
507 |       "[188]\ttrain-mlogloss:0.433208\ttest-mlogloss:0.553862\n",
508 |       "[189]\ttrain-mlogloss:0.43253\ttest-mlogloss:0.553751\n",
509 |       "[190]\ttrain-mlogloss:0.432027\ttest-mlogloss:0.553633\n",
510 |       "[191]\ttrain-mlogloss:0.43148\ttest-mlogloss:0.553609\n",
511 |       "[192]\ttrain-mlogloss:0.431025\ttest-mlogloss:0.553599\n",
512 |       "[193]\ttrain-mlogloss:0.430441\ttest-mlogloss:0.553502\n",
513 |       "[194]\ttrain-mlogloss:0.429787\ttest-mlogloss:0.553418\n",
514 |       "[195]\ttrain-mlogloss:0.429262\ttest-mlogloss:0.553465\n",
515 |       "[196]\ttrain-mlogloss:0.42865\ttest-mlogloss:0.553342\n",
516 |       "[197]\ttrain-mlogloss:0.428045\ttest-mlogloss:0.553264\n",
517 |       "[198]\ttrain-mlogloss:0.427341\ttest-mlogloss:0.553197\n",
518 |       "[199]\ttrain-mlogloss:0.426563\ttest-mlogloss:0.552965\n",
519 |       "[200]\ttrain-mlogloss:0.426066\ttest-mlogloss:0.552906\n",
520 |       "[201]\ttrain-mlogloss:0.42541\ttest-mlogloss:0.552713\n",
521 |       "[202]\ttrain-mlogloss:0.424861\ttest-mlogloss:0.552693\n",
522 |       "[203]\ttrain-mlogloss:0.42421\ttest-mlogloss:0.552601\n",
523 |       "[204]\ttrain-mlogloss:0.423567\ttest-mlogloss:0.552647\n",
524 |       "[205]\ttrain-mlogloss:0.422962\ttest-mlogloss:0.552553\n",
525 |       "[206]\ttrain-mlogloss:0.422326\ttest-mlogloss:0.552551\n",
526 |       "[207]\ttrain-mlogloss:0.421518\ttest-mlogloss:0.55258\n",
527 |       "[208]\ttrain-mlogloss:0.420897\ttest-mlogloss:0.552612\n",
528 |       "[209]\ttrain-mlogloss:0.420392\ttest-mlogloss:0.552503\n",
529 |       "[210]\ttrain-mlogloss:0.420065\ttest-mlogloss:0.552369\n",
530 |       "[211]\ttrain-mlogloss:0.419603\ttest-mlogloss:0.55221\n",
531 |       "[212]\ttrain-mlogloss:0.41903\ttest-mlogloss:0.552108\n",
532 |       "[213]\ttrain-mlogloss:0.418522\ttest-mlogloss:0.551998\n",
533 |       "[214]\ttrain-mlogloss:0.417667\ttest-mlogloss:0.551873\n",
534 |       "[215]\ttrain-mlogloss:0.417187\ttest-mlogloss:0.551808\n",
535 |       "[216]\ttrain-mlogloss:0.416637\ttest-mlogloss:0.551775\n",
536 |       "[217]\ttrain-mlogloss:0.41618\ttest-mlogloss:0.55173\n",
537 |       "[218]\ttrain-mlogloss:0.415826\ttest-mlogloss:0.55165\n",
538 |       "[219]\ttrain-mlogloss:0.415501\ttest-mlogloss:0.551587\n",
539 |       "[220]\ttrain-mlogloss:0.415265\ttest-mlogloss:0.551546\n",
540 |       "[221]\ttrain-mlogloss:0.414692\ttest-mlogloss:0.551359\n",
541 |       "[222]\ttrain-mlogloss:0.414234\ttest-mlogloss:0.551307\n",
542 |       "[223]\ttrain-mlogloss:0.413624\ttest-mlogloss:0.551199\n",
543 |       "[224]\ttrain-mlogloss:0.41308\ttest-mlogloss:0.551012\n",
544 |       "[225]\ttrain-mlogloss:0.41247\ttest-mlogloss:0.550941\n",
545 |       "[226]\ttrain-mlogloss:0.411947\ttest-mlogloss:0.550983\n",
546 |       "[227]\ttrain-mlogloss:0.411371\ttest-mlogloss:0.550967\n",
547 |       "[228]\ttrain-mlogloss:0.41081\ttest-mlogloss:0.550876\n",
548 |       "[229]\ttrain-mlogloss:0.410216\ttest-mlogloss:0.550737\n",
549 |       "[230]\ttrain-mlogloss:0.409747\ttest-mlogloss:0.550653\n",
550 |       "[231]\ttrain-mlogloss:0.409131\ttest-mlogloss:0.550562\n",
551 |       "[232]\ttrain-mlogloss:0.408654\ttest-mlogloss:0.55062\n",
552 |       "[233]\ttrain-mlogloss:0.408119\ttest-mlogloss:0.550529\n",
553 |       "[234]\ttrain-mlogloss:0.407361\ttest-mlogloss:0.550505\n",
554 |       "[235]\ttrain-mlogloss:0.406824\ttest-mlogloss:0.550482\n",
555 |       "[236]\ttrain-mlogloss:0.406348\ttest-mlogloss:0.55042\n",
556 |       "[237]\ttrain-mlogloss:0.406023\ttest-mlogloss:0.550356\n",
557 |       "[238]\ttrain-mlogloss:0.405309\ttest-mlogloss:0.550179\n",
558 |       "[239]\ttrain-mlogloss:0.404664\ttest-mlogloss:0.55013\n",
559 |       "[240]\ttrain-mlogloss:0.404285\ttest-mlogloss:0.550085\n",
560 |       "[241]\ttrain-mlogloss:0.403685\ttest-mlogloss:0.55006\n",
561 |       "[242]\ttrain-mlogloss:0.403308\ttest-mlogloss:0.549991\n",
562 |       "[243]\ttrain-mlogloss:0.402697\ttest-mlogloss:0.549962\n",
563 |       "[244]\ttrain-mlogloss:0.402272\ttest-mlogloss:0.549869\n",
564 |       "[245]\ttrain-mlogloss:0.401685\ttest-mlogloss:0.549878\n",
565 |       "[246]\ttrain-mlogloss:0.401243\ttest-mlogloss:0.549921\n",
566 |       "[247]\ttrain-mlogloss:0.400637\ttest-mlogloss:0.549932\n",
567 |       "[248]\ttrain-mlogloss:0.400319\ttest-mlogloss:0.549812\n",
568 |       "[249]\ttrain-mlogloss:0.399861\ttest-mlogloss:0.549876\n",
569 |       "[250]\ttrain-mlogloss:0.399276\ttest-mlogloss:0.549815\n",
570 |       "[251]\ttrain-mlogloss:0.398666\ttest-mlogloss:0.549829\n",
571 |       "[252]\ttrain-mlogloss:0.398211\ttest-mlogloss:0.549989\n",
572 |       "[253]\ttrain-mlogloss:0.397705\ttest-mlogloss:0.549932\n",
573 |       "[254]\ttrain-mlogloss:0.397121\ttest-mlogloss:0.550049\n",
574 |       "[255]\ttrain-mlogloss:0.396528\ttest-mlogloss:0.550022\n",
575 |       "[256]\ttrain-mlogloss:0.396249\ttest-mlogloss:0.550033\n",
576 |       "[257]\ttrain-mlogloss:0.395951\ttest-mlogloss:0.549966\n",
577 |       "[258]\ttrain-mlogloss:0.395331\ttest-mlogloss:0.549948\n",
578 |       "[259]\ttrain-mlogloss:0.394668\ttest-mlogloss:0.549957\n",
579 |       "[260]\ttrain-mlogloss:0.394171\ttest-mlogloss:0.549973\n",
580 |       "[261]\ttrain-mlogloss:0.39384\ttest-mlogloss:0.549985\n",
581 |       "[262]\ttrain-mlogloss:0.393273\ttest-mlogloss:0.550006\n",
582 |       "[263]\ttrain-mlogloss:0.392843\ttest-mlogloss:0.5499\n",
583 |       "[264]\ttrain-mlogloss:0.392273\ttest-mlogloss:0.549908\n",
584 |       "[265]\ttrain-mlogloss:0.391828\ttest-mlogloss:0.549826\n",
585 |       "[266]\ttrain-mlogloss:0.391468\ttest-mlogloss:0.549805\n",
586 |       "[267]\ttrain-mlogloss:0.390976\ttest-mlogloss:0.549758\n",
587 |       "[268]\ttrain-mlogloss:0.390481\ttest-mlogloss:0.549727\n",
588 |       "[269]\ttrain-mlogloss:0.390038\ttest-mlogloss:0.549707\n",
589 |       "[270]\ttrain-mlogloss:0.389536\ttest-mlogloss:0.549714\n",
590 |       "[271]\ttrain-mlogloss:0.388936\ttest-mlogloss:0.549652\n",
591 |       "[272]\ttrain-mlogloss:0.388576\ttest-mlogloss:0.549666\n",
592 |       "[273]\ttrain-mlogloss:0.388062\ttest-mlogloss:0.549731\n",
593 |       "[274]\ttrain-mlogloss:0.387869\ttest-mlogloss:0.549754\n",
594 |       "[275]\ttrain-mlogloss:0.387572\ttest-mlogloss:0.549816\n",
595 |       "[276]\ttrain-mlogloss:0.387073\ttest-mlogloss:0.549819\n",
596 |       "[277]\ttrain-mlogloss:0.386474\ttest-mlogloss:0.54963\n",
597 |       "[278]\ttrain-mlogloss:0.385841\ttest-mlogloss:0.549673\n",
598 |       "[279]\ttrain-mlogloss:0.385482\ttest-mlogloss:0.549606\n",
599 |       "[280]\ttrain-mlogloss:0.385114\ttest-mlogloss:0.549587\n",
600 |       "[281]\ttrain-mlogloss:0.384674\ttest-mlogloss:0.54955\n",
601 |       "[282]\ttrain-mlogloss:0.384137\ttest-mlogloss:0.549542\n",
602 |       "[283]\ttrain-mlogloss:0.38372\ttest-mlogloss:0.549528\n",
603 |       "[284]\ttrain-mlogloss:0.383234\ttest-mlogloss:0.549464\n",
604 |       "[285]\ttrain-mlogloss:0.38272\ttest-mlogloss:0.549434\n",
605 |       "[286]\ttrain-mlogloss:0.382295\ttest-mlogloss:0.549465\n",
606 |       "[287]\ttrain-mlogloss:0.381834\ttest-mlogloss:0.549379\n",
607 |       "[288]\ttrain-mlogloss:0.38132\ttest-mlogloss:0.54934\n",
608 |       "[289]\ttrain-mlogloss:0.380894\ttest-mlogloss:0.549264\n",
609 |       "[290]\ttrain-mlogloss:0.380498\ttest-mlogloss:0.549247\n",
610 |       "[291]\ttrain-mlogloss:0.380062\ttest-mlogloss:0.549205\n",
611 |       "[292]\ttrain-mlogloss:0.37965\ttest-mlogloss:0.549201\n",
612 |       "[293]\ttrain-mlogloss:0.379019\ttest-mlogloss:0.549211\n",
613 |       "[294]\ttrain-mlogloss:0.378508\ttest-mlogloss:0.549221\n",
614 |       "[295]\ttrain-mlogloss:0.378046\ttest-mlogloss:0.549091\n",
615 |       "[296]\ttrain-mlogloss:0.377815\ttest-mlogloss:0.549071\n",
616 |       "[297]\ttrain-mlogloss:0.377491\ttest-mlogloss:0.549019\n",
617 |       "[298]\ttrain-mlogloss:0.377001\ttest-mlogloss:0.549037\n",
618 |       "[299]\ttrain-mlogloss:0.376494\ttest-mlogloss:0.549011\n",
619 |       "[300]\ttrain-mlogloss:0.376066\ttest-mlogloss:0.548946\n",
620 |       "[301]\ttrain-mlogloss:0.375527\ttest-mlogloss:0.548929\n",
621 |       "[302]\ttrain-mlogloss:0.375013\ttest-mlogloss:0.54892\n",
622 |       "[303]\ttrain-mlogloss:0.374521\ttest-mlogloss:0.549\n",
623 |       "[304]\ttrain-mlogloss:0.373935\ttest-mlogloss:0.549171\n",
624 |       "[305]\ttrain-mlogloss:0.373428\ttest-mlogloss:0.549223\n",
625 |       "[306]\ttrain-mlogloss:0.373039\ttest-mlogloss:0.54916\n",
626 |       "[307]\ttrain-mlogloss:0.372686\ttest-mlogloss:0.549035\n",
627 |       "[308]\ttrain-mlogloss:0.37216\ttest-mlogloss:0.548995\n",
628 |       "[309]\ttrain-mlogloss:0.371648\ttest-mlogloss:0.548941\n",
629 |       "[310]\ttrain-mlogloss:0.371155\ttest-mlogloss:0.548814\n",
630 |       "[311]\ttrain-mlogloss:0.370729\ttest-mlogloss:0.548765\n",
631 |       "[312]\ttrain-mlogloss:0.37032\ttest-mlogloss:0.548888\n",
632 |       "[313]\ttrain-mlogloss:0.369891\ttest-mlogloss:0.548985\n",
633 |       "[314]\ttrain-mlogloss:0.369316\ttest-mlogloss:0.548926\n",
634 |       "[315]\ttrain-mlogloss:0.368816\ttest-mlogloss:0.548971\n",
635 |       "[316]\ttrain-mlogloss:0.368333\ttest-mlogloss:0.548876\n",
636 |       "[317]\ttrain-mlogloss:0.368004\ttest-mlogloss:0.548885\n",
637 |       "[318]\ttrain-mlogloss:0.367705\ttest-mlogloss:0.548927\n",
638 |       "[319]\ttrain-mlogloss:0.367121\ttest-mlogloss:0.548788\n",
639 |       "[320]\ttrain-mlogloss:0.366641\ttest-mlogloss:0.548706\n",
640 |       "[321]\ttrain-mlogloss:0.366203\ttest-mlogloss:0.548571\n",
641 |       "[322]\ttrain-mlogloss:0.365932\ttest-mlogloss:0.548489\n",
642 |       "[323]\ttrain-mlogloss:0.365446\ttest-mlogloss:0.548531\n",
643 |       "[324]\ttrain-mlogloss:0.365172\ttest-mlogloss:0.548617\n",
644 |       "[325]\ttrain-mlogloss:0.364779\ttest-mlogloss:0.548644\n",
645 |       "[326]\ttrain-mlogloss:0.364241\ttest-mlogloss:0.548594\n",
646 |       "[327]\ttrain-mlogloss:0.363824\ttest-mlogloss:0.548602\n",
647 |       "[328]\ttrain-mlogloss:0.3634\ttest-mlogloss:0.548548\n",
648 |       "[329]\ttrain-mlogloss:0.363085\ttest-mlogloss:0.548491\n",
649 |       "[330]\ttrain-mlogloss:0.362653\ttest-mlogloss:0.548437\n",
650 |       "[331]\ttrain-mlogloss:0.362338\ttest-mlogloss:0.548367\n",
651 |       "[332]\ttrain-mlogloss:0.361838\ttest-mlogloss:0.548419\n",
652 |       "[333]\ttrain-mlogloss:0.361572\ttest-mlogloss:0.548516\n",
653 |       "[334]\ttrain-mlogloss:0.361207\ttest-mlogloss:0.548434\n",
654 |       "[335]\ttrain-mlogloss:0.360795\ttest-mlogloss:0.548389\n",
655 |       "[336]\ttrain-mlogloss:0.360272\ttest-mlogloss:0.548249\n",
656 |       "[337]\ttrain-mlogloss:0.359874\ttest-mlogloss:0.548235\n",
657 |       "[338]\ttrain-mlogloss:0.359489\ttest-mlogloss:0.54823\n",
658 |       "[339]\ttrain-mlogloss:0.358986\ttest-mlogloss:0.548271\n",
659 |       "[340]\ttrain-mlogloss:0.358536\ttest-mlogloss:0.548283\n",
660 |       "[341]\ttrain-mlogloss:0.358192\ttest-mlogloss:0.5482\n",
661 |       "[342]\ttrain-mlogloss:0.357849\ttest-mlogloss:0.548229\n",
662 |       "[343]\ttrain-mlogloss:0.357487\ttest-mlogloss:0.54821\n",
663 |       "[344]\ttrain-mlogloss:0.356953\ttest-mlogloss:0.548181\n",
664 |       "[345]\ttrain-mlogloss:0.356421\ttest-mlogloss:0.548106\n",
665 |       "[346]\ttrain-mlogloss:0.355903\ttest-mlogloss:0.548063\n",
666 |       "[347]\ttrain-mlogloss:0.355627\ttest-mlogloss:0.548068\n",
667 |       "[348]\ttrain-mlogloss:0.355334\ttest-mlogloss:0.54803\n",
668 |       "[349]\ttrain-mlogloss:0.354875\ttest-mlogloss:0.548005\n",
669 |       "[350]\ttrain-mlogloss:0.354477\ttest-mlogloss:0.547958\n",
670 |       "[351]\ttrain-mlogloss:0.354084\ttest-mlogloss:0.547862\n",
671 |       "[352]\ttrain-mlogloss:0.353584\ttest-mlogloss:0.54775\n",
672 |       "[353]\ttrain-mlogloss:0.353249\ttest-mlogloss:0.547744\n",
673 |       "[354]\ttrain-mlogloss:0.35303\ttest-mlogloss:0.547778\n",
674 |       "[355]\ttrain-mlogloss:0.352646\ttest-mlogloss:0.547696\n",
675 |       "[356]\ttrain-mlogloss:0.352297\ttest-mlogloss:0.54783\n",
676 |       "[357]\ttrain-mlogloss:0.351894\ttest-mlogloss:0.547775\n",
677 |       "[358]\ttrain-mlogloss:0.351425\ttest-mlogloss:0.54786\n",
678 |       "[359]\ttrain-mlogloss:0.350943\ttest-mlogloss:0.547774\n",
679 |       "[360]\ttrain-mlogloss:0.350602\ttest-mlogloss:0.547771\n",
680 |       "[361]\ttrain-mlogloss:0.350357\ttest-mlogloss:0.547768\n",
681 |       "[362]\ttrain-mlogloss:0.34985\ttest-mlogloss:0.547881\n",
682 |       "[363]\ttrain-mlogloss:0.349465\ttest-mlogloss:0.547835\n",
683 |       "[364]\ttrain-mlogloss:0.348895\ttest-mlogloss:0.547832\n",
684 |       "[365]\ttrain-mlogloss:0.348455\ttest-mlogloss:0.548\n",
685 |       "[366]\ttrain-mlogloss:0.348064\ttest-mlogloss:0.547948\n",
686 |       "[367]\ttrain-mlogloss:0.347629\ttest-mlogloss:0.548026\n",
687 |       "[368]\ttrain-mlogloss:0.347153\ttest-mlogloss:0.547928\n",
688 |       "[369]\ttrain-mlogloss:0.346734\ttest-mlogloss:0.547903\n",
689 |       "[370]\ttrain-mlogloss:0.346251\ttest-mlogloss:0.547871\n",
690 |       "[371]\ttrain-mlogloss:0.345869\ttest-mlogloss:0.547909\n",
691 |       "[372]\ttrain-mlogloss:0.345424\ttest-mlogloss:0.547937\n",
692 |       "[373]\ttrain-mlogloss:0.34505\ttest-mlogloss:0.548001\n",
693 |       "[374]\ttrain-mlogloss:0.344615\ttest-mlogloss:0.547982\n",
694 |       "[375]\ttrain-mlogloss:0.344206\ttest-mlogloss:0.54803\n",
695 |       "Stopping. Best iteration:\n",
696 |       "[355]\ttrain-mlogloss:0.352646\ttest-mlogloss:0.547696\n",
697 |       "\n",
698 |       "[0.54803037236074925]\n"
699 |      ]
700 |     }
701 |    ],
702 |    "source": [
703 |     "cv_scores = []\n",
704 |     "kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)\n",
705 |     "for dev_index, val_index in kf.split(range(train_X.shape[0])):\n",
706 |     "        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]\n",
707 |     "        dev_y, val_y = train_y[dev_index], train_y[val_index]\n",
708 |     "        preds, model = runXGB(dev_X, dev_y, val_X, val_y)\n",
709 |     "        cv_scores.append(log_loss(val_y, preds))\n",
710 |     "        print(cv_scores)\n",
711 |     "        break"
712 |    ]
713 |   },
714 |   {
715 |    "cell_type": "markdown",
716 |    "metadata": {
717 |     "_cell_guid": "5cff686f-2601-321d-8f81-5fa846ef7562"
718 |    },
719 |    "source": [
720 |     "Now let us build the final model and get the predictions on the test set."
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "code",
725 |    "execution_count": 10,
726 |    "metadata": {
727 |     "_cell_guid": "4fb1954d-e3f0-9369-d50c-bd1b615c0077"
728 |    },
729 |    "outputs": [],
730 |    "source": [
731 |     "preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)\n",
732 |     "out_df = pd.DataFrame(preds)\n",
733 |     "out_df.columns = [\"high\", \"medium\", \"low\"]\n",
734 |     "out_df[\"listing_id\"] = test_df.listing_id.values\n",
735 |     "out_df.to_csv(\"xgb_starter2.csv\", index=False)"
736 |    ]
737 |   },
738 |   {
739 |    "cell_type": "markdown",
740 |    "metadata": {
741 |     "_cell_guid": "b23cc080-cd12-dc7d-0877-66806a34bf4c"
742 |    },
743 |    "source": [
744 |     "\n",
745 |     "Hope this helps the python users as a good starting point."
746 |    ]
747 |   }
748 |  ],
749 |  "metadata": {
750 |   "_change_revision": 488,
751 |   "_is_fork": false,
752 |   "kernelspec": {
753 |    "display_name": "Python 3",
754 |    "language": "python",
755 |    "name": "python3"
756 |   },
757 |   "language_info": {
758 |    "codemirror_mode": {
759 |     "name": "ipython",
760 |     "version": 3
761 |    },
762 |    "file_extension": ".py",
763 |    "mimetype": "text/x-python",
764 |    "name": "python",
765 |    "nbconvert_exporter": "python",
766 |    "pygments_lexer": "ipython3",
767 |    "version": "3.6.0"
768 |   }
769 |  },
770 |  "nbformat": 4,
771 |  "nbformat_minor": 0
772 | }
773 | 


--------------------------------------------------------------------------------
/TwoSigmaConnect_RentHop/readme.md:
--------------------------------------------------------------------------------
1 | Codes and notebooks used for [Kaggle - Two Sigma Connect : RentHop Rental Listing Enquiries competition](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)
2 | 


--------------------------------------------------------------------------------
/TwoSigmaFinancialModeling/readme.md:
--------------------------------------------------------------------------------
1 | Codes for Two Sigma Financial Modeling Challenge
2 | 


--------------------------------------------------------------------------------
/Walmart_TripType/NeuralNets/neural_net.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import csv
  3 | import operator
  4 | import numpy as np
  5 | import pandas as pd
  6 | import scipy as sp
  7 | import cPickle as pkl
  8 | from scipy.sparse import csr_matrix
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.cross_validation import KFold
 11 | from sklearn import ensemble, preprocessing
 12 | from sklearn.metrics import mean_squared_error, roc_auc_score
 13 | #sys.path.append("/home/sudalai/Softwares/xgboost-master/wrapper/")
 14 | sys.path.append("/home/sudalai/Softwares/XGB_pointfour/xgboost-master/wrapper/")
 15 | import xgboost as xgb
 16 | 
 17 | np.random.seed(12345)
 18 | from keras.models import Sequential
 19 | from keras.optimizers import SGD
 20 | from keras.layers.normalization import BatchNormalization
 21 | from keras.layers.core import Dense, Activation, Dropout
 22 | from keras.constraints import maxnorm
 23 | from keras.utils import np_utils
 24 | from keras import regularizers
 25 | from keras.layers.advanced_activations import PReLU
 26 | 
 27 | 
 28 | def multiclassLogLoss(y_true, y_pred, eps=1e-15):
 29 |     """Multi class version of Logarithmic Loss metric.
 30 |     https://www.kaggle.com/wiki/MultiClassLogLoss
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     y_true : array, shape = [n_samples]
 35 |             true class, intergers in [0, n_classes - 1)
 36 |     y_pred : array, shape = [n_samples, n_classes]
 37 | 
 38 |     Returns
 39 |     -------
 40 |     loss : float
 41 |     """
 42 |     predictions = np.clip(y_pred, eps, 1 - eps)
 43 | 
 44 |     # normalize row sums to 1
 45 |     predictions /= predictions.sum(axis=1)[:, np.newaxis]
 46 | 
 47 |     actual = np.zeros(y_pred.shape)
 48 |     n_samples = actual.shape[0]
 49 |     actual[np.arange(n_samples), y_true.astype(int)] = 1
 50 |     vectsum = np.sum(actual * np.log(predictions))
 51 |     loss = -1.0 / n_samples * vectsum
 52 |     return loss
 53 | 
 54 | 
 55 | def runNN(train_X, train_y, test_X=None, test_y=None):
 56 |         sc = preprocessing.StandardScaler()
 57 |         train_X = sc.fit_transform(train_X)
 58 |         #test_X = sc.transform(test_X)
 59 | 
 60 |         train_y = np_utils.to_categorical(train_y, 38)
 61 | 
 62 |         model = Sequential()
 63 | 	#model.add(Dropout(0.2))
 64 | 
 65 |         model.add(Dense(600, input_shape=(train_X.shape[1],), init='he_uniform', W_regularizer=regularizers.l1(0.002)))
 66 |         model.add(Activation('relu'))
 67 |         model.add(Dropout(0.3))
 68 | 	#model.add(BatchNormalization())
 69 | 
 70 |         model.add(Dense(600, init='he_uniform'))
 71 |         model.add(Activation('relu'))
 72 |         model.add(Dropout(0.3))
 73 | 	#model.add(BatchNormalization())
 74 | 
 75 |         #model.add(Dense(100, init='he_uniform'))
 76 |         #model.add(Activation('relu'))
 77 |         #model.add(Dropout(0.5))
 78 | 
 79 |         model.add(Dense(38, init='he_uniform'))
 80 |         model.add(Activation('softmax'))
 81 | 
 82 |         #sgd_opt = SGD(lr=0.01)
 83 |         model.compile(loss='categorical_crossentropy', optimizer='adagrad')
 84 | 
 85 | 	#for i in xrange(500):
 86 |         model.fit(train_X, train_y, batch_size=256, nb_epoch=200, validation_split=0.03, verbose=2, shuffle=True)
 87 | 	#preds = model.predict(test_X, verbose=0)
 88 |         #print "Test preds shape : ",preds.shape
 89 | 	#loss = multiclassLogLoss(test_y, preds)
 90 | 	#print "At",(i+1)*2, "Epochs, Loss is : ", loss
 91 |         #print "ROC AUC score : ", metrics.roc_auc_score(test_y, preds)
 92 | 
 93 | 	return model, sc
 94 | 
 95 | if __name__ == "__main__":
 96 |         # setting the input path and reading the data into dataframe #
 97 |         print "Reading data.."
 98 |         data_path = "../Data/"
 99 | 	train_X = pd.read_csv(data_path + "train_mod_v2.csv")
100 | 
101 | 	print "Getting target and id"
102 |         train_y = np.array(train_X["DV"])
103 |         train_id = np.array(train_X["VisitNumber"])
104 | 	
105 | 	print "Dropping columns"
106 |         drop_columns = ["DV"]
107 |         train_X.drop(drop_columns+["VisitNumber"], axis=1, inplace=True)
108 | 	#test_X.drop(["VisitNumber"], axis=1, inplace=True)
109 | 
110 | 	print "Converting to array"
111 |         train_X = np.array(train_X)
112 | 	print "Train shape : ", train_X.shape 
113 | 
114 | 	print "Building model.."
115 | 	model, scaler = runNN(train_X, train_y)
116 | 	del train_X
117 | 	import gc
118 | 	gc.collect()
119 | 
120 | 	print "Working on test data.."
121 | 	test_X = pd.read_csv(data_path + "test_mod_v2.csv")
122 | 	test_id = np.array(test_X["VisitNumber"])
123 | 	test_X.drop(["VisitNumber"], axis=1, inplace=True)
124 | 	test_X = np.array(test_X)
125 | 	test_X = scaler.transform(test_X)
126 | 
127 | 	print "Getting preds.."
128 | 	preds = model.predict(test_X, verbose=0)
129 | 	
130 | 	sample = pd.read_csv(data_path + "sample_submission.csv")
131 |         preds = pd.DataFrame(preds, index=test_id, columns=sample.columns[1:])
132 |         preds.to_csv("sub_nn.csv", index_label="VisitNumber")
133 | 


--------------------------------------------------------------------------------
/Walmart_TripType/NeuralNets/prepData.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | from config_v2 import fineline_dict, header_list4
  6 | 
  7 | map_type_dv_dict = {3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 12:7, 14:8, 15:9, 18:10, 19:11, 20:12, 21:13, 22:14, 23:15, 24:16, 25:17, 26:18, 27:19, 28:20, 29:21, 30:22, 31:23, 32:24, 33:25, 34:26, 35:27, 36:28, 37:29, 38:30, 39:31, 40:32, 41:33, 42:34, 43:35, 44:36, 999:37}
  8 | map_dept_dict = {'COMM BREAD': 14, 'OPTICAL - FRAMES': 47, '1-HR PHOTO': 1, 'LIQUOR,WINE,BEER': 41, 'FABRICS AND CRAFTS': 20, 'MENS WEAR': 44, 'SEAFOOD': 59, 'AUTOMOTIVE': 3, 'BEDDING': 7, 'COOK AND DINE': 16, 'OPTICAL - LENSES': 48, 'HARDWARE': 26, 'SLEEPWEAR/FOUNDATIONS': 64, 'FINANCIAL SERVICES': 21, 'OTHER DEPARTMENTS': 49, 'ELECTRONICS': 19, 'LADIESWEAR': 38, 'HOME MANAGEMENT': 29, 'HOUSEHOLD PAPER GOODS': 32, 'FROZEN FOODS': 22, 'FURNITURE': 23, 'INFANT CONSUMABLE HARDLINES': 35, 'MENSWEAR': 45, 'PAINT AND ACCESSORIES': 50, 'GROCERY DRY GOODS': 25, 'BOYS WEAR': 9, 'SERVICE DELI': 61, 'ACCESSORIES': 2, 'DSD GROCERY': 18, 'MEDIA AND GAMING': 43, -999: 0, 'JEWELRY AND SUNGLASSES': 36, 'PLUS AND MATERNITY': 56, 'LARGE HOUSEHOLD GOODS': 39, 'HOUSEHOLD CHEMICALS/SUPP': 31, 'CAMERAS AND SUPPLIES': 11, 'BATH AND SHOWER': 5, 'SEASONAL': 60, 'IMPULSE MERCHANDISE': 33, 'BRAS & SHAPEWEAR': 10, 'PHARMACY OTC': 53, 'SPORTING GOODS': 65, 'BEAUTY': 6, 'PETS AND SUPPLIES': 52, 'LADIES SOCKS': 37, 'HOME DECOR': 28, 'WIRELESS': 68, 'DAIRY': 17, 'PERSONAL CARE': 51, 'TOYS': 67, 'CONCEPT STORES': 15, 'HEALTH AND BEAUTY AIDS': 27, 'OFFICE SUPPLIES': 46, 'LAWN AND GARDEN': 40, 'SHOES': 63, 'SHEER HOSIERY': 62, 'PRE PACKED DELI': 57, 'INFANT APPAREL': 34, 'HORTICULTURE AND ACCESS': 30, 'PLAYERS AND ELECTRONICS': 55, 'BAKERY': 4, 'PRODUCE': 58, 'CANDY, TOBACCO, COOKIES': 12, 'MEAT - FRESH & FROZEN': 42, 'PHARMACY RX': 54, 'BOOKS AND MAGAZINES': 8, 'GIRLS WEAR, 4-6X  AND 7-14': 24, 'SWIMWEAR/OUTERWEAR': 66, 'CELEBRATION': 13}
  9 | weekday_dict = {"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6}
 10 | 
 11 | def getHeader(train):
 12 | 	header_list1 = ["VisitNumber", "DayOfWeek", "NumberOfRows", "NoOfUPCs", "NumberOfItems", "NumberOfDepts", "NumberOfFineLine" ]
 13 | 	header_list2 = ['Dept_-999', 'Dept_1-HR_PHOTO', 'Dept_ACCESSORIES', 'Dept_AUTOMOTIVE', 'Dept_BAKERY', 'Dept_BATH_AND_SHOWER', 'Dept_BEAUTY', 'Dept_BEDDING', 'Dept_BOOKS_AND_MAGAZINES', 'Dept_BOYS_WEAR', 'Dept_BRAS_&_SHAPEWEAR', 'Dept_CAMERAS_AND_SUPPLIES', 'Dept_CANDY,_TOBACCO,_COOKIES', 'Dept_CELEBRATION', 'Dept_COMM_BREAD', 'Dept_CONCEPT_STORES', 'Dept_COOK_AND_DINE', 'Dept_DAIRY', 'Dept_DSD_GROCERY', 'Dept_ELECTRONICS', 'Dept_FABRICS_AND_CRAFTS', 'Dept_FINANCIAL_SERVICES', 'Dept_FROZEN_FOODS', 'Dept_FURNITURE', 'Dept_GIRLS_WEAR,_4-6X__AND_7-14', 'Dept_GROCERY_DRY_GOODS', 'Dept_HARDWARE', 'Dept_HEALTH_AND_BEAUTY_AIDS', 'Dept_HOME_DECOR', 'Dept_HOME_MANAGEMENT', 'Dept_HORTICULTURE_AND_ACCESS', 'Dept_HOUSEHOLD_CHEMICALS/SUPP', 'Dept_HOUSEHOLD_PAPER_GOODS', 'Dept_IMPULSE_MERCHANDISE', 'Dept_INFANT_APPAREL', 'Dept_INFANT_CONSUMABLE_HARDLINES', 'Dept_JEWELRY_AND_SUNGLASSES', 'Dept_LADIES_SOCKS', 'Dept_LADIESWEAR', 'Dept_LARGE_HOUSEHOLD_GOODS', 'Dept_LAWN_AND_GARDEN', 'Dept_LIQUOR,WINE,BEER', 'Dept_MEAT_-_FRESH_&_FROZEN', 'Dept_MEDIA_AND_GAMING', 'Dept_MENS_WEAR', 'Dept_MENSWEAR', 'Dept_OFFICE_SUPPLIES', 'Dept_OPTICAL_-_FRAMES', 'Dept_OPTICAL_-_LENSES', 'Dept_OTHER_DEPARTMENTS', 'Dept_PAINT_AND_ACCESSORIES', 'Dept_PERSONAL_CARE', 'Dept_PETS_AND_SUPPLIES', 'Dept_PHARMACY_OTC', 'Dept_PHARMACY_RX', 'Dept_PLAYERS_AND_ELECTRONICS', 'Dept_PLUS_AND_MATERNITY', 'Dept_PRE_PACKED_DELI', 'Dept_PRODUCE', 'Dept_SEAFOOD', 'Dept_SEASONAL', 'Dept_SERVICE_DELI', 'Dept_SHEER_HOSIERY', 'Dept_SHOES', 'Dept_SLEEPWEAR/FOUNDATIONS', 'Dept_SPORTING_GOODS', 'Dept_SWIMWEAR/OUTERWEAR', 'Dept_TOYS', 'Dept_WIRELESS']
 14 | 	#header_list3 = ["MinCountUPC", "MaxCountUPC", "MeanCountUPC", 'DeptScan_-999', 'DeptScan_1-HR_PHOTO', 'DeptScan_ACCESSORIES', 'DeptScan_AUTOMOTIVE', 'DeptScan_BAKERY', 'DeptScan_BATH_AND_SHOWER', 'DeptScan_BEAUTY', 'DeptScan_BEDDING', 'DeptScan_BOOKS_AND_MAGAZINES', 'DeptScan_BOYS_WEAR', 'DeptScan_BRAS_&_SHAPEWEAR', 'DeptScan_CAMERAS_AND_SUPPLIES', 'DeptScan_CANDY,_TOBACCO,_COOKIES', 'DeptScan_CELEBRATION', 'DeptScan_COMM_BREAD', 'DeptScan_CONCEPT_STORES', 'DeptScan_COOK_AND_DINE', 'DeptScan_DAIRY', 'DeptScan_DSD_GROCERY', 'DeptScan_ELECTRONICS', 'DeptScan_FABRICS_AND_CRAFTS', 'DeptScan_FINANCIAL_SERVICES', 'DeptScan_FROZEN_FOODS', 'DeptScan_FURNITURE', 'DeptScan_GIRLS_WEAR,_4-6X__AND_7-14', 'DeptScan_GROCERY_DRY_GOODS', 'DeptScan_HARDWARE', 'DeptScan_HEALTH_AND_BEAUTY_AIDS', 'DeptScan_HOME_DECOR', 'DeptScan_HOME_MANAGEMENT', 'DeptScan_HORTICULTURE_AND_ACCESS', 'DeptScan_HOUSEHOLD_CHEMICALS/SUPP', 'DeptScan_HOUSEHOLD_PAPER_GOODS', 'DeptScan_IMPULSE_MERCHANDISE', 'DeptScan_INFANT_APPAREL', 'DeptScan_INFANT_CONSUMABLE_HARDLINES', 'DeptScan_JEWELRY_AND_SUNGLASSES', 'DeptScan_LADIES_SOCKS', 'DeptScan_LADIESWEAR', 'DeptScan_LARGE_HOUSEHOLD_GOODS', 'DeptScan_LAWN_AND_GARDEN', 'DeptScan_LIQUOR,WINE,BEER', 'DeptScan_MEAT_-_FRESH_&_FROZEN', 'DeptScan_MEDIA_AND_GAMING', 'DeptScan_MENS_WEAR', 'DeptScan_MENSWEAR', 'DeptScan_OFFICE_SUPPLIES', 'DeptScan_OPTICAL_-_FRAMES', 'DeptScan_OPTICAL_-_LENSES', 'DeptScan_OTHER_DEPARTMENTS', 'DeptScan_PAINT_AND_ACCESSORIES', 'DeptScan_PERSONAL_CARE', 'DeptScan_PETS_AND_SUPPLIES', 'DeptScan_PHARMACY_OTC', 'DeptScan_PHARMACY_RX', 'DeptScan_PLAYERS_AND_ELECTRONICS', 'DeptScan_PLUS_AND_MATERNITY', 'DeptScan_PRE_PACKED_DELI', 'DeptScan_PRODUCE', 'DeptScan_SEAFOOD', 'DeptScan_SEASONAL', 'DeptScan_SERVICE_DELI', 'DeptScan_SHEER_HOSIERY', 'DeptScan_SHOES', 'DeptScan_SLEEPWEAR/FOUNDATIONS', 'DeptScan_SPORTING_GOODS', 'DeptScan_SWIMWEAR/OUTERWEAR', 'DeptScan_TOYS', 'DeptScan_WIRELESS', 'RatioItemsUPC', 'RatioItemsDept', 'RatioItemsFineLine', 'NoItemsLessZero']
 15 | 	header_list3 = ["MinCountUPC", "MaxCountUPC", "MeanCountUPC", 'RatioItemsUPC', 'RatioItemsDept', 'RatioItemsFineLine', 'NoItemsLessZero']
 16 | 	
 17 | 	header_list = header_list1 + header_list2 + header_list3 + header_list4
 18 | 
 19 | 	if train:
 20 | 		return header_list + ["DV"]
 21 | 	else:
 22 | 		return header_list
 23 | 
 24 | def getDeptCount(depts):
 25 | 	dept_list = [0]*len(map_dept_dict.keys())
 26 | 	for dept in depts:
 27 | 		dept_no = map_dept_dict[dept]
 28 | 		dept_list[dept_no] += 1
 29 | 	return dept_list
 30 | 
 31 | fineline_len = len(fineline_dict.keys())
 32 | def getFineLineCount(finelines):
 33 | 	fineline_list = [0]*fineline_len
 34 | 	for fineline in finelines:
 35 | 		fineline_no = fineline_dict.get(fineline,fineline_dict[-999])
 36 | 		fineline_list[fineline_no] += 1
 37 | 	return fineline_list
 38 | 
 39 | def getDeptScanCounts(depts, scans):
 40 | 	dept_list = [0]*len(map_dept_dict.keys())
 41 | 	for index, dept in enumerate(depts):
 42 | 		dept_no = map_dept_dict[dept]
 43 | 		dept_list[dept_no] += scans[index]
 44 | 	return dept_list
 45 | 		
 46 | 
 47 | def getVariables(name, grouped_df, train=0):
 48 | 	try:
 49 | 		out_list = [name, weekday_dict[np.array(grouped_df["Weekday"])[0]], grouped_df.shape[0]]
 50 | 	except:
 51 | 		raise
 52 | 
 53 | 	no_upc = len( np.unique(grouped_df["Upc"]) )
 54 | 	out_list.append(no_upc)
 55 | 
 56 | 	no_items = int(np.sum(grouped_df["ScanCount"]) )
 57 | 	out_list.append(no_items)
 58 | 
 59 | 	no_depts = len( np.unique(grouped_df["DepartmentDescription"]) )
 60 | 	out_list.append(no_depts)
 61 | 
 62 | 	no_fineline = len( np.unique(grouped_df["FinelineNumber"]) )
 63 | 	out_list.append(no_fineline)
 64 | 
 65 | 	depts = grouped_df["DepartmentDescription"].tolist()
 66 | 	out_list.extend( getDeptCount(depts) )
 67 | 
 68 | 	min_count_in_upc = int(np.min(grouped_df["ScanCount"]))
 69 | 	out_list.append(min_count_in_upc)
 70 | 
 71 | 	max_count_in_upc = int(np.max(grouped_df["ScanCount"]))
 72 |         out_list.append(max_count_in_upc)
 73 | 
 74 | 	mean_count_in_upc = int(np.mean(grouped_df["ScanCount"]))
 75 |         out_list.append(mean_count_in_upc)
 76 | 
 77 | 	#scans = grouped_df["ScanCount"].tolist()
 78 | 	#out_list.extend( getDeptScanCounts(depts, scans) )
 79 | 
 80 | 	ratio_items_upc = no_items / no_upc
 81 | 	out_list.append(ratio_items_upc)
 82 | 
 83 | 	ratio_items_dept = no_items / no_depts
 84 | 	out_list.append(ratio_items_dept)
 85 | 
 86 | 	ratio_items_fineline = no_items / no_fineline
 87 | 	out_list.append(ratio_items_fineline)
 88 | 
 89 | 	no_items_less0 = np.sum( np.array(grouped_df["ScanCount"])<0 )
 90 | 	out_list.append(no_items_less0)
 91 | 
 92 | 	finelines = grouped_df["FinelineNumber"].tolist()
 93 |         out_list.extend( getFineLineCount(finelines) )
 94 | 
 95 | 	if train:
 96 | 		out_list.append( map_type_dv_dict[ np.array(grouped_df["TripType"])[0] ])
 97 | 
 98 | 	return out_list
 99 | 	
100 | 
101 | if __name__ == "__main__":
102 | 	data_path = "../Data/"
103 | 	train_file = data_path + "train.csv"
104 | 	test_file = data_path + "test.csv"
105 | 	train_out_file = data_path + "train_mod_v2.csv"
106 | 	test_out_file = data_path + "test_mod_v2.csv"
107 | 
108 | 	train_df = pd.read_csv(train_file)
109 | 	test_df = pd.read_csv(test_file)
110 | 	train_out_handle = open(train_out_file, "w")
111 | 	test_out_handle = open(test_out_file, "w")
112 | 	train_writer = csv.writer(train_out_handle)
113 | 	test_writer = csv.writer(test_out_handle)
114 | 
115 | 	train_df = train_df.fillna(-999)
116 | 	test_df = test_df.fillna(-999)
117 | 
118 | 	train_header = getHeader(train=1)
119 | 	train_writer.writerow( train_header )
120 | 	test_header = getHeader(train=0)
121 | 	test_writer.writerow( test_header )
122 | 	train_header_len = len(train_header)
123 | 	test_header_len = len(test_header)
124 | 
125 | 	print "Processing train.."
126 | 	print train_df.shape
127 | 	grouped_train_df = train_df.groupby("VisitNumber")
128 | 	counter = 0
129 | 	for name, group in grouped_train_df:
130 | 		out_row = getVariables(name, group, train=1)
131 | 		assert len(out_row) == train_header_len
132 | 		train_writer.writerow(out_row)
133 | 		counter += 1
134 | 		if counter%10000 == 0:
135 | 			print counter
136 | 
137 | 	print "Processing test.."
138 | 	grouped_test_df = test_df.groupby("VisitNumber")
139 | 	counter = 0
140 |         for name, group in grouped_test_df:
141 |                 out_row = getVariables(name, group, train=0)
142 | 		assert len(out_row) == test_header_len
143 |                 test_writer.writerow(out_row)
144 |                 counter += 1
145 |                 if counter%10000 == 0:
146 |                         print counter
147 | 
148 | 	train_out_handle.close()
149 | 	test_out_handle.close()
150 | 


--------------------------------------------------------------------------------
/Walmart_TripType/NeuralNets/readme.md:
--------------------------------------------------------------------------------
1 | Codes for best Neural Net model
2 | 


--------------------------------------------------------------------------------
/Walmart_TripType/XGB/prepData.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | from config_v5 import fineline_dict, header_list4, upc_dict, header_list5
  6 | 
  7 | map_type_dv_dict = {3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 12:7, 14:8, 15:9, 18:10, 19:11, 20:12, 21:13, 22:14, 23:15, 24:16, 25:17, 26:18, 27:19, 28:20, 29:21, 30:22, 31:23, 32:24, 33:25, 34:26, 35:27, 36:28, 37:29, 38:30, 39:31, 40:32, 41:33, 42:34, 43:35, 44:36, 999:37}
  8 | map_dept_dict = {'COMM BREAD': 14, 'OPTICAL - FRAMES': 47, '1-HR PHOTO': 1, 'LIQUOR,WINE,BEER': 41, 'FABRICS AND CRAFTS': 20, 'MENS WEAR': 44, 'SEAFOOD': 59, 'AUTOMOTIVE': 3, 'BEDDING': 7, 'COOK AND DINE': 16, 'OPTICAL - LENSES': 48, 'HARDWARE': 26, 'SLEEPWEAR/FOUNDATIONS': 64, 'FINANCIAL SERVICES': 21, 'OTHER DEPARTMENTS': 49, 'ELECTRONICS': 19, 'LADIESWEAR': 38, 'HOME MANAGEMENT': 29, 'HOUSEHOLD PAPER GOODS': 32, 'FROZEN FOODS': 22, 'FURNITURE': 23, 'INFANT CONSUMABLE HARDLINES': 35, 'MENSWEAR': 45, 'PAINT AND ACCESSORIES': 50, 'GROCERY DRY GOODS': 25, 'BOYS WEAR': 9, 'SERVICE DELI': 61, 'ACCESSORIES': 2, 'DSD GROCERY': 18, 'MEDIA AND GAMING': 43, -999: 0, 'JEWELRY AND SUNGLASSES': 36, 'PLUS AND MATERNITY': 56, 'LARGE HOUSEHOLD GOODS': 39, 'HOUSEHOLD CHEMICALS/SUPP': 31, 'CAMERAS AND SUPPLIES': 11, 'BATH AND SHOWER': 5, 'SEASONAL': 60, 'IMPULSE MERCHANDISE': 33, 'BRAS & SHAPEWEAR': 10, 'PHARMACY OTC': 53, 'SPORTING GOODS': 65, 'BEAUTY': 6, 'PETS AND SUPPLIES': 52, 'LADIES SOCKS': 37, 'HOME DECOR': 28, 'WIRELESS': 68, 'DAIRY': 17, 'PERSONAL CARE': 51, 'TOYS': 67, 'CONCEPT STORES': 15, 'HEALTH AND BEAUTY AIDS': 27, 'OFFICE SUPPLIES': 46, 'LAWN AND GARDEN': 40, 'SHOES': 63, 'SHEER HOSIERY': 62, 'PRE PACKED DELI': 57, 'INFANT APPAREL': 34, 'HORTICULTURE AND ACCESS': 30, 'PLAYERS AND ELECTRONICS': 55, 'BAKERY': 4, 'PRODUCE': 58, 'CANDY, TOBACCO, COOKIES': 12, 'MEAT - FRESH & FROZEN': 42, 'PHARMACY RX': 54, 'BOOKS AND MAGAZINES': 8, 'GIRLS WEAR, 4-6X  AND 7-14': 24, 'SWIMWEAR/OUTERWEAR': 66, 'CELEBRATION': 13}
  9 | weekday_dict = {"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6}
 10 | 
 11 | def getHeader(train):
 12 | 	header_list1 = ["VisitNumber", "DayOfWeek", "NumberOfRows", "NoOfUPCs", "NumberOfItems", "NumberOfDepts", "NumberOfFineLine" ]
 13 | 	header_list2 = ['Dept_-999', 'Dept_1-HR_PHOTO', 'Dept_ACCESSORIES', 'Dept_AUTOMOTIVE', 'Dept_BAKERY', 'Dept_BATH_AND_SHOWER', 'Dept_BEAUTY', 'Dept_BEDDING', 'Dept_BOOKS_AND_MAGAZINES', 'Dept_BOYS_WEAR', 'Dept_BRAS_&_SHAPEWEAR', 'Dept_CAMERAS_AND_SUPPLIES', 'Dept_CANDY,_TOBACCO,_COOKIES', 'Dept_CELEBRATION', 'Dept_COMM_BREAD', 'Dept_CONCEPT_STORES', 'Dept_COOK_AND_DINE', 'Dept_DAIRY', 'Dept_DSD_GROCERY', 'Dept_ELECTRONICS', 'Dept_FABRICS_AND_CRAFTS', 'Dept_FINANCIAL_SERVICES', 'Dept_FROZEN_FOODS', 'Dept_FURNITURE', 'Dept_GIRLS_WEAR,_4-6X__AND_7-14', 'Dept_GROCERY_DRY_GOODS', 'Dept_HARDWARE', 'Dept_HEALTH_AND_BEAUTY_AIDS', 'Dept_HOME_DECOR', 'Dept_HOME_MANAGEMENT', 'Dept_HORTICULTURE_AND_ACCESS', 'Dept_HOUSEHOLD_CHEMICALS/SUPP', 'Dept_HOUSEHOLD_PAPER_GOODS', 'Dept_IMPULSE_MERCHANDISE', 'Dept_INFANT_APPAREL', 'Dept_INFANT_CONSUMABLE_HARDLINES', 'Dept_JEWELRY_AND_SUNGLASSES', 'Dept_LADIES_SOCKS', 'Dept_LADIESWEAR', 'Dept_LARGE_HOUSEHOLD_GOODS', 'Dept_LAWN_AND_GARDEN', 'Dept_LIQUOR,WINE,BEER', 'Dept_MEAT_-_FRESH_&_FROZEN', 'Dept_MEDIA_AND_GAMING', 'Dept_MENS_WEAR', 'Dept_MENSWEAR', 'Dept_OFFICE_SUPPLIES', 'Dept_OPTICAL_-_FRAMES', 'Dept_OPTICAL_-_LENSES', 'Dept_OTHER_DEPARTMENTS', 'Dept_PAINT_AND_ACCESSORIES', 'Dept_PERSONAL_CARE', 'Dept_PETS_AND_SUPPLIES', 'Dept_PHARMACY_OTC', 'Dept_PHARMACY_RX', 'Dept_PLAYERS_AND_ELECTRONICS', 'Dept_PLUS_AND_MATERNITY', 'Dept_PRE_PACKED_DELI', 'Dept_PRODUCE', 'Dept_SEAFOOD', 'Dept_SEASONAL', 'Dept_SERVICE_DELI', 'Dept_SHEER_HOSIERY', 'Dept_SHOES', 'Dept_SLEEPWEAR/FOUNDATIONS', 'Dept_SPORTING_GOODS', 'Dept_SWIMWEAR/OUTERWEAR', 'Dept_TOYS', 'Dept_WIRELESS']
 14 | 	#header_list3 = ["MinCountUPC", "MaxCountUPC", "MeanCountUPC", 'DeptScan_-999', 'DeptScan_1-HR_PHOTO', 'DeptScan_ACCESSORIES', 'DeptScan_AUTOMOTIVE', 'DeptScan_BAKERY', 'DeptScan_BATH_AND_SHOWER', 'DeptScan_BEAUTY', 'DeptScan_BEDDING', 'DeptScan_BOOKS_AND_MAGAZINES', 'DeptScan_BOYS_WEAR', 'DeptScan_BRAS_&_SHAPEWEAR', 'DeptScan_CAMERAS_AND_SUPPLIES', 'DeptScan_CANDY,_TOBACCO,_COOKIES', 'DeptScan_CELEBRATION', 'DeptScan_COMM_BREAD', 'DeptScan_CONCEPT_STORES', 'DeptScan_COOK_AND_DINE', 'DeptScan_DAIRY', 'DeptScan_DSD_GROCERY', 'DeptScan_ELECTRONICS', 'DeptScan_FABRICS_AND_CRAFTS', 'DeptScan_FINANCIAL_SERVICES', 'DeptScan_FROZEN_FOODS', 'DeptScan_FURNITURE', 'DeptScan_GIRLS_WEAR,_4-6X__AND_7-14', 'DeptScan_GROCERY_DRY_GOODS', 'DeptScan_HARDWARE', 'DeptScan_HEALTH_AND_BEAUTY_AIDS', 'DeptScan_HOME_DECOR', 'DeptScan_HOME_MANAGEMENT', 'DeptScan_HORTICULTURE_AND_ACCESS', 'DeptScan_HOUSEHOLD_CHEMICALS/SUPP', 'DeptScan_HOUSEHOLD_PAPER_GOODS', 'DeptScan_IMPULSE_MERCHANDISE', 'DeptScan_INFANT_APPAREL', 'DeptScan_INFANT_CONSUMABLE_HARDLINES', 'DeptScan_JEWELRY_AND_SUNGLASSES', 'DeptScan_LADIES_SOCKS', 'DeptScan_LADIESWEAR', 'DeptScan_LARGE_HOUSEHOLD_GOODS', 'DeptScan_LAWN_AND_GARDEN', 'DeptScan_LIQUOR,WINE,BEER', 'DeptScan_MEAT_-_FRESH_&_FROZEN', 'DeptScan_MEDIA_AND_GAMING', 'DeptScan_MENS_WEAR', 'DeptScan_MENSWEAR', 'DeptScan_OFFICE_SUPPLIES', 'DeptScan_OPTICAL_-_FRAMES', 'DeptScan_OPTICAL_-_LENSES', 'DeptScan_OTHER_DEPARTMENTS', 'DeptScan_PAINT_AND_ACCESSORIES', 'DeptScan_PERSONAL_CARE', 'DeptScan_PETS_AND_SUPPLIES', 'DeptScan_PHARMACY_OTC', 'DeptScan_PHARMACY_RX', 'DeptScan_PLAYERS_AND_ELECTRONICS', 'DeptScan_PLUS_AND_MATERNITY', 'DeptScan_PRE_PACKED_DELI', 'DeptScan_PRODUCE', 'DeptScan_SEAFOOD', 'DeptScan_SEASONAL', 'DeptScan_SERVICE_DELI', 'DeptScan_SHEER_HOSIERY', 'DeptScan_SHOES', 'DeptScan_SLEEPWEAR/FOUNDATIONS', 'DeptScan_SPORTING_GOODS', 'DeptScan_SWIMWEAR/OUTERWEAR', 'DeptScan_TOYS', 'DeptScan_WIRELESS', 'RatioItemsUPC', 'RatioItemsDept', 'RatioItemsFineLine', 'NoItemsLessZero']
 15 | 	header_list3 = ["MinCountUPC", "MaxCountUPC", "MeanCountUPC", 'RatioItemsUPC', 'RatioItemsDept', 'RatioItemsFineLine', 'NoItemsLessZero']
 16 | 	
 17 | 	header_list = header_list1 + header_list2 + header_list3 + header_list4 + header_list5
 18 | 
 19 | 	if train:
 20 | 		return header_list + ["DV"]
 21 | 	else:
 22 | 		return header_list
 23 | 
 24 | def getDeptCount(depts):
 25 | 	dept_list = [0]*len(map_dept_dict.keys())
 26 | 	for dept in depts:
 27 | 		dept_no = map_dept_dict[dept]
 28 | 		dept_list[dept_no] += 1
 29 | 	return dept_list
 30 | 
 31 | fineline_len = len(fineline_dict.keys())
 32 | def getFineLineCount(finelines):
 33 | 	fineline_list = [0]*fineline_len
 34 | 	for fineline in finelines:
 35 | 		fineline_no = fineline_dict.get(fineline,fineline_dict[-999])
 36 | 		fineline_list[fineline_no] += 1
 37 | 	return fineline_list
 38 | 
 39 | upc_len = len(upc_dict.keys())
 40 | def getUpcCount(upcs):
 41 |         upc_list = [0]*upc_len
 42 |         for upc in upcs:
 43 |                 upc_no = upc_dict.get(upc,upc_dict[-999.0])
 44 |                 upc_list[upc_no] += 1
 45 |         return upc_list
 46 | 
 47 | def getDeptScanCounts(depts, scans):
 48 | 	dept_list = [0]*len(map_dept_dict.keys())
 49 | 	for index, dept in enumerate(depts):
 50 | 		dept_no = map_dept_dict[dept]
 51 | 		dept_list[dept_no] += scans[index]
 52 | 	return dept_list
 53 | 		
 54 | 
 55 | def getVariables(name, grouped_df, train=0):
 56 | 	try:
 57 | 		out_list = [name, weekday_dict[np.array(grouped_df["Weekday"])[0]], grouped_df.shape[0]]
 58 | 	except:
 59 | 		raise
 60 | 
 61 | 	no_upc = len( np.unique(grouped_df["Upc"]) )
 62 | 	out_list.append(no_upc)
 63 | 
 64 | 	no_items = int(np.sum(grouped_df["ScanCount"]) )
 65 | 	out_list.append(no_items)
 66 | 
 67 | 	no_depts = len( np.unique(grouped_df["DepartmentDescription"]) )
 68 | 	out_list.append(no_depts)
 69 | 
 70 | 	no_fineline = len( np.unique(grouped_df["FinelineNumber"]) )
 71 | 	out_list.append(no_fineline)
 72 | 
 73 | 	depts = grouped_df["DepartmentDescription"].tolist()
 74 | 	out_list.extend( getDeptCount(depts) )
 75 | 
 76 | 	min_count_in_upc = int(np.min(grouped_df["ScanCount"]))
 77 | 	out_list.append(min_count_in_upc)
 78 | 
 79 | 	max_count_in_upc = int(np.max(grouped_df["ScanCount"]))
 80 |         out_list.append(max_count_in_upc)
 81 | 
 82 | 	mean_count_in_upc = int(np.mean(grouped_df["ScanCount"]))
 83 |         out_list.append(mean_count_in_upc)
 84 | 
 85 | 	#scans = grouped_df["ScanCount"].tolist()
 86 | 	#out_list.extend( getDeptScanCounts(depts, scans) )
 87 | 
 88 | 	ratio_items_upc = no_items / no_upc
 89 | 	out_list.append(ratio_items_upc)
 90 | 
 91 | 	ratio_items_dept = no_items / no_depts
 92 | 	out_list.append(ratio_items_dept)
 93 | 
 94 | 	ratio_items_fineline = no_items / no_fineline
 95 | 	out_list.append(ratio_items_fineline)
 96 | 
 97 | 	no_items_less0 = np.sum( np.array(grouped_df["ScanCount"])<0 )
 98 | 	out_list.append(no_items_less0)
 99 | 
100 | 	finelines = grouped_df["FinelineNumber"].tolist()
101 |         out_list.extend( getFineLineCount(finelines) )
102 | 
103 | 	upcs = grouped_df["Upc"].tolist()
104 |         out_list.extend( getUpcCount(upcs) )
105 | 
106 | 	if train:
107 | 		out_list.append( map_type_dv_dict[ np.array(grouped_df["TripType"])[0] ])
108 | 
109 | 	return out_list
110 | 	
111 | 
112 | if __name__ == "__main__":
113 | 	data_path = "../Data/"
114 | 	train_file = data_path + "train.csv"
115 | 	test_file = data_path + "test.csv"
116 | 	train_out_file = data_path + "train_mod_v5.csv"
117 | 	test_out_file = data_path + "test_mod_v5.csv"
118 | 	train_dv_out_file = data_path + "train_mod_v5_dv.csv"
119 | 
120 | 	train_df = pd.read_csv(train_file)
121 | 	test_df = pd.read_csv(test_file)
122 | 	train_out_handle = open(train_out_file, "w")
123 | 	test_out_handle = open(test_out_file, "w")
124 | 	train_dv_out_handle = open(train_dv_out_file, "w")
125 | 	train_writer = csv.writer(train_out_handle)
126 | 	test_writer = csv.writer(test_out_handle)
127 | 	train_dv_writer = csv.writer(train_dv_out_handle)
128 | 
129 | 	train_df = train_df.fillna(-999)
130 | 	test_df = test_df.fillna(-999)
131 | 
132 | 	train_header = getHeader(train=0)
133 | 	train_writer.writerow( train_header )
134 | 	test_header = getHeader(train=0)
135 | 	test_writer.writerow( test_header )
136 | 	train_dv_header = ["VisitNumber", "DV"]
137 | 	train_dv_writer.writerow(train_dv_header)
138 | 	train_header_len = len(train_header)
139 | 	test_header_len = len(test_header)
140 | 	train_dv_header_len = len(train_dv_header) 
141 | 
142 | 	print "Processing train.."
143 | 	print train_df.shape
144 | 	grouped_train_df = train_df.groupby("VisitNumber")
145 | 	counter = 0
146 | 	for name, group in grouped_train_df:
147 | 		out_row = getVariables(name, group, train=1)
148 | 		dv = out_row[-1]
149 | 		out_row = out_row[:-1]
150 | 		dv_row = [name, dv]
151 | 		assert len(out_row) == train_header_len
152 | 		assert len(dv_row) == train_dv_header_len
153 | 		train_writer.writerow(out_row)
154 | 		train_dv_writer.writerow(dv_row)
155 | 		counter += 1
156 | 		if counter%10000 == 0:
157 | 			print counter
158 | 
159 | 	print "Processing test.."
160 | 	grouped_test_df = test_df.groupby("VisitNumber")
161 | 	counter = 0
162 |         for name, group in grouped_test_df:
163 |                 out_row = getVariables(name, group, train=0)
164 | 		assert len(out_row) == test_header_len
165 |                 test_writer.writerow(out_row)
166 |                 counter += 1
167 |                 if counter%10000 == 0:
168 |                         print counter
169 | 
170 | 	train_out_handle.close()
171 | 	test_out_handle.close()
172 | 	train_dv_out_handle.close()
173 | 
174 | 


--------------------------------------------------------------------------------
/Walmart_TripType/XGB/readme.md:
--------------------------------------------------------------------------------
1 | Codes for best XGB model
2 | 


--------------------------------------------------------------------------------
/Walmart_TripType/XGB/xgb_model.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import csv
  3 | import operator
  4 | import numpy as np
  5 | import pandas as pd
  6 | import scipy as sp
  7 | import cPickle as pkl
  8 | from scipy.sparse import csr_matrix
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.cross_validation import KFold
 11 | from sklearn import ensemble
 12 | from sklearn.metrics import mean_squared_error, roc_auc_score
 13 | #sys.path.append("/home/sudalai/Softwares/xgboost-master/wrapper/")
 14 | sys.path.append("/home/sudalai/Softwares/XGB_pointfour/xgboost-master/wrapper/")
 15 | import xgboost as xgb
 16 | 
 17 | def multiclassLogLoss(y_true, y_pred, eps=1e-15):
 18 |     """Multi class version of Logarithmic Loss metric.
 19 |     https://www.kaggle.com/wiki/MultiClassLogLoss
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     y_true : array, shape = [n_samples]
 24 |             true class, intergers in [0, n_classes - 1)
 25 |     y_pred : array, shape = [n_samples, n_classes]
 26 | 
 27 |     Returns
 28 |     -------
 29 |     loss : float
 30 |     """
 31 |     predictions = np.clip(y_pred, eps, 1 - eps)
 32 | 
 33 |     # normalize row sums to 1
 34 |     predictions /= predictions.sum(axis=1)[:, np.newaxis]
 35 | 
 36 |     actual = np.zeros(y_pred.shape)
 37 |     n_samples = actual.shape[0]
 38 |     actual[np.arange(n_samples), y_true.astype(int)] = 1
 39 |     vectsum = np.sum(actual * np.log(predictions))
 40 |     loss = -1.0 / n_samples * vectsum
 41 |     return loss
 42 | 
 43 | def getData(file_name):
 44 |         reader = csv.reader(open(file_name))
 45 |         header = reader.next()
 46 | 
 47 |         row_list = []
 48 |         col_list = []
 49 |         data_list = []
 50 |         row_ind = 0
 51 |         for row in reader:
 52 |                 row = map(int, row)
 53 |                 for col_ind, col_val in enumerate(row):
 54 |                         if col_val != 0 :
 55 |                                 row_list.append(row_ind)
 56 |                                 col_list.append(col_ind)
 57 |                                 data_list.append(col_val)
 58 |                 row_ind += 1
 59 |         
 60 |         sp_array = csr_matrix( (data_list, (row_list, col_list)), shape=(row_ind, len(header)))
 61 |         #pkl.dump(sp_array, open("train_mod_v5_sparse.pkl","w"))
 62 | 
 63 |         #sp_array = pkl.load(open("train_mod_v7_sparse.pkl"))
 64 |         return sp_array
 65 | 
 66 | 
 67 | def getTestData(file_name):
 68 |         reader = csv.reader(open(file_name))
 69 |         header = reader.next()
 70 | 
 71 |         row_list = []
 72 |         col_list = []
 73 |         data_list = []
 74 |         row_ind = 0
 75 |         for row in reader:
 76 |                 row = map(int, row)
 77 |                 for col_ind, col_val in enumerate(row):
 78 |                         if col_val != 0 :
 79 |                                 row_list.append(row_ind)
 80 |                                 col_list.append(col_ind)
 81 |                                 data_list.append(col_val)
 82 |                 row_ind += 1
 83 |         
 84 |         sp_array = csr_matrix( (data_list, (row_list, col_list)), shape=(row_ind, len(header)))
 85 |         #pkl.dump(sp_array, open("test_mod_v7_sparse.pkl","w"))
 86 | 		
 87 | 	#sp_array = pkl.load(open("test_mod_v7_sparse.pkl"))
 88 |         return sp_array
 89 | 
 90 | 
 91 | def runXGB(train_X, train_y):
 92 | 	xg_train = xgb.DMatrix(train_X, label=train_y)
 93 | 	
 94 | 	## Setting up the params ##
 95 | 	param = {}
 96 | 	# use softmax multi-class classification
 97 | 	param['objective'] = 'multi:softprob'
 98 | 	# scale weight of positive examples
 99 | 	param['eta'] = 0.05
100 | 	param['max_depth'] = 6
101 | 	param['silent'] = 1
102 | 	param['num_class'] = 38
103 | 	param['eval_metric'] = "mlogloss"
104 | 	#param['min_child_weight'] = 2
105 | 	param['subsample'] = 0.9
106 | 	param['colsample_bytree'] = 0.7
107 | 	param['gamma'] = 1
108 | 	
109 | 	#watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
110 | 	num_round = 4200
111 | 	bst = xgb.train( param, xg_train, num_round)
112 | 	return bst
113 | 
114 | if __name__ == "__main__":
115 |         # setting the input path and reading the data into dataframe #
116 | 	print "Reading data.."
117 |         data_path = "../Data/"
118 |         train_X = getData(data_path + "train_mod_v5.csv")
119 |         train_y = np.array( pd.read_csv(data_path + "train_mod_v5_dv.csv")["DV"] )
120 |         print "Train shape : ", train_X.shape
121 | 
122 | 	bst = runXGB(train_X, train_y)
123 | 	del train_X
124 | 	del train_y
125 | 	import gc
126 | 	gc.collect()
127 | 
128 | 	print "Working on test.."
129 | 	test_X = getTestData(data_path + "test_mod_v5.csv")
130 | 	test_id = np.array( pd.read_csv(data_path+"test_mod_v7.csv", usecols=["VisitNumber"])["VisitNumber"] )
131 | 	print test_X.shape
132 | 	xg_test =  xgb.DMatrix(test_X)
133 | 	preds = bst.predict( xg_test )#.reshape( test_X.shape[0], param['num_class'] )
134 | 
135 | 	sample = pd.read_csv(data_path + "sample_submission.csv")
136 | 	preds = pd.DataFrame(preds, index=test_id, columns=sample.columns[1:])
137 | 	preds.to_csv("sub6.csv", index_label="VisitNumber")
138 | 
139 | 


--------------------------------------------------------------------------------
/Walmart_TripType/readme.md:
--------------------------------------------------------------------------------
1 | This folder has the codes used for competition Kaggle - [Walmart - Trip Type Classification](https://www.kaggle.com/c/walmart-recruiting-trip-type-classification)
2 | 
3 | Finished 23rd out of >1000 people in this competition
4 | 
5 | Approach:
6 |  1. Built few XGB models by converting the features to sparse format and the best model is present in XGB folder
7 |  2. Built few Neural Net models using Keras using features excluding UPCs and the best model is present in neuralnets folder
8 |  3. My final model is an ensemble of XGBs and NNs
9 | 


--------------------------------------------------------------------------------