├── .gitattributes ├── .gitignore ├── LICENSE.txt ├── README.md ├── majorityvote_modelselection.py ├── parallel.py ├── submission ├── majority_rfs50_5.23_shuffle_GAfix_4of7of10.csv └── majority_rfs50_5.23_shuffle_GAfix_5of9of50.csv └── utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | documentation/ 3 | data/*.csv 4 | *.xlsx 5 | 6 | ################# 7 | ## Eclipse 8 | ################# 9 | 10 | *.pydevproject 11 | .project 12 | .metadata 13 | bin/ 14 | tmp/ 15 | *.tmp 16 | *.bak 17 | *.swp 18 | *~.nib 19 | local.properties 20 | .classpath 21 | .settings/ 22 | .loadpath 23 | 24 | # External tool builders 25 | .externalToolBuilders/ 26 | 27 | # Locally stored "Eclipse launch configurations" 28 | *.launch 29 | 30 | # CDT-specific 31 | .cproject 32 | 33 | # PDT-specific 34 | .buildpath 35 | 36 | 37 | ################# 38 | ## Visual Studio 39 | ################# 40 | 41 | ## Ignore Visual Studio temporary files, build results, and 42 | ## files generated by popular Visual Studio add-ons. 43 | 44 | # User-specific files 45 | *.suo 46 | *.user 47 | *.sln.docstates 48 | 49 | # Build results 50 | 51 | [Dd]ebug/ 52 | [Rr]elease/ 53 | x64/ 54 | build/ 55 | [Bb]in/ 56 | [Oo]bj/ 57 | 58 | # MSTest test Results 59 | [Tt]est[Rr]esult*/ 60 | [Bb]uild[Ll]og.* 61 | 62 | *_i.c 63 | *_p.c 64 | *.ilk 65 | *.meta 66 | *.obj 67 | *.pch 68 | *.pdb 69 | *.pgc 70 | *.pgd 71 | *.rsp 72 | *.sbr 73 | *.tlb 74 | *.tli 75 | *.tlh 76 | *.tmp 77 | *.tmp_proj 78 | *.log 79 | *.vspscc 80 | *.vssscc 81 | .builds 82 | *.pidb 83 | *.log 84 | *.scc 85 | 86 | # Visual C++ cache files 87 | ipch/ 88 | *.aps 89 | *.ncb 90 | *.opensdf 91 | *.sdf 92 | *.cachefile 93 | 94 | # Visual Studio profiler 95 | *.psess 96 | *.vsp 97 | *.vspx 98 | 99 | # Guidance Automation Toolkit 100 | *.gpState 101 | 102 | # ReSharper is a .NET coding add-in 103 | _ReSharper*/ 104 | *.[Rr]e[Ss]harper 105 | 106 | # TeamCity is a build add-in 107 | _TeamCity* 108 | 109 | # DotCover is a Code Coverage Tool 110 | *.dotCover 111 | 112 | # NCrunch 113 | *.ncrunch* 114 | .*crunch*.local.xml 115 | 116 | # Installshield output folder 117 | [Ee]xpress/ 118 | 119 | # DocProject is a documentation generator add-in 120 | DocProject/buildhelp/ 121 | DocProject/Help/*.HxT 122 | DocProject/Help/*.HxC 123 | DocProject/Help/*.hhc 124 | DocProject/Help/*.hhk 125 | DocProject/Help/*.hhp 126 | DocProject/Help/Html2 127 | DocProject/Help/html 128 | 129 | # Click-Once directory 130 | publish/ 131 | 132 | # Publish Web Output 133 | *.Publish.xml 134 | *.pubxml 135 | 136 | # NuGet Packages Directory 137 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 138 | #packages/ 139 | 140 | # Windows Azure Build Output 141 | csx 142 | *.build.csdef 143 | 144 | # Windows Store app package directory 145 | AppPackages/ 146 | 147 | # Others 148 | sql/ 149 | *.Cache 150 | ClientBin/ 151 | [Ss]tyle[Cc]op.* 152 | ~$* 153 | *~ 154 | *.dbmdl 155 | *.[Pp]ublish.xml 156 | *.pfx 157 | *.publishsettings 158 | 159 | # RIA/Silverlight projects 160 | Generated_Code/ 161 | 162 | # Backup & report files from converting an old project file to a newer 163 | # Visual Studio version. Backup files are not needed, because we have git ;-) 164 | _UpgradeReport_Files/ 165 | Backup*/ 166 | UpgradeLog*.XML 167 | UpgradeLog*.htm 168 | 169 | # SQL Server files 170 | App_Data/*.mdf 171 | App_Data/*.ldf 172 | 173 | ############# 174 | ## Windows detritus 175 | ############# 176 | 177 | # Windows image file caches 178 | Thumbs.db 179 | ehthumbs.db 180 | 181 | # Folder config file 182 | Desktop.ini 183 | 184 | # Recycle Bin used on file shares 185 | $RECYCLE.BIN/ 186 | 187 | # Mac crap 188 | .DS_Store 189 | 190 | 191 | ############# 192 | ## Python 193 | ############# 194 | 195 | *.py[co] 196 | 197 | # Packages 198 | *.egg 199 | *.egg-info 200 | dist/ 201 | build/ 202 | eggs/ 203 | parts/ 204 | var/ 205 | sdist/ 206 | develop-eggs/ 207 | .installed.cfg 208 | 209 | # Installer logs 210 | pip-log.txt 211 | 212 | # Unit test / coverage reports 213 | .coverage 214 | .tox 215 | 216 | #Translations 217 | *.mo 218 | 219 | #Mr Developer 220 | .mr.developer.cfg 221 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2014 Alessandro Mariani 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Allstate Purchase Prediction Challenge 2 | 3 | ### Requirements 4 | Python 2.7.5 with Scikit-Learn 0.14a1, Numpy 1.8, Pandas 0.12
5 | Windows 8, Intel i5-3230M @ 2.60Ghz, 16GB RAM
6 | Developed on a HP Envy 17 j100tx laptop
7 | 8 | ### How to generate the solution 9 | Type "python majorityvote_modelselection.py" in Python shell or 10 | easily double click on Windows. Watch out on memory usage, even 11 | though "should" be configured not to exceed 8 GB with the 12 | default settings. 13 | 14 | ### Comments 15 | Using the default setting, this will fit the model and creates 16 | the submission which will score 0.53705 in the private L. This 17 | is the setting which combined with Breakfast Pirate ABCEDF 18 | combination, scored 0.53715 in the private LB and .54535 in the 19 | public LB. On the above system configuration this will take 20 | approximately 3 hours. If you’re impatience, set N=10 and NS=7 21 | and will score 0.53710 in just 30 minutes! If you think is still 22 | slow try setting N=8, NS=6, params=[(30,5,23)] and is going to 23 | be even faster scoring as my best submission 0.53705 but lower 24 | on the public LB. If still slow, get a better computer!!! 25 | 26 | The script will perform the the following steps: 27 | 28 | 1. Prepare the data (load the files, transformation, clean and 29 | create the engineered features) 30 | 2. Fit the Random Forests 31 | 3. Make the prediction of the product G 32 | 4. Selected the best Random Forest given the train set accuracy 33 | 5. Do a majority vote using all the N model(s) and print the 34 | score on the cross validation set 35 | 6. Do a majority vote using the NS selected model(s) and print 36 | the score on the cross validation set 37 | 38 | Then, if submit is set to False:
39 | a. Records the performance of the k-fold and loop
40 | b. Exit the loop and make the prediction on the test set, do 41 | a majority vote using the selected models, fix the product 42 | accordingly with the state rule and create the submission file 43 | 44 | ### License 45 | Please refer for LICENSE.txt file 46 | -------------------------------------------------------------------------------- /majorityvote_modelselection.py: -------------------------------------------------------------------------------- 1 | # Allstate Purchase Prediction Challenge 2 | # Author: Alessandro Mariani 3 | # https://www.kaggle.com/c/allstate-purchase-prediction-challenge 4 | 5 | ''' 6 | This module is for train, cross validate and make the final prediction. 7 | ''' 8 | 9 | import pandas as pd, numpy as np 10 | import matplotlib.pyplot as plt 11 | import operator 12 | 13 | from sklearn import cross_validation, ensemble 14 | from utils import prepare_data, concat, expval, stateFix 15 | from parallel import RandomForestsParallel 16 | from time import time 17 | 18 | def majority_vote(baseline,model_predictions): 19 | # given a baseline and a matrix of prediction (#samples x #models) 20 | # if will return the prediction if 1+#models/2 agree on the same product 21 | # otherwise will return the baseline 22 | prcnt = np.vstack([np.bincount(p,minlength=5) for p in model_predictions]) 23 | prmax = np.max(prcnt,axis=1) >= (1+(len(selected)/2)) 24 | preds = baseline+0; preds[prmax] = np.argmax(prcnt[prmax],axis=1) 25 | return preds 26 | 27 | def make_ptscores(y_true,y_pred,y_base,pt,vmask): 28 | # measure the increase of "plan" accuracy given a prediction for the product (G) 29 | return [np.mean(vmask[pt==ipt]&(y_true[pt==ipt] == y_pred[pt==ipt])) - np.mean(vmask[pt==ipt]&(y_true[pt==ipt] == y_base[pt==ipt])) for ipt in range(1,11)] 30 | 31 | if __name__ == '__main__': 32 | ############################################################################ 33 | ## SETTING ################################################################# 34 | # submit: if 'True' create a submission file and train models for submission 35 | # N: number of models to build 36 | # NS: number of models to selected for majority vote 37 | # kfold: number of k-fold to perform, if not submitting 38 | # N_proc: number of process to spawn, default #CPU(s)-1 39 | # include_from_pt: minimum shopping_pt included in the data set 40 | # verbose_selection: print all details while selecting the model 41 | # tn: test set distrubution of shopping_pt (#10-11 merged) 42 | ############################################################################ 43 | submit = True; N = 50; NS = 9; kfold = 3; N_proc = None; 44 | include_from_pt = 1; verbose_selection = False 45 | tn = np.array([18943,13298,9251,6528,4203,2175,959,281,78]) 46 | ############################################################################ 47 | # Random Forest Setting #################################################### 48 | # Must be a list containg a tuple with (ntree,maxfea,leafsize) 49 | params = [(50,5,23)] 50 | # ex. [(x,5,23) for x in [35,50,75]] # [(50,x,23) for x in range(4,12)] 51 | # anything you'd like to try, here is the place for the modifications 52 | ############################################################################ 53 | 54 | print "Majority vote using %i models, selecting %i\n" % (N,NS) 55 | # initialize data 56 | data,test,con,cat,extra,conf,conf_f,encoders = prepare_data() 57 | data = data[data.shopping_pt >=include_from_pt]; print "Including from shopping_pt #%i\n" % data.shopping_pt.min(), 58 | # features, target, weights (not used) 59 | X = data[con+cat+conf+extra]; y = data['G_f'] ; w = np.ones(y.shape) 60 | 61 | vmask = reduce(operator.and_,data[conf[:-1]].values.T==data[conf_f[:-1]].values.T) 62 | scores,imp,ptscores = {},{},{} 63 | for n,m,l in params: 64 | t = time(); 65 | scores[(m,l)],imp[(m,l)],ptscores[(m,l)] = [],[],[] 66 | col_trscores,col_cvscores = [],[] 67 | 68 | # initialize the ensemble of forests to run in parallel 69 | # class is also structured to handle single-process 70 | rfs = RandomForestsParallel(N, n, m, l, N_proc) 71 | 72 | # cross validation is use to find the best parameters 73 | for ifold,(itr,icv) in enumerate(cross_validation.KFold(len(y),kfold,indices=False)): 74 | if submit: 75 | # just a lame way to re-using the same code for fitting & selecting when submitting :) 76 | itr = np.ones(y.shape,dtype=bool); icv = -itr 77 | print "\nHEY! CREATING SUBMISSION!\n" 78 | else: 79 | # redo expected value for the current training & cv set 80 | for c in [x for x in X.columns if x[-4:] == '_exp']: 81 | X[c] = expval(data,c[:-4],'G_f',itr) 82 | 83 | # fits the random forests at the same time 84 | rfs.fit(X[itr],y[itr],w[itr]) 85 | 86 | print "predicting...", 87 | allpreds = rfs.predict(X) 88 | rftscores = [] 89 | print "selecting models..." 90 | for irf in range(len(rfs.rfs)): 91 | # SELECTION of the best random forest, even though probably 92 | # is just getting rid of very unlucky seeds ... 93 | pG = allpreds[:,irf]; ipt2 = data.shopping_pt > 1 94 | ptscore = make_ptscores(y[icv],pG[icv],data.G[icv],data.shopping_pt[icv],vmask[icv]) 95 | tptscore = make_ptscores(y[itr],pG[itr],data.G[itr],data.shopping_pt[itr],vmask[itr]) 96 | rftscores.append((tn.dot(tptscore[1:]),irf)) 97 | print "%i,%i %.5f %.5f %.5f %.5f" % ( 98 | ifold,irf, 99 | np.mean(pG[itr]==y[itr]),np.mean(vmask[itr]&(pG[itr]==y[itr])), 100 | np.mean(pG[ipt2&itr]==y[ipt2&itr]),np.mean(vmask[ipt2&itr]&(pG[ipt2&itr]==y[ipt2&itr]))), 101 | if verbose_selection: 102 | print " ".join(["%.5f" %pts for pts in ptscore]), 103 | print " ".join(["%.5f" %pts for pts in tptscore]), 104 | print "%.2f %.2f" %(tn.dot(tptscore[1:]),tn.dot(ptscore[1:])) 105 | 106 | # select the best models for the majority vote 107 | rftscores.sort(reverse=1); selected = [x[1] for x in rftscores[:NS]] 108 | 109 | print "counting votes..." 110 | # print also the score using all the models 111 | pG = majority_vote(data.G,allpreds) 112 | ptscore = make_ptscores(y[icv],pG[icv],data.G[icv],data.shopping_pt[icv],vmask[icv]) 113 | # ifold,a : majority vote score using all models 114 | print str(ifold)+",a "+" ".join(["%.5f" %pts for pts in ptscore])+" %.2f" % tn.dot(ptscore[1:]) 115 | 116 | # results for selected models 117 | pG = majority_vote(data.G,allpreds[:,selected]) 118 | ptscore = make_ptscores(y[icv],pG[icv],data.G[icv],data.shopping_pt[icv],vmask[icv]) 119 | # ifold,s : majority vote score using selected models 120 | print str(ifold)+",s "+" ".join(["%.5f" %pts for pts in ptscore])+" %.2f" % tn.dot(ptscore[1:]) 121 | 122 | # append features importances & scores 123 | col_trscores.append(np.mean(pG[itr]==y[itr])) # append train score 124 | col_cvscores.append(np.mean(pG[icv]==y[icv])) # append cv score 125 | imp[(m,l)].append(rfs.impf) 126 | scores[(m,l)].append(tn.dot(ptscore[1:])) 127 | ptscores[(m,l)].append(ptscore) 128 | 129 | # skip any following fold if we're submitting 130 | if submit: break 131 | 132 | print "%i %i %i\t %.2f %.2f %.4f %.4f %.2f - %.2fm" % ( 133 | n,m,l, 134 | np.mean(scores[(m,l)]), np.std(scores[(m,l)]), # for best params & variance 135 | np.mean(col_trscores), np.mean(col_cvscores), # use x diagnostic training set overfit 136 | tn.dot(np.mean(ptscores[(m,l)],axis=0)[1:]), # score 137 | (time()-t)/60), # k-fold time 138 | print " ".join(["%.5f" %pts for pts in np.mean(ptscores[(m,l)],axis=0)]), 139 | print " ".join(["%.5f" %pts for pts in np.std(ptscores[(m,l)],axis=0)]) 140 | 141 | if submit: 142 | # MAKE SUBMISSION 143 | # very complicated way to keep only the latest shopping_pt for each customer just to have everything in one row!!!!!11 144 | test = test[test.shopping_pt == test.reset_index().customer_ID.map(test.reset_index().groupby('customer_ID').shopping_pt.max())] 145 | Xt = test[con+cat+conf+extra] 146 | 147 | # TEST SET PREDICTION 148 | print "now predicting on test set...", 149 | allpreds = rfs.predict(Xt) 150 | test['pG'] = majority_vote(test.G,allpreds[:,selected]); print "done" 151 | 152 | # Fix state law products, then concatenate to string 153 | stateFix(encoders,test,['C','D','pG'],1) 154 | test['plan'] = concat(test,['A','B','C','D','E','F','pG']) 155 | test['plan'].to_csv('submission\\majority_rfs%i_%i.%i_shuffle_GAfix_%iof%iof%i.csv' % ( 156 | n,m,l,NS/2+1,NS,N),header=1) 157 | 158 | # features importances 159 | impf = rfs.impf; impf.sort() 160 | 161 | 162 | -------------------------------------------------------------------------------- /parallel.py: -------------------------------------------------------------------------------- 1 | # Allstate Purchase Prediction Challenge 2 | # Author: Alessandro Mariani 3 | # https://www.kaggle.com/c/allstate-purchase-prediction-challenge 4 | 5 | ''' 6 | RandomForestParallel: is just a "fancy" class which will help 7 | optimize memory usage while fitting several random forest 8 | on the same machine. It implements fit() and predict() as 9 | for the scikit-learn convention. 10 | ''' 11 | 12 | from time import time 13 | from sklearn import ensemble 14 | 15 | import multiprocessing, operator 16 | import pandas as pd, numpy as np 17 | 18 | ## Pickle FIX for multiprocessing using bound methods 19 | ## http://stackoverflow.com/questions/1816958/cant-pickle-type-instancemethod-when-using-pythons-multiprocessing-pool-ma/ 20 | from copy_reg import pickle 21 | from types import MethodType 22 | 23 | def _pickle_method(method): 24 | func_name = method.im_func.__name__ 25 | obj = method.im_self 26 | cls = method.im_class 27 | return _unpickle_method, (func_name, obj, cls) 28 | 29 | def _unpickle_method(func_name, obj, cls): 30 | for cls in cls.mro(): 31 | try: 32 | func = cls.__dict__[func_name] 33 | except KeyError: 34 | pass 35 | else: 36 | break 37 | return func.__get__(obj, cls) 38 | 39 | class RandomForestsParallel(object): 40 | # class used to fit & predict in parallel minimizing memory usage 41 | rfs = [] 42 | def __init__(self,N,ntree,maxfea,leafsize,N_proc=None): 43 | self.N = N 44 | self.ntree = ntree; self.maxfea = maxfea; self.leafsize = leafsize 45 | self.N_proc = N_proc if N_proc is not None else max(1,multiprocessing.cpu_count()-1) 46 | 47 | # fix pickling when using bound methods in classes 48 | pickle(MethodType, _pickle_method, _pickle_method) 49 | 50 | def _parallel_fit(self, rf): 51 | t = time() 52 | return rf.fit(self.X,self.y,self.w), (time()-t)/60. 53 | 54 | def _parallel_predict(self, rf): 55 | return rf.predict(self.X) 56 | 57 | def fit(self,X,y,w=None): 58 | # fit N random forest in parallel 59 | self.rfs = []; self.X = X; self.y = y 60 | self.w = np.ones(y.shape,dtype=bool) if w is None else w 61 | print "fitting %i RFs using %i processes..." % (self.N,self.N_proc), 62 | 63 | args = [ensemble.RandomForestClassifier( 64 | n_estimators=self.ntree, max_features=self.maxfea, 65 | min_samples_leaf=self.leafsize,random_state=irf, 66 | compute_importances=1) for irf in range(self.N)] 67 | 68 | if self.N_proc > 1: 69 | pool = multiprocessing.Pool(self.N_proc) 70 | for i,(rf,irft) in enumerate(pool.imap(self._parallel_fit,args)): 71 | self.rfs.append(rf); print "rf#%i %.2fm" % (i,irft), 72 | pool.terminate() 73 | else: 74 | for i,rf in enumerate(args): 75 | rf,irft = self._parallel_fit(rf) 76 | self.rfs.append(rf); print "rf#%i %.2fm" % (i,irft), 77 | 78 | del self.X,self.y,self.w 79 | # set the importances of the features 80 | self.impf = self._calculate_impf(X.columns) 81 | 82 | return self 83 | 84 | def predict(self,X,single_process=True): 85 | # predict using all the random forest in self.rfs 86 | # single_process is set by default, as multiprocess predict is not 87 | # memory efficient and sometime time efficient (efficient smaller N) 88 | self.X = X 89 | if (not single_process) & (self.N_proc > 1): 90 | pool = multiprocessing.Pool(self.N_proc) 91 | allpreds = np.array([p for p in pool.imap(self._parallel_predict,self.rfs)]).T 92 | pool.terminate() 93 | else: 94 | allpreds = np.array([self._parallel_predict(rf) for rf in self.rfs]).T 95 | 96 | del self.X 97 | 98 | return allpreds 99 | 100 | def _calculate_impf(self, feature_names): 101 | # private method to calculate the average features importance 102 | return pd.Series(reduce(operator.add,[rf.feature_importances_ for rf in self.rfs]) / self.N, feature_names) 103 | 104 | def __repr__(self): 105 | return "N:%i N_proc:%i ntree:%i maxfea:%i leafsize:%i fitted:%s" % ( 106 | self.N, self.N_proc, self.ntree,self.maxfea, 107 | self.leafsize, 'Yes' if len(self.rfs) > 0 else 'No') 108 | 109 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Allstate Purchase Prediction Challenge 2 | # Author: Alessandro Mariani 3 | # https://www.kaggle.com/c/allstate-purchase-prediction-challenge 4 | 5 | ''' 6 | This module cointains the data preparation and utilities 7 | ''' 8 | 9 | from time import time 10 | from itertools import combinations 11 | from sklearn import preprocessing 12 | 13 | import scipy as sp, numpy as np, pandas as pd 14 | 15 | # Cantor Pairing 16 | def cantor(args): 17 | # Cantor Pairing - recursive call if more than 1 pair 18 | if len(args) > 2: 19 | x2 = cantor(args[1:]) 20 | x1 = args[0] 21 | else: 22 | x1, x2 = args 23 | return int((0.5 * (x1 + x2)*(x1 + x2 + 1) + x2)) 24 | 25 | # Groups all columns of data into combinations of [degree] 26 | def group_data(data, degree=3, hash=hash, NAMES=None): 27 | init = time() 28 | new_data = []; combined_names = [] 29 | m,n = data.shape 30 | for indicies in combinations(range(n), degree): 31 | new_data.append([hash(tuple(v)) for v in data[:,indicies]]) 32 | if NAMES != None: 33 | combined_names.append( '+'.join([NAMES[indicies[i]] for i in range(degree)]) ) 34 | print "DONE! %.2fm" % ((time()-init)/60) 35 | if NAMES != None: 36 | return (np.array(new_data).T, combined_names) 37 | return np.array(new_data).T 38 | 39 | # Return concatenated fields in a dataframe 40 | # [1,2,3,4,5,6] => '123456' 41 | def concat(df, columns): 42 | return np.array([''.join(x) for x in np.array( 43 | [np.array(df[col].values, dtype=str) for col in columns]).T]) 44 | 45 | # Breakfast Pirate Awesome State trick + some additions 46 | def stateFix(encoders,df,c=['C','D','G'],verbose=False): 47 | # GA 48 | iGA = df.state == encoders['state'].transform(['GA'])[0] 49 | ifix = iGA&(df[c[0]]==1); df.ix[ifix,c[0]] = 2; nga1 = np.sum(ifix) #C 50 | ifix = iGA&(df[c[1]]==1); df.ix[ifix,c[1]] = 2; nga2 = np.sum(ifix) #D 51 | # FL 52 | iFL = df.state == encoders['state'].transform(['FL'])[0] 53 | ifix = iFL&(df[c[2]]<=2); df.ix[ifix,c[2]] = 3; nfl1 = np.sum(ifix) #G 54 | # OH 55 | iOH = df.state == encoders['state'].transform(['OH'])[0] 56 | ifix = iOH&(df[c[2]]==1); df.ix[ifix,c[2]] = 2; noh1 = np.sum(ifix) #G 57 | # ND 58 | iND = df.state == encoders['state'].transform(['ND'])[0] 59 | ifix = iND&(df[c[2]]!=2); df.ix[ifix,c[2]] = 2; nnd1 = np.sum(ifix) #G 60 | # SD 61 | iSD = df.state == encoders['state'].transform(['SD'])[0] 62 | ifix = iSD&(df[c[2]]!=2); df.ix[ifix,c[2]] = 2; nsd1 = np.sum(ifix) #G 63 | if verbose: 64 | print "Fixed state law products. GA1:%i GA2:%i FL1:%i OH1:%i ND1:%i SD1:%i" %( 65 | nga1, nga2, nfl1, noh1, nnd1, nsd1) 66 | 67 | # Target variable expected value given a categorical feature 68 | def expval(df,col,y,tfilter): 69 | tmp = pd.DataFrame(index=df.index) 70 | pb = df[tfilter][y].mean() # train set mean 71 | tmp['cnt'] = df[col].map(df[tfilter][col].value_counts()).fillna(0) # train set count 72 | tmp['csm'] = df[col].map(df[tfilter].groupby(col)[y].sum()).fillna(pb) # train set sum 73 | tmp.ix[tfilter,'cnt'] -= 1 # reduce count for train set 74 | tmp.ix[tfilter,'csm'] -= df.ix[tfilter,y] # remove current value 75 | tmp['exp'] = ((tmp.csm+ pb*15) / (tmp.cnt+ 15)).fillna(pb) # calculate mean including kn-extra 'average' samples 76 | np.random.seed(1) 77 | tmp.ix[tfilter,'exp'] *= 1+.3*(np.random.rand(len(tmp[tfilter]))-.5) # add some random noise to the train set 78 | return tmp.exp 79 | 80 | def prepare_data(shuffle=True): 81 | alltest = pd.read_csv('data\\test_v2.csv') 82 | test = alltest.set_index('customer_ID') 83 | alldata = pd.read_csv('data\\train.csv').set_index('customer_ID') 84 | 85 | # handy lists of features 86 | con = ['group_size','car_age','age_oldest','age_youngest','duration_previous','cost'] 87 | cat = ['homeowner','car_value','risk_factor','married_couple','C_previous','state', 'location','shopping_pt'] 88 | conf = ['A','B','C','D','E','F','G']; conf_f = [col+'_f' for col in conf] 89 | extra = [] 90 | 91 | final_purchase = alldata[alldata.record_type == 1] # final purchase 92 | data = alldata.join(final_purchase[conf], rsuffix='_f') # creating training dataset with target features 93 | data = data[data.record_type == 0] # removing final purchase 94 | 95 | data['conf'] = concat(data,conf_f) # handy purchase plan 96 | data['conf_init'] = concat(data,conf) # handy last quoted plan 97 | 98 | encoders = dict() 99 | data = data.append(test) 100 | 101 | # Fix NAs 102 | data['C_previous'].fillna(0, inplace=1) 103 | data['duration_previous'].fillna(0, inplace=1) 104 | data.location.fillna(-1, inplace=1); 105 | # Transform data to numerical data 106 | for col in ['car_value','risk_factor','state']: 107 | encoders[col] = preprocessing.LabelEncoder() 108 | data[col] = encoders[col].fit_transform(data[col].fillna(99)) 109 | 110 | print 'Location substitution:', 111 | ## get rid of very location, given the total count from train,cv and test set 112 | x = data[data.shopping_pt==2].location.value_counts() 113 | sub = data.location.map(x).fillna(0) < 5 114 | data.ix[sub,'location'] = data.state[sub]; print '%.5f' % sub.mean() 115 | 116 | # cost per car_age; cost per person; cost per state 117 | data['caCost'] = 1.*data.cost / (data.car_age+1) 118 | data['ppCost'] = 1.*data.cost / data.group_size 119 | data['stCost'] = data.state.map(data.groupby('state')['cost'].mean()) 120 | extra.extend(['caCost','ppCost','stCost']) 121 | 122 | # average quote cost by G values 123 | data['costG'] = data['G'].map(data.groupby('G')['cost'].mean()) 124 | extra.append('costG') 125 | 126 | # average quote cost by G & state values 127 | x = data.groupby(['G','state'])['cost'].mean() 128 | x = x.reset_index().set_index(['G','state']); x.columns = ['costStG'] # covert to DF 129 | data = data.merge(x,left_on=['G','state'],right_index=True,how='left') 130 | extra.append('costStG') 131 | 132 | # two way intersactino between state, G and shopping_pt 133 | print "Grouping few 2-way interactions...", 134 | grpTrn, c2 = group_data(data[['state','G','shopping_pt']].values,2,hash,['state','G','shopping_pt']) 135 | for i,col in enumerate(c2): 136 | encoders[col] = preprocessing.LabelEncoder() 137 | data[col] = encoders[col].fit_transform(grpTrn[:,i]) 138 | extra.extend(c2) 139 | 140 | # expected value (arithmetic average) of G by state & location 141 | for col in ['state','location']: 142 | extra.append(col+'_exp') 143 | data[col+'_exp'] = expval(data,col,'G_f',-data.G_f.isnull()) 144 | 145 | # previous G 146 | data['prev_G'] = data.G.shift(1); extra.append('prev_G') 147 | data.ix[data.shopping_pt == 1,'prev_G'] = data.ix[data.shopping_pt==1,'G'] 148 | 149 | # separating training & test data 150 | test = data[data.conf.isnull()]; data = data[-data.conf.isnull()] 151 | 152 | # SHUFFLE THE DATASET, keeping the same customers transaction in order 153 | if shuffle: 154 | print "Shuffling dataset...", 155 | np.random.seed(9); ids = np.unique(data.index.values) 156 | rands = pd.Series(np.random.random_sample(len(ids)),index=ids) 157 | data['rand'] = data.reset_index()['customer_ID'].map(rands).values 158 | data.sort(['rand','shopping_pt'],inplace=1); print "DONE!" 159 | 160 | # convert to int due to emtpy values in test set 161 | for col in conf_f: data[col] = np.array(data[col].values,dtype=np.int8) 162 | 163 | return data,test,con,cat,extra,conf,conf_f,encoders 164 | 165 | --------------------------------------------------------------------------------