├── .gitattributes
├── .gitignore
├── LICENSE.txt
├── README.md
├── majorityvote_modelselection.py
├── parallel.py
├── submission
    ├── majority_rfs50_5.23_shuffle_GAfix_4of7of10.csv
    └── majority_rfs50_5.23_shuffle_GAfix_5of9of50.csv
└── utils.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | *.sln    merge=union
 7 | *.csproj merge=union
 8 | *.vbproj merge=union
 9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 | 
12 | # Standard to msysgit
13 | *.doc	 diff=astextplain
14 | *.DOC	 diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot  diff=astextplain
18 | *.DOT  diff=astextplain
19 | *.pdf  diff=astextplain
20 | *.PDF	 diff=astextplain
21 | *.rtf	 diff=astextplain
22 | *.RTF	 diff=astextplain
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | old/
  2 | documentation/
  3 | data/*.csv
  4 | *.xlsx
  5 | 
  6 | #################
  7 | ## Eclipse
  8 | #################
  9 | 
 10 | *.pydevproject
 11 | .project
 12 | .metadata
 13 | bin/
 14 | tmp/
 15 | *.tmp
 16 | *.bak
 17 | *.swp
 18 | *~.nib
 19 | local.properties
 20 | .classpath
 21 | .settings/
 22 | .loadpath
 23 | 
 24 | # External tool builders
 25 | .externalToolBuilders/
 26 | 
 27 | # Locally stored "Eclipse launch configurations"
 28 | *.launch
 29 | 
 30 | # CDT-specific
 31 | .cproject
 32 | 
 33 | # PDT-specific
 34 | .buildpath
 35 | 
 36 | 
 37 | #################
 38 | ## Visual Studio
 39 | #################
 40 | 
 41 | ## Ignore Visual Studio temporary files, build results, and
 42 | ## files generated by popular Visual Studio add-ons.
 43 | 
 44 | # User-specific files
 45 | *.suo
 46 | *.user
 47 | *.sln.docstates
 48 | 
 49 | # Build results
 50 | 
 51 | [Dd]ebug/
 52 | [Rr]elease/
 53 | x64/
 54 | build/
 55 | [Bb]in/
 56 | [Oo]bj/
 57 | 
 58 | # MSTest test Results
 59 | [Tt]est[Rr]esult*/
 60 | [Bb]uild[Ll]og.*
 61 | 
 62 | *_i.c
 63 | *_p.c
 64 | *.ilk
 65 | *.meta
 66 | *.obj
 67 | *.pch
 68 | *.pdb
 69 | *.pgc
 70 | *.pgd
 71 | *.rsp
 72 | *.sbr
 73 | *.tlb
 74 | *.tli
 75 | *.tlh
 76 | *.tmp
 77 | *.tmp_proj
 78 | *.log
 79 | *.vspscc
 80 | *.vssscc
 81 | .builds
 82 | *.pidb
 83 | *.log
 84 | *.scc
 85 | 
 86 | # Visual C++ cache files
 87 | ipch/
 88 | *.aps
 89 | *.ncb
 90 | *.opensdf
 91 | *.sdf
 92 | *.cachefile
 93 | 
 94 | # Visual Studio profiler
 95 | *.psess
 96 | *.vsp
 97 | *.vspx
 98 | 
 99 | # Guidance Automation Toolkit
100 | *.gpState
101 | 
102 | # ReSharper is a .NET coding add-in
103 | _ReSharper*/
104 | *.[Rr]e[Ss]harper
105 | 
106 | # TeamCity is a build add-in
107 | _TeamCity*
108 | 
109 | # DotCover is a Code Coverage Tool
110 | *.dotCover
111 | 
112 | # NCrunch
113 | *.ncrunch*
114 | .*crunch*.local.xml
115 | 
116 | # Installshield output folder
117 | [Ee]xpress/
118 | 
119 | # DocProject is a documentation generator add-in
120 | DocProject/buildhelp/
121 | DocProject/Help/*.HxT
122 | DocProject/Help/*.HxC
123 | DocProject/Help/*.hhc
124 | DocProject/Help/*.hhk
125 | DocProject/Help/*.hhp
126 | DocProject/Help/Html2
127 | DocProject/Help/html
128 | 
129 | # Click-Once directory
130 | publish/
131 | 
132 | # Publish Web Output
133 | *.Publish.xml
134 | *.pubxml
135 | 
136 | # NuGet Packages Directory
137 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
138 | #packages/
139 | 
140 | # Windows Azure Build Output
141 | csx
142 | *.build.csdef
143 | 
144 | # Windows Store app package directory
145 | AppPackages/
146 | 
147 | # Others
148 | sql/
149 | *.Cache
150 | ClientBin/
151 | [Ss]tyle[Cc]op.*
152 | ~$*
153 | *~
154 | *.dbmdl
155 | *.[Pp]ublish.xml
156 | *.pfx
157 | *.publishsettings
158 | 
159 | # RIA/Silverlight projects
160 | Generated_Code/
161 | 
162 | # Backup & report files from converting an old project file to a newer
163 | # Visual Studio version. Backup files are not needed, because we have git ;-)
164 | _UpgradeReport_Files/
165 | Backup*/
166 | UpgradeLog*.XML
167 | UpgradeLog*.htm
168 | 
169 | # SQL Server files
170 | App_Data/*.mdf
171 | App_Data/*.ldf
172 | 
173 | #############
174 | ## Windows detritus
175 | #############
176 | 
177 | # Windows image file caches
178 | Thumbs.db
179 | ehthumbs.db
180 | 
181 | # Folder config file
182 | Desktop.ini
183 | 
184 | # Recycle Bin used on file shares
185 | $RECYCLE.BIN/
186 | 
187 | # Mac crap
188 | .DS_Store
189 | 
190 | 
191 | #############
192 | ## Python
193 | #############
194 | 
195 | *.py[co]
196 | 
197 | # Packages
198 | *.egg
199 | *.egg-info
200 | dist/
201 | build/
202 | eggs/
203 | parts/
204 | var/
205 | sdist/
206 | develop-eggs/
207 | .installed.cfg
208 | 
209 | # Installer logs
210 | pip-log.txt
211 | 
212 | # Unit test / coverage reports
213 | .coverage
214 | .tox
215 | 
216 | #Translations
217 | *.mo
218 | 
219 | #Mr Developer
220 | .mr.developer.cfg
221 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2014  Alessandro Mariani <alzmcr@yahoo.it>
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Allstate Purchase Prediction Challenge
 2 | 
 3 | ### Requirements
 4 | Python 2.7.5 with Scikit-Learn 0.14a1, Numpy 1.8, Pandas 0.12<br>
 5 | Windows 8, Intel i5-3230M @ 2.60Ghz, 16GB RAM<br>
 6 | Developed on a HP Envy 17 j100tx laptop<br>
 7 | 
 8 | ### How to generate the solution
 9 | Type "python majorityvote_modelselection.py" in Python shell or
10 | easily double click on Windows. Watch out on memory usage, even
11 | though "should" be configured not to exceed 8 GB with the
12 | default settings.
13 | 
14 | ### Comments
15 | Using the default setting, this will fit the model and creates 
16 | the submission which will score 0.53705 in the private L. This 
17 | is the setting which combined with Breakfast Pirate ABCEDF 
18 | combination, scored 0.53715 in the private LB and .54535 in the 
19 | public LB. On the above system configuration this will take 
20 | approximately 3 hours. If you’re impatience, set N=10 and NS=7 
21 | and will score 0.53710 in just 30 minutes! If you think is still
22 | slow try setting N=8, NS=6, params=[(30,5,23)] and is going to
23 | be even faster scoring as my best submission 0.53705 but lower
24 | on the public LB. If still slow, get a better computer!!!
25 | 
26 | The script will perform the the following steps:
27 | 
28 | 1. Prepare the data (load the files, transformation, clean and
29 |    create the engineered features)
30 | 2. Fit the Random Forests
31 | 3. Make the prediction of the product G
32 | 4. Selected the best Random Forest given the train set accuracy
33 | 5. Do a majority vote using all the N model(s) and print the 
34 |    score on the cross validation set
35 | 6. Do a majority vote using the NS selected model(s) and print 
36 |    the score on the cross validation set
37 | 
38 | Then, if submit is set to False:<br>
39 | a. Records the performance of the k-fold and loop<br>
40 | b. Exit the loop and make the prediction on the test set, do 
41 |    a majority vote using the selected models, fix the product
42 |    accordingly with the state rule and create the submission file
43 | 
44 | ### License
45 | Please refer for LICENSE.txt file
46 | 


--------------------------------------------------------------------------------
/majorityvote_modelselection.py:
--------------------------------------------------------------------------------
  1 | # Allstate Purchase Prediction Challenge
  2 | # Author: Alessandro Mariani <alzmcr@yahoo.it>
  3 | # https://www.kaggle.com/c/allstate-purchase-prediction-challenge
  4 | 
  5 | '''
  6 | This module is for train, cross validate and make the final prediction.
  7 | '''
  8 | 
  9 | import pandas as pd, numpy as np
 10 | import matplotlib.pyplot as plt
 11 | import operator
 12 | 
 13 | from sklearn import cross_validation, ensemble
 14 | from utils import prepare_data, concat, expval, stateFix
 15 | from parallel import RandomForestsParallel
 16 | from time import time
 17 | 
 18 | def majority_vote(baseline,model_predictions):
 19 |     # given a baseline and a matrix of prediction (#samples x #models)
 20 |     # if will return the prediction if 1+#models/2 agree on the same product
 21 |     # otherwise will return the baseline
 22 |     prcnt = np.vstack([np.bincount(p,minlength=5) for p in model_predictions])
 23 |     prmax = np.max(prcnt,axis=1) >= (1+(len(selected)/2))
 24 |     preds = baseline+0; preds[prmax] = np.argmax(prcnt[prmax],axis=1)
 25 |     return preds
 26 | 
 27 | def make_ptscores(y_true,y_pred,y_base,pt,vmask):
 28 |     # measure the increase of "plan" accuracy given a prediction for the product (G)
 29 |     return [np.mean(vmask[pt==ipt]&(y_true[pt==ipt] == y_pred[pt==ipt])) - np.mean(vmask[pt==ipt]&(y_true[pt==ipt] == y_base[pt==ipt])) for ipt in range(1,11)]
 30 | 
 31 | if __name__ == '__main__':
 32 |     ############################################################################
 33 |     ## SETTING #################################################################
 34 |     # submit: if 'True' create a submission file and train models for submission
 35 |     # N: number of models to build
 36 |     # NS: number of models to selected for majority vote
 37 |     # kfold: number of k-fold to perform, if not submitting
 38 |     # N_proc: number of process to spawn, default #CPU(s)-1
 39 |     # include_from_pt: minimum shopping_pt included in the data set
 40 |     # verbose_selection: print all details while selecting the model
 41 |     # tn: test set distrubution of shopping_pt (#10-11 merged)    
 42 |     ############################################################################
 43 |     submit = True; N = 50; NS = 9; kfold = 3; N_proc = None;
 44 |     include_from_pt = 1; verbose_selection = False
 45 |     tn = np.array([18943,13298,9251,6528,4203,2175,959,281,78])
 46 |     ############################################################################
 47 |     # Random Forest Setting ####################################################
 48 |     # Must be a list containg a tuple with (ntree,maxfea,leafsize)
 49 |     params = [(50,5,23)]
 50 |     # ex. [(x,5,23) for x in [35,50,75]] # [(50,x,23) for x in range(4,12)]
 51 |     # anything you'd like to try, here is the place for the modifications
 52 |     ############################################################################
 53 |     
 54 |     print "Majority vote using %i models, selecting %i\n" % (N,NS)
 55 |     # initialize data
 56 |     data,test,con,cat,extra,conf,conf_f,encoders = prepare_data()
 57 |     data = data[data.shopping_pt >=include_from_pt]; print "Including from shopping_pt #%i\n" % data.shopping_pt.min(),
 58 |     # features, target, weights (not used)
 59 |     X = data[con+cat+conf+extra]; y = data['G_f'] ; w = np.ones(y.shape)
 60 |     
 61 |     vmask = reduce(operator.and_,data[conf[:-1]].values.T==data[conf_f[:-1]].values.T)
 62 |     scores,imp,ptscores = {},{},{}
 63 |     for n,m,l in params:
 64 |         t = time();
 65 |         scores[(m,l)],imp[(m,l)],ptscores[(m,l)] = [],[],[]
 66 |         col_trscores,col_cvscores = [],[]
 67 | 
 68 |         # initialize the ensemble of forests to run in parallel
 69 |         # class is also structured to handle single-process 
 70 |         rfs = RandomForestsParallel(N, n, m, l, N_proc)
 71 |         
 72 |         # cross validation is use to find the best parameters
 73 |         for ifold,(itr,icv) in enumerate(cross_validation.KFold(len(y),kfold,indices=False)):
 74 |             if submit:
 75 |                 # just a lame way to re-using the same code for fitting & selecting when submitting :)
 76 |                 itr = np.ones(y.shape,dtype=bool); icv = -itr
 77 |                 print "\nHEY! CREATING SUBMISSION!\n"
 78 |             else:
 79 |                 # redo expected value for the current training & cv set
 80 |                 for c in [x for x in X.columns if x[-4:] == '_exp']:
 81 |                     X[c] = expval(data,c[:-4],'G_f',itr)           
 82 | 
 83 |             # fits the random forests at the same time
 84 |             rfs.fit(X[itr],y[itr],w[itr])
 85 | 
 86 |             print "predicting...",
 87 |             allpreds = rfs.predict(X)
 88 |             rftscores = []
 89 |             print "selecting models..."
 90 |             for irf in range(len(rfs.rfs)):
 91 |                 # SELECTION of the best random forest, even though probably
 92 |                 # is just getting rid of very unlucky seeds ...
 93 |                 pG = allpreds[:,irf]; ipt2 =  data.shopping_pt > 1
 94 |                 ptscore = make_ptscores(y[icv],pG[icv],data.G[icv],data.shopping_pt[icv],vmask[icv])
 95 |                 tptscore = make_ptscores(y[itr],pG[itr],data.G[itr],data.shopping_pt[itr],vmask[itr])
 96 |                 rftscores.append((tn.dot(tptscore[1:]),irf))
 97 |                 print "%i,%i %.5f %.5f %.5f %.5f" % (
 98 |                     ifold,irf,
 99 |                     np.mean(pG[itr]==y[itr]),np.mean(vmask[itr]&(pG[itr]==y[itr])),
100 |                     np.mean(pG[ipt2&itr]==y[ipt2&itr]),np.mean(vmask[ipt2&itr]&(pG[ipt2&itr]==y[ipt2&itr]))),
101 |                 if verbose_selection:
102 |                     print " ".join(["%.5f" %pts for pts in ptscore]),
103 |                     print " ".join(["%.5f" %pts for pts in tptscore]),
104 |                 print "%.2f %.2f" %(tn.dot(tptscore[1:]),tn.dot(ptscore[1:]))
105 | 
106 |             # select the best models for the majority vote
107 |             rftscores.sort(reverse=1); selected = [x[1] for x in rftscores[:NS]]
108 | 
109 |             print "counting votes..."
110 |             # print also the score using all the models
111 |             pG = majority_vote(data.G,allpreds)
112 |             ptscore = make_ptscores(y[icv],pG[icv],data.G[icv],data.shopping_pt[icv],vmask[icv])
113 |             # ifold,a : majority vote score using all models
114 |             print str(ifold)+",a "+" ".join(["%.5f" %pts for pts in ptscore])+" %.2f" % tn.dot(ptscore[1:])
115 |             
116 |             # results for selected models
117 |             pG = majority_vote(data.G,allpreds[:,selected])
118 |             ptscore = make_ptscores(y[icv],pG[icv],data.G[icv],data.shopping_pt[icv],vmask[icv])
119 |             # ifold,s : majority vote score using selected models
120 |             print str(ifold)+",s "+" ".join(["%.5f" %pts for pts in ptscore])+" %.2f" % tn.dot(ptscore[1:])
121 |             
122 |             # append features importances & scores
123 |             col_trscores.append(np.mean(pG[itr]==y[itr]))        # append train score
124 |             col_cvscores.append(np.mean(pG[icv]==y[icv]))        # append cv score
125 |             imp[(m,l)].append(rfs.impf)
126 |             scores[(m,l)].append(tn.dot(ptscore[1:]))
127 |             ptscores[(m,l)].append(ptscore)
128 | 
129 |             # skip any following fold if we're submitting
130 |             if submit: break
131 |         
132 |         print "%i %i %i\t %.2f %.2f %.4f %.4f %.2f - %.2fm" % (
133 |             n,m,l,
134 |             np.mean(scores[(m,l)]), np.std(scores[(m,l)]),  # for best params & variance
135 |             np.mean(col_trscores), np.mean(col_cvscores),   # use x diagnostic training set overfit
136 |             tn.dot(np.mean(ptscores[(m,l)],axis=0)[1:]),    # score
137 |             (time()-t)/60),                                 # k-fold time
138 |         print " ".join(["%.5f" %pts for pts in np.mean(ptscores[(m,l)],axis=0)]),
139 |         print " ".join(["%.5f" %pts for pts in np.std(ptscores[(m,l)],axis=0)])
140 |         
141 |     if submit:
142 |         # MAKE SUBMISSION
143 |         # very complicated way to keep only the latest shopping_pt for each customer just to have everything in one row!!!!!11
144 |         test = test[test.shopping_pt == test.reset_index().customer_ID.map(test.reset_index().groupby('customer_ID').shopping_pt.max())]
145 |         Xt = test[con+cat+conf+extra]
146 | 
147 |         # TEST SET PREDICTION
148 |         print "now predicting on test set...",
149 |         allpreds = rfs.predict(Xt)
150 |         test['pG'] = majority_vote(test.G,allpreds[:,selected]); print "done"
151 |         
152 |         # Fix state law products, then concatenate to string
153 |         stateFix(encoders,test,['C','D','pG'],1)
154 |         test['plan'] = concat(test,['A','B','C','D','E','F','pG'])
155 |         test['plan'].to_csv('submission\\majority_rfs%i_%i.%i_shuffle_GAfix_%iof%iof%i.csv' % (
156 |             n,m,l,NS/2+1,NS,N),header=1)
157 | 
158 |         # features importances
159 |         impf = rfs.impf; impf.sort()
160 |         
161 | 
162 | 


--------------------------------------------------------------------------------
/parallel.py:
--------------------------------------------------------------------------------
  1 | # Allstate Purchase Prediction Challenge
  2 | # Author: Alessandro Mariani <alzmcr@yahoo.it>
  3 | # https://www.kaggle.com/c/allstate-purchase-prediction-challenge
  4 | 
  5 | '''
  6 | RandomForestParallel: is just a "fancy" class which will help
  7 |     optimize memory usage while fitting several random forest
  8 |     on the same machine. It implements fit() and predict() as
  9 |     for the scikit-learn convention.
 10 | '''
 11 | 
 12 | from time import time
 13 | from sklearn import ensemble
 14 | 
 15 | import multiprocessing, operator
 16 | import pandas as pd, numpy as np
 17 | 
 18 | ## Pickle FIX for multiprocessing using bound methods
 19 | ## http://stackoverflow.com/questions/1816958/cant-pickle-type-instancemethod-when-using-pythons-multiprocessing-pool-ma/
 20 | from copy_reg import pickle
 21 | from types import MethodType
 22 |         
 23 | def _pickle_method(method):
 24 |     func_name = method.im_func.__name__
 25 |     obj = method.im_self
 26 |     cls = method.im_class
 27 |     return _unpickle_method, (func_name, obj, cls)
 28 | 
 29 | def _unpickle_method(func_name, obj, cls):
 30 |     for cls in cls.mro():
 31 |         try:
 32 |             func = cls.__dict__[func_name]
 33 |         except KeyError:
 34 |             pass
 35 |         else:
 36 |             break
 37 |     return func.__get__(obj, cls)
 38 | 
 39 | class RandomForestsParallel(object):
 40 |     # class used to fit & predict in parallel minimizing memory usage
 41 |     rfs = []
 42 |     def __init__(self,N,ntree,maxfea,leafsize,N_proc=None):
 43 |         self.N = N
 44 |         self.ntree = ntree; self.maxfea = maxfea; self.leafsize = leafsize
 45 |         self.N_proc = N_proc if N_proc is not None else max(1,multiprocessing.cpu_count()-1)
 46 | 
 47 |         # fix pickling when using bound methods in classes
 48 |         pickle(MethodType, _pickle_method, _pickle_method)
 49 | 
 50 |     def _parallel_fit(self, rf):
 51 |         t = time()
 52 |         return rf.fit(self.X,self.y,self.w), (time()-t)/60.
 53 | 
 54 |     def _parallel_predict(self, rf):
 55 |         return rf.predict(self.X)
 56 |     
 57 |     def fit(self,X,y,w=None):
 58 |         # fit N random forest in parallel
 59 |         self.rfs = []; self.X = X; self.y = y
 60 |         self.w = np.ones(y.shape,dtype=bool) if w is None else w
 61 |         print "fitting %i RFs using %i processes..." % (self.N,self.N_proc),
 62 | 
 63 |         args = [ensemble.RandomForestClassifier(
 64 |             n_estimators=self.ntree, max_features=self.maxfea,
 65 |             min_samples_leaf=self.leafsize,random_state=irf,
 66 |             compute_importances=1) for irf in range(self.N)]
 67 | 
 68 |         if self.N_proc > 1:
 69 |             pool = multiprocessing.Pool(self.N_proc)
 70 |             for i,(rf,irft) in enumerate(pool.imap(self._parallel_fit,args)):
 71 |                 self.rfs.append(rf); print "rf#%i %.2fm" % (i,irft),
 72 |             pool.terminate()
 73 |         else:
 74 |             for i,rf in enumerate(args):
 75 |                 rf,irft = self._parallel_fit(rf)
 76 |                 self.rfs.append(rf); print "rf#%i %.2fm" % (i,irft),
 77 |                 
 78 |         del self.X,self.y,self.w
 79 |         # set the importances of the features
 80 |         self.impf = self._calculate_impf(X.columns)
 81 | 
 82 |         return self
 83 |         
 84 |     def predict(self,X,single_process=True):
 85 |         # predict using all the random forest in self.rfs
 86 |         # single_process is set by default, as multiprocess predict is not
 87 |         # memory efficient and sometime time efficient (efficient smaller N)
 88 |         self.X = X
 89 |         if (not single_process) & (self.N_proc > 1):
 90 |             pool = multiprocessing.Pool(self.N_proc)
 91 |             allpreds = np.array([p for p in pool.imap(self._parallel_predict,self.rfs)]).T
 92 |             pool.terminate()
 93 |         else:
 94 |             allpreds = np.array([self._parallel_predict(rf) for rf in self.rfs]).T
 95 |             
 96 |         del self.X
 97 |         
 98 |         return allpreds
 99 | 
100 |     def _calculate_impf(self, feature_names):
101 |         # private method to calculate the average features importance
102 |         return pd.Series(reduce(operator.add,[rf.feature_importances_ for rf in self.rfs]) / self.N, feature_names)
103 | 
104 |     def __repr__(self):
105 |         return "N:%i N_proc:%i ntree:%i maxfea:%i leafsize:%i fitted:%s" % (    
106 |             self.N, self.N_proc, self.ntree,self.maxfea,
107 |             self.leafsize, 'Yes' if len(self.rfs) > 0 else 'No')
108 |     
109 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # Allstate Purchase Prediction Challenge
  2 | # Author: Alessandro Mariani <alzmcr@yahoo.it>
  3 | # https://www.kaggle.com/c/allstate-purchase-prediction-challenge
  4 | 
  5 | '''
  6 | This module cointains the data preparation and utilities
  7 | '''
  8 | 
  9 | from time import time
 10 | from itertools import combinations
 11 | from sklearn import preprocessing
 12 | 
 13 | import scipy as sp, numpy as np, pandas as pd
 14 | 
 15 | # Cantor Pairing
 16 | def cantor(args):
 17 |     # Cantor Pairing - recursive call if more than 1 pair
 18 |     if len(args) > 2:
 19 |         x2 = cantor(args[1:])
 20 |         x1 = args[0]
 21 |     else:
 22 |         x1, x2 = args
 23 |     return int((0.5 * (x1 + x2)*(x1 + x2 + 1) + x2))
 24 | 
 25 | # Groups all columns of data into combinations of [degree]
 26 | def group_data(data, degree=3, hash=hash, NAMES=None): 
 27 |     init = time()
 28 |     new_data = []; combined_names = []
 29 |     m,n = data.shape
 30 |     for indicies in combinations(range(n), degree):
 31 |         new_data.append([hash(tuple(v)) for v in data[:,indicies]])
 32 |         if NAMES != None:
 33 |             combined_names.append( '+'.join([NAMES[indicies[i]] for i in range(degree)]) )
 34 |     print "DONE! %.2fm" % ((time()-init)/60)
 35 |     if NAMES != None:
 36 |         return (np.array(new_data).T, combined_names)
 37 |     return np.array(new_data).T
 38 | 
 39 | # Return concatenated fields in a dataframe
 40 | # [1,2,3,4,5,6] => '123456'
 41 | def concat(df, columns):
 42 |     return np.array([''.join(x) for x in np.array(
 43 |         [np.array(df[col].values, dtype=str) for col in columns]).T])
 44 | 
 45 | # Breakfast Pirate Awesome State trick + some additions
 46 | def stateFix(encoders,df,c=['C','D','G'],verbose=False):
 47 |     # GA
 48 |     iGA = df.state == encoders['state'].transform(['GA'])[0]
 49 |     ifix = iGA&(df[c[0]]==1); df.ix[ifix,c[0]] = 2; nga1 = np.sum(ifix) #C
 50 |     ifix = iGA&(df[c[1]]==1); df.ix[ifix,c[1]] = 2; nga2 = np.sum(ifix) #D
 51 |     # FL
 52 |     iFL = df.state == encoders['state'].transform(['FL'])[0]
 53 |     ifix = iFL&(df[c[2]]<=2); df.ix[ifix,c[2]] = 3; nfl1 = np.sum(ifix) #G
 54 |     # OH
 55 |     iOH = df.state == encoders['state'].transform(['OH'])[0]
 56 |     ifix = iOH&(df[c[2]]==1); df.ix[ifix,c[2]] = 2; noh1 = np.sum(ifix) #G
 57 |     # ND
 58 |     iND = df.state == encoders['state'].transform(['ND'])[0]
 59 |     ifix = iND&(df[c[2]]!=2); df.ix[ifix,c[2]] = 2; nnd1 = np.sum(ifix) #G
 60 |     # SD
 61 |     iSD = df.state == encoders['state'].transform(['SD'])[0]
 62 |     ifix = iSD&(df[c[2]]!=2); df.ix[ifix,c[2]] = 2; nsd1 = np.sum(ifix) #G
 63 |     if verbose:
 64 |         print "Fixed state law products. GA1:%i GA2:%i FL1:%i OH1:%i ND1:%i SD1:%i" %(
 65 |             nga1, nga2, nfl1, noh1, nnd1, nsd1)
 66 | 
 67 | # Target variable expected value given a categorical feature
 68 | def expval(df,col,y,tfilter):
 69 |     tmp = pd.DataFrame(index=df.index)
 70 |     pb = df[tfilter][y].mean()                                              # train set mean
 71 |     tmp['cnt'] = df[col].map(df[tfilter][col].value_counts()).fillna(0)     # train set count
 72 |     tmp['csm'] = df[col].map(df[tfilter].groupby(col)[y].sum()).fillna(pb)  # train set sum
 73 |     tmp.ix[tfilter,'cnt'] -= 1                                              # reduce count for train set
 74 |     tmp.ix[tfilter,'csm'] -= df.ix[tfilter,y]                               # remove current value
 75 |     tmp['exp'] = ((tmp.csm+ pb*15) / (tmp.cnt+ 15)).fillna(pb)              # calculate mean including kn-extra 'average' samples 
 76 |     np.random.seed(1)
 77 |     tmp.ix[tfilter,'exp'] *= 1+.3*(np.random.rand(len(tmp[tfilter]))-.5) # add some random noise to the train set
 78 |     return tmp.exp
 79 | 
 80 | def prepare_data(shuffle=True):
 81 |     alltest = pd.read_csv('data\\test_v2.csv')
 82 |     test = alltest.set_index('customer_ID')
 83 |     alldata = pd.read_csv('data\\train.csv').set_index('customer_ID')
 84 | 
 85 |     # handy lists of features
 86 |     con = ['group_size','car_age','age_oldest','age_youngest','duration_previous','cost']
 87 |     cat = ['homeowner','car_value','risk_factor','married_couple','C_previous','state', 'location','shopping_pt']
 88 |     conf = ['A','B','C','D','E','F','G']; conf_f = [col+'_f' for col in conf]
 89 |     extra = []
 90 | 
 91 |     final_purchase = alldata[alldata.record_type == 1]          # final purchase
 92 |     data = alldata.join(final_purchase[conf], rsuffix='_f')     # creating training dataset with target features
 93 |     data = data[data.record_type == 0]                          # removing final purchase
 94 | 
 95 |     data['conf'] = concat(data,conf_f)                          # handy purchase plan 
 96 |     data['conf_init'] = concat(data,conf)                       # handy last quoted plan
 97 | 
 98 |     encoders = dict()
 99 |     data = data.append(test)
100 | 
101 |     # Fix NAs
102 |     data['C_previous'].fillna(0, inplace=1)
103 |     data['duration_previous'].fillna(0, inplace=1)
104 |     data.location.fillna(-1, inplace=1);
105 |     # Transform data to numerical data
106 |     for col in ['car_value','risk_factor','state']:
107 |         encoders[col] = preprocessing.LabelEncoder()
108 |         data[col] = encoders[col].fit_transform(data[col].fillna(99))
109 | 
110 |     print 'Location substitution:',
111 |     ## get rid of very location, given the total count from train,cv and test set
112 |     x = data[data.shopping_pt==2].location.value_counts()
113 |     sub = data.location.map(x).fillna(0) < 5
114 |     data.ix[sub,'location'] = data.state[sub]; print '%.5f' % sub.mean()
115 | 
116 |     # cost per car_age; cost per person; cost per state
117 |     data['caCost'] = 1.*data.cost / (data.car_age+1)
118 |     data['ppCost'] = 1.*data.cost / data.group_size
119 |     data['stCost'] = data.state.map(data.groupby('state')['cost'].mean())
120 |     extra.extend(['caCost','ppCost','stCost'])
121 | 
122 |     # average quote cost by G values
123 |     data['costG'] = data['G'].map(data.groupby('G')['cost'].mean())
124 |     extra.append('costG')
125 | 
126 |     # average quote cost by G & state values
127 |     x = data.groupby(['G','state'])['cost'].mean()
128 |     x = x.reset_index().set_index(['G','state']); x.columns = ['costStG']   # covert to DF
129 |     data = data.merge(x,left_on=['G','state'],right_index=True,how='left')
130 |     extra.append('costStG')
131 | 
132 |     # two way intersactino between state, G and shopping_pt
133 |     print "Grouping few 2-way interactions...",
134 |     grpTrn, c2 = group_data(data[['state','G','shopping_pt']].values,2,hash,['state','G','shopping_pt'])
135 |     for i,col in enumerate(c2):
136 |         encoders[col] = preprocessing.LabelEncoder()
137 |         data[col] = encoders[col].fit_transform(grpTrn[:,i])
138 |     extra.extend(c2)
139 | 
140 |     # expected value (arithmetic average) of G by state & location
141 |     for col in ['state','location']:
142 |         extra.append(col+'_exp')
143 |         data[col+'_exp'] = expval(data,col,'G_f',-data.G_f.isnull())
144 | 
145 |     # previous G
146 |     data['prev_G'] = data.G.shift(1); extra.append('prev_G')
147 |     data.ix[data.shopping_pt == 1,'prev_G'] = data.ix[data.shopping_pt==1,'G']
148 | 
149 |     # separating training & test data
150 |     test = data[data.conf.isnull()]; data = data[-data.conf.isnull()]
151 | 
152 |     # SHUFFLE THE DATASET, keeping the same customers transaction in order
153 |     if shuffle:
154 |         print "Shuffling dataset...",
155 |         np.random.seed(9); ids = np.unique(data.index.values)
156 |         rands = pd.Series(np.random.random_sample(len(ids)),index=ids)
157 |         data['rand'] = data.reset_index()['customer_ID'].map(rands).values
158 |         data.sort(['rand','shopping_pt'],inplace=1); print "DONE!"
159 | 
160 |     # convert to int due to emtpy values in test set
161 |     for col in conf_f: data[col] = np.array(data[col].values,dtype=np.int8)
162 | 
163 |     return data,test,con,cat,extra,conf,conf_f,encoders
164 | 
165 | 


--------------------------------------------------------------------------------