├── .gitignore ├── README.md ├── model.py ├── preprocess.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | data/ 3 | drivers/ 4 | logs/ 5 | submissions/ 6 | .idea 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ##AXA - Driver Telematics Analysis 2 | 3 | ### Ideas 4 | Train N random forest model for each driver in the data set. 5 | 6 | * Rather than consider each trip as a single instance, divide the trip 7 | in equals part (default=4). Then sum up the probabilities at the 8 | very end for each trip. 9 | * Create features with different window 10 | * The model is train on all the driver trips 11 | plus five times more trips randomly picked from all other drivers. 12 | 13 | ### How to generate the solution 14 | Just run "python model.py" and you're good to go! You'll need 15 | first to unpack the driver.zip, create a /data and /submission 16 | folder in order not to fail the script. 17 | 18 | ### Settings in __main__ 19 | * n_jobs: allow multiprocessing spawn N jobs 20 | * use_cache: use previously preprocessed data file 21 | * n_drivers: drivers to process and save to file 22 | * windows: windows for calculate features (1,15,30,60 seconds) 23 | * part: number of part to split a trip into 24 | * n_quantiles: number of features to create 25 | * size: if not None, split trip in equals size parts rather than equals parts 26 | 27 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | from sklearn.linear_model import Ridge 6 | from sklearn import preprocessing, ensemble, cross_validation, linear_model, metrics, decomposition, cluster 7 | from preprocess import prepare_data 8 | from multiprocessing import Pool, Manager 9 | from utils import logger, kf_score, kf_score_impf, feature_selection, feature_removal, remove_outliers, gzip_submission 10 | from utils import MyRidge, MyRidgeCV, introduce_outliers, remove_worst, parse_submission, low_memory_read_csv 11 | from time import time 12 | import datetime 13 | import sys 14 | 15 | """ multiprocessing handler: assumed global data,part,qlog""" 16 | def make_driver_prediction(driverid): 17 | # create mask 18 | np.random.seed(driverid); dmask = (data['driverid'].values==driverid) 19 | mask = dmask | (np.random.sample(len(data)) < (dmask.sum()*5./len(data))) 20 | X = data[mask].copy(); X['target'] = X['driverid'] == driverid 21 | X = X.set_index(['driverid','tripid']); y = X['target']; X = X.drop('target',1) 22 | 23 | # initialize model 24 | rf = ensemble.RandomForestClassifier(random_state=1, n_estimators=500, n_jobs=1, min_samples_leaf=3) 25 | 26 | selected_feas = X.columns; X = X[selected_feas] 27 | y_original = y.copy(); X_original = X.copy() 28 | # logging 29 | qlog.put('%i,"%s",%i,%i' %(driverid,selected_feas,rf.n_estimators,rf.min_samples_leaf)) 30 | 31 | predict_with_kfold = False 32 | if predict_with_kfold: 33 | proba = y_original[y_original].copy(); proba.name = 'prob' 34 | for itr,icv in cross_validation.KFold(len(X),10,shuffle=True,random_state=7): 35 | Xitr, Xicv = X.iloc[itr], X.iloc[icv] 36 | yitr, yicv, yicv_original = y.iloc[itr], y.iloc[icv], y_original.iloc[icv] 37 | proba[yicv[yicv_original].index.values] = rf.fit(Xitr,yitr).predict_proba(Xicv[yicv_original.values])[:,1] 38 | else: 39 | proba = pd.DataFrame(rf.fit(X,y).predict_proba(X_original[y_original])[:,1],index=X_original[y_original].index,columns=['prob']) 40 | 41 | return proba.reset_index() 42 | 43 | logfile = datetime.datetime.now().strftime("logs/%Y%m%d_%H%M")+".log" 44 | 45 | if __name__ == '__main__': 46 | n_jobs = 4; use_cache = False; logger = False 47 | 48 | if not use_cache: 49 | n_drivers = 10000 50 | n_jobs = 4; windows = [1,15,30,60] 51 | part = 4; n_quantiles = 15 52 | size = None 53 | fname = "data/processed_part%i_q%s_%s.csv"%(part,n_quantiles,'w'.join([str(w) for w in ['']+windows])) 54 | data = prepare_data(n_drivers,windows,n_quantiles,part,size,n_jobs) 55 | data.to_csv(fname) 56 | else: 57 | # use cache 58 | t = time(); print "Loading cache...", 59 | data = pd.DataFrame.from_csv("data/processed.csv") 60 | print "DONE! %.2fm" % ((time()-t)/60.) 61 | 62 | eta_iteration = (np.array([2,3,4,5,10,50,100])*n_jobs).tolist() + (np.array(range(200,3000,100)*n_jobs).tolist()) 63 | probas = []; t = time(); print "Predicting... estimated time:", 64 | if n_jobs > 1: 65 | # initialize logger and pool args 66 | qlog = Manager().Queue(); ndrivers = data['driverid'].nunique() 67 | # initialize pool and logger 68 | pool = Pool(n_jobs+(1 if logger else 0)) 69 | if logger: 70 | rlog = pool.apply_async(logger, [(logfile,qlog)]); qlog.put("driverid,feas,ntree,nleaf") 71 | for i,proba in enumerate(pool.imap(make_driver_prediction,data['driverid'].unique()[:])): 72 | probas.append(proba) 73 | if i in eta_iteration[4:]: 74 | print "%.2fm (%.2fm)," % (((time()-t) / i * (data.driverid.nunique()-i+1) / 60.),(time()-t)/60.), 75 | sys.stdout.flush() 76 | qlog.put('kill'); pool.terminate() 77 | else: 78 | for i,driverid in enumerate(data['driverid'].unique()[:]): 79 | probas.append(make_driver_prediction(driverid)) 80 | if i in eta_iteration: 81 | print "%.2fm (%.2fm)," % (((time()-t) / i * (data.driverid.nunique()-i+1) / 60.),(time()-t)/60.), 82 | sys.stdout.flush() 83 | print "DONE! %.2fm" % ((time()-t)/60.) 84 | 85 | print "Creating file...", 86 | submission_name = "submissions/%s.csv" % datetime.datetime.now().strftime("%Y%m%d_%H%M") 87 | submission = pd.concat(probas)[['driverid','tripid','prob']] 88 | submission['driver_trip'] = submission.apply(lambda x: "%i_%i"%(x['driverid'],x['tripid']),1) 89 | submission.groupby('driver_trip')[['prob']].mean().to_csv(submission_name, header=True) 90 | print "compressing..." 91 | gzip_submission(submission_name) 92 | print "DONE! %.2fm" % ((time()-t)/60.) 93 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | import os 6 | import random 7 | from time import time 8 | from sklearn.linear_model import Ridge 9 | from sklearn import feature_selection, preprocessing, ensemble, cross_validation, linear_model, metrics 10 | from pylab import cm 11 | from multiprocessing import Pool 12 | import sys 13 | 14 | driverids = os.walk('drivers').next()[1] 15 | 16 | def driver_trips(driverid,windows=[1],n_quantiles=5,part=1,size=None): 17 | tripfeas = [] 18 | windows = windows if isinstance(windows,list) else [windows] 19 | for i,tripid in enumerate(os.listdir('drivers/%s' % driverid)): 20 | trip = pd.read_csv('drivers/%s/%s' % (driverid,tripid)) 21 | tripfea = pd.concat([trip_feature(trip,window,n_quantiles,part,size) for window in windows],axis=1) 22 | tripfea = tripfea.astype(np.float32) 23 | tripfea['driverid'] = int(driverid); tripfea['tripid'] = int(tripid.replace('.csv','')) 24 | tripfea['len'] = len(trip) 25 | tripfeas.append(tripfea.fillna(0)) 26 | return pd.concat(tripfeas) 27 | 28 | def trip_feature(trip,window=1,n_quantiles=5,part=1,size=None): 29 | diff = trip.diff(window) 30 | distance = (diff**2).sum(axis=1)**0.5 31 | acceration = distance.diff() 32 | heading = np.arctan2(diff.x,diff.y).diff().abs() 33 | cacceleration = heading*acceration 34 | cvelocity = heading*distance 35 | bounds = [((len(trip)/part)*(ip-1),(len(trip)/part)*ip) for ip in range(1,part+1)] 36 | if size is not None: 37 | size = min(len(trip)-window,size) 38 | # override the part 39 | bounds = [(window+i*size,window+(i+1)*size) for i in range((len(trip)-window)/size)] 40 | rows = [] 41 | for ilower,iupper in bounds: 42 | # speeds 43 | tspeeds = [distance[ilower:iupper].quantile(q) for q in np.linspace(0,1,n_quantiles)] 44 | name_tspeeds = ['speed_q%.2f'%q for q in np.linspace(0,1,n_quantiles)] 45 | # acceleration 46 | tacc = [acceration[ilower:iupper].quantile(q) for q in np.linspace(0,1,n_quantiles)] 47 | name_tacc = ['acc_q%.2f'%q for q in np.linspace(0,1,n_quantiles)] 48 | #TODO heading change 49 | headings = [heading[ilower:iupper].quantile(q) for q in np.linspace(0,1,n_quantiles)] 50 | name_headings = ['headchange_q%.2f'%q for q in np.linspace(0,1,n_quantiles)] 51 | #TODO centripetal acceleration 52 | caccelerations = [cacceleration[ilower:iupper].quantile(q) for q in np.linspace(0,1,n_quantiles)] 53 | name_caccelerations = ['headacc_q%.2f'%q for q in np.linspace(0,1,n_quantiles)] 54 | #TODO as above but velocity 55 | cvelocitys = [cvelocity[ilower:iupper].quantile(q) for q in np.linspace(0,1,n_quantiles)] 56 | name_cvelocitys = ['headvel_q%.2f'%q for q in np.linspace(0,1,n_quantiles)] 57 | #TODO stops (inside acceleration? - (distance<1).mean()) 58 | #TODO include standard variation in statistics? 59 | # append row 60 | rows.append([tspeeds + tacc + headings + caccelerations + cvelocitys]) 61 | 62 | tripfea = pd.DataFrame( 63 | np.array(np.vstack(rows)), 64 | columns = name_tspeeds + name_tacc + name_headings + name_caccelerations + name_cvelocitys 65 | ); tripfea.columns = ['w%i_'%window+col for col in tripfea.columns] 66 | return tripfea 67 | 68 | def pool_driver_trips_handler(args): 69 | trip,windows,n_quantiles,part,size = args 70 | return driver_trips(trip,windows,n_quantiles,part,size) 71 | 72 | def prepare_data(n_drivers, windows=[1], n_quantiles=9, part=1, size=None, n_jobs=1, seed=7): 73 | t = time(); random.seed(seed); n_drivers = min(n_drivers,len(driverids)) 74 | print "Processing data | window %s | quantiles %i | (%i jobs) | ETA:" % (windows,n_quantiles,n_jobs), 75 | # drawing sample (if under sampling) 76 | sample = [ driverids[i] for i in random.sample(xrange(len(driverids)), min(n_drivers,len(driverids))) ] 77 | # estimated time print iteration 78 | eta_iteration = (np.array([2,3,4,5,10,50,100])*n_jobs).tolist() + (np.array(range(200,3000,100)*n_jobs).tolist()) 79 | # multiprocess support 80 | if n_jobs > 1: 81 | pool = Pool(n_jobs); dfs = []; t = time() 82 | for i,df in enumerate(pool.imap(pool_driver_trips_handler, [(trip,windows,n_quantiles,part,size) for trip in sample])): 83 | dfs.append(df) 84 | if i in eta_iteration[4:]: 85 | print "%.2fm (%.2fm)," % (((time()-t) / i * (len(driverids)-i+1) / 60.),(time()-t)/60.), 86 | sys.stdout.flush() 87 | # terminate pool 88 | pool.terminate() 89 | else: 90 | dfs = map(lambda x: driver_trips(x,windows,n_quantiles,part), sample) 91 | print "DONE! %.2fm" % ((time()-t)/60.) 92 | return pd.concat(dfs) 93 | 94 | if __name__ == '__main__': 95 | # settings 96 | n_drivers = 10000 97 | n_jobs = 4; windows = [1,15,30,60] # [1,5,10,15,30,45,60] 98 | part = 6; n_quantiles = 15 99 | size = None 100 | 101 | 102 | fname = "data/processed_part%i_q%s_%s.csv"%(part,n_quantiles,'w'.join([str(w) for w in ['']+windows])) 103 | if size is not None: 104 | print "Using 'size'!" 105 | fname = "data/processed_size%i_q%s_%s.csv"%(size,n_quantiles,'w'.join([str(w) for w in ['']+windows])) 106 | 107 | prepare_data(n_drivers,windows,n_quantiles,part,size,n_jobs).to_csv(fname) 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from time import sleep 5 | from sklearn import cross_validation,metrics,cluster,decomposition,preprocessing,linear_model 6 | from sklearn.feature_selection import f_classif 7 | import gzip 8 | 9 | def introduce_outliers(y,n=1,seed=10): 10 | y_copy = y.copy().reset_index(); np.random.seed(seed) 11 | y_copy['r'] = np.random.sample(len(y)); y_copy = y_copy.sort('r') 12 | y.loc[y_copy[-y_copy['target']][:n].set_index(['driverid','tripid']).index] = True 13 | return y 14 | 15 | def low_memory_read_csv(path_fname): 16 | # low memory dataframe forcing numerical fields to 32 rather than 64 bits 17 | data_specs = pd.read_csv(path_fname,nrows=1000).dtypes.to_dict() 18 | for key in data_specs.keys(): 19 | if data_specs[key] == np.int64: 20 | data_specs[key] = np.int32 21 | if data_specs[key] == np.float64: 22 | data_specs[key] = np.float32 23 | data = pd.read_csv(path_fname,dtype=data_specs) 24 | # remove index columns if present from pd.DataFrame.to_csv() 25 | if 'Unnamed: 0' in data.columns: data = data.drop('Unnamed: 0',1) 26 | return data 27 | 28 | def remove_worst(submission,X,y,driverid,n_instances): 29 | to_remove = submission.query('driverid==%i'%driverid)[['prob']] 30 | index_to_remove = to_remove.sort('prob').head(n_instances).index 31 | return X.drop(index_to_remove),y.drop(index_to_remove) 32 | 33 | def parse_submission(nsubmission): 34 | submission = pd.DataFrame.from_csv('submissions/'+nsubmission).reset_index() 35 | submission['driverid'] = submission['driver_trip'].apply(lambda x: int(x.split('_')[0])).values 36 | submission['tripid'] = submission['driver_trip'].apply(lambda x: int(x.split('_')[1])).values 37 | return submission.drop(['driver_trip'],1).set_index(['driverid','tripid']) 38 | 39 | def kf_score_impf(model,X,y,n_fold=3,mask=None,seed=7): 40 | trscore,cvscore,impf = [],[],[] 41 | mask = np.array([True]*len(X)) if mask is None else mask 42 | for itr,icv in cross_validation.KFold(len(X),n_fold,shuffle=True,random_state=seed): 43 | Xitr, Xicv = X.values[itr[mask[itr]]], X.values[icv] 44 | yitr, yicv = y.values[itr[mask[itr]]], y.values[icv] 45 | model = model.fit(Xitr,yitr) 46 | trscore.append( metrics.roc_auc_score(yitr, model.predict_proba(Xitr)[:,1]) ) 47 | cvscore.append( metrics.roc_auc_score(yicv, model.predict_proba(Xicv)[:,1]) ) 48 | impf.append( model.feature_importances_) 49 | return trscore, cvscore, impf 50 | 51 | def kf_score(model,X,y,n_fold=3,mask=None,seed=7): 52 | trscore,cvscore = [],[] 53 | mask = np.array([True]*len(X)) if mask is None else mask 54 | for itr,icv in cross_validation.KFold(len(X),n_fold,shuffle=True,random_state=seed): 55 | Xitr, Xicv = X.values[itr[mask[itr]]], X.values[icv] 56 | yitr, yicv = y.values[itr[mask[itr]]], y.values[icv] 57 | model = model.fit(Xitr,yitr) 58 | trscore.append( metrics.roc_auc_score(yitr, model.predict_proba(Xitr)[:,1]) ) 59 | cvscore.append( metrics.roc_auc_score(yicv, model.predict_proba(Xicv)[:,1]) ) 60 | return trscore, cvscore 61 | 62 | def feature_removal(model,X,y,n_fold=3,maxiter=10,verbose=True,seed=6): 63 | # initial benchmark 64 | scoretr, scorecv, impf = kf_score_impf(model,X,y,n_fold,mask=None,seed=seed) 65 | cvscore_to_beat = np.mean(scorecv) 66 | for i in range(maxiter): 67 | # feature importances 68 | impf = pd.Series(np.mean(impf,axis=0),X.columns); impf.sort() 69 | # feature p-values 70 | pval = pd.Series(f_classif(X,y)[0],X.columns); pval.sort() 71 | # select candidates for both methodologies and score removing that feature 72 | impf_candidate, pval_candidate = impf.index[0], pval.index[0] 73 | scoretr_impf, scorecv_impf, impf_impf = kf_score_impf(model,X.drop(impf_candidate,1),y,n_fold,mask=None,seed=seed) 74 | scoretr_pval, scorecv_pval, impf_pval = kf_score_impf(model,X.drop(pval_candidate,1),y,n_fold,mask=None,seed=seed) 75 | scorecv_impf, scorecv_pval = np.mean(scorecv_impf), np.mean(scorecv_pval) 76 | best_cvscore = max(scorecv_impf, scorecv_pval) 77 | 78 | if (best_cvscore - cvscore_to_beat) < 0.0005: break 79 | else: 80 | use_impf = True if best_cvscore == scorecv_impf else False 81 | candidate = impf_candidate if use_impf else pval_candidate 82 | if verbose: 83 | print "removing '%s' | previous %.4f | new %.4f | improvement %.4f" % ( 84 | candidate, cvscore_to_beat, best_cvscore, best_cvscore - cvscore_to_beat) 85 | cvscore_to_beat = best_cvscore 86 | X = X.drop(candidate,1) 87 | return X.columns 88 | 89 | def feature_selection(model,X,y,n_fold=3,verbose=True,seed=11): 90 | feas = X.columns.tolist() 91 | candidates = []; fea_scores = {}; cvscore_to_beat = 0 92 | while True: 93 | # scoring on each single fea and select the best 94 | for fea in feas: 95 | fea_scores[fea] = np.mean(kf_score(model,X[candidates+[fea]],y,n_fold,mask=None,seed=seed)[1]) 96 | best_fea = fea_scores.keys()[np.argmax(fea_scores.values())] 97 | best_cvscore = fea_scores[best_fea] 98 | 99 | # if improvement is better than the tollerance, then include fea in feature set 100 | if (best_cvscore - cvscore_to_beat) < 0.0005: break 101 | else: 102 | if verbose: 103 | print "adding '%s' | previous %.4f | new %.4f | improvement %.4f" % ( 104 | best_fea, cvscore_to_beat, best_cvscore, best_cvscore - cvscore_to_beat) 105 | candidates.append(best_fea) 106 | feas.remove(best_fea) 107 | cvscore_to_beat = best_cvscore 108 | return candidates 109 | 110 | def remove_outliers(X,y,outlier_ratio=0.99,retained_var=0.99,seed=3): 111 | # scale at fit PCA using only the driver data point 112 | X_scaled = preprocessing.scale(X[y.values]); pca = decomposition.PCA().fit(X_scaled) 113 | # retain the selected variance and refit the PCA 114 | pca.n_components = np.argmax(pca.explained_variance_ratio_.cumsum() > retained_var) + 1 115 | # cluster using k-means (20 clusters given 200 data points to group) 116 | n_clusters = y.sum()/10; km = cluster.KMeans(n_clusters=n_clusters, random_state=seed) 117 | clusters = pd.Series(km.fit_predict(pca.fit_transform(X_scaled))) 118 | # smaller cluster will be considered outliers, until the sum reach the ratio desidered 119 | clusters_map = clusters.value_counts().cumsum() <= (len(X_scaled)*outlier_ratio) 120 | y_pruned = y.copy(); y_pruned[y] = clusters.map(clusters_map).values 121 | return y_pruned 122 | 123 | def logger((logfile,q)): 124 | f = open(logfile, 'wb') 125 | while True: 126 | m = q.get() 127 | if m == 'kill': 128 | f.write('killed') 129 | break 130 | f.write(str(m) + '\n'); f.flush() 131 | sleep(30) 132 | f.close() 133 | 134 | def create_config(logfile): 135 | log = pd.read_csv(logfile) 136 | log['feas'] = log['feas'].apply(lambda x: x.replace('[','').replace(']','').replace(' ','').replace("'",'').split(',')) 137 | return log.set_index('driverid') 138 | 139 | def gzip_submission(submission_pathfname): 140 | f_in = open(submission_pathfname, 'rb') 141 | f_out = gzip.open(submission_pathfname+'.gz', 'wb') 142 | f_out.writelines(f_in) 143 | f_out.close(); f_in.close() 144 | 145 | class MyRidge(linear_model.RidgeClassifier): 146 | def predict_proba(self,X): 147 | return self._predict_proba_lr(X) 148 | 149 | class MyRidgeCV(linear_model.RidgeClassifierCV): 150 | def predict_proba(self,X): 151 | return self._predict_proba_lr(X) 152 | --------------------------------------------------------------------------------