├── AvazuModelDocumentation.pdf ├── LICENSE.txt ├── READ.me ├── _0_run_me.sh ├── _3b_gbdt.py ├── _4_post_processing.py ├── _2b_generate_dataset_for_vw_fm.py ├── _3a_rf.py ├── _2c_generate_fm_features.py ├── _3d_fm.py ├── _3c_vw.py ├── _1_encode_cat_features.py └── utils.py /AvazuModelDocumentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/owenzhang/kaggle-avazu/HEAD/AvazuModelDocumentation.pdf -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2015 Zhonghua Zhang 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use 4 | this file except in compliance with the License. You may obtain a copy of the 5 | License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | applicable law or agreed to in writing, software distributed under the License 7 | is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 8 | KIND, either express or implied. See the License for the specific language 9 | governing permissions and limitations under the License. 10 | -------------------------------------------------------------------------------- /READ.me: -------------------------------------------------------------------------------- 1 | To reproduce the result: 2 | 3 | 1. Clone the github repo, go to the folder 4 | 2. Set paths in the utils.py file 5 | 3. sh ./_0_run_me.sh 6 | 7 | 8 | The shell script will run the model 3 times 9 | 10 | 1. a small sample run using day 30 as validation -- should take about 1-2 hours and generate .393-394 logloss 11 | 2. a small sample run using day 31 as test -- should get LB score about .391-.392 12 | 3. a full run using day 31 as test -- should get LB score that ranks 2nd. 13 | 14 | Please note that the full run will take about 2 days, and require 130GB of temporary storage space. So highly recommended run it over the weekend, after step 1 and 2 are successful. 15 | 16 | This process CAN be made much more efficient with a few hours work by a REAL software engineer. 17 | 18 | -------------------------------------------------------------------------------- /_0_run_me.sh: -------------------------------------------------------------------------------- 1 | #small test run using day 30 as validation 2 | python utils.py -set_params Y 1.0 3 | python _1_encode_cat_features.py 4 | python _2b_generate_dataset_for_vw_fm.py 5 | python _2c_generate_fm_features.py 6 | python _3a_rf.py 7 | python _3b_gbdt.py 8 | python _3c_vw.py -rseed 1 9 | python _3c_vw.py -rseed 2 10 | python _3c_vw.py -rseed 3 11 | python _3c_vw.py -rseed 4 12 | python _3d_fm.py -rseed 51 13 | python _3d_fm.py -rseed 52 14 | python _3d_fm.py -rseed 53 15 | python _3d_fm.py -rseed 54 16 | #should generate logloss ~= 0.3937 17 | python _4_post_processing.py 18 | 19 | exit 20 | #try a quick submission using small sample data 21 | python utils -set_params Y 0.05 22 | python _3a_rf.py 23 | python _3b_gbdt.py 24 | python _3c_vw.py -rseed 1 25 | python _3c_vw.py -rseed 2 26 | python _3c_vw.py -rseed 3 27 | python _3c_vw.py -rseed 4 28 | python _3d_fm.py -rseed 51 29 | python _3d_fm.py -rseed 52 30 | python _3d_fm.py -rseed 53 31 | python _3d_fm.py -rseed 54 32 | #should generate a submission with score (Public LB) .3936: (Private LB): .3917 33 | python _4_post_processing.py 34 | 35 | #run the whole thing, will take about 2 days 36 | python utils -set_params Y 1.0 37 | python _1_encode_cat_features.py 38 | python _2b_generate_dataset_for_vw_fm.py 39 | python _2c_generate_fm_features.py 40 | python _3a_rf.py 41 | python _3b_gbdt.py 42 | python _3c_vw.py -rseed 1 43 | python _3c_vw.py -rseed 2 44 | python _3c_vw.py -rseed 3 45 | python _3c_vw.py -rseed 4 46 | python _3d_fm.py -rseed 51 47 | python _3d_fm.py -rseed 52 48 | python _3d_fm.py -rseed 53 49 | python _3d_fm.py -rseed 54 50 | #should generate a submission with score (Private LB) ~ .3805 51 | python _4_post_processing.py 52 | 53 | 54 | -------------------------------------------------------------------------------- /_3b_gbdt.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sc 4 | import scipy.sparse as sp 5 | from sklearn.utils import check_random_state 6 | import pylab 7 | import sys 8 | import time 9 | from joblib import dump, load, Parallel, delayed 10 | import utils 11 | from utils import * 12 | 13 | sys.path.append(utils.xgb_path) 14 | import xgboost as xgb 15 | 16 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx3.joblib_dat') 17 | t0tv_mx3 = t0tv_mx_save['t0tv_mx'] 18 | click_values = t0tv_mx_save['click'] 19 | day_values = t0tv_mx_save['day'] 20 | print "t0tv_mx3 loaded with shape", t0tv_mx3.shape 21 | 22 | 23 | n_trees = utils.xgb_n_trees 24 | day_test = 30 25 | if utils.tvh == 'Y': 26 | day_test = 31 27 | 28 | param = {'max_depth':15, 'eta':.02, 'objective':'binary:logistic', 'verbose':0, 29 | 'subsample':1.0, 'min_child_weight':50, 'gamma':0, 30 | 'nthread': 16, 'colsample_bytree':.5, 'base_score':0.16, 'seed': 999} 31 | 32 | nn = t0tv_mx3.shape[0] 33 | np.random.seed(999) 34 | sample_idx = np.random.random_integers(0, 3, nn) 35 | 36 | predv_xgb = 0 37 | ctr = 0 38 | for idx in [0, 1, 2, 3]: 39 | filter1 = np.logical_and(np.logical_and(day_values >= 22, day_values < day_test), np.logical_and(sample_idx== idx , True)) 40 | filter_v1 = day_values == day_test 41 | 42 | xt1 = t0tv_mx3[filter1, :] 43 | yt1 = click_values[filter1] 44 | if xt1.shape[0] <=0 or xt1.shape[0] != yt1.shape[0]: 45 | print xt1.shape, yt1.shape 46 | raise ValueError('wrong shape!') 47 | dtrain = xgb.DMatrix(xt1, label=yt1) 48 | dvalid = xgb.DMatrix(t0tv_mx3[filter_v1], label=click_values[filter_v1]) 49 | watchlist = [(dtrain, 'train'), (dvalid, 'valid')] 50 | print xt1.shape, yt1.shape 51 | 52 | plst = list(param.items()) + [('eval_metric', 'logloss')] 53 | xgb1 = xgb.train(plst, dtrain, n_trees, watchlist) 54 | #xgb_pred[rseed] = xgb1.predict(dtv3) 55 | #xgb_list[rseed] = xgb1 56 | 57 | ctr += 1 58 | predv_xgb += xgb1.predict(dvalid) 59 | print '-'*30, ctr, logloss(predv_xgb / ctr, click_values[filter_v1]) 60 | 61 | print "to save validation predictions ..." 62 | dump(predv_xgb / ctr, utils.tmp_data_path + 'xgb_pred_v.joblib_dat') 63 | 64 | -------------------------------------------------------------------------------- /_4_post_processing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sc 4 | import scipy.sparse as sp 5 | from sklearn.utils import check_random_state 6 | import pylab 7 | import sys 8 | import time 9 | import utils 10 | from utils import * 11 | import os 12 | 13 | from joblib import dump, load 14 | 15 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx.joblib_dat') 16 | click_values = t0tv_mx_save['click'] 17 | day_values = t0tv_mx_save['day'] 18 | site_id_values= t0tv_mx_save['site_id'] 19 | print "t0tv_mx loaded" 20 | 21 | 22 | day_test = 30 23 | if utils.tvh == 'Y': 24 | day_test = 31 25 | 26 | #RandomForest model output 27 | rf_pred = load(utils.tmp_data_path + 'rf_pred_v.joblib_dat') 28 | print "RF prediction loaded with shape", rf_pred.shape 29 | 30 | #GBDT (xgboost) model output 31 | xgb_pred = load(utils.tmp_data_path + 'xgb_pred_v.joblib_dat') 32 | print "xgb prediction loaded with shape", xgb_pred.shape 33 | 34 | #Vowpal Wabbit model output 35 | ctr = 0 36 | vw_pred = 0 37 | for i in [1, 2, 3, 4]: 38 | vw_pred += 1 / (1+ np.exp(-pd.read_csv(open(utils.tmp_data_path + 'vwV12__r%d_test.txt_pred.txt'%i, 'r'), header=None).ix[:,0].values)) 39 | ctr += 1 40 | vw_pred /= ctr 41 | print "VW prediction loaded with shape", vw_pred.shape 42 | 43 | #factorization machine model output 44 | ctr = 0 45 | fm_pred = 0 46 | for i in [51, 52, 53, 54]: 47 | fm_pred += pd.read_csv(open(utils.tmp_data_path + 'fm__r%d_v.txt.out'%i, 'r'), header=None).ix[:,0].values 48 | ctr += 1 49 | fm_pred /= ctr 50 | print "FM prediction loaded with shape", fm_pred.shape 51 | 52 | 53 | blending_w = {'rf': .075, 'xgb': .175, 'vw': .225, 'fm': .525} 54 | 55 | total_w = 0 56 | pred = 0 57 | 58 | pred += rf_pred * blending_w['rf'] 59 | total_w += blending_w['rf'] 60 | pred += xgb_pred * blending_w['xgb'] 61 | total_w += blending_w['xgb'] 62 | pred += vw_pred * blending_w['vw'] 63 | total_w += blending_w['vw'] 64 | pred += fm_pred * blending_w['fm'] 65 | total_w += blending_w['fm'] 66 | 67 | pred /= total_w 68 | 69 | if utils.tvh == 'Y': 70 | #create submission 71 | predh_raw_avg = pred 72 | site_ids_h = site_id_values[day_values == 31] 73 | tmp_f1 = site_ids_h == '17d1b03f' 74 | predh_raw_avg[tmp_f1] *= .13 / predh_raw_avg[tmp_f1].mean() 75 | predh_raw_avg *= .161 / predh_raw_avg.mean() 76 | 77 | sub0 = pd.read_csv(open(utils.raw_data_path + 'sampleSubmission', 'r')) 78 | pred_h_str = ["%.4f" % x for x in predh_raw_avg] 79 | sub0['click'] = pred_h_str 80 | fn_sub = utils.tmp_data_path + 'sub_sample' + str(utils.sample_pct) + '.csv.gz' 81 | import gzip 82 | sub0.to_csv(gzip.open(fn_sub, 'w'), index=False) 83 | print "=" * 80 84 | print "Training complted and submission file " + fn_sub + " created." 85 | print "=" * 80 86 | else: 87 | #validate using day30 88 | print "Training completed!" 89 | print "=" * 80 90 | print "logloss of blended prediction:", logloss(pred, click_values[day_values==day_test]) 91 | print "=" * 80 92 | -------------------------------------------------------------------------------- /_2b_generate_dataset_for_vw_fm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sc 4 | import scipy.sparse as sp 5 | from sklearn.utils import check_random_state 6 | import pylab 7 | import sys 8 | import time 9 | import utils 10 | from utils import * 11 | import os 12 | 13 | from joblib import dump, load 14 | 15 | t0 = load(utils.tmp_data_path + 't0.joblib_dat') 16 | print "t0 loaded with shape", t0.shape 17 | 18 | t0['dev_id_cnt2'] = np.minimum(t0.cnt_dev_id.astype('int32').values, 300) 19 | t0['dev_ip_cnt2'] = np.minimum(t0.cnt_dev_ip.astype('int32').values, 300) 20 | 21 | t0['dev_id2plus'] = t0.device_id.values 22 | t0.ix[t0.cnt_dev_id.values == 1, 'dev_id2plus'] = '___only1' 23 | t0['dev_ip2plus'] = t0.device_ip.values 24 | t0.ix[t0.cnt_dev_ip.values == 1, 'dev_ip2plus'] = '___only1' 25 | 26 | t0['device_ip_only_hour_for_day'] = t0.cnt_device_ip_day_hour.values == t0.cnt_device_ip_pday.values 27 | 28 | vns0 = ['app_or_web', 'banner_pos', 'C1', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'] 29 | for vn in vns0 + ['C14']: 30 | print vn 31 | vn2 = '_A_' + vn 32 | t0[vn2] = np.add(t0['app_site_id'].values, t0[vn].astype('string').values) 33 | t0[vn2] = t0[vn2].astype('category') 34 | 35 | t3 = t0 36 | vns1 = vns0 + ['hour1'] + ['_A_' + vn for vn in vns0] + \ 37 | ['device_model', 'device_type', 'device_conn_type', 'app_site_id', 'as_domain', 'as_category', 38 | 'cnt_device_ip_day_hour', 'cnt_device_ip_day_hour_prev', 'cnt_device_ip_day_hour_next', 'cnt_device_ip_pday', 39 | 'cnt_diff_device_ip_day_pday', 'as_model'] + \ 40 | [ 'dev_id_cnt2', 'dev_ip_cnt2', 'C14', '_A_C14', 'dev_ip2plus', 'dev_id2plus'] 41 | 42 | #'cnt_device_ip_day', 'device_ip_only_hour_for_day' 43 | 44 | t3a = t3.ix[:, ['click']].copy() 45 | idx_base = 3000 46 | for vn in vns1: 47 | if vn in ['cnt_device_ip_day_hour', 'cnt_device_ip_day_hour_prev', 'cnt_device_ip_day_hour_next', 'cnt_device_ip_pday', 48 | 'cnt_diff_device_ip_day_pday', 'cnt_device_ip_day', 'cnt_device_ip_pday']: 49 | _cat = pd.Series(np.maximum(-100, np.minimum(200, t3[vn].values))).astype('category').values.codes 50 | elif vn in ['as_domain']: 51 | _cat = pd.Series(np.add(t3['app_domain'].values, t3['site_domain'].values)).astype('category').values.codes 52 | elif vn in ['as_category']: 53 | _cat = pd.Series(np.add(t3['app_category'].values, t3['site_category'].values)).astype('category').values.codes 54 | elif vn in ['as_model']: 55 | _cat = pd.Series(np.add(t3['app_site_id'].values, t3['device_model'].values)).astype('category').values.codes 56 | else: 57 | _cat = t3[vn].astype('category').values.codes 58 | _cat = np.asarray(_cat, dtype='int32') 59 | _cat1 = _cat + idx_base 60 | t3a[vn] = _cat1 61 | print vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size 62 | idx_base += _cat.max() + 1 63 | 64 | print "to save t3a ..." 65 | t3a_save = {} 66 | t3a_save['t3a'] = t3a 67 | t3a_save['idx_base'] = idx_base 68 | dump(t3a_save, utils.tmp_data_path + 't3a.joblib_dat') 69 | -------------------------------------------------------------------------------- /_3a_rf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sc 4 | import scipy.sparse as sp 5 | from sklearn.utils import check_random_state 6 | import pylab 7 | import sys 8 | import time 9 | import utils 10 | from utils import * 11 | import os 12 | 13 | from joblib import dump, load 14 | 15 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx3.joblib_dat') 16 | t0tv_mx3 = t0tv_mx_save['t0tv_mx'] 17 | click_values = t0tv_mx_save['click'] 18 | day_values = t0tv_mx_save['day'] 19 | print "t0tv_mx3 loaded with shape", t0tv_mx3.shape 20 | 21 | 22 | from sklearn.ensemble import RandomForestClassifier 23 | 24 | day_test = 30 25 | if utils.tvh == 'Y': 26 | day_test = 31 27 | 28 | print "to create Random Forest using day", day_test, " as validation" 29 | 30 | clf = RandomForestClassifier(n_estimators=32, max_depth=40, min_samples_split=100, min_samples_leaf=10, random_state=0, criterion='entropy', 31 | max_features=8, verbose = 1, n_jobs=-1, bootstrap=False) 32 | 33 | _start_day = 22 34 | 35 | 36 | predv = 0 37 | ctr = 0 38 | xv = t0tv_mx3[day_values==day_test, :] 39 | yv = click_values[day_values==day_test] 40 | nn = t0tv_mx3.shape[0] 41 | 42 | 43 | 44 | for i1 in xrange(8): 45 | clf.random_state = i1 46 | np.random.seed(i1) 47 | r1 = np.random.uniform(0, 1, nn) 48 | filter1 = np.logical_and(np.logical_and(day_values >= _start_day, day_values < day_test), np.logical_and(r1 < .3, True)) 49 | xt1 = t0tv_mx3[filter1, :] 50 | yt1 = click_values[filter1] 51 | rf1 = clf.fit(xt1, yt1) 52 | y_hat = rf1.predict_proba(xv)[:, 1] 53 | predv += y_hat 54 | ctr += 1 55 | ll = logloss(predv/ctr, yv) 56 | print "iter", i1, ", logloss = ", ll 57 | sys.stdout.flush() 58 | 59 | list_param = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'banner_pos', 'device_type', 'device_conn_type'] 60 | feature_list = list_param + \ 61 | ['exptv_' + vn for vn in ['app_site_id', 'as_domain', 62 | 'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 63 | 'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw']] + \ 64 | ['cnt_diff_device_ip_day_pday', 65 | 'app_cnt_by_dev_ip', 'cnt_device_ip_day_hour', 'app_or_web', 66 | 'rank_dev_ip', 'rank_day_dev_ip', 'rank_app_dev_ip', 67 | 'diff_cnt_dev_ip_hour_phour_aw2_prev', 'diff_cnt_dev_ip_hour_phour_aw2_next', 68 | 'exp2_device_ip', 'exp2_app_site_id', 'exp2_device_model', 'exp2_app_site_model', 69 | 'exp2_app_site_model_aw', 'exp2_dev_ip_app_site', 70 | 'cnt_dev_ip', 'cnt_dev_id', 'hour1_web'] + \ 71 | ['all_withid', 'all_noid', 'all_but_ip', 'fm_5vars'] 72 | 73 | rf1_imp = pd.DataFrame({'feature':feature_list, 'impt': clf.feature_importances_}) 74 | print rf1_imp.sort('impt') 75 | 76 | print "to save validation predictions ..." 77 | dump(predv / ctr, utils.tmp_data_path + 'rf_pred_v.joblib_dat') 78 | 79 | -------------------------------------------------------------------------------- /_2c_generate_fm_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sc 4 | import scipy.sparse as sp 5 | from sklearn.utils import check_random_state 6 | import pylab 7 | import sys 8 | import time 9 | import utils 10 | from utils import * 11 | import os 12 | 13 | from joblib import dump, load 14 | 15 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx.joblib_dat') 16 | t0tv_mx = t0tv_mx_save['t0tv_mx'] 17 | click_values = t0tv_mx_save['click'] 18 | day_values = t0tv_mx_save['day'] 19 | print "t0tv_mx loaded with shape", t0tv_mx.shape 20 | 21 | t0 = load(utils.tmp_data_path + 't3a.joblib_dat')['t3a'] 22 | print "t0 loaded with shape", t0.shape 23 | 24 | 25 | vns={} 26 | _vns1 = ['app_or_web', 'banner_pos', 'C1', 'C15', 'C16', 'C20', 'C14', 27 | 'cnt_device_ip_day_hour', 'cnt_device_ip_day_hour_prev', 28 | 'cnt_device_ip_pday', 'dev_ip_cnt2', 'app_site_id', 'device_model'] 29 | 30 | vns['all_noid'] = _vns1 31 | vns['all_withid'] = _vns1 + ['dev_id2plus'] 32 | vns['fm_5vars'] = ['app_or_web', 'banner_pos', 'C1', 'C15', 'device_model'] 33 | vns['all_but_ip'] = ['app_or_web', 'device_conn_type', 'C18', 'device_type', 34 | 'banner_pos', 'C1', 'C15', 'C16', 'hour1', 'as_category', 'C21', 35 | 'C19', 'C20', 'cnt_device_ip_day_hour', 'cnt_device_ip_pday', 36 | 'cnt_device_ip_day_hour_prev', 'cnt_device_ip_day_hour_next', 37 | 'dev_id_cnt2', 'dev_ip_cnt2', 'cnt_diff_device_ip_day_pday', 'C17', 38 | 'C14', 'device_model', 'as_domain', 'app_site_id', '_A_app_or_web', 39 | '_A_C1', '_A_banner_pos', '_A_C16', '_A_C15', '_A_C18', '_A_C19', 40 | '_A_C21', '_A_C20', '_A_C17', '_A_C14', 'as_model', 'dev_id2plus'] 41 | 42 | cmd_str = utils.fm_path + ' -t 4 -s 8 -l 1e-5 /dev/shm/_tmp_2way_v.txt /dev/shm/_tmp_2way_t.txt' 43 | 44 | day_bgn = 22 45 | day_end = 32 46 | 47 | fm_vecs = {} 48 | for day_v in xrange(day_bgn, day_end): 49 | fm_vecs[day_v] = {} 50 | for vns_name in vns.keys(): 51 | vns2 = vns[vns_name] 52 | 53 | print day_v, vns_name 54 | t1 = t0.ix[:, ['click']].copy() 55 | 56 | idx_base = 0 57 | 58 | for vn in vns2: 59 | t1[vn] = t0[vn].values 60 | t1[vn] = np.asarray(t1[vn].astype('category').values.codes, np.int32) + idx_base 61 | idx_base = t1[vn].values.max() + 1 62 | #print '-'* 5, vn, idx_base 63 | 64 | path1 = '/dev/shm/' 65 | fn_t = path1 + '_tmp_2way_t.txt' 66 | fn_v = path1 + '_tmp_2way_v.txt' 67 | 68 | print "to write data files ..." 69 | 70 | t1.ix[np.logical_and(day_values>=21, day_values < day_v),:].to_csv(open(fn_t, 'w'), sep='\t', header=False, index=False) 71 | t1.ix[day_values==day_v,:].to_csv(open(fn_v, 'w'), sep='\t', header=False, index=False) 72 | 73 | 74 | print cmd_str 75 | os.system(cmd_str) 76 | 77 | print "load results ..." 78 | fm_predv = pd.read_csv(open(path1 + '_tmp_2way_v.txt.out', 'r'), header=None).ix[:,0].values 79 | 80 | print "--- gini_norm:", gini_norm(fm_predv, click_values[day_values==day_v], None) 81 | 82 | fm_vecs[day_v][vns_name] = fm_predv 83 | print '='*60 84 | 85 | 86 | t2 = t0.ix[:, ['click']].copy() 87 | 88 | nn = t2.shape[0] 89 | for vns_name in vns.keys(): 90 | t2[vns_name] = np.zeros(nn) 91 | for day_v in xrange(day_bgn, day_end): 92 | print day_v, vns_name 93 | t2.ix[day_values==day_v, vns_name] = fm_vecs[day_v][vns_name] 94 | 95 | print "to save FM features ..." 96 | dump(t2, tmp_data_path + 't2.joblib_dat') 97 | 98 | t0tv_mx3 = np.concatenate([t0tv_mx[:, :43], t2.ix[:, 1:].as_matrix()], axis=1) 99 | print "t0tv_mx3 generated with shape", t0tv_mx3.shape 100 | 101 | t0tv_mx_save = {} 102 | t0tv_mx_save['t0tv_mx'] = t0tv_mx3 103 | t0tv_mx_save['click'] = click_values 104 | t0tv_mx_save['day'] = day_values 105 | dump(t0tv_mx_save, utils.tmp_data_path + '/t0tv_mx3.joblib_dat') 106 | -------------------------------------------------------------------------------- /_3d_fm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sc 4 | import scipy.sparse as sp 5 | import pylab 6 | import sys 7 | import time 8 | import os 9 | import utils 10 | from utils import * 11 | from joblib import dump, load, Parallel, delayed 12 | 13 | sys.path.append(utils.xgb_path) 14 | import xgboost as xgb 15 | 16 | 17 | rseed = 0 18 | xgb_eta = .3 19 | tvh = utils.tvh 20 | n_passes = 5 21 | n_trees = 40 22 | n_iter = 7 23 | n_threads = 8 24 | nr_factor = 4 25 | 26 | i = 1 27 | while i < len(sys.argv): 28 | if sys.argv[i] == '-rseed': 29 | i += 1 30 | rseed = int(sys.argv[i]) 31 | elif sys.argv[i] == '-passes': 32 | i += 1 33 | n_passes = int(sys.argv[i]) 34 | else: 35 | raise ValueError("unrecognized parameter [" + sys.argv[i] + "]") 36 | 37 | i += 1 38 | 39 | learning_rate = .1 40 | 41 | path1 = utils.tmp_data_path 42 | param_names = '_r' + str(rseed) 43 | 44 | fn_t = path1 + 'fm_' + param_names + '_t.txt' 45 | fn_v = path1 + 'fm_' + param_names + '_v.txt' 46 | 47 | test_day = 30 48 | if tvh == 'Y': 49 | test_day = 31 50 | 51 | def build_data(): 52 | 53 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx.joblib_dat') 54 | 55 | t0tv_mx = t0tv_mx_save['t0tv_mx'] 56 | click_values = t0tv_mx_save['click'] 57 | day_values = t0tv_mx_save['day'] 58 | 59 | print "t0tv_mx loaded with shape ", t0tv_mx.shape 60 | 61 | np.random.seed(rseed) 62 | nn = t0tv_mx.shape[0] 63 | r1 = np.random.uniform(0, 1, nn) 64 | 65 | 66 | filter1 = np.logical_and(np.logical_and(day_values >= 22, day_values < test_day), np.logical_and(r1 < 0.25, True)) 67 | filter_v1 = day_values == test_day 68 | 69 | xt1 = t0tv_mx[filter1, :] 70 | yt1 = click_values[filter1] 71 | if xt1.shape[0] <=0 or xt1.shape[0] != yt1.shape[0]: 72 | print xt1.shape, yt1.shape 73 | raise ValueError('wrong shape!') 74 | dtrain = xgb.DMatrix(xt1, label=yt1) 75 | dvalid = xgb.DMatrix(t0tv_mx[filter_v1], label=click_values[filter_v1]) 76 | watchlist = [(dtrain, 'train'), (dvalid, 'valid')] 77 | print xt1.shape, yt1.shape 78 | 79 | 80 | param = {'max_depth':6, 'eta':.5, 'objective':'binary:logistic', 'verbose':0, 81 | 'subsample':1.0, 'min_child_weight':50, 'gamma':0, 82 | 'nthread': 16, 'colsample_bytree':.5, 'base_score':0.16, 'seed': rseed} 83 | 84 | plst = list(param.items()) + [('eval_metric', 'logloss')] 85 | xgb_test_basis_d6 = xgb.train(plst, dtrain, n_trees, watchlist) 86 | 87 | 88 | dtv = xgb.DMatrix(t0tv_mx) 89 | xgb_leaves = xgb_test_basis_d6.predict(dtv, pred_leaf = True) 90 | 91 | t0 = pd.DataFrame({'click': click_values}) 92 | print xgb_leaves.shape 93 | for i in xrange(n_trees): 94 | pred2 = xgb_leaves[:, i] 95 | print i, np.unique(pred2).size 96 | t0['xgb_basis'+str(i)] = pred2 97 | 98 | t3a_save = load(utils.tmp_data_path + 't3a.joblib_dat') 99 | t3a = t3a_save['t3a'] 100 | 101 | idx_base = 0 102 | for vn in ['xgb_basis' + str(i) for i in xrange(n_trees)]: 103 | _cat = np.asarray(t0[vn].astype('category').values.codes, dtype='int32') 104 | _cat1 = _cat + idx_base 105 | print vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size 106 | t3a[vn] = _cat1 107 | idx_base += _cat.max() + 1 108 | 109 | 110 | 111 | t3a.ix[np.logical_and(np.logical_and(day_values < test_day, day_values >= 22), True),:].to_csv(open(fn_t, 'w'), sep='\t', header=False, index=False) 112 | t3a.ix[day_values==test_day,:].to_csv(open(fn_v, 'w'), sep='\t', header=False, index=False) 113 | 114 | 115 | build_data() 116 | import gc 117 | gc.collect() 118 | 119 | 120 | import os 121 | fm_cmd = utils.fm_path + ' -k ' + str(nr_factor) + ' -t ' + str(n_iter) + ' -s '+ str(n_threads) + ' ' 122 | fm_cmd += ' -d ' + str(rseed) + ' -r ' + str(learning_rate) + ' ' + fn_v + ' ' + fn_t 123 | 124 | print fm_cmd 125 | os.system(fm_cmd) 126 | 127 | os.system("rm " + fn_t) 128 | os.system("rm " + fn_v) 129 | 130 | -------------------------------------------------------------------------------- /_3c_vw.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sc 4 | import scipy.sparse as sp 5 | import pylab 6 | import sys 7 | import time 8 | import os 9 | import utils 10 | from utils import * 11 | from joblib import dump, load, Parallel, delayed 12 | 13 | sys.path.append(utils.xgb_path) 14 | import xgboost as xgb 15 | 16 | rseed = 0 17 | xgb_eta = .3 18 | tvh = utils.tvh 19 | n_passes = 4 20 | 21 | i = 1 22 | while i < len(sys.argv): 23 | if sys.argv[i] == '-rseed': 24 | i += 1 25 | rseed = int(sys.argv[i]) 26 | else: 27 | raise ValueError("unrecognized parameter [" + sys.argv[i] + "]") 28 | 29 | i += 1 30 | 31 | 32 | file_name1 = '_r' + str(rseed) 33 | 34 | path1 = utils.tmp_data_path 35 | fn_t = path1 + 'vwV12_' + file_name1 + '_train.txt' 36 | fn_v = path1 + 'vwV12_' + file_name1 + '_test.txt' 37 | 38 | 39 | def build_data(): 40 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx.joblib_dat') 41 | 42 | t0tv_mx = t0tv_mx_save['t0tv_mx'] 43 | click_values = t0tv_mx_save['click'] 44 | day_values = t0tv_mx_save['day'] 45 | 46 | print "t0tv_mx loaded with shape ", t0tv_mx.shape 47 | 48 | test_day = 30 49 | if tvh == 'Y': 50 | test_day = 31 51 | 52 | np.random.seed(rseed) 53 | nn = t0tv_mx.shape[0] 54 | r1 = np.random.uniform(0, 1, nn) 55 | filter1 = np.logical_and(np.logical_and(day_values >= 22, day_values < test_day), np.logical_and(r1 < .25, True)) 56 | filter_v1 = day_values == test_day 57 | 58 | xt1 = t0tv_mx[filter1, :] 59 | yt1 = click_values[filter1] 60 | if xt1.shape[0] <=0 or xt1.shape[0] != yt1.shape[0]: 61 | print xt1.shape, yt1.shape 62 | raise ValueError('wrong shape!') 63 | dtrain = xgb.DMatrix(xt1, label=yt1) 64 | dvalid = xgb.DMatrix(t0tv_mx[filter_v1], label=click_values[filter_v1]) 65 | watchlist = [(dtrain, 'train'), (dvalid, 'valid')] 66 | print xt1.shape, yt1.shape 67 | 68 | 69 | n_trees = 30 70 | n_parallel_tree = 1 71 | 72 | param = {'max_depth':6, 'eta':xgb_eta, 'objective':'binary:logistic', 'verbose':1, 73 | 'subsample':1.0, 'min_child_weight':50, 'gamma':0, 74 | 'nthread': 16, 'colsample_bytree':.5, 'base_score':0.16, 'seed': rseed, 75 | 'num_parallel_tree': n_parallel_tree} 76 | 77 | plst = list(param.items()) + [('eval_metric', 'logloss')] 78 | xgb_test_basis_d6 = xgb.train(plst, dtrain, n_trees, watchlist) 79 | 80 | print "to score gbdt ..." 81 | 82 | dtv = xgb.DMatrix(t0tv_mx) 83 | xgb_leaves = xgb_test_basis_d6.predict(dtv, pred_leaf = True) 84 | 85 | t0 = pd.DataFrame({'click': click_values}) 86 | print xgb_leaves.shape 87 | for i in xrange(n_trees * n_parallel_tree): 88 | pred2 = xgb_leaves[:, i] 89 | #print pred2[:10] 90 | #print pred_raw_diff[:10] 91 | print i, np.unique(pred2).size 92 | t0['xgb_basis'+str(i)] = pred2 93 | 94 | 95 | t3a_save = load(utils.tmp_data_path + 't3a.joblib_dat') 96 | 97 | t3a = t3a_save['t3a'] 98 | idx_base = 0 99 | for vn in ['xgb_basis' + str(i) for i in xrange(30 * n_parallel_tree)]: 100 | _cat = np.asarray(t0[vn].astype('category').values.codes, dtype='int32') 101 | _cat1 = _cat + idx_base 102 | print vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size 103 | t3a[vn] = _cat1 104 | idx_base += _cat.max() + 1 105 | 106 | t3a['click1'] = t3a.click.values * 2 - 1 107 | t3a['ns_C']='|C' 108 | t3a['ns_D']='|D' 109 | t3a['ns_M']='|M' 110 | t3a['ns_S']='|S' 111 | t3a['ns_W']='|W' 112 | t3a['ns_N']='|N' 113 | t3a['ns_X']='|X' 114 | t3a['ns_Y']='|Y' 115 | t3a['ns_Z']='|Z' 116 | 117 | field_list = ['click1'] 118 | field_list += ['ns_C', 'banner_pos', 'C1'] + ['C' + str(x) for x in xrange(14, 22)] 119 | field_list += ['ns_D', 'dev_ip2plus', 'dev_id2plus'] 120 | field_list += ['ns_M', 'device_model', 'device_type', 'device_cnn_type'] 121 | field_list += ['ns_S', 'app_site_id', 'as_domain', 'as_category'] 122 | field_list += ['ns_W', 'app_or_web'] 123 | field_list += ['ns_N', 'cnt_device_ip_day_hour', 'cnt_device_ip_pday', 124 | 'cnt_diff_device_ip_day_pday', 'dev_id_cnt2', 'dev_ip_cnt2', 125 | 'cnt_device_ip_day_hour_prev', 'cnt_device_ip_day_hour_next'] 126 | field_list += ['ns_X'] + ['xgb_basis'+str(i) for i in xrange(0, 10)] 127 | field_list += ['ns_Y'] + ['xgb_basis'+str(i) for i in xrange(10, 20)] 128 | field_list += ['ns_Z'] + ['xgb_basis'+str(i) for i in xrange(20, 30)] 129 | 130 | 131 | if tvh == 'Y': 132 | row_idx = np.logical_and(day_values >= 22, day_values <= 30) 133 | print row_idx.shape, row_idx.sum() 134 | else: 135 | row_idx = np.zeros(t3a.shape[0]) 136 | 137 | pre_t_lmt = (day_values < 22).sum() 138 | t_lmt = (day_values < 30).sum() 139 | v_lmt = (day_values < 31).sum() 140 | 141 | t_cnt = t_lmt - pre_t_lmt 142 | v_cnt = v_lmt - t_lmt 143 | 144 | t_idx = np.random.permutation(t_cnt) + pre_t_lmt 145 | v_idx = np.random.permutation(v_cnt) + t_lmt 146 | 147 | 148 | i = 0 149 | i_t = 0 150 | i_v = 0 151 | while True: 152 | if i % 7 == 6: 153 | row_idx[i] = v_idx[i_v] 154 | i_v += 1 155 | if i_v >= v_cnt: 156 | i_v = 0 157 | else: 158 | #training 159 | row_idx[i] = t_idx[i_t] 160 | i_t += 1 161 | if i_t >= t_cnt: 162 | break 163 | i+= 1 164 | 165 | row_idx = row_idx[:i] 166 | print t3a.shape, t_cnt, v_cnt, row_idx.shape 167 | 168 | t3a['idx'] = np.arange(t3a.shape[0]) 169 | t3a.set_index('idx', inplace=True) 170 | 171 | print "to write training file, this may take a long time" 172 | import gzip 173 | t3a.ix[row_idx, field_list].to_csv(open(fn_t, 'w'), sep=' ', header=False, index=False) 174 | 175 | os.system("gzip -f "+fn_t) 176 | 177 | print "to write test file, this shouldn't take too long" 178 | if tvh == 'Y': 179 | t3a.ix[day_values==31, field_list].to_csv(open(fn_v, 'w'), sep=' ', header=False, index=False) 180 | else: 181 | t3a.ix[day_values==30, field_list].to_csv(open(fn_v, 'w'), sep=' ', header=False, index=False) 182 | 183 | os.system("gzip -f "+fn_v) 184 | 185 | 186 | build_data() 187 | 188 | if tvh == 'Y': 189 | holdout_str = " --holdout_off " 190 | else: 191 | holdout_str = " --holdout_period 7 " 192 | 193 | mdl_name = 'vw' + file_name1 + ".mdl" 194 | vw_cmd_str = utils.vw_path + fn_t + ".gz --random_seed " + str(rseed) + " " + \ 195 | "--passes " + str(n_passes) + " -c --progress 1000000 --loss_function logistic -b 25 " + holdout_str + \ 196 | "--l2 1e-7 -q CS -q CM -q MS -l .1 --power_t .5 -q NM -q NS --decay_learning_rate .75 --hash all " + \ 197 | " -q SX -q MX -q SY -q MY -q SZ -q MZ -q NV -q MV -q VX -q VY -q VZ" + \ 198 | " --ignore H -f " + mdl_name + " -k --compressed" 199 | print vw_cmd_str 200 | os.system(vw_cmd_str) 201 | 202 | vw_cmd_str = utils.vw_path + fn_v + ".gz --hash all " + \ 203 | "-i " + mdl_name + " -p " + fn_v + "_pred.txt -t --loss_function logistic --progress 200000" 204 | print vw_cmd_str 205 | os.system(vw_cmd_str) 206 | 207 | -------------------------------------------------------------------------------- /_1_encode_cat_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sc 4 | import scipy.sparse as sp 5 | from sklearn.utils import check_random_state 6 | import pylab 7 | import sys 8 | import time 9 | sys.path.append('/home/zzhang/Downloads/xgboost/wrapper') 10 | import xgboost as xgb 11 | from joblib import dump, load, Parallel, delayed 12 | import utils 13 | from utils import * 14 | 15 | 16 | raw_data_path = utils.raw_data_path 17 | tmp_data_path = utils.tmp_data_path 18 | 19 | 20 | t0org0 = pd.read_csv(open(raw_data_path + "train", "ra")) 21 | h0org = pd.read_csv(open(raw_data_path + "test", "ra")) 22 | 23 | 24 | if utils.sample_pct < 1.0: 25 | np.random.seed(999) 26 | r1 = np.random.uniform(0, 1, t0org0.shape[0]) 27 | t0org0 = t0org0.ix[r1 < utils.sample_pct, :] 28 | print "testing with small sample of training data, ", t0org0.shape 29 | 30 | 31 | h0org['click'] = 0 32 | t0org = pd.concat([t0org0, h0org]) 33 | print "finished loading raw data, ", t0org.shape 34 | 35 | print "to add some basic features ..." 36 | t0org['day']=np.round(t0org.hour % 10000 / 100) 37 | t0org['hour1'] = np.round(t0org.hour % 100) 38 | t0org['day_hour'] = (t0org.day.values - 21) * 24 + t0org.hour1.values 39 | t0org['day_hour_prev'] = t0org['day_hour'] - 1 40 | t0org['day_hour_next'] = t0org['day_hour'] + 1 41 | t0org['app_or_web'] = 0 42 | t0org.ix[t0org.app_id.values=='ecad2386', 'app_or_web'] = 1 43 | 44 | t0 = t0org 45 | 46 | t0['app_site_id'] = np.add(t0.app_id.values, t0.site_id.values) 47 | 48 | print "to encode categorical features using mean responses from earlier days -- univariate" 49 | sys.stdout.flush() 50 | 51 | calc_exptv(t0, ['app_or_web']) 52 | 53 | exptv_vn_list = ['app_site_id', 'as_domain', 'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 54 | 'app_site_model', 'site_model','app_model', 'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw'] 55 | 56 | calc_exptv(t0, exptv_vn_list) 57 | 58 | calc_exptv(t0, ['app_site_id'], add_count=True) 59 | 60 | 61 | print "to encode categorical features using mean responses from earlier days -- multivariate" 62 | vns = ['app_or_web', 'device_ip', 'app_site_id', 'device_model', 'app_site_model', 'C1', 'C14', 'C17', 'C21', 63 | 'device_type', 'device_conn_type','app_site_model_aw', 'dev_ip_app_site'] 64 | dftv = t0.ix[np.logical_and(t0.day.values >= 21, t0.day.values < 32), ['click', 'day', 'id'] + vns].copy() 65 | 66 | dftv['app_site_model'] = np.add(dftv.device_model.values, dftv.app_site_id.values) 67 | dftv['app_site_model_aw'] = np.add(dftv.app_site_model.values, dftv.app_or_web.astype('string').values) 68 | dftv['dev_ip_app_site'] = np.add(dftv.device_ip.values, dftv.app_site_id.values) 69 | for vn in vns: 70 | dftv[vn] = dftv[vn].astype('category') 71 | print vn 72 | 73 | n_ks = {'app_or_web': 100, 'app_site_id': 100, 'device_ip': 10, 'C14': 50, 'app_site_model': 50, 'device_model': 100, 'device_id': 50, 74 | 'C17': 100, 'C21': 100, 'C1': 100, 'device_type': 100, 'device_conn_type': 100, 'banner_pos': 100, 75 | 'app_site_model_aw': 100, 'dev_ip_app_site': 10 , 'device_model': 500} 76 | 77 | exp2_dict = {} 78 | for vn in vns: 79 | exp2_dict[vn] = np.zeros(dftv.shape[0]) 80 | 81 | days_npa = dftv.day.values 82 | 83 | for day_v in xrange(22, 32): 84 | df1 = dftv.ix[np.logical_and(dftv.day.values < day_v, dftv.day.values < 31), :].copy() 85 | df2 = dftv.ix[dftv.day.values == day_v, :] 86 | print "Validation day:", day_v, ", train data shape:", df1.shape, ", validation data shape:", df2.shape 87 | pred_prev = df1.click.values.mean() * np.ones(df1.shape[0]) 88 | for vn in vns: 89 | if 'exp2_'+vn in df1.columns: 90 | df1.drop('exp2_'+vn, inplace=True, axis=1) 91 | for i in xrange(3): 92 | for vn in vns: 93 | p1 = calcLeaveOneOut2(df1, vn, 'click', n_ks[vn], 0, 0.25, mean0=pred_prev) 94 | pred = pred_prev * p1 95 | print day_v, i, vn, "change = ", ((pred - pred_prev)**2).mean() 96 | pred_prev = pred 97 | 98 | pred1 = df1.click.values.mean() 99 | for vn in vns: 100 | print "="*20, "merge", day_v, vn 101 | diff1 = mergeLeaveOneOut2(df1, df2, vn) 102 | pred1 *= diff1 103 | exp2_dict[vn][days_npa == day_v] = diff1 104 | 105 | pred1 *= df1.click.values.mean() / pred1.mean() 106 | print "logloss = ", logloss(pred1, df2.click.values) 107 | #print my_lift(pred1, None, df2.click.values, None, 20, fig_size=(10, 5)) 108 | #plt.show() 109 | 110 | for vn in vns: 111 | t0['exp2_'+vn] = exp2_dict[vn] 112 | 113 | 114 | print "to count prev/current/next hour by ip ..." 115 | cntDualKey(t0, 'device_ip', None, 'day_hour', 'day_hour_prev', fill_na=0) 116 | cntDualKey(t0, 'device_ip', None, 'day_hour', 'day_hour', fill_na=0) 117 | cntDualKey(t0, 'device_ip', None, 'day_hour', 'day_hour_next', fill_na=0) 118 | 119 | print "to create day diffs" 120 | t0['pday'] = t0.day - 1 121 | calcDualKey(t0, 'device_ip', None, 'day', 'pday', 'click', 10, None, True, True) 122 | t0['cnt_diff_device_ip_day_pday'] = t0.cnt_device_ip_day.values - t0.cnt_device_ip_pday.values 123 | t0['hour1_web'] = t0.hour1.values 124 | t0.ix[t0.app_or_web.values==0, 'hour1_web'] = -1 125 | t0['app_cnt_by_dev_ip'] = my_grp_cnt(t0.device_ip.values.astype('string'), t0.app_id.values.astype('string')) 126 | 127 | 128 | t0['hour1'] = np.round(t0.hour.values % 100) 129 | t0['cnt_diff_device_ip_day_pday'] = t0.cnt_device_ip_day.values - t0.cnt_device_ip_pday.values 130 | 131 | t0['rank_dev_ip'] = my_grp_idx(t0.device_ip.values.astype('string'), t0.id.values.astype('string')) 132 | t0['rank_day_dev_ip'] = my_grp_idx(np.add(t0.device_ip.values, t0.day.astype('string').values).astype('string'), t0.id.values.astype('string')) 133 | t0['rank_app_dev_ip'] = my_grp_idx(np.add(t0.device_ip.values, t0.app_id.values).astype('string'), t0.id.values.astype('string')) 134 | 135 | 136 | t0['cnt_dev_ip'] = get_agg(t0.device_ip.values, t0.id, np.size) 137 | t0['cnt_dev_id'] = get_agg(t0.device_id.values, t0.id, np.size) 138 | 139 | t0['dev_id_cnt2'] = np.minimum(t0.cnt_dev_id.astype('int32').values, 300) 140 | t0['dev_ip_cnt2'] = np.minimum(t0.cnt_dev_ip.astype('int32').values, 300) 141 | 142 | t0['dev_id2plus'] = t0.device_id.values 143 | t0.ix[t0.cnt_dev_id.values == 1, 'dev_id2plus'] = '___only1' 144 | t0['dev_ip2plus'] = t0.device_ip.values 145 | t0.ix[t0.cnt_dev_ip.values == 1, 'dev_ip2plus'] = '___only1' 146 | 147 | t0['diff_cnt_dev_ip_hour_phour_aw2_prev'] = (t0.cnt_device_ip_day_hour.values - t0.cnt_device_ip_day_hour_prev.values) * ((t0.app_or_web * 2 - 1)) 148 | t0['diff_cnt_dev_ip_hour_phour_aw2_next'] = (t0.cnt_device_ip_day_hour.values - t0.cnt_device_ip_day_hour_next.values) * ((t0.app_or_web * 2 - 1)) 149 | 150 | 151 | print "to save t0 ..." 152 | 153 | dump(t0, tmp_data_path + 't0.joblib_dat') 154 | 155 | 156 | print "to generate t0tv_mx .. " 157 | app_or_web = None 158 | _start_day = 22 159 | list_param = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'banner_pos', 'device_type', 'device_conn_type'] 160 | feature_list_dict = {} 161 | 162 | feature_list_name = 'tvexp3' 163 | feature_list_dict[feature_list_name] = list_param + \ 164 | ['exptv_' + vn for vn in ['app_site_id', 'as_domain', 165 | 'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 166 | 'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw']] + \ 167 | ['cnt_diff_device_ip_day_pday', 168 | 'app_cnt_by_dev_ip', 'cnt_device_ip_day_hour', 'app_or_web', 169 | 'rank_dev_ip', 'rank_day_dev_ip', 'rank_app_dev_ip', 170 | 'diff_cnt_dev_ip_hour_phour_aw2_prev', 'diff_cnt_dev_ip_hour_phour_aw2_next', 171 | 'exp2_device_ip', 'exp2_app_site_id', 'exp2_device_model', 'exp2_app_site_model', 172 | 'exp2_app_site_model_aw', 'exp2_dev_ip_app_site', 173 | 'cnt_dev_ip', 'cnt_dev_id', 'hour1_web'] 174 | 175 | filter_tv = np.logical_and(t0.day.values >= _start_day, t0.day.values < 31) 176 | filter_t1 = np.logical_and(t0.day.values < 30, filter_tv) 177 | filter_v1 = np.logical_and(~filter_t1, filter_tv) 178 | 179 | print filter_tv.sum() 180 | 181 | 182 | for vn in feature_list_dict[feature_list_name] : 183 | if vn not in t0.columns: 184 | print "="*60 + vn 185 | 186 | yv = t0.click.values[filter_v1] 187 | 188 | t0tv_mx = t0.as_matrix(feature_list_dict[feature_list_name]) 189 | 190 | print t0tv_mx.shape 191 | 192 | 193 | print "to save t0tv_mx ..." 194 | 195 | t0tv_mx_save = {} 196 | t0tv_mx_save['t0tv_mx'] = t0tv_mx 197 | t0tv_mx_save['click'] = t0.click.values 198 | t0tv_mx_save['day'] = t0.day.values 199 | t0tv_mx_save['site_id'] = t0.site_id.values 200 | dump(t0tv_mx_save, tmp_data_path + 't0tv_mx.joblib_dat') 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.utils import check_random_state 4 | import time 5 | import sys 6 | from joblib import dump, load 7 | 8 | sample_pct = .05 9 | tvh = 'N' 10 | xgb_n_trees = 300 11 | 12 | 13 | #Please set following path accordingly 14 | 15 | #where we can find training, test, and sampleSubmission.csv 16 | raw_data_path = '/home/fast/2014_mobilectr/' 17 | #where we store results -- require about 130GB 18 | tmp_data_path = './tmp_data/' 19 | 20 | #path to external binaries. Please see dependencies in the .pdf document 21 | fm_path = ' ~/Downloads/guestwalk/kaggle-2014-criteo/fm' 22 | xgb_path = '/home/zzhang/Downloads/xgboost/wrapper' 23 | vw_path = '~/vowpal_wabbit/vowpalwabbit/vw ' 24 | 25 | 26 | try: 27 | params=load(tmp_data_path + '_params.joblib_dat') 28 | sample_pct = params['pct'] 29 | tvh = params['tvh'] 30 | except: 31 | pass 32 | 33 | 34 | def print_help(): 35 | print "usage: python utils -set_params [tvh=Y|N], [sample_pct]" 36 | print "for example: python utils -set_params N 0.05" 37 | 38 | def main(): 39 | if sys.argv[1] == '-set_params' and len(sys.argv) == 4: 40 | try: 41 | tvh = sys.argv[2] 42 | sample_pct = float(sys.argv[3]) 43 | dump({'pct': sample_pct, 'tvh':tvh}, tmp_data_path + '_params.joblib_dat') 44 | except: 45 | print_help() 46 | else: 47 | print_help() 48 | 49 | if __name__ == "__main__": 50 | main() 51 | 52 | def get_agg(group_by, value, func): 53 | g1 = pd.Series(value).groupby(group_by) 54 | agg1 = g1.aggregate(func) 55 | #print agg1 56 | r1 = agg1[group_by].values 57 | return r1 58 | 59 | def calcLeaveOneOut2(df, vn, vn_y, cred_k, r_k, power, mean0=None, add_count=False): 60 | if mean0 is None: 61 | mean0 = df_yt[vn_y].mean() * np.ones(df.shape[0]) 62 | _key_codes = df[vn].values.codes 63 | grp1 = df[vn_y].groupby(_key_codes) 64 | grp_mean = pd.Series(mean0).groupby(_key_codes) 65 | mean1 = grp_mean.aggregate(np.mean) 66 | sum1 = grp1.aggregate(np.sum) 67 | cnt1 = grp1.aggregate(np.size) 68 | 69 | #print sum1 70 | #print cnt1 71 | vn_sum = 'sum_' + vn 72 | vn_cnt = 'cnt_' + vn 73 | _sum = sum1[_key_codes].values 74 | _cnt = cnt1[_key_codes].values 75 | _mean = mean1[_key_codes].values 76 | #print _sum[:10] 77 | #print _cnt[:10] 78 | #print _mean[:10] 79 | #print _cnt[:10] 80 | _mean[np.isnan(_sum)] = mean0.mean() 81 | _cnt[np.isnan(_sum)] = 0 82 | _sum[np.isnan(_sum)] = 0 83 | #print _cnt[:10] 84 | _sum -= df[vn_y].values 85 | _cnt -= 1 86 | #print _cnt[:10] 87 | vn_yexp = 'exp2_'+vn 88 | # df[vn_yexp] = (_sum + cred_k * mean0)/(_cnt + cred_k) 89 | diff = np.power((_sum + cred_k * _mean)/(_cnt + cred_k) / _mean, power) 90 | if vn_yexp in df.columns: 91 | df[vn_yexp] *= diff 92 | else: 93 | df[vn_yexp] = diff 94 | if r_k > 0: 95 | df[vn_yexp] *= np.exp((np.random.rand(np.sum(filter_train))-.5) * r_k) 96 | if add_count: 97 | df[vn_cnt] = _cnt 98 | return diff 99 | 100 | 101 | def my_lift(order_by, p, y, w, n_rank, dual_axis=False, random_state=0, dither=1e-5, fig_size=None): 102 | gen = check_random_state(random_state) 103 | if w is None: 104 | w = np.ones(order_by.shape[0]) 105 | if p is None: 106 | p = order_by 107 | ord_idx = np.argsort(order_by + dither*np.random.uniform(-1.0, 1.0, order_by.size)) 108 | p2 = p[ord_idx] 109 | y2 = y[ord_idx] 110 | w2 = w[ord_idx] 111 | 112 | cumm_w = np.cumsum(w2) 113 | total_w = cumm_w[-1] 114 | r1 = np.minimum(n_rank, np.maximum(1, 115 | np.round(cumm_w * n_rank / total_w + .4999999))) 116 | 117 | df1 = pd.DataFrame({'r': r1, 'pw': p2 * w2, 'yw': y2 * w2, 'w': w2}) 118 | grp1 = df1.groupby('r') 119 | 120 | sum_w = grp1['w'].aggregate(np.sum) 121 | avg_p = grp1['pw'].aggregate(np.sum) / sum_w 122 | avg_y = grp1['yw'].aggregate(np.sum) / sum_w 123 | 124 | xs = xrange(1, n_rank+1) 125 | 126 | fig, ax1 = plt.subplots() 127 | if fig_size is None: 128 | fig.set_size_inches(20, 15) 129 | else: 130 | fig.set_size_inches(fig_size) 131 | ax1.plot(xs, avg_p, 'b--') 132 | if dual_axis: 133 | ax2 = ax1.twinx() 134 | ax2.plot(xs, avg_y, 'r') 135 | else: 136 | ax1.plot(xs, avg_y, 'r') 137 | 138 | #print "logloss: ", logloss(p, y, w) 139 | 140 | return gini_norm(order_by, y, w) 141 | 142 | def logloss(pred, y, weight=None): 143 | if weight is None: 144 | weight = np.ones(y.size) 145 | 146 | pred = np.maximum(1e-7, np.minimum(1 - 1e-7, pred)) 147 | return - np.sum(weight * (y * np.log(pred) + (1 - y) * np.log(1 - pred))) / np.sum(weight) 148 | 149 | def gini_norm(pred, y, weight=None): 150 | 151 | #equal weight by default 152 | if weight == None: 153 | weight = np.ones(y.size) 154 | 155 | #sort actual by prediction 156 | ord = np.argsort(pred) 157 | y2 = y[ord] 158 | w2 = weight[ord] 159 | #gini by pred 160 | cumm_y = np.cumsum(y2) 161 | total_y = cumm_y[-1] 162 | total_w = np.sum(w2) 163 | g1 = 1 - 2 * sum(cumm_y * w2) / (total_y * total_w) 164 | 165 | #sort actual by actual 166 | ord = np.argsort(y) 167 | y2 = y[ord] 168 | w2 = weight[ord] 169 | #gini by actual 170 | cumm_y = np.cumsum(y2) 171 | g0 = 1 - 2 * sum(cumm_y * w2) / (total_y * total_w) 172 | 173 | return g1/g0 174 | 175 | def mergeLeaveOneOut2(df, dfv, vn): 176 | _key_codes = df[vn].values.codes 177 | vn_yexp = 'exp2_'+vn 178 | grp1 = df[vn_yexp].groupby(_key_codes) 179 | _mean1 = grp1.aggregate(np.mean) 180 | 181 | _mean = _mean1[dfv[vn].values.codes].values 182 | 183 | _mean[np.isnan(_mean)] = _mean1.mean() 184 | 185 | return _mean 186 | 187 | 188 | def calcTVTransform(df, vn, vn_y, cred_k, filter_train, mean0=None): 189 | if mean0 is None: 190 | mean0 = df.ix[filter_train, vn_y].mean() 191 | print "mean0:", mean0 192 | else: 193 | mean0 = mean0[~filter_train] 194 | 195 | df['_key1'] = df[vn].astype('category').values.codes 196 | df_yt = df.ix[filter_train, ['_key1', vn_y]] 197 | #df_y.set_index([')key1']) 198 | grp1 = df_yt.groupby(['_key1']) 199 | sum1 = grp1[vn_y].aggregate(np.sum) 200 | cnt1 = grp1[vn_y].aggregate(np.size) 201 | vn_sum = 'sum_' + vn 202 | vn_cnt = 'cnt_' + vn 203 | v_codes = df.ix[~filter_train, '_key1'] 204 | _sum = sum1[v_codes].values 205 | _cnt = cnt1[v_codes].values 206 | _cnt[np.isnan(_sum)] = 0 207 | _sum[np.isnan(_sum)] = 0 208 | 209 | r = {} 210 | r['exp'] = (_sum + cred_k * mean0)/(_cnt + cred_k) 211 | r['cnt'] = _cnt 212 | return r 213 | 214 | def cntDualKey(df, vn, vn2, key_src, key_tgt, fill_na=False): 215 | 216 | print "build src key" 217 | _key_src = np.add(df[key_src].astype('string').values, df[vn].astype('string').values) 218 | print "build tgt key" 219 | _key_tgt = np.add(df[key_tgt].astype('string').values, df[vn].astype('string').values) 220 | 221 | if vn2 is not None: 222 | _key_src = np.add(_key_src, df[vn2].astype('string').values) 223 | _key_tgt = np.add(_key_tgt, df[vn2].astype('string').values) 224 | 225 | print "aggreate by src key" 226 | grp1 = df.groupby(_key_src) 227 | cnt1 = grp1[vn].aggregate(np.size) 228 | 229 | print "map to tgt key" 230 | vn_sum = 'sum_' + vn + '_' + key_src + '_' + key_tgt 231 | _cnt = cnt1[_key_tgt].values 232 | 233 | if fill_na is not None: 234 | print "fill in na" 235 | _cnt[np.isnan(_cnt)] = fill_na 236 | 237 | vn_cnt_tgt = 'cnt_' + vn + '_' + key_tgt 238 | if vn2 is not None: 239 | vn_cnt_tgt += '_' + vn2 240 | df[vn_cnt_tgt] = _cnt 241 | 242 | def my_grp_cnt(group_by, count_by): 243 | _ts = time.time() 244 | _ord = np.lexsort((count_by, group_by)) 245 | print time.time() - _ts 246 | _ts = time.time() 247 | _ones = pd.Series(np.ones(group_by.size)) 248 | print time.time() - _ts 249 | _ts = time.time() 250 | #_cs1 = _ones.groupby(group_by[_ord]).cumsum().values 251 | _cs1 = np.zeros(group_by.size) 252 | _prev_grp = '___' 253 | runnting_cnt = 0 254 | for i in xrange(1, group_by.size): 255 | i0 = _ord[i] 256 | if _prev_grp == group_by[i0]: 257 | if count_by[_ord[i-1]] != count_by[i0]: 258 | running_cnt += 1 259 | else: 260 | running_cnt = 1 261 | _prev_grp = group_by[i0] 262 | if i == group_by.size - 1 or group_by[i0] != group_by[_ord[i+1]]: 263 | j = i 264 | while True: 265 | j0 = _ord[j] 266 | _cs1[j0] = running_cnt 267 | if j == 0 or group_by[_ord[j-1]] != group_by[j0]: 268 | break 269 | j -= 1 270 | 271 | print time.time() - _ts 272 | if True: 273 | return _cs1 274 | else: 275 | _ts = time.time() 276 | 277 | org_idx = np.zeros(group_by.size, dtype=np.int) 278 | print time.time() - _ts 279 | _ts = time.time() 280 | org_idx[_ord] = np.asarray(xrange(group_by.size)) 281 | print time.time() - _ts 282 | _ts = time.time() 283 | 284 | return _cs1[org_idx] 285 | 286 | def my_cnt(group_by): 287 | _ts = time.time() 288 | _ord = np.argsort(group_by) 289 | print time.time() - _ts 290 | _ts = time.time() 291 | #_cs1 = _ones.groupby(group_by[_ord]).cumsum().values 292 | _cs1 = np.zeros(group_by.size) 293 | _prev_grp = '___' 294 | runnting_cnt = 0 295 | for i in xrange(1, group_by.size): 296 | i0 = _ord[i] 297 | if _prev_grp == group_by[i0]: 298 | running_cnt += 1 299 | else: 300 | running_cnt = 1 301 | _prev_grp = group_by[i0] 302 | if i == group_by.size - 1 or group_by[i0] != group_by[_ord[i+1]]: 303 | j = i 304 | while True: 305 | j0 = _ord[j] 306 | _cs1[j0] = running_cnt 307 | if j == 0 or group_by[_ord[j-1]] != group_by[j0]: 308 | break 309 | j -= 1 310 | 311 | print time.time() - _ts 312 | return _cs1 313 | 314 | def my_grp_value_diff(group_by, order_by, value): 315 | _ts = time.time() 316 | _ord = np.lexsort((order_by, group_by)) 317 | print time.time() - _ts 318 | _ts = time.time() 319 | _ones = pd.Series(np.ones(group_by.size)) 320 | print time.time() - _ts 321 | _ts = time.time() 322 | #_cs1 = _ones.groupby(group_by[_ord]).cumsum().values 323 | _cs1 = np.zeros(group_by.size) 324 | _prev_grp = '___' 325 | for i in xrange(1, group_by.size): 326 | i0 = _ord[i] 327 | if _prev_grp == group_by[i0]: 328 | _cs1[i0] = value[_ord[i]] - value[_ord[i-1]] 329 | else: 330 | _cs1[i0] = 1e7 331 | _prev_grp = group_by[i0] 332 | print time.time() - _ts 333 | 334 | return np.minimum(_cs1, 1e7) 335 | 336 | def my_grp_idx(group_by, order_by): 337 | _ts = time.time() 338 | _ord = np.lexsort((order_by, group_by)) 339 | print time.time() - _ts 340 | _ts = time.time() 341 | _ones = pd.Series(np.ones(group_by.size)) 342 | print time.time() - _ts 343 | _ts = time.time() 344 | #_cs1 = _ones.groupby(group_by[_ord]).cumsum().values 345 | _cs1 = np.zeros(group_by.size) 346 | _prev_grp = '___' 347 | for i in xrange(1, group_by.size): 348 | i0 = _ord[i] 349 | if _prev_grp == group_by[i0]: 350 | _cs1[i] = _cs1[i - 1] + 1 351 | else: 352 | _cs1[i] = 1 353 | _prev_grp = group_by[i0] 354 | print time.time() - _ts 355 | _ts = time.time() 356 | 357 | org_idx = np.zeros(group_by.size, dtype=np.int) 358 | print time.time() - _ts 359 | _ts = time.time() 360 | org_idx[_ord] = np.asarray(xrange(group_by.size)) 361 | print time.time() - _ts 362 | _ts = time.time() 363 | 364 | return _cs1[org_idx] 365 | 366 | def calcDualKey(df, vn, vn2, key_src, key_tgt, vn_y, cred_k, mean0=None, add_count=False, fill_na=False): 367 | if mean0 is None: 368 | mean0 = df[vn_y].mean() 369 | 370 | print "build src key" 371 | _key_src = np.add(df[key_src].astype('string').values, df[vn].astype('string').values) 372 | print "build tgt key" 373 | _key_tgt = np.add(df[key_tgt].astype('string').values, df[vn].astype('string').values) 374 | 375 | if vn2 is not None: 376 | _key_src = np.add(_key_src, df[vn2].astype('string').values) 377 | _key_tgt = np.add(_key_tgt, df[vn2].astype('string').values) 378 | 379 | print "aggreate by src key" 380 | grp1 = df.groupby(_key_src) 381 | sum1 = grp1[vn_y].aggregate(np.sum) 382 | cnt1 = grp1[vn_y].aggregate(np.size) 383 | 384 | print "map to tgt key" 385 | vn_sum = 'sum_' + vn + '_' + key_src + '_' + key_tgt 386 | _sum = sum1[_key_tgt].values 387 | _cnt = cnt1[_key_tgt].values 388 | 389 | if fill_na: 390 | print "fill in na" 391 | _cnt[np.isnan(_sum)] = 0 392 | _sum[np.isnan(_sum)] = 0 393 | 394 | print "calc exp" 395 | if vn2 is not None: 396 | vn_yexp = 'exp_' + vn + '_' + vn2 + '_' + key_src + '_' + key_tgt 397 | else: 398 | vn_yexp = 'exp_' + vn + '_' + key_src + '_' + key_tgt 399 | df[vn_yexp] = (_sum + cred_k * mean0)/(_cnt + cred_k) 400 | 401 | if add_count: 402 | print "add counts" 403 | vn_cnt_src = 'cnt_' + vn + '_' + key_src 404 | df[vn_cnt_src] = _cnt 405 | grp2 = df.groupby(_key_tgt) 406 | cnt2 = grp2[vn_y].aggregate(np.size) 407 | _cnt2 = cnt2[_key_tgt].values 408 | vn_cnt_tgt = 'cnt_' + vn + '_' + key_tgt 409 | df[vn_cnt_tgt] = _cnt2 410 | 411 | def get_set_diff(df, vn, f1, f2): 412 | #print(df[vn].values.sum()) 413 | set1 = set(np.unique(df[vn].values[f1])) 414 | set2 = set(np.unique(df[vn].values[f2])) 415 | set2_1 = set2 - set1 416 | print vn, '\t', len(set1), '\t', len(set2), '\t', len(set2_1) 417 | return len(set2_1) * 1.0 / len(set2) 418 | 419 | 420 | def calc_exptv(t0, vn_list, last_day_only=False, add_count=False): 421 | t0a = t0.ix[:, ['day', 'click']].copy() 422 | day_exps = {} 423 | 424 | for vn in vn_list: 425 | if vn == 'dev_id_ip': 426 | t0a[vn] = pd.Series(np.add(t0.device_id.values , t0.device_ip.values)).astype('category').values.codes 427 | elif vn == 'dev_ip_aw': 428 | t0a[vn] = pd.Series(np.add(t0.device_ip.values , t0.app_or_web.astype('string').values)).astype('category').values.codes 429 | elif vn == 'C14_aw': 430 | t0a[vn] = pd.Series(np.add(t0.C14.astype('string').values , t0.app_or_web.astype('string').values)).astype('category').values.codes 431 | elif vn == 'C17_aw': 432 | t0a[vn] = pd.Series(np.add(t0.C17.astype('string').values , t0.app_or_web.astype('string').values)).astype('category').values.codes 433 | elif vn == 'C21_aw': 434 | t0a[vn] = pd.Series(np.add(t0.C21.astype('string').values , t0.app_or_web.astype('string').values)).astype('category').values.codes 435 | elif vn == 'as_domain': 436 | t0a[vn] = pd.Series(np.add(t0.app_domain.values , t0.site_domain.values)).astype('category').values.codes 437 | elif vn == 'site_app_id': 438 | t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.app_id.values)).astype('category').values.codes 439 | elif vn == 'app_model': 440 | t0a[vn] = pd.Series(np.add(t0.app_id.values , t0.device_model.values)).astype('category').values.codes 441 | elif vn == 'app_site_model': 442 | t0a[vn] = pd.Series(np.add(t0.app_id.values , np.add(t0.site_id.values , t0.device_model.values))).astype('category').values.codes 443 | elif vn == 'site_model': 444 | t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.device_model.values)).astype('category').values.codes 445 | elif vn == 'app_site': 446 | t0a[vn] = pd.Series(np.add(t0.app_id.values , t0.site_id.values)).astype('category').values.codes 447 | elif vn == 'site_ip': 448 | t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.device_ip.values)).astype('category').values.codes 449 | elif vn == 'app_ip': 450 | t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.device_ip.values)).astype('category').values.codes 451 | elif vn == 'site_id_domain': 452 | t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.site_domain.values)).astype('category').values.codes 453 | elif vn == 'site_hour': 454 | t0a[vn] = pd.Series(np.add(t0.site_domain.values , (t0.hour.values % 100).astype('string'))).astype('category').values.codes 455 | else: 456 | t0a[vn] = t0[vn] 457 | 458 | for day_v in xrange(22, 32): 459 | cred_k = 10 460 | if day_v not in day_exps: 461 | day_exps[day_v] = {} 462 | 463 | vn_key = vn 464 | 465 | import time 466 | _tstart = time.time() 467 | 468 | day1 = 20 469 | if last_day_only: 470 | day1 = day_v - 2 471 | filter_t = np.logical_and(t0.day.values > day1, t0.day.values <= day_v) 472 | vn_key = vn 473 | t1 = t0a.ix[filter_t, :].copy() 474 | filter_t2 = np.logical_and(t1.day.values != day_v, t1.day.values < 31) 475 | 476 | if vn == 'app_or_web': 477 | day_exps[day_v][vn_key] = calcTVTransform(t1, vn, 'click', cred_k, filter_t2) 478 | else: 479 | if last_day_only: 480 | day_exps[day_v][vn_key] = calcTVTransform(t1, vn, 'click', cred_k, filter_t2, mean0=t0.expld_app_or_web.values) 481 | else: 482 | day_exps[day_v][vn_key] = calcTVTransform(t1, vn, 'click', cred_k, filter_t2, mean0=t0.exptv_app_or_web.values) 483 | 484 | print vn, vn_key, " ", day_v, " done in ", time.time() - _tstart 485 | t0a.drop(vn, inplace=True, axis=1) 486 | 487 | for vn in vn_list: 488 | vn_key = vn 489 | 490 | vn_exp = 'exptv_'+vn_key 491 | if last_day_only: 492 | vn_exp='expld_'+vn_key 493 | 494 | t0[vn_exp] = np.zeros(t0.shape[0]) 495 | if add_count: 496 | t0['cnttv_'+vn_key] = np.zeros(t0.shape[0]) 497 | for day_v in xrange(22, 32): 498 | print vn, vn_key, day_v, t0.ix[t0.day.values == day_v, vn_exp].values.size, day_exps[day_v][vn_key]['exp'].size 499 | t0.loc[t0.day.values == day_v, vn_exp]=day_exps[day_v][vn_key]['exp'] 500 | if add_count: 501 | t0.loc[t0.day.values == day_v, 'cnttv_'+vn_key]=day_exps[day_v][vn_key]['cnt'] 502 | 503 | --------------------------------------------------------------------------------