├── AvazuModelDocumentation.pdf
├── LICENSE.txt
├── READ.me
├── _0_run_me.sh
├── _3b_gbdt.py
├── _4_post_processing.py
├── _2b_generate_dataset_for_vw_fm.py
├── _3a_rf.py
├── _2c_generate_fm_features.py
├── _3d_fm.py
├── _3c_vw.py
├── _1_encode_cat_features.py
└── utils.py


/AvazuModelDocumentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/owenzhang/kaggle-avazu/HEAD/AvazuModelDocumentation.pdf


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2015 Zhonghua Zhang
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 4 | this file except in compliance with the License.  You may obtain a copy of the
 5 | License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by
 6 | applicable law or agreed to in writing, software distributed under the License
 7 | is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 8 | KIND, either express or implied.  See the License for the specific language
 9 | governing permissions and limitations under the License.
10 | 


--------------------------------------------------------------------------------
/READ.me:
--------------------------------------------------------------------------------
 1 | To reproduce the result:
 2 | 
 3 | 1. Clone the github repo, go to the folder
 4 | 2. Set paths in the utils.py file
 5 | 3. sh ./_0_run_me.sh
 6 | 
 7 | 
 8 | The shell script will run the model 3 times
 9 | 
10 | 1. a small sample run using day 30 as validation -- should take about 1-2 hours and generate .393-394 logloss
11 | 2. a small sample run using day 31 as test -- should get LB score about .391-.392
12 | 3. a full run using day 31 as test -- should get LB score that ranks 2nd. 
13 | 
14 | Please note that the full run will take about 2 days, and require 130GB of temporary storage space. So highly recommended run it over the weekend, after step 1 and 2 are successful.
15 | 
16 | This process CAN be made much more efficient with a few hours work by a REAL software engineer.
17 | 
18 | 


--------------------------------------------------------------------------------
/_0_run_me.sh:
--------------------------------------------------------------------------------
 1 | #small test run using day 30 as validation
 2 | python utils.py -set_params Y 1.0
 3 | python _1_encode_cat_features.py
 4 | python _2b_generate_dataset_for_vw_fm.py
 5 | python _2c_generate_fm_features.py
 6 | python _3a_rf.py
 7 | python _3b_gbdt.py
 8 | python _3c_vw.py -rseed 1
 9 | python _3c_vw.py -rseed 2
10 | python _3c_vw.py -rseed 3
11 | python _3c_vw.py -rseed 4
12 | python _3d_fm.py -rseed 51
13 | python _3d_fm.py -rseed 52
14 | python _3d_fm.py -rseed 53
15 | python _3d_fm.py -rseed 54
16 | #should generate logloss ~= 0.3937
17 | python _4_post_processing.py
18 | 
19 | exit
20 | #try a quick submission using small sample data
21 | python utils -set_params Y 0.05
22 | python _3a_rf.py
23 | python _3b_gbdt.py
24 | python _3c_vw.py -rseed 1
25 | python _3c_vw.py -rseed 2
26 | python _3c_vw.py -rseed 3
27 | python _3c_vw.py -rseed 4
28 | python _3d_fm.py -rseed 51
29 | python _3d_fm.py -rseed 52
30 | python _3d_fm.py -rseed 53
31 | python _3d_fm.py -rseed 54
32 | #should generate a submission with score (Public LB) .3936: (Private LB): .3917
33 | python _4_post_processing.py
34 | 
35 | #run the whole thing, will take about 2 days
36 | python utils -set_params Y 1.0
37 | python _1_encode_cat_features.py
38 | python _2b_generate_dataset_for_vw_fm.py
39 | python _2c_generate_fm_features.py
40 | python _3a_rf.py
41 | python _3b_gbdt.py
42 | python _3c_vw.py -rseed 1
43 | python _3c_vw.py -rseed 2
44 | python _3c_vw.py -rseed 3
45 | python _3c_vw.py -rseed 4
46 | python _3d_fm.py -rseed 51
47 | python _3d_fm.py -rseed 52
48 | python _3d_fm.py -rseed 53
49 | python _3d_fm.py -rseed 54
50 | #should generate a submission with score (Private LB) ~ .3805
51 | python _4_post_processing.py
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/_3b_gbdt.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import scipy as sc
 4 | import scipy.sparse as sp
 5 | from sklearn.utils import check_random_state 
 6 | import pylab 
 7 | import sys
 8 | import time
 9 | from joblib import dump, load, Parallel, delayed
10 | import utils
11 | from utils import *
12 | 
13 | sys.path.append(utils.xgb_path)
14 | import xgboost as xgb
15 | 
16 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx3.joblib_dat')
17 | t0tv_mx3 = t0tv_mx_save['t0tv_mx']
18 | click_values = t0tv_mx_save['click']
19 | day_values = t0tv_mx_save['day']
20 | print "t0tv_mx3 loaded with shape", t0tv_mx3.shape
21 | 
22 | 
23 | n_trees = utils.xgb_n_trees
24 | day_test = 30
25 | if utils.tvh == 'Y':
26 |     day_test = 31
27 | 
28 | param = {'max_depth':15, 'eta':.02, 'objective':'binary:logistic', 'verbose':0,
29 |          'subsample':1.0, 'min_child_weight':50, 'gamma':0,
30 |          'nthread': 16, 'colsample_bytree':.5, 'base_score':0.16, 'seed': 999}
31 | 
32 | nn = t0tv_mx3.shape[0]
33 | np.random.seed(999)
34 | sample_idx = np.random.random_integers(0, 3, nn)
35 | 
36 | predv_xgb = 0
37 | ctr = 0
38 | for idx in [0, 1, 2, 3]:
39 |     filter1 = np.logical_and(np.logical_and(day_values >= 22, day_values < day_test), np.logical_and(sample_idx== idx , True))
40 |     filter_v1 = day_values == day_test
41 | 
42 |     xt1 = t0tv_mx3[filter1, :]
43 |     yt1 = click_values[filter1]
44 |     if xt1.shape[0] <=0 or xt1.shape[0] != yt1.shape[0]:
45 |         print xt1.shape, yt1.shape
46 |         raise ValueError('wrong shape!')
47 |     dtrain = xgb.DMatrix(xt1, label=yt1)
48 |     dvalid = xgb.DMatrix(t0tv_mx3[filter_v1], label=click_values[filter_v1])
49 |     watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
50 |     print xt1.shape, yt1.shape
51 | 
52 |     plst = list(param.items()) + [('eval_metric', 'logloss')]
53 |     xgb1 = xgb.train(plst, dtrain, n_trees, watchlist)
54 |     #xgb_pred[rseed] = xgb1.predict(dtv3)
55 |     #xgb_list[rseed] = xgb1
56 |     
57 |     ctr += 1
58 |     predv_xgb += xgb1.predict(dvalid)
59 |     print '-'*30, ctr, logloss(predv_xgb / ctr, click_values[filter_v1])
60 | 
61 | print "to save validation predictions ..."
62 | dump(predv_xgb / ctr, utils.tmp_data_path + 'xgb_pred_v.joblib_dat')
63 | 
64 | 


--------------------------------------------------------------------------------
/_4_post_processing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import scipy as sc
 4 | import scipy.sparse as sp
 5 | from sklearn.utils import check_random_state 
 6 | import pylab 
 7 | import sys
 8 | import time
 9 | import utils
10 | from utils import *
11 | import os
12 | 
13 | from joblib import dump, load
14 | 
15 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx.joblib_dat')
16 | click_values = t0tv_mx_save['click']
17 | day_values = t0tv_mx_save['day']
18 | site_id_values= t0tv_mx_save['site_id']
19 | print "t0tv_mx loaded"
20 | 
21 | 
22 | day_test = 30
23 | if utils.tvh == 'Y':
24 |     day_test = 31
25 | 
26 | #RandomForest model output
27 | rf_pred = load(utils.tmp_data_path + 'rf_pred_v.joblib_dat')
28 | print "RF prediction loaded with shape", rf_pred.shape
29 | 
30 | #GBDT (xgboost) model output
31 | xgb_pred = load(utils.tmp_data_path + 'xgb_pred_v.joblib_dat')
32 | print "xgb prediction loaded with shape", xgb_pred.shape
33 | 
34 | #Vowpal Wabbit model output
35 | ctr = 0
36 | vw_pred = 0
37 | for i in [1, 2, 3, 4]:
38 |     vw_pred += 1 / (1+ np.exp(-pd.read_csv(open(utils.tmp_data_path + 'vwV12__r%d_test.txt_pred.txt'%i, 'r'), header=None).ix[:,0].values))
39 |     ctr += 1
40 | vw_pred /= ctr
41 | print "VW prediction loaded with shape", vw_pred.shape
42 | 
43 | #factorization machine model output
44 | ctr = 0
45 | fm_pred = 0
46 | for i in [51, 52, 53, 54]:
47 |     fm_pred += pd.read_csv(open(utils.tmp_data_path + 'fm__r%d_v.txt.out'%i, 'r'), header=None).ix[:,0].values
48 |     ctr += 1
49 | fm_pred /= ctr
50 | print "FM prediction loaded with shape", fm_pred.shape
51 | 
52 | 
53 | blending_w = {'rf': .075, 'xgb': .175, 'vw': .225, 'fm': .525}
54 | 
55 | total_w = 0
56 | pred = 0
57 | 
58 | pred += rf_pred * blending_w['rf']
59 | total_w += blending_w['rf']
60 | pred += xgb_pred * blending_w['xgb']
61 | total_w += blending_w['xgb']
62 | pred += vw_pred * blending_w['vw']
63 | total_w += blending_w['vw']
64 | pred += fm_pred * blending_w['fm']
65 | total_w += blending_w['fm']
66 | 
67 | pred /= total_w
68 | 
69 | if utils.tvh == 'Y':
70 |     #create submission
71 |     predh_raw_avg = pred
72 |     site_ids_h = site_id_values[day_values == 31] 
73 |     tmp_f1 = site_ids_h == '17d1b03f'
74 |     predh_raw_avg[tmp_f1] *= .13 / predh_raw_avg[tmp_f1].mean()
75 |     predh_raw_avg *= .161 / predh_raw_avg.mean()
76 | 
77 |     sub0 = pd.read_csv(open(utils.raw_data_path + 'sampleSubmission', 'r'))
78 |     pred_h_str = ["%.4f" % x for x in predh_raw_avg]
79 |     sub0['click'] = pred_h_str
80 |     fn_sub = utils.tmp_data_path + 'sub_sample' + str(utils.sample_pct) + '.csv.gz'
81 |     import gzip
82 |     sub0.to_csv(gzip.open(fn_sub, 'w'), index=False)
83 |     print "=" * 80
84 |     print "Training complted and submission file " + fn_sub + " created."
85 |     print "=" * 80
86 | else:
87 |     #validate using day30
88 |     print "Training completed!"
89 |     print "=" * 80
90 |     print "logloss of blended prediction:", logloss(pred, click_values[day_values==day_test])
91 |     print "=" * 80
92 | 


--------------------------------------------------------------------------------
/_2b_generate_dataset_for_vw_fm.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import scipy as sc
 4 | import scipy.sparse as sp
 5 | from sklearn.utils import check_random_state 
 6 | import pylab 
 7 | import sys
 8 | import time
 9 | import utils
10 | from utils import *
11 | import os
12 | 
13 | from joblib import dump, load
14 | 
15 | t0 = load(utils.tmp_data_path + 't0.joblib_dat')
16 | print "t0 loaded with shape", t0.shape
17 | 
18 | t0['dev_id_cnt2'] = np.minimum(t0.cnt_dev_id.astype('int32').values, 300)
19 | t0['dev_ip_cnt2'] = np.minimum(t0.cnt_dev_ip.astype('int32').values, 300)
20 | 
21 | t0['dev_id2plus'] = t0.device_id.values
22 | t0.ix[t0.cnt_dev_id.values == 1, 'dev_id2plus'] = '___only1'
23 | t0['dev_ip2plus'] = t0.device_ip.values
24 | t0.ix[t0.cnt_dev_ip.values == 1, 'dev_ip2plus'] = '___only1'
25 | 
26 | t0['device_ip_only_hour_for_day'] = t0.cnt_device_ip_day_hour.values == t0.cnt_device_ip_pday.values
27 | 
28 | vns0 = ['app_or_web', 'banner_pos', 'C1', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
29 | for vn in vns0 + ['C14']:
30 |     print vn
31 |     vn2 = '_A_' + vn
32 |     t0[vn2] = np.add(t0['app_site_id'].values, t0[vn].astype('string').values)
33 |     t0[vn2] = t0[vn2].astype('category')
34 | 
35 | t3 = t0
36 | vns1 = vns0 + ['hour1'] + ['_A_' + vn for vn in vns0] + \
37 |  ['device_model', 'device_type', 'device_conn_type', 'app_site_id', 'as_domain', 'as_category',
38 |       'cnt_device_ip_day_hour', 'cnt_device_ip_day_hour_prev', 'cnt_device_ip_day_hour_next', 'cnt_device_ip_pday',
39 |      'cnt_diff_device_ip_day_pday', 'as_model'] + \
40 |     [ 'dev_id_cnt2', 'dev_ip_cnt2', 'C14', '_A_C14', 'dev_ip2plus', 'dev_id2plus']
41 |  
42 | #'cnt_device_ip_day', 'device_ip_only_hour_for_day'
43 |     
44 | t3a = t3.ix[:, ['click']].copy()
45 | idx_base = 3000
46 | for vn in vns1:
47 |     if vn in ['cnt_device_ip_day_hour', 'cnt_device_ip_day_hour_prev', 'cnt_device_ip_day_hour_next', 'cnt_device_ip_pday',
48 |      'cnt_diff_device_ip_day_pday', 'cnt_device_ip_day', 'cnt_device_ip_pday']:
49 |         _cat = pd.Series(np.maximum(-100, np.minimum(200, t3[vn].values))).astype('category').values.codes
50 |     elif vn in ['as_domain']:
51 |         _cat = pd.Series(np.add(t3['app_domain'].values, t3['site_domain'].values)).astype('category').values.codes
52 |     elif vn in ['as_category']:
53 |         _cat = pd.Series(np.add(t3['app_category'].values, t3['site_category'].values)).astype('category').values.codes
54 |     elif vn in ['as_model']:
55 |         _cat = pd.Series(np.add(t3['app_site_id'].values, t3['device_model'].values)).astype('category').values.codes
56 |     else:
57 |         _cat = t3[vn].astype('category').values.codes
58 |     _cat = np.asarray(_cat, dtype='int32')
59 |     _cat1 = _cat + idx_base
60 |     t3a[vn] = _cat1
61 |     print vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size
62 |     idx_base += _cat.max() + 1
63 | 
64 | print "to save t3a ..."
65 | t3a_save = {}
66 | t3a_save['t3a'] = t3a
67 | t3a_save['idx_base'] = idx_base
68 | dump(t3a_save, utils.tmp_data_path + 't3a.joblib_dat')
69 | 


--------------------------------------------------------------------------------
/_3a_rf.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import scipy as sc
 4 | import scipy.sparse as sp
 5 | from sklearn.utils import check_random_state 
 6 | import pylab 
 7 | import sys
 8 | import time
 9 | import utils
10 | from utils import *
11 | import os
12 | 
13 | from joblib import dump, load
14 | 
15 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx3.joblib_dat')
16 | t0tv_mx3 = t0tv_mx_save['t0tv_mx']
17 | click_values = t0tv_mx_save['click']
18 | day_values = t0tv_mx_save['day']
19 | print "t0tv_mx3 loaded with shape", t0tv_mx3.shape
20 | 
21 | 
22 | from sklearn.ensemble import RandomForestClassifier
23 | 
24 | day_test = 30
25 | if utils.tvh == 'Y':
26 |     day_test = 31
27 | 
28 | print "to create Random Forest using day", day_test, " as validation"
29 | 
30 | clf = RandomForestClassifier(n_estimators=32, max_depth=40, min_samples_split=100, min_samples_leaf=10, random_state=0, criterion='entropy',
31 |                              max_features=8, verbose = 1, n_jobs=-1, bootstrap=False)
32 | 
33 | _start_day = 22
34 | 
35 | 
36 | predv = 0
37 | ctr = 0
38 | xv = t0tv_mx3[day_values==day_test, :]
39 | yv = click_values[day_values==day_test]
40 | nn = t0tv_mx3.shape[0]
41 | 
42 | 
43 | 
44 | for i1 in xrange(8):
45 |     clf.random_state = i1
46 |     np.random.seed(i1)
47 |     r1 = np.random.uniform(0, 1, nn)
48 |     filter1 = np.logical_and(np.logical_and(day_values >= _start_day, day_values < day_test), np.logical_and(r1 < .3, True))
49 |     xt1 = t0tv_mx3[filter1, :]
50 |     yt1 = click_values[filter1]
51 |     rf1 = clf.fit(xt1, yt1)
52 |     y_hat = rf1.predict_proba(xv)[:, 1]
53 |     predv += y_hat
54 |     ctr += 1
55 |     ll = logloss(predv/ctr, yv)
56 |     print "iter", i1, ", logloss = ", ll
57 |     sys.stdout.flush()
58 | 
59 | list_param = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'banner_pos', 'device_type', 'device_conn_type']
60 | feature_list = list_param + \
61 |                             ['exptv_' + vn for vn in ['app_site_id', 'as_domain', 
62 |                              'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 
63 |                              'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw']] + \
64 |                             ['cnt_diff_device_ip_day_pday', 
65 |                              'app_cnt_by_dev_ip', 'cnt_device_ip_day_hour', 'app_or_web',
66 |                              'rank_dev_ip', 'rank_day_dev_ip', 'rank_app_dev_ip',
67 |                              'diff_cnt_dev_ip_hour_phour_aw2_prev', 'diff_cnt_dev_ip_hour_phour_aw2_next',
68 |                              'exp2_device_ip', 'exp2_app_site_id', 'exp2_device_model', 'exp2_app_site_model',
69 |                              'exp2_app_site_model_aw', 'exp2_dev_ip_app_site',
70 |                              'cnt_dev_ip', 'cnt_dev_id', 'hour1_web'] + \
71 |                             ['all_withid', 'all_noid', 'all_but_ip', 'fm_5vars']
72 | 
73 | rf1_imp = pd.DataFrame({'feature':feature_list, 'impt': clf.feature_importances_})
74 | print rf1_imp.sort('impt')
75 | 
76 | print "to save validation predictions ..."
77 | dump(predv / ctr, utils.tmp_data_path + 'rf_pred_v.joblib_dat')
78 | 
79 | 


--------------------------------------------------------------------------------
/_2c_generate_fm_features.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import scipy as sc
  4 | import scipy.sparse as sp
  5 | from sklearn.utils import check_random_state 
  6 | import pylab 
  7 | import sys
  8 | import time
  9 | import utils
 10 | from utils import *
 11 | import os
 12 | 
 13 | from joblib import dump, load
 14 | 
 15 | t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx.joblib_dat')
 16 | t0tv_mx = t0tv_mx_save['t0tv_mx']
 17 | click_values = t0tv_mx_save['click']
 18 | day_values = t0tv_mx_save['day']
 19 | print "t0tv_mx loaded with shape", t0tv_mx.shape
 20 | 
 21 | t0 = load(utils.tmp_data_path + 't3a.joblib_dat')['t3a']
 22 | print "t0 loaded with shape", t0.shape
 23 | 
 24 | 
 25 | vns={}
 26 | _vns1 = ['app_or_web', 'banner_pos', 'C1', 'C15', 'C16', 'C20',  'C14',
 27 |         'cnt_device_ip_day_hour', 'cnt_device_ip_day_hour_prev',
 28 |         'cnt_device_ip_pday', 'dev_ip_cnt2', 'app_site_id', 'device_model']
 29 | 
 30 | vns['all_noid'] = _vns1
 31 | vns['all_withid'] = _vns1 + ['dev_id2plus']
 32 | vns['fm_5vars'] = ['app_or_web', 'banner_pos', 'C1', 'C15', 'device_model']
 33 | vns['all_but_ip'] = ['app_or_web', 'device_conn_type', 'C18', 'device_type',
 34 |        'banner_pos', 'C1', 'C15', 'C16', 'hour1', 'as_category', 'C21',
 35 |        'C19', 'C20', 'cnt_device_ip_day_hour', 'cnt_device_ip_pday',
 36 |        'cnt_device_ip_day_hour_prev', 'cnt_device_ip_day_hour_next',
 37 |        'dev_id_cnt2', 'dev_ip_cnt2', 'cnt_diff_device_ip_day_pday', 'C17',
 38 |        'C14', 'device_model', 'as_domain', 'app_site_id', '_A_app_or_web',
 39 |        '_A_C1', '_A_banner_pos', '_A_C16', '_A_C15', '_A_C18', '_A_C19',
 40 |        '_A_C21', '_A_C20', '_A_C17', '_A_C14', 'as_model', 'dev_id2plus']
 41 | 
 42 | cmd_str = utils.fm_path + ' -t 4 -s 8 -l 1e-5 /dev/shm/_tmp_2way_v.txt /dev/shm/_tmp_2way_t.txt'
 43 | 
 44 | day_bgn = 22
 45 | day_end = 32
 46 | 
 47 | fm_vecs = {}
 48 | for day_v in xrange(day_bgn, day_end):
 49 |     fm_vecs[day_v] = {}
 50 |     for vns_name in vns.keys():
 51 |         vns2 = vns[vns_name]
 52 | 
 53 |         print day_v, vns_name
 54 |         t1 = t0.ix[:, ['click']].copy()
 55 | 
 56 |         idx_base = 0
 57 | 
 58 |         for vn in vns2:
 59 |             t1[vn] = t0[vn].values
 60 |             t1[vn] = np.asarray(t1[vn].astype('category').values.codes, np.int32) + idx_base
 61 |             idx_base = t1[vn].values.max() + 1
 62 |             #print '-'* 5, vn, idx_base
 63 | 
 64 |         path1 = '/dev/shm/'
 65 |         fn_t = path1 + '_tmp_2way_t.txt'
 66 |         fn_v = path1 + '_tmp_2way_v.txt'
 67 |         
 68 |         print "to write data files ..."
 69 |         
 70 |         t1.ix[np.logical_and(day_values>=21, day_values < day_v),:].to_csv(open(fn_t, 'w'), sep='\t', header=False, index=False)
 71 |         t1.ix[day_values==day_v,:].to_csv(open(fn_v, 'w'), sep='\t', header=False, index=False)
 72 | 
 73 |         
 74 |         print cmd_str
 75 |         os.system(cmd_str)
 76 |         
 77 |         print "load results ..."
 78 |         fm_predv = pd.read_csv(open(path1 + '_tmp_2way_v.txt.out', 'r'), header=None).ix[:,0].values
 79 |         
 80 |         print "--- gini_norm:", gini_norm(fm_predv, click_values[day_values==day_v], None)
 81 |         
 82 |         fm_vecs[day_v][vns_name] = fm_predv
 83 |         print '='*60
 84 | 
 85 | 
 86 | t2 = t0.ix[:, ['click']].copy()
 87 | 
 88 | nn = t2.shape[0]
 89 | for vns_name in vns.keys():
 90 |     t2[vns_name] = np.zeros(nn)
 91 |     for day_v in xrange(day_bgn, day_end):
 92 |         print day_v, vns_name
 93 |         t2.ix[day_values==day_v, vns_name] = fm_vecs[day_v][vns_name]
 94 | 
 95 | print "to save FM features ..."
 96 | dump(t2, tmp_data_path + 't2.joblib_dat')
 97 | 
 98 | t0tv_mx3 = np.concatenate([t0tv_mx[:, :43], t2.ix[:, 1:].as_matrix()], axis=1)
 99 | print "t0tv_mx3 generated with shape", t0tv_mx3.shape
100 | 
101 | t0tv_mx_save = {}
102 | t0tv_mx_save['t0tv_mx'] = t0tv_mx3
103 | t0tv_mx_save['click'] = click_values
104 | t0tv_mx_save['day'] = day_values
105 | dump(t0tv_mx_save, utils.tmp_data_path + '/t0tv_mx3.joblib_dat')
106 | 


--------------------------------------------------------------------------------
/_3d_fm.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import scipy as sc
  4 | import scipy.sparse as sp
  5 | import pylab 
  6 | import sys
  7 | import time
  8 | import os
  9 | import utils
 10 | from utils import *
 11 | from joblib import dump, load, Parallel, delayed
 12 | 
 13 | sys.path.append(utils.xgb_path)
 14 | import xgboost as xgb
 15 | 
 16 | 
 17 | rseed = 0
 18 | xgb_eta = .3
 19 | tvh = utils.tvh
 20 | n_passes = 5
 21 | n_trees = 40
 22 | n_iter = 7
 23 | n_threads = 8
 24 | nr_factor = 4
 25 | 
 26 | i = 1
 27 | while i < len(sys.argv):
 28 |     if sys.argv[i] == '-rseed':
 29 |         i += 1
 30 |         rseed = int(sys.argv[i])
 31 |     elif sys.argv[i] == '-passes':
 32 |         i += 1
 33 |         n_passes = int(sys.argv[i])
 34 |     else:
 35 |         raise ValueError("unrecognized parameter [" + sys.argv[i] + "]")
 36 |     
 37 |     i += 1
 38 | 
 39 | learning_rate = .1
 40 | 
 41 | path1 = utils.tmp_data_path
 42 | param_names = '_r' + str(rseed)
 43 |  
 44 | fn_t = path1 + 'fm_' + param_names + '_t.txt'
 45 | fn_v = path1 + 'fm_' + param_names + '_v.txt'
 46 | 
 47 | test_day = 30
 48 | if tvh == 'Y':
 49 |     test_day = 31
 50 | 
 51 | def build_data():
 52 | 
 53 |     t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx.joblib_dat')
 54 | 
 55 |     t0tv_mx = t0tv_mx_save['t0tv_mx']
 56 |     click_values = t0tv_mx_save['click']
 57 |     day_values = t0tv_mx_save['day']
 58 | 
 59 |     print "t0tv_mx loaded with shape ", t0tv_mx.shape
 60 | 
 61 |     np.random.seed(rseed)
 62 |     nn = t0tv_mx.shape[0]
 63 |     r1 = np.random.uniform(0, 1, nn)
 64 | 
 65 | 
 66 |     filter1 = np.logical_and(np.logical_and(day_values >= 22, day_values < test_day), np.logical_and(r1 < 0.25, True))
 67 |     filter_v1 = day_values == test_day
 68 | 
 69 |     xt1 = t0tv_mx[filter1, :]
 70 |     yt1 = click_values[filter1]
 71 |     if xt1.shape[0] <=0 or xt1.shape[0] != yt1.shape[0]:
 72 |         print xt1.shape, yt1.shape
 73 |         raise ValueError('wrong shape!')
 74 |     dtrain = xgb.DMatrix(xt1, label=yt1)
 75 |     dvalid = xgb.DMatrix(t0tv_mx[filter_v1], label=click_values[filter_v1])
 76 |     watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
 77 |     print xt1.shape, yt1.shape
 78 | 
 79 | 
 80 |     param = {'max_depth':6, 'eta':.5, 'objective':'binary:logistic', 'verbose':0,
 81 |              'subsample':1.0, 'min_child_weight':50, 'gamma':0,
 82 |              'nthread': 16, 'colsample_bytree':.5, 'base_score':0.16, 'seed': rseed}
 83 | 
 84 |     plst = list(param.items()) + [('eval_metric', 'logloss')]
 85 |     xgb_test_basis_d6 = xgb.train(plst, dtrain, n_trees, watchlist)
 86 | 
 87 | 
 88 |     dtv = xgb.DMatrix(t0tv_mx)
 89 |     xgb_leaves = xgb_test_basis_d6.predict(dtv, pred_leaf = True)
 90 | 
 91 |     t0 = pd.DataFrame({'click': click_values})
 92 |     print xgb_leaves.shape
 93 |     for i in xrange(n_trees):
 94 |         pred2 = xgb_leaves[:, i]
 95 |         print i, np.unique(pred2).size
 96 |         t0['xgb_basis'+str(i)] = pred2
 97 | 
 98 |     t3a_save = load(utils.tmp_data_path + 't3a.joblib_dat')
 99 |     t3a = t3a_save['t3a']
100 | 
101 |     idx_base = 0
102 |     for vn in ['xgb_basis' + str(i) for i in xrange(n_trees)]:
103 |         _cat = np.asarray(t0[vn].astype('category').values.codes, dtype='int32')
104 |         _cat1 = _cat + idx_base
105 |         print vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size
106 |         t3a[vn] = _cat1
107 |         idx_base += _cat.max() + 1
108 | 
109 | 
110 | 
111 |     t3a.ix[np.logical_and(np.logical_and(day_values < test_day, day_values >= 22), True),:].to_csv(open(fn_t, 'w'), sep='\t', header=False, index=False)
112 |     t3a.ix[day_values==test_day,:].to_csv(open(fn_v, 'w'), sep='\t', header=False, index=False)
113 | 
114 | 
115 | build_data()
116 | import gc
117 | gc.collect()
118 | 
119 | 
120 | import os
121 | fm_cmd = utils.fm_path + ' -k ' + str(nr_factor) + ' -t ' + str(n_iter) + ' -s '+ str(n_threads) + ' '
122 | fm_cmd += ' -d ' + str(rseed) + ' -r ' + str(learning_rate) + ' ' + fn_v + ' ' + fn_t
123 | 
124 | print fm_cmd
125 | os.system(fm_cmd)
126 | 
127 | os.system("rm " + fn_t)
128 | os.system("rm " + fn_v)
129 | 
130 | 


--------------------------------------------------------------------------------
/_3c_vw.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import scipy as sc
  4 | import scipy.sparse as sp
  5 | import pylab 
  6 | import sys
  7 | import time
  8 | import os
  9 | import utils
 10 | from utils import *
 11 | from joblib import dump, load, Parallel, delayed
 12 | 
 13 | sys.path.append(utils.xgb_path)
 14 | import xgboost as xgb
 15 | 
 16 | rseed = 0
 17 | xgb_eta = .3
 18 | tvh = utils.tvh
 19 | n_passes = 4
 20 | 
 21 | i = 1
 22 | while i < len(sys.argv):
 23 |     if sys.argv[i] == '-rseed':
 24 |         i += 1
 25 |         rseed = int(sys.argv[i])
 26 |     else:
 27 |         raise ValueError("unrecognized parameter [" + sys.argv[i] + "]")
 28 |     
 29 |     i += 1
 30 | 
 31 | 
 32 | file_name1 = '_r' + str(rseed)
 33 | 
 34 | path1 = utils.tmp_data_path
 35 | fn_t = path1 + 'vwV12_' + file_name1 + '_train.txt'
 36 | fn_v = path1 + 'vwV12_' + file_name1 + '_test.txt'
 37 | 
 38 | 
 39 | def build_data():
 40 |     t0tv_mx_save = load(utils.tmp_data_path + 't0tv_mx.joblib_dat')
 41 | 
 42 |     t0tv_mx = t0tv_mx_save['t0tv_mx']
 43 |     click_values = t0tv_mx_save['click']
 44 |     day_values = t0tv_mx_save['day']
 45 | 
 46 |     print "t0tv_mx loaded with shape ", t0tv_mx.shape
 47 | 
 48 |     test_day = 30
 49 |     if tvh == 'Y':
 50 |         test_day = 31
 51 | 
 52 |     np.random.seed(rseed)
 53 |     nn = t0tv_mx.shape[0]
 54 |     r1 = np.random.uniform(0, 1, nn)
 55 |     filter1 = np.logical_and(np.logical_and(day_values >= 22, day_values < test_day), np.logical_and(r1 < .25, True))
 56 |     filter_v1 = day_values == test_day
 57 | 
 58 |     xt1 = t0tv_mx[filter1, :]
 59 |     yt1 = click_values[filter1]
 60 |     if xt1.shape[0] <=0 or xt1.shape[0] != yt1.shape[0]:
 61 |         print xt1.shape, yt1.shape
 62 |         raise ValueError('wrong shape!')
 63 |     dtrain = xgb.DMatrix(xt1, label=yt1)
 64 |     dvalid = xgb.DMatrix(t0tv_mx[filter_v1], label=click_values[filter_v1])
 65 |     watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
 66 |     print xt1.shape, yt1.shape
 67 | 
 68 | 
 69 |     n_trees = 30
 70 |     n_parallel_tree = 1
 71 | 
 72 |     param = {'max_depth':6, 'eta':xgb_eta, 'objective':'binary:logistic', 'verbose':1,
 73 | 	     'subsample':1.0, 'min_child_weight':50, 'gamma':0,
 74 | 	     'nthread': 16, 'colsample_bytree':.5, 'base_score':0.16, 'seed': rseed,
 75 | 	     'num_parallel_tree': n_parallel_tree}
 76 | 
 77 |     plst = list(param.items()) + [('eval_metric', 'logloss')]
 78 |     xgb_test_basis_d6 = xgb.train(plst, dtrain, n_trees, watchlist)
 79 | 
 80 |     print "to score gbdt ..."
 81 | 
 82 |     dtv = xgb.DMatrix(t0tv_mx)
 83 |     xgb_leaves = xgb_test_basis_d6.predict(dtv, pred_leaf = True)
 84 |         
 85 |     t0 = pd.DataFrame({'click': click_values})
 86 |     print xgb_leaves.shape
 87 |     for i in xrange(n_trees * n_parallel_tree):
 88 |         pred2 = xgb_leaves[:, i]
 89 |         #print pred2[:10]
 90 |         #print pred_raw_diff[:10]
 91 |         print i, np.unique(pred2).size
 92 |         t0['xgb_basis'+str(i)] = pred2
 93 | 
 94 | 
 95 |     t3a_save = load(utils.tmp_data_path + 't3a.joblib_dat')
 96 | 
 97 |     t3a = t3a_save['t3a']
 98 |     idx_base = 0
 99 |     for vn in ['xgb_basis' + str(i) for i in xrange(30 * n_parallel_tree)]:
100 |         _cat = np.asarray(t0[vn].astype('category').values.codes, dtype='int32')
101 |         _cat1 = _cat + idx_base
102 |         print vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size
103 |         t3a[vn] = _cat1
104 |         idx_base += _cat.max() + 1
105 | 
106 |     t3a['click1'] = t3a.click.values * 2 - 1
107 |     t3a['ns_C']='|C'
108 |     t3a['ns_D']='|D'
109 |     t3a['ns_M']='|M'
110 |     t3a['ns_S']='|S'
111 |     t3a['ns_W']='|W'
112 |     t3a['ns_N']='|N'
113 |     t3a['ns_X']='|X'
114 |     t3a['ns_Y']='|Y'
115 |     t3a['ns_Z']='|Z'
116 | 
117 |     field_list = ['click1']
118 |     field_list += ['ns_C', 'banner_pos', 'C1'] + ['C' + str(x) for x in xrange(14, 22)]
119 |     field_list += ['ns_D', 'dev_ip2plus', 'dev_id2plus']
120 |     field_list += ['ns_M', 'device_model', 'device_type', 'device_cnn_type']
121 |     field_list += ['ns_S', 'app_site_id', 'as_domain', 'as_category']
122 |     field_list += ['ns_W', 'app_or_web']
123 |     field_list += ['ns_N', 'cnt_device_ip_day_hour', 'cnt_device_ip_pday', 
124 | 	           'cnt_diff_device_ip_day_pday', 'dev_id_cnt2', 'dev_ip_cnt2',
125 | 	           'cnt_device_ip_day_hour_prev', 'cnt_device_ip_day_hour_next']
126 |     field_list += ['ns_X'] + ['xgb_basis'+str(i) for i in xrange(0, 10)]
127 |     field_list += ['ns_Y'] + ['xgb_basis'+str(i) for i in xrange(10, 20)]
128 |     field_list += ['ns_Z'] + ['xgb_basis'+str(i) for i in xrange(20, 30)]
129 | 
130 | 
131 |     if tvh == 'Y':
132 |         row_idx = np.logical_and(day_values >= 22, day_values <= 30)
133 |         print row_idx.shape, row_idx.sum()
134 |     else:
135 |         row_idx = np.zeros(t3a.shape[0])
136 | 
137 |         pre_t_lmt = (day_values < 22).sum()
138 |         t_lmt = (day_values < 30).sum()
139 |         v_lmt = (day_values < 31).sum()
140 | 
141 |         t_cnt = t_lmt - pre_t_lmt
142 |         v_cnt = v_lmt - t_lmt
143 | 
144 |         t_idx = np.random.permutation(t_cnt) + pre_t_lmt
145 |         v_idx = np.random.permutation(v_cnt) + t_lmt
146 | 
147 | 
148 |         i = 0
149 |         i_t = 0
150 |         i_v = 0
151 |         while True:
152 | 	    if i % 7 == 6:
153 | 	        row_idx[i] = v_idx[i_v]
154 | 	        i_v += 1
155 | 	        if i_v >= v_cnt:
156 | 	            i_v = 0
157 | 	    else:
158 | 	        #training
159 | 	        row_idx[i] = t_idx[i_t]
160 | 	        i_t += 1
161 | 	        if i_t >= t_cnt:
162 | 	            break
163 | 	    i+= 1
164 | 
165 |         row_idx = row_idx[:i]
166 |         print t3a.shape, t_cnt, v_cnt, row_idx.shape
167 | 
168 |         t3a['idx'] = np.arange(t3a.shape[0])
169 |         t3a.set_index('idx', inplace=True)
170 | 
171 |     print "to write training file, this may take a long time"
172 |     import gzip
173 |     t3a.ix[row_idx, field_list].to_csv(open(fn_t, 'w'), sep=' ', header=False, index=False)
174 | 
175 |     os.system("gzip -f "+fn_t)
176 | 
177 |     print "to write test file, this shouldn't take too long"
178 |     if tvh == 'Y':
179 |         t3a.ix[day_values==31, field_list].to_csv(open(fn_v, 'w'), sep=' ', header=False, index=False)
180 |     else:
181 |         t3a.ix[day_values==30, field_list].to_csv(open(fn_v, 'w'), sep=' ', header=False, index=False)
182 | 
183 |     os.system("gzip -f "+fn_v)
184 | 
185 | 
186 | build_data()
187 | 
188 | if tvh == 'Y':
189 |     holdout_str = " --holdout_off "
190 | else:
191 |     holdout_str = " --holdout_period 7 "
192 |     
193 | mdl_name = 'vw' + file_name1 + ".mdl"
194 | vw_cmd_str = utils.vw_path + fn_t + ".gz --random_seed " + str(rseed) + " " + \
195 | "--passes " + str(n_passes) + " -c --progress 1000000 --loss_function logistic -b 25 " +  holdout_str + \
196 | "--l2 1e-7 -q CS -q CM -q MS -l .1 --power_t .5 -q NM -q NS --decay_learning_rate .75 --hash all " + \
197 | " -q SX -q MX -q SY -q MY -q SZ -q MZ -q NV -q MV -q VX -q VY -q VZ" + \
198 | " --ignore H -f " + mdl_name + " -k --compressed"
199 | print vw_cmd_str
200 | os.system(vw_cmd_str)
201 | 
202 | vw_cmd_str = utils.vw_path + fn_v + ".gz --hash all " + \
203 |     "-i " + mdl_name + " -p " + fn_v + "_pred.txt -t --loss_function logistic --progress 200000"
204 | print vw_cmd_str
205 | os.system(vw_cmd_str)
206 | 
207 | 


--------------------------------------------------------------------------------
/_1_encode_cat_features.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import scipy as sc
  4 | import scipy.sparse as sp
  5 | from sklearn.utils import check_random_state 
  6 | import pylab 
  7 | import sys
  8 | import time
  9 | sys.path.append('/home/zzhang/Downloads/xgboost/wrapper')
 10 | import xgboost as xgb
 11 | from joblib import dump, load, Parallel, delayed
 12 | import utils
 13 | from utils import *
 14 | 
 15 | 
 16 | raw_data_path = utils.raw_data_path
 17 | tmp_data_path = utils.tmp_data_path
 18 | 
 19 | 
 20 | t0org0 = pd.read_csv(open(raw_data_path + "train", "ra"))
 21 | h0org = pd.read_csv(open(raw_data_path + "test", "ra"))
 22 | 
 23 | 
 24 | if utils.sample_pct < 1.0:
 25 |     np.random.seed(999)
 26 |     r1 = np.random.uniform(0, 1, t0org0.shape[0])
 27 |     t0org0 = t0org0.ix[r1 < utils.sample_pct, :]
 28 |     print "testing with small sample of training data, ", t0org0.shape
 29 | 
 30 | 
 31 | h0org['click'] = 0
 32 | t0org = pd.concat([t0org0, h0org])
 33 | print "finished loading raw data, ", t0org.shape
 34 | 
 35 | print "to add some basic features ..."
 36 | t0org['day']=np.round(t0org.hour % 10000 / 100)
 37 | t0org['hour1'] = np.round(t0org.hour % 100)
 38 | t0org['day_hour'] = (t0org.day.values - 21) * 24 + t0org.hour1.values
 39 | t0org['day_hour_prev'] = t0org['day_hour'] - 1
 40 | t0org['day_hour_next'] = t0org['day_hour'] + 1
 41 | t0org['app_or_web'] = 0
 42 | t0org.ix[t0org.app_id.values=='ecad2386', 'app_or_web'] = 1
 43 | 
 44 | t0 = t0org
 45 | 
 46 | t0['app_site_id'] = np.add(t0.app_id.values, t0.site_id.values)
 47 | 
 48 | print "to encode categorical features using mean responses from earlier days -- univariate"
 49 | sys.stdout.flush()
 50 | 
 51 | calc_exptv(t0,  ['app_or_web'])
 52 | 
 53 | exptv_vn_list = ['app_site_id', 'as_domain', 'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 
 54 |                 'app_site_model', 'site_model','app_model', 'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw']
 55 | 
 56 | calc_exptv(t0, exptv_vn_list)
 57 | 
 58 | calc_exptv(t0, ['app_site_id'], add_count=True)
 59 | 
 60 | 
 61 | print "to encode categorical features using mean responses from earlier days -- multivariate"
 62 | vns = ['app_or_web',  'device_ip', 'app_site_id', 'device_model', 'app_site_model', 'C1', 'C14', 'C17', 'C21',
 63 |                         'device_type', 'device_conn_type','app_site_model_aw', 'dev_ip_app_site']
 64 | dftv = t0.ix[np.logical_and(t0.day.values >= 21, t0.day.values < 32), ['click', 'day', 'id'] + vns].copy()
 65 | 
 66 | dftv['app_site_model'] = np.add(dftv.device_model.values, dftv.app_site_id.values)
 67 | dftv['app_site_model_aw'] = np.add(dftv.app_site_model.values, dftv.app_or_web.astype('string').values)
 68 | dftv['dev_ip_app_site'] = np.add(dftv.device_ip.values, dftv.app_site_id.values)
 69 | for vn in vns:
 70 |     dftv[vn] = dftv[vn].astype('category')
 71 |     print vn
 72 | 
 73 | n_ks = {'app_or_web': 100, 'app_site_id': 100, 'device_ip': 10, 'C14': 50, 'app_site_model': 50, 'device_model': 100, 'device_id': 50,
 74 |         'C17': 100, 'C21': 100, 'C1': 100, 'device_type': 100, 'device_conn_type': 100, 'banner_pos': 100,
 75 |         'app_site_model_aw': 100, 'dev_ip_app_site': 10 , 'device_model': 500}
 76 | 
 77 | exp2_dict = {}
 78 | for vn in vns:
 79 |     exp2_dict[vn] = np.zeros(dftv.shape[0])
 80 | 
 81 | days_npa = dftv.day.values
 82 |     
 83 | for day_v in xrange(22, 32):
 84 |     df1 = dftv.ix[np.logical_and(dftv.day.values < day_v, dftv.day.values < 31), :].copy()
 85 |     df2 = dftv.ix[dftv.day.values == day_v, :]
 86 |     print "Validation day:", day_v, ", train data shape:", df1.shape, ", validation data shape:", df2.shape
 87 |     pred_prev = df1.click.values.mean() * np.ones(df1.shape[0])
 88 |     for vn in vns:
 89 |         if 'exp2_'+vn in df1.columns:
 90 |             df1.drop('exp2_'+vn, inplace=True, axis=1)
 91 |     for i in xrange(3):
 92 |         for vn in vns:
 93 |             p1 = calcLeaveOneOut2(df1, vn, 'click', n_ks[vn], 0, 0.25, mean0=pred_prev)
 94 |             pred = pred_prev * p1
 95 |             print day_v, i, vn, "change = ", ((pred - pred_prev)**2).mean()
 96 |             pred_prev = pred    
 97 |             
 98 |         pred1 = df1.click.values.mean()
 99 |         for vn in vns:
100 |             print "="*20, "merge", day_v, vn
101 |             diff1 = mergeLeaveOneOut2(df1, df2, vn)
102 |             pred1 *= diff1
103 |             exp2_dict[vn][days_npa == day_v] = diff1
104 |         
105 |         pred1 *= df1.click.values.mean() / pred1.mean()
106 |         print "logloss = ", logloss(pred1, df2.click.values)
107 |         #print my_lift(pred1, None, df2.click.values, None, 20, fig_size=(10, 5))
108 |         #plt.show()
109 | 
110 | for vn in vns:
111 |     t0['exp2_'+vn] = exp2_dict[vn]
112 | 
113 | 
114 | print "to count prev/current/next hour by ip ..."
115 | cntDualKey(t0, 'device_ip', None, 'day_hour', 'day_hour_prev', fill_na=0)
116 | cntDualKey(t0, 'device_ip', None, 'day_hour', 'day_hour', fill_na=0)
117 | cntDualKey(t0, 'device_ip', None, 'day_hour', 'day_hour_next', fill_na=0)
118 | 
119 | print "to create day diffs"
120 | t0['pday'] = t0.day - 1
121 | calcDualKey(t0, 'device_ip', None, 'day', 'pday', 'click', 10, None, True, True)
122 | t0['cnt_diff_device_ip_day_pday'] = t0.cnt_device_ip_day.values  - t0.cnt_device_ip_pday.values
123 | t0['hour1_web'] = t0.hour1.values
124 | t0.ix[t0.app_or_web.values==0, 'hour1_web'] = -1
125 | t0['app_cnt_by_dev_ip'] = my_grp_cnt(t0.device_ip.values.astype('string'), t0.app_id.values.astype('string'))
126 | 
127 | 
128 | t0['hour1'] = np.round(t0.hour.values % 100)
129 | t0['cnt_diff_device_ip_day_pday'] = t0.cnt_device_ip_day.values  - t0.cnt_device_ip_pday.values
130 | 
131 | t0['rank_dev_ip'] = my_grp_idx(t0.device_ip.values.astype('string'), t0.id.values.astype('string'))
132 | t0['rank_day_dev_ip'] = my_grp_idx(np.add(t0.device_ip.values, t0.day.astype('string').values).astype('string'), t0.id.values.astype('string'))
133 | t0['rank_app_dev_ip'] = my_grp_idx(np.add(t0.device_ip.values, t0.app_id.values).astype('string'), t0.id.values.astype('string'))
134 | 
135 | 
136 | t0['cnt_dev_ip'] = get_agg(t0.device_ip.values, t0.id, np.size)
137 | t0['cnt_dev_id'] = get_agg(t0.device_id.values, t0.id, np.size)
138 | 
139 | t0['dev_id_cnt2'] = np.minimum(t0.cnt_dev_id.astype('int32').values, 300)
140 | t0['dev_ip_cnt2'] = np.minimum(t0.cnt_dev_ip.astype('int32').values, 300)
141 | 
142 | t0['dev_id2plus'] = t0.device_id.values
143 | t0.ix[t0.cnt_dev_id.values == 1, 'dev_id2plus'] = '___only1'
144 | t0['dev_ip2plus'] = t0.device_ip.values
145 | t0.ix[t0.cnt_dev_ip.values == 1, 'dev_ip2plus'] = '___only1'
146 | 
147 | t0['diff_cnt_dev_ip_hour_phour_aw2_prev'] = (t0.cnt_device_ip_day_hour.values - t0.cnt_device_ip_day_hour_prev.values) * ((t0.app_or_web * 2 - 1)) 
148 | t0['diff_cnt_dev_ip_hour_phour_aw2_next'] = (t0.cnt_device_ip_day_hour.values - t0.cnt_device_ip_day_hour_next.values) * ((t0.app_or_web * 2 - 1)) 
149 | 
150 | 
151 | print "to save t0 ..."
152 | 
153 | dump(t0, tmp_data_path + 't0.joblib_dat')
154 | 
155 | 
156 | print "to generate t0tv_mx .. "
157 | app_or_web = None
158 | _start_day = 22
159 | list_param = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'banner_pos', 'device_type', 'device_conn_type']
160 | feature_list_dict = {}
161 | 
162 | feature_list_name = 'tvexp3'
163 | feature_list_dict[feature_list_name] = list_param + \
164 |                             ['exptv_' + vn for vn in ['app_site_id', 'as_domain', 
165 |                              'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 
166 |                              'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw']] + \
167 |                             ['cnt_diff_device_ip_day_pday', 
168 |                              'app_cnt_by_dev_ip', 'cnt_device_ip_day_hour', 'app_or_web',
169 |                              'rank_dev_ip', 'rank_day_dev_ip', 'rank_app_dev_ip',
170 |                              'diff_cnt_dev_ip_hour_phour_aw2_prev', 'diff_cnt_dev_ip_hour_phour_aw2_next',
171 |                              'exp2_device_ip', 'exp2_app_site_id', 'exp2_device_model', 'exp2_app_site_model',
172 |                              'exp2_app_site_model_aw', 'exp2_dev_ip_app_site',
173 |                              'cnt_dev_ip', 'cnt_dev_id', 'hour1_web']
174 | 
175 | filter_tv = np.logical_and(t0.day.values >= _start_day, t0.day.values < 31)
176 | filter_t1 = np.logical_and(t0.day.values < 30, filter_tv)
177 | filter_v1 = np.logical_and(~filter_t1, filter_tv)    
178 |     
179 | print filter_tv.sum()
180 | 
181 | 
182 | for vn in feature_list_dict[feature_list_name] :
183 |     if vn not in t0.columns:
184 |         print "="*60 + vn
185 |         
186 | yv = t0.click.values[filter_v1]
187 | 
188 | t0tv_mx = t0.as_matrix(feature_list_dict[feature_list_name])
189 | 
190 | print t0tv_mx.shape
191 | 
192 | 
193 | print "to save t0tv_mx ..."
194 | 
195 | t0tv_mx_save = {}
196 | t0tv_mx_save['t0tv_mx'] = t0tv_mx
197 | t0tv_mx_save['click'] = t0.click.values
198 | t0tv_mx_save['day'] = t0.day.values
199 | t0tv_mx_save['site_id'] = t0.site_id.values
200 | dump(t0tv_mx_save, tmp_data_path + 't0tv_mx.joblib_dat')
201 | 
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.utils import check_random_state 
  4 | import time
  5 | import sys
  6 | from joblib import dump, load
  7 | 
  8 | sample_pct = .05
  9 | tvh = 'N'
 10 | xgb_n_trees = 300
 11 | 
 12 | 
 13 | #Please set following path accordingly
 14 | 
 15 | #where we can find training, test, and sampleSubmission.csv
 16 | raw_data_path = '/home/fast/2014_mobilectr/'
 17 | #where we store results -- require about 130GB
 18 | tmp_data_path = './tmp_data/'
 19 | 
 20 | #path to external binaries. Please see dependencies in the .pdf document
 21 | fm_path = ' ~/Downloads/guestwalk/kaggle-2014-criteo/fm'
 22 | xgb_path = '/home/zzhang/Downloads/xgboost/wrapper'
 23 | vw_path = '~/vowpal_wabbit/vowpalwabbit/vw '
 24 | 
 25 | 
 26 | try:
 27 |     params=load(tmp_data_path + '_params.joblib_dat')
 28 |     sample_pct = params['pct']
 29 |     tvh = params['tvh']
 30 | except:
 31 |     pass
 32 | 
 33 | 
 34 | def print_help():
 35 |     print "usage: python utils -set_params [tvh=Y|N], [sample_pct]"
 36 |     print "for example: python utils -set_params N 0.05"
 37 | 
 38 | def main():
 39 |     if sys.argv[1] == '-set_params' and len(sys.argv) == 4:
 40 |         try:
 41 |             tvh = sys.argv[2]
 42 |             sample_pct = float(sys.argv[3])
 43 |             dump({'pct': sample_pct, 'tvh':tvh}, tmp_data_path + '_params.joblib_dat')
 44 |         except:
 45 |             print_help()
 46 |     else:
 47 |         print_help()
 48 | 
 49 | if __name__ == "__main__":
 50 |     main()
 51 | 
 52 | def get_agg(group_by, value, func):
 53 |     g1 = pd.Series(value).groupby(group_by)
 54 |     agg1  = g1.aggregate(func)
 55 |     #print agg1
 56 |     r1 = agg1[group_by].values
 57 |     return r1
 58 | 
 59 | def calcLeaveOneOut2(df, vn, vn_y, cred_k, r_k, power, mean0=None, add_count=False):
 60 |     if mean0 is None:
 61 |         mean0 = df_yt[vn_y].mean() * np.ones(df.shape[0])
 62 |     _key_codes = df[vn].values.codes
 63 |     grp1 = df[vn_y].groupby(_key_codes)
 64 |     grp_mean = pd.Series(mean0).groupby(_key_codes)
 65 |     mean1 = grp_mean.aggregate(np.mean)
 66 |     sum1 = grp1.aggregate(np.sum)
 67 |     cnt1 = grp1.aggregate(np.size)
 68 |     
 69 |     #print sum1
 70 |     #print cnt1
 71 |     vn_sum = 'sum_' + vn
 72 |     vn_cnt = 'cnt_' + vn
 73 |     _sum = sum1[_key_codes].values
 74 |     _cnt = cnt1[_key_codes].values
 75 |     _mean = mean1[_key_codes].values
 76 |     #print _sum[:10]
 77 |     #print _cnt[:10]
 78 |     #print _mean[:10]
 79 |     #print _cnt[:10]
 80 |     _mean[np.isnan(_sum)] = mean0.mean()
 81 |     _cnt[np.isnan(_sum)] = 0    
 82 |     _sum[np.isnan(_sum)] = 0
 83 |     #print _cnt[:10]
 84 |     _sum -= df[vn_y].values
 85 |     _cnt -= 1
 86 |     #print _cnt[:10]
 87 |     vn_yexp = 'exp2_'+vn
 88 | #    df[vn_yexp] = (_sum + cred_k * mean0)/(_cnt + cred_k)
 89 |     diff = np.power((_sum + cred_k * _mean)/(_cnt + cred_k) / _mean, power)
 90 |     if vn_yexp in df.columns:
 91 |         df[vn_yexp] *= diff
 92 |     else:
 93 |         df[vn_yexp] = diff 
 94 |     if r_k > 0:
 95 |         df[vn_yexp] *= np.exp((np.random.rand(np.sum(filter_train))-.5) * r_k)
 96 |     if add_count:
 97 |         df[vn_cnt] = _cnt
 98 |     return diff
 99 | 
100 | 
101 | def my_lift(order_by, p, y, w, n_rank, dual_axis=False, random_state=0, dither=1e-5, fig_size=None):
102 |     gen = check_random_state(random_state)
103 |     if w is None:
104 |         w = np.ones(order_by.shape[0])
105 |     if p is None:
106 |         p = order_by
107 |     ord_idx = np.argsort(order_by + dither*np.random.uniform(-1.0, 1.0, order_by.size))
108 |     p2 = p[ord_idx]
109 |     y2 = y[ord_idx]
110 |     w2 = w[ord_idx]
111 | 
112 |     cumm_w = np.cumsum(w2)
113 |     total_w = cumm_w[-1]
114 |     r1 = np.minimum(n_rank, np.maximum(1, 
115 |                     np.round(cumm_w * n_rank / total_w + .4999999)))
116 |     
117 |     df1 = pd.DataFrame({'r': r1, 'pw': p2 * w2, 'yw': y2 * w2, 'w': w2})
118 |     grp1 = df1.groupby('r')
119 |     
120 |     sum_w = grp1['w'].aggregate(np.sum)
121 |     avg_p = grp1['pw'].aggregate(np.sum) / sum_w 
122 |     avg_y = grp1['yw'].aggregate(np.sum) / sum_w
123 |     
124 |     xs = xrange(1, n_rank+1)
125 |     
126 |     fig, ax1 = plt.subplots()
127 |     if fig_size is None:
128 |         fig.set_size_inches(20, 15)
129 |     else:
130 |         fig.set_size_inches(fig_size)
131 |     ax1.plot(xs, avg_p, 'b--')
132 |     if dual_axis:
133 |         ax2 = ax1.twinx()
134 |         ax2.plot(xs, avg_y, 'r')
135 |     else:
136 |         ax1.plot(xs, avg_y, 'r')
137 |     
138 |     #print "logloss: ", logloss(p, y, w)
139 |     
140 |     return gini_norm(order_by, y, w)
141 | 
142 | def logloss(pred, y, weight=None):
143 |     if weight is None:
144 |         weight = np.ones(y.size)
145 |     
146 |     pred = np.maximum(1e-7, np.minimum(1 - 1e-7, pred))
147 |     return - np.sum(weight * (y * np.log(pred) + (1 - y) * np.log(1 - pred))) / np.sum(weight)
148 | 
149 | def gini_norm(pred, y, weight=None):
150 | 
151 |     #equal weight by default
152 |     if weight == None:
153 |         weight = np.ones(y.size)
154 | 
155 |     #sort actual by prediction
156 |     ord = np.argsort(pred)
157 |     y2 = y[ord]
158 |     w2 = weight[ord]
159 |     #gini by pred
160 |     cumm_y = np.cumsum(y2)
161 |     total_y = cumm_y[-1]
162 |     total_w = np.sum(w2)
163 |     g1 = 1 - 2 * sum(cumm_y * w2) / (total_y * total_w)
164 | 
165 |     #sort actual by actual
166 |     ord = np.argsort(y)
167 |     y2 = y[ord]
168 |     w2 = weight[ord]
169 |     #gini by actual
170 |     cumm_y = np.cumsum(y2)
171 |     g0 = 1 - 2 * sum(cumm_y * w2) / (total_y * total_w)
172 | 
173 |     return g1/g0
174 | 
175 | def mergeLeaveOneOut2(df, dfv, vn):
176 |     _key_codes = df[vn].values.codes
177 |     vn_yexp = 'exp2_'+vn
178 |     grp1 = df[vn_yexp].groupby(_key_codes)
179 |     _mean1 = grp1.aggregate(np.mean)
180 |     
181 |     _mean = _mean1[dfv[vn].values.codes].values
182 |     
183 |     _mean[np.isnan(_mean)] = _mean1.mean()
184 | 
185 |     return _mean
186 |     
187 |     
188 | def calcTVTransform(df, vn, vn_y, cred_k, filter_train, mean0=None):
189 |     if mean0 is None:
190 |         mean0 = df.ix[filter_train, vn_y].mean()
191 |         print "mean0:", mean0
192 |     else:
193 |         mean0 = mean0[~filter_train]
194 |         
195 |     df['_key1'] = df[vn].astype('category').values.codes
196 |     df_yt = df.ix[filter_train, ['_key1', vn_y]]
197 |     #df_y.set_index([')key1'])
198 |     grp1 = df_yt.groupby(['_key1'])
199 |     sum1 = grp1[vn_y].aggregate(np.sum)
200 |     cnt1 = grp1[vn_y].aggregate(np.size)
201 |     vn_sum = 'sum_' + vn
202 |     vn_cnt = 'cnt_' + vn
203 |     v_codes = df.ix[~filter_train, '_key1']
204 |     _sum = sum1[v_codes].values
205 |     _cnt = cnt1[v_codes].values
206 |     _cnt[np.isnan(_sum)] = 0    
207 |     _sum[np.isnan(_sum)] = 0
208 |     
209 |     r = {}
210 |     r['exp'] = (_sum + cred_k * mean0)/(_cnt + cred_k)
211 |     r['cnt'] = _cnt
212 |     return r
213 | 
214 | def cntDualKey(df, vn, vn2, key_src, key_tgt, fill_na=False):
215 |     
216 |     print "build src key"
217 |     _key_src = np.add(df[key_src].astype('string').values, df[vn].astype('string').values)
218 |     print "build tgt key"
219 |     _key_tgt = np.add(df[key_tgt].astype('string').values, df[vn].astype('string').values)
220 |     
221 |     if vn2 is not None:
222 |         _key_src = np.add(_key_src, df[vn2].astype('string').values)
223 |         _key_tgt = np.add(_key_tgt, df[vn2].astype('string').values)
224 | 
225 |     print "aggreate by src key"
226 |     grp1 = df.groupby(_key_src)
227 |     cnt1 = grp1[vn].aggregate(np.size)
228 |     
229 |     print "map to tgt key"
230 |     vn_sum = 'sum_' + vn + '_' + key_src + '_' + key_tgt
231 |     _cnt = cnt1[_key_tgt].values
232 | 
233 |     if fill_na is not None:
234 |         print "fill in na"
235 |         _cnt[np.isnan(_cnt)] = fill_na    
236 | 
237 |     vn_cnt_tgt = 'cnt_' + vn + '_' + key_tgt
238 |     if vn2 is not None:
239 |         vn_cnt_tgt += '_' + vn2
240 |     df[vn_cnt_tgt] = _cnt
241 | 
242 | def my_grp_cnt(group_by, count_by):
243 |     _ts = time.time()
244 |     _ord = np.lexsort((count_by, group_by))
245 |     print time.time() - _ts
246 |     _ts = time.time()    
247 |     _ones = pd.Series(np.ones(group_by.size))
248 |     print time.time() - _ts
249 |     _ts = time.time()    
250 |     #_cs1 = _ones.groupby(group_by[_ord]).cumsum().values
251 |     _cs1 = np.zeros(group_by.size)
252 |     _prev_grp = '___'
253 |     runnting_cnt = 0
254 |     for i in xrange(1, group_by.size):
255 |         i0 = _ord[i]
256 |         if _prev_grp == group_by[i0]:
257 |             if count_by[_ord[i-1]] != count_by[i0]: 
258 |                 running_cnt += 1
259 |         else:
260 |             running_cnt = 1
261 |             _prev_grp = group_by[i0]
262 |         if i == group_by.size - 1 or group_by[i0] != group_by[_ord[i+1]]:
263 |             j = i
264 |             while True:
265 |                 j0 = _ord[j]
266 |                 _cs1[j0] = running_cnt
267 |                 if j == 0 or group_by[_ord[j-1]] != group_by[j0]:
268 |                     break
269 |                 j -= 1
270 |             
271 |     print time.time() - _ts
272 |     if True:
273 |         return _cs1
274 |     else:
275 |         _ts = time.time()    
276 | 
277 |         org_idx = np.zeros(group_by.size, dtype=np.int)
278 |         print time.time() - _ts
279 |         _ts = time.time()    
280 |         org_idx[_ord] = np.asarray(xrange(group_by.size))
281 |         print time.time() - _ts
282 |         _ts = time.time()    
283 | 
284 |         return _cs1[org_idx]
285 |     
286 | def my_cnt(group_by):
287 |     _ts = time.time()
288 |     _ord = np.argsort(group_by)
289 |     print time.time() - _ts
290 |     _ts = time.time()    
291 |     #_cs1 = _ones.groupby(group_by[_ord]).cumsum().values
292 |     _cs1 = np.zeros(group_by.size)
293 |     _prev_grp = '___'
294 |     runnting_cnt = 0
295 |     for i in xrange(1, group_by.size):
296 |         i0 = _ord[i]
297 |         if _prev_grp == group_by[i0]:
298 |             running_cnt += 1
299 |         else:
300 |             running_cnt = 1
301 |             _prev_grp = group_by[i0]
302 |         if i == group_by.size - 1 or group_by[i0] != group_by[_ord[i+1]]:
303 |             j = i
304 |             while True:
305 |                 j0 = _ord[j]
306 |                 _cs1[j0] = running_cnt
307 |                 if j == 0 or group_by[_ord[j-1]] != group_by[j0]:
308 |                     break
309 |                 j -= 1
310 |             
311 |     print time.time() - _ts
312 |     return _cs1
313 | 
314 | def my_grp_value_diff(group_by, order_by, value):
315 |     _ts = time.time()
316 |     _ord = np.lexsort((order_by, group_by))
317 |     print time.time() - _ts
318 |     _ts = time.time()    
319 |     _ones = pd.Series(np.ones(group_by.size))
320 |     print time.time() - _ts
321 |     _ts = time.time()    
322 |     #_cs1 = _ones.groupby(group_by[_ord]).cumsum().values
323 |     _cs1 = np.zeros(group_by.size)
324 |     _prev_grp = '___'
325 |     for i in xrange(1, group_by.size):
326 |         i0 = _ord[i]
327 |         if _prev_grp == group_by[i0]:
328 |             _cs1[i0] = value[_ord[i]] - value[_ord[i-1]]
329 |         else:
330 |             _cs1[i0] = 1e7
331 |             _prev_grp = group_by[i0]
332 |     print time.time() - _ts
333 |     
334 |     return np.minimum(_cs1, 1e7)
335 | 
336 | def my_grp_idx(group_by, order_by):
337 |     _ts = time.time()
338 |     _ord = np.lexsort((order_by, group_by))
339 |     print time.time() - _ts
340 |     _ts = time.time()    
341 |     _ones = pd.Series(np.ones(group_by.size))
342 |     print time.time() - _ts
343 |     _ts = time.time()    
344 |     #_cs1 = _ones.groupby(group_by[_ord]).cumsum().values
345 |     _cs1 = np.zeros(group_by.size)
346 |     _prev_grp = '___'
347 |     for i in xrange(1, group_by.size):
348 |         i0 = _ord[i]
349 |         if _prev_grp == group_by[i0]:
350 |             _cs1[i] = _cs1[i - 1] + 1
351 |         else:
352 |             _cs1[i] = 1
353 |             _prev_grp = group_by[i0]
354 |     print time.time() - _ts
355 |     _ts = time.time()    
356 |     
357 |     org_idx = np.zeros(group_by.size, dtype=np.int)
358 |     print time.time() - _ts
359 |     _ts = time.time()    
360 |     org_idx[_ord] = np.asarray(xrange(group_by.size))
361 |     print time.time() - _ts
362 |     _ts = time.time()    
363 | 
364 |     return _cs1[org_idx]
365 | 
366 | def calcDualKey(df, vn, vn2, key_src, key_tgt, vn_y, cred_k, mean0=None, add_count=False, fill_na=False):
367 |     if mean0 is None:
368 |         mean0 = df[vn_y].mean()
369 |     
370 |     print "build src key"
371 |     _key_src = np.add(df[key_src].astype('string').values, df[vn].astype('string').values)
372 |     print "build tgt key"
373 |     _key_tgt = np.add(df[key_tgt].astype('string').values, df[vn].astype('string').values)
374 |     
375 |     if vn2 is not None:
376 |         _key_src = np.add(_key_src, df[vn2].astype('string').values)
377 |         _key_tgt = np.add(_key_tgt, df[vn2].astype('string').values)
378 | 
379 |     print "aggreate by src key"
380 |     grp1 = df.groupby(_key_src)
381 |     sum1 = grp1[vn_y].aggregate(np.sum)
382 |     cnt1 = grp1[vn_y].aggregate(np.size)
383 |     
384 |     print "map to tgt key"
385 |     vn_sum = 'sum_' + vn + '_' + key_src + '_' + key_tgt
386 |     _sum = sum1[_key_tgt].values
387 |     _cnt = cnt1[_key_tgt].values
388 | 
389 |     if fill_na:
390 |         print "fill in na"
391 |         _cnt[np.isnan(_sum)] = 0    
392 |         _sum[np.isnan(_sum)] = 0
393 | 
394 |     print "calc exp"
395 |     if vn2 is not None:
396 |         vn_yexp = 'exp_' + vn + '_' + vn2 + '_' + key_src + '_' + key_tgt
397 |     else:
398 |         vn_yexp = 'exp_' + vn + '_' + key_src + '_' + key_tgt
399 |     df[vn_yexp] = (_sum + cred_k * mean0)/(_cnt + cred_k)
400 | 
401 |     if add_count:
402 |         print "add counts"
403 |         vn_cnt_src = 'cnt_' + vn + '_' + key_src
404 |         df[vn_cnt_src] = _cnt
405 |         grp2 = df.groupby(_key_tgt)
406 |         cnt2 = grp2[vn_y].aggregate(np.size)
407 |         _cnt2 = cnt2[_key_tgt].values
408 |         vn_cnt_tgt = 'cnt_' + vn + '_' + key_tgt
409 |         df[vn_cnt_tgt] = _cnt2
410 | 
411 | def get_set_diff(df, vn, f1, f2):
412 |     #print(df[vn].values.sum())
413 |     set1 = set(np.unique(df[vn].values[f1]))
414 |     set2 = set(np.unique(df[vn].values[f2]))
415 |     set2_1 = set2 - set1
416 |     print vn, '\t', len(set1), '\t', len(set2), '\t', len(set2_1)
417 |     return len(set2_1) * 1.0 / len(set2)
418 | 
419 | 
420 | def calc_exptv(t0, vn_list, last_day_only=False, add_count=False):
421 |     t0a = t0.ix[:, ['day', 'click']].copy()
422 |     day_exps = {}
423 | 
424 |     for vn in vn_list:
425 |         if vn == 'dev_id_ip':
426 |             t0a[vn] = pd.Series(np.add(t0.device_id.values , t0.device_ip.values)).astype('category').values.codes
427 |         elif vn == 'dev_ip_aw':
428 |             t0a[vn] = pd.Series(np.add(t0.device_ip.values , t0.app_or_web.astype('string').values)).astype('category').values.codes
429 |         elif vn == 'C14_aw':
430 |             t0a[vn] = pd.Series(np.add(t0.C14.astype('string').values , t0.app_or_web.astype('string').values)).astype('category').values.codes
431 |         elif vn == 'C17_aw':
432 |             t0a[vn] = pd.Series(np.add(t0.C17.astype('string').values , t0.app_or_web.astype('string').values)).astype('category').values.codes
433 |         elif vn == 'C21_aw':
434 |             t0a[vn] = pd.Series(np.add(t0.C21.astype('string').values , t0.app_or_web.astype('string').values)).astype('category').values.codes
435 |         elif vn == 'as_domain':
436 |             t0a[vn] = pd.Series(np.add(t0.app_domain.values , t0.site_domain.values)).astype('category').values.codes
437 |         elif vn == 'site_app_id':
438 |             t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.app_id.values)).astype('category').values.codes
439 |         elif vn == 'app_model':
440 |             t0a[vn] = pd.Series(np.add(t0.app_id.values , t0.device_model.values)).astype('category').values.codes
441 |         elif vn == 'app_site_model':
442 |             t0a[vn] = pd.Series(np.add(t0.app_id.values , np.add(t0.site_id.values , t0.device_model.values))).astype('category').values.codes
443 |         elif vn == 'site_model':
444 |             t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.device_model.values)).astype('category').values.codes
445 |         elif vn == 'app_site':
446 |             t0a[vn] = pd.Series(np.add(t0.app_id.values , t0.site_id.values)).astype('category').values.codes
447 |         elif vn == 'site_ip':
448 |             t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.device_ip.values)).astype('category').values.codes
449 |         elif vn == 'app_ip':
450 |             t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.device_ip.values)).astype('category').values.codes
451 |         elif vn == 'site_id_domain':
452 |             t0a[vn] = pd.Series(np.add(t0.site_id.values , t0.site_domain.values)).astype('category').values.codes
453 |         elif vn == 'site_hour':
454 |             t0a[vn] = pd.Series(np.add(t0.site_domain.values , (t0.hour.values % 100).astype('string'))).astype('category').values.codes
455 |         else:
456 |             t0a[vn] = t0[vn]
457 | 
458 |         for day_v in xrange(22, 32):
459 |             cred_k = 10
460 |             if day_v not in day_exps:
461 |                 day_exps[day_v] = {}
462 | 
463 |             vn_key = vn
464 | 
465 |             import time
466 |             _tstart = time.time()
467 | 
468 |             day1 = 20
469 |             if last_day_only:
470 |                 day1 = day_v - 2
471 |             filter_t = np.logical_and(t0.day.values > day1, t0.day.values <= day_v)
472 |             vn_key = vn
473 |             t1 = t0a.ix[filter_t, :].copy()
474 |             filter_t2 = np.logical_and(t1.day.values != day_v, t1.day.values < 31)
475 |             
476 |             if vn == 'app_or_web':
477 |                 day_exps[day_v][vn_key] = calcTVTransform(t1, vn, 'click', cred_k, filter_t2)
478 |             else:
479 |                 if last_day_only:
480 |                     day_exps[day_v][vn_key] = calcTVTransform(t1, vn, 'click', cred_k, filter_t2, mean0=t0.expld_app_or_web.values)
481 |                 else:
482 |                     day_exps[day_v][vn_key] = calcTVTransform(t1, vn, 'click', cred_k, filter_t2, mean0=t0.exptv_app_or_web.values)
483 |             
484 |             print vn, vn_key, " ", day_v, " done in ", time.time() - _tstart
485 |         t0a.drop(vn, inplace=True, axis=1)
486 |         
487 |     for vn in vn_list:
488 |         vn_key = vn
489 |             
490 |         vn_exp = 'exptv_'+vn_key
491 |         if last_day_only:
492 |             vn_exp='expld_'+vn_key
493 |             
494 |         t0[vn_exp] = np.zeros(t0.shape[0])
495 |         if add_count:
496 |             t0['cnttv_'+vn_key] = np.zeros(t0.shape[0])
497 |         for day_v in xrange(22, 32):
498 |             print vn, vn_key, day_v, t0.ix[t0.day.values == day_v, vn_exp].values.size, day_exps[day_v][vn_key]['exp'].size
499 |             t0.loc[t0.day.values == day_v, vn_exp]=day_exps[day_v][vn_key]['exp']
500 |             if add_count:
501 |                 t0.loc[t0.day.values == day_v, 'cnttv_'+vn_key]=day_exps[day_v][vn_key]['cnt']
502 |         
503 | 


--------------------------------------------------------------------------------