├── .idea
├── vcs.xml
├── misc.xml
├── modules.xml
└── TencentSocialAds.iml
├── cross features stat.py
├── Other.py
├── model - appID.py
├── feat_label.py
├── model - group_value.py
├── model - LR06.py
├── train_xgb.py
├── train_cv.py
├── README.md
├── model - xgb - cross features.py
├── doFeats_1.py
└── doFeats_2.py
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/TencentSocialAds.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/cross features stat.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import f_pack
4 |
5 | data = pd.read_csv(f_pack.file_train)
6 | dfAd = pd.read_csv(f_pack.file_ad)
7 | data = pd.merge(data, dfAd, how='left', on='creativeID')
8 | '''
9 | g1 = data.groupby(['positionID', 'connectionType']).apply(lambda x: np.mean(x["label"])).reset_index()
10 | g1.columns = ['positionID', 'connectionType', 'mean_pos_conn']
11 | g1.to_csv(f_pack.file_cf_pos_conn)
12 |
13 | g2 = data.groupby(['positionID', 'advertiserID']).apply(lambda x: np.mean(x["label"])).reset_index()
14 | g2.columns = ['positionID', 'advertiserID', 'mean_pos_adv']
15 | g2.to_csv(f_pack.file_cf_pos_adv)
16 | '''
17 |
18 | g3 = data.groupby(['userID']).apply(lambda x: np.mean(x["label"])).reset_index()
19 | g3.columns = ['userID', 'mean_userID']
20 | g3.to_csv(f_pack.file_f_mean_userID)
21 |
--------------------------------------------------------------------------------
/Other.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import math
4 |
5 | def zeroMean(dataMat):
6 | meanVal=np.mean(dataMat,axis=0)
7 | newData=dataMat-meanVal
8 | return newData,meanVal
9 |
10 | newData,meanVal=zeroMean(X_train)
11 | res=newData.corr(method='pearson')
12 |
13 | def f(x):
14 | res=1/(1+np.e**(-x))
15 | return res
16 |
17 | def f_ver(x):
18 | res=np.log(x/(1-x))
19 | return res
20 |
21 |
22 | df=pd.read_csv(r'a.csv')
23 | print(df.prob.mean())
24 | avg=0.027232030146226882#0.0273
25 | b=[-2,2]
26 | df.prob1=df.prob
27 | while abs(np.mean(df.prob1)-avg)>0.00001:
28 | mid=(b[0]+b[1])/2.0
29 | df.prob1=df.prob.apply(lambda x:math.log(x/(1-x)))
30 | df.prob1=df.prob1.apply(lambda x:x+mid)
31 | df.prob1=df.prob1.apply(lambda x:1/(1+math.exp(-x)))
32 | if np.mean(df.prob1)>avg:
33 | b[1]=mid
34 | else:
35 | b[0]=mid
36 | df.prob=df.prob1
37 | del df.prob1
38 | df.to_csv(r'submission.csv',index=False)
--------------------------------------------------------------------------------
/model - appID.py:
--------------------------------------------------------------------------------
1 | """
2 | baseline 1: history pCVR of creativeID/adID/camgaignID/advertiserID/appID/appPlatform
3 | """
4 |
5 | # res: 0.1 appID is a great feature
6 |
7 | import zipfile
8 | import numpy as np
9 | import pandas as pd
10 | import f_pack
11 |
12 | # load data
13 | dfTrain = pd.read_csv(f_pack.file_train)
14 | dfTest = pd.read_csv(f_pack.file_test)
15 | dfAd = pd.read_csv(f_pack.file_ad)
16 |
17 | # process data
18 | dfTrain = pd.merge(dfTrain, dfAd, on="creativeID")
19 | dfTest = pd.merge(dfTest, dfAd, on="creativeID")
20 | y_train = dfTrain["label"].values
21 |
22 | # model building
23 | key = "appID"
24 | dfCvr = dfTrain.groupby(key).apply(lambda df: np.mean(df["label"])).reset_index()
25 | dfCvr.columns = [key, "avg_cvr"]
26 | dfCvr.to_csv(f_pack.file_appID_score)
27 | dfTest = pd.merge(dfTest, dfCvr, how="left", on=key)
28 | dfTest["avg_cvr"].fillna(np.mean(dfTrain["label"]), inplace=True)
29 | proba_test = dfTest["avg_cvr"].values
30 |
31 | # submission
32 | df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": proba_test})
33 | df.sort_values("instanceID", inplace=True)
34 | df.to_csv(f_pack.file_submission, index=False)
35 |
--------------------------------------------------------------------------------
/feat_label.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import f_pack
3 |
4 |
5 | def stat(group):
6 | group = group.sort_values('minute')
7 | group = group.reset_index()
8 | group['h_potential'] = 0
9 | group['h_potential02'] = 0
10 | length = group.size / len(group.columns)
11 | if length < 2:
12 | return group
13 |
14 | minute = group['minute']
15 |
16 | if length == 2:
17 | if minute[1] - minute[0] <= 3 or (0 <= minute[1] <= 1 and 58 <= minute[0] < 60):
18 | group.loc[0, 'h_potential'] = 1
19 | return group
20 |
21 | count = 0
22 | for i in range(len(minute) - 1):
23 | if minute[i + 1] - minute[i] <= 3 or (0 <= minute[i + 1] <= 1 and 58 <= minute[i] < 60):
24 | count = 1
25 | group.loc[i, 'h_potential'] = 1
26 | if i > 0 and group.loc[i - 1, 'h_potential02'] == 1:
27 | group.loc[i, 'h_potential'] = 0
28 | if count == 1 and i < len(minute) - 2:
29 | if minute[i + 2] - minute[i + 1] <= 3 or (0 <= minute[i + 2] <= 1 and 58 <= minute[i + 1] < 60):
30 | group.loc[i, 'h_potential02'] = 1
31 | count = 0
32 | return group
33 |
34 |
35 | data = pd.read_csv(f_pack.file_train)
36 | print(data.shape)
37 | data['day'] = data['clickTime'].map(lambda x: x // 10000)
38 | days = data['day']
39 | days = days.unique()
40 | save_data = pd.DataFrame()
41 | for day in days:
42 | test_data = data[data['day'] == day]
43 | print('test_data num is %d' % len(test_data))
44 | sub_data = data[data['day'] == day].copy()
45 | print('sub_data num is %d' % len(sub_data))
46 | sub_data['minute'] = sub_data['clickTime'] % 100
47 | sub_data = sub_data.groupby('userID', as_index=False).apply(stat).reset_index()
48 | sub_data = sub_data.drop(['level_0', 'level_1', 'index'], axis=1)
49 | print('sub_data num is %d' % len(sub_data))
50 | save_data = pd.concat([save_data, sub_data])
51 | print(save_data.shape)
52 |
53 | save_data.to_csv(f_pack.file_train_s, index=False)
54 | data = pd.read_csv(f_pack.file_train_s)
55 | print(data.shape)
56 |
--------------------------------------------------------------------------------
/model - group_value.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from scipy import sparse
4 | from sklearn.preprocessing import OneHotEncoder
5 | from sklearn.linear_model import LogisticRegression
6 | import f_pack
7 |
8 | # load data
9 | dfTrain = pd.read_csv(f_pack.file_train)
10 | dfTest = pd.read_csv(f_pack.file_test)
11 | dfAd = pd.read_csv(f_pack.file_ad)
12 | dfUser = pd.read_csv(f_pack.file_user)
13 | dfUser['age'] = dfUser['age'].map(lambda x: (x // 5) + 1 if x > 0 else 0)
14 | dfUser['haveBaby'] = dfUser['haveBaby'].map(lambda x: 3 if x >= 3 else x)
15 | dfAppCate = pd.read_csv(f_pack.file_app_categories)
16 | dfPos = pd.read_csv(f_pack.file_position)
17 | dfAppScore = pd.read_csv(f_pack.file_appID_score)
18 | # process data
19 | print(dfTrain.shape)
20 | print(dfTest.shape)
21 | dfTrain = pd.merge(dfTrain, dfAd, on="creativeID")
22 | dfTest = pd.merge(dfTest, dfAd, on="creativeID")
23 | dfTrain = pd.merge(dfTrain, dfUser, on='userID')
24 | dfTest = pd.merge(dfTest, dfUser, on='userID')
25 | dfTrain = pd.merge(dfTrain, dfAppCate, on='appID')
26 | dfTest = pd.merge(dfTest, dfAppCate, on='appID')
27 | dfTrain = pd.merge(dfTrain, dfPos, on='positionID')
28 | dfTest = pd.merge(dfTest, dfPos, on='positionID')
29 | y_train = dfTrain["label"].values
30 | print(dfTrain.shape)
31 | print(dfTest.shape)
32 |
33 | # feature engineering/encoding
34 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "camgaignID", "advertiserID",
35 | "appPlatform", 'age', 'gender', 'education', 'marriageStatus', 'haveBaby', 'residence', 'appID',
36 | 'positionType', 'sitesetID', 'hometown', 'appCategory']
37 |
38 | for i, feat in enumerate(feats):
39 | dfCvr = dfTrain.groupby(feat).apply(lambda df: np.mean(df["label"])).reset_index()
40 | dfCvr.columns = [feat, feat + "_avg_cvr"]
41 | dfTrain = pd.merge(dfTrain, dfCvr, how="left", on=feat)
42 | dfTest = pd.merge(dfTest, dfCvr, how="left", on=feat)
43 | dfTest.fillna(0, inplace=True)
44 |
45 | filt = '.*_avg_cvr'
46 | X_train = dfTrain.filter(regex=filt)
47 | X_test = dfTest.filter(regex=filt)
48 |
49 | # model building
50 | lr = LogisticRegression(penalty='l1')
51 | lr.fit(X_train, y_train)
52 | proba_test = lr.predict_proba(X_test)[:, 1]
53 | param=pd.DataFrame({"columns":list(X_train.columns), "coef":list(lr.coef_.T)})
54 | print(param)
55 |
56 | # submission
57 | df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": proba_test})
58 | df.sort_values("instanceID", inplace=True)
59 | df.to_csv(f_pack.file_submission, index=False)
60 |
--------------------------------------------------------------------------------
/model - LR06.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import pandas as pd
3 | from scipy import sparse
4 | from sklearn.preprocessing import OneHotEncoder
5 | from sklearn.linear_model import LogisticRegression
6 | import f_pack
7 | from sklearn.model_selection import train_test_split
8 | import numpy as np
9 |
10 |
11 | def format_cross_features(dfTrain, dfTest, feat1, feat2):
12 | feat = feat1 + '-' + feat2
13 | dfTrain[feat] = dfTrain[feat1] + '.' + dfTrain[feat2]
14 | dfTrain[feat] = dfTrain[feat].astype(float)
15 | dfTrain[feat] = dfTrain[feat] * 10000
16 | dfTest[feat] = dfTest[feat1] + '.' + dfTest[feat2]
17 | dfTest[feat] = dfTest[feat].astype(float)
18 | dfTest[feat] = dfTest[feat] * 10000
19 | return dfTrain, dfTest
20 |
21 |
22 | # load data
23 | dfTrain, dfTest, y_label = f_pack.load_data()
24 | # cross feature
25 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "camgaignID", "advertiserID",
26 | "appPlatform", 'gender', 'marriageStatus', 'haveBaby', 'residence', 'age', 'education', 'appID']
27 | for feat in feats:
28 | dfTrain[feat], dfTest[feat] = dfTrain[feat].astype(str), dfTest[feat].astype(str)
29 |
30 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'age', 'education')
31 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'appID')
32 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'appPlatform')
33 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'connectionType')
34 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'advertiserID')
35 |
36 | dfTrain = dfTrain.fillna(0)
37 | dfTrain = dfTrain.replace(np.inf, 0)
38 | dfTest = dfTest.replace(np.inf, 0)
39 | dfTest = dfTest.fillna(0)
40 |
41 | enc = OneHotEncoder()
42 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "camgaignID", "advertiserID",
43 | "appPlatform", 'gender', 'marriageStatus', 'haveBaby', 'residence', 'age', 'education', 'positionID-appID',
44 | 'positionID-appPlatform', 'positionID-connectionType']
45 |
46 | for i, feat in enumerate(feats):
47 | x_train = enc.fit_transform(dfTrain[feat].values.reshape(-1, 1))
48 | x_test = enc.transform(dfTest[feat].values.reshape(-1, 1))
49 | if i == 0:
50 | X_train, X_test = x_train, x_test
51 | else:
52 | X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test))
53 |
54 | feats = ['download_num', 'avg_cvr', 'user_install_num', 'app_install_num']
55 | for feat in feats:
56 | X_train = sparse.hstack((X_train, dfTrain[feat].values.reshape(-1, 1)))
57 | X_test = sparse.hstack((X_test, dfTest[feat].values.reshape(-1, 1)))
58 |
59 | X_train, test_set, y_train, y_test_set = train_test_split(X_train, y_label, test_size=0.2, random_state=0)
60 | # model training
61 | print("start modeling")
62 | lr = LogisticRegression(penalty='l1')
63 | lr.fit(X_train, y_train)
64 | proba_test = lr.predict_proba(X_test)[:, 1]
65 |
66 | # submission
67 | df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": proba_test})
68 | df.sort_values("instanceID", inplace=True)
69 | df.to_csv(f_pack.file_submission, index=False)
70 |
71 | # metric
72 | predictions = lr.predict(test_set)
73 | f_pack.print_metrics(y_test_set, predictions)
74 |
--------------------------------------------------------------------------------
/train_xgb.py:
--------------------------------------------------------------------------------
1 | from feature_set import make_train_set
2 | from feature_set import make_test_set
3 | from sklearn.model_selection import train_test_split
4 | import xgboost as xgb
5 | from datetime import datetime
6 | import pandas as pd
7 | from sklearn.utils import resample
8 | import f_pack
9 | import matplotlib.pyplot as plt
10 | import operator
11 |
12 | def statistic(group):
13 | group = group.sort_values('label', ascending=False).head(1)
14 | return group
15 |
16 | def ceate_feature_map(features):
17 | outfile = open('xgb.fmap', 'w')
18 | i = 0
19 | for feat in features:
20 | outfile.write('{0}\t{1}\tq\n'.format(i, feat))
21 | i = i + 1
22 | outfile.close()
23 |
24 | def underSampling(training_data, label):
25 | n = label.size // 8
26 | small_data, small_label = resample(training_data, label, n_samples=n)
27 | data = pd.concat([training_data, label], axis=1)
28 | positive = data[data['label'] == 1]
29 | small_data = pd.concat([small_data, positive.ix[:, :positive.columns.size - 1]])
30 | small_label = pd.concat([small_label, positive.ix[:, positive.columns.size - 1]])
31 | return small_data, small_label
32 |
33 |
34 | def xgboost_make_submission():
35 | training_data, label = make_train_set()
36 | instanceID, test_trainning_data = make_test_set()
37 | instanceID = instanceID.reset_index()
38 | test_trainning_data = xgb.DMatrix(test_trainning_data.values)
39 | print('start fit')
40 | for i in range(1):
41 | small_data, small_label = underSampling(training_data, label)
42 | ceate_feature_map(small_data.columns)
43 | X_train, X_test, y_train, y_test = train_test_split(small_data.values, small_label.values, test_size=0.2,
44 | random_state=0)
45 | dtrain = xgb.DMatrix(X_train, label=y_train)
46 | dtest = xgb.DMatrix(X_test, label=y_test)
47 | param = {'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 3,
48 | 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
49 | 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
50 | num_round = 300
51 | watchList = [(dtest, 'eval'), (dtrain, 'train')]
52 | plst = list(param.items()) + [('eval_metric', 'logloss')]
53 | bst = xgb.train(plst, dtrain, num_round, watchList)
54 | y = bst.predict(test_trainning_data)
55 | instanceID = pd.concat([instanceID, pd.Series(y)], axis=1)
56 | # feature importance
57 | feature_score=bst.get_fscore(fmap='xgb.fmap')
58 | print(feature_score)
59 | feature_score = sorted(feature_score.items(), key=operator.itemgetter(1))
60 | df = pd.DataFrame(feature_score, columns=['feature', 'fscore'])
61 | df['fscore'] = df['fscore'] / df['fscore'].sum()
62 | df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(16, 10))
63 | plt.show()
64 | instanceID.to_csv(f_pack.file_output_test, index=False)
65 |
66 | # output
67 | data = f_pack.read_file(f_pack.file_output_test)
68 | data = data.ix[:, 1:]
69 | print(data.head())
70 | data['Prob'] = data.ix[:, 1:].sum(axis=1) / 10
71 | data = data[['instanceID', 'Prob']]
72 | data.to_csv(f_pack.file_submission, index=False)
73 |
74 |
75 | if __name__ == '__main__':
76 | print(datetime.now())
77 | xgboost_make_submission()
78 | print(datetime.now())
79 |
--------------------------------------------------------------------------------
/train_cv.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import scipy as sp
4 | import lightgbm as lgb
5 | import gc
6 | import datetime
7 | import random
8 | import scipy.special as special
9 | from sklearn.cross_validation import train_test_split
10 | from sklearn.cross_validation import StratifiedKFold
11 |
12 | rawpath='C:\\final\\'
13 | temppath='C:\\final\\temp\\'
14 | iapath='C:\\final\\temp\\installedactions\\'
15 |
16 | def logloss(act, preds):
17 | epsilon = 1e-15
18 | preds = sp.maximum(epsilon, preds)
19 | preds = sp.minimum(1 - epsilon, preds)
20 | ll = sum(act * sp.log(preds) + sp.subtract(1, act) * sp.log(sp.subtract(1, preds)))
21 | ll = ll * -1.0 / len(act)
22 | return ll
23 |
24 |
25 | def getTrainVal(X_train, scope=(28, 29), val_type='30', seed=1000):
26 | if val_type == '30':
27 | X_val = X_train.loc[X_train['day'] == 30, :]
28 | X_train = X_train.loc[(X_train['day'] >= scope[0]) & (X_train['day'] <= scope[1]), :]
29 | elif val_type == '73':
30 | X_train = X_train.loc[(X_train['day'] >= scope[0]) & (X_train['day'] <= scope[1]), :]
31 | X_train, X_val, y_train, y_val = train_test_split(X_train, X_train['label'], test_size=0.3, random_state=seed)
32 | return X_train, X_val
33 |
34 |
35 | t_start = datetime.datetime.now()
36 | X_loc_train=pd.read_csv(temppath+'2_smooth.csv')
37 | print('load train over...')
38 | X_loc_test=pd.read_csv(temppath+'2_test_smooth.csv')
39 | print('load test over...')
40 |
41 | ##########################################################CV预测时30号验证效果不好,而其中一折做验证来提前停止,后面删除30天数据
42 | X_loc_train, X_loc_val = getTrainVal(X_loc_train, scope=(28, 29), val_type='30', seed=1000)
43 |
44 | drop = ['label', 'day']
45 | y_loc_train = X_loc_train.loc[:, 'label']
46 | X_loc_train.drop(drop, axis=1, inplace=True)
47 |
48 | # y_loc_val = X_loc_val.loc[:, 'label']
49 | # X_loc_val.drop(drop, axis=1, inplace=True)
50 |
51 | res = X_loc_test.loc[:, ['instanceID']]
52 | X_loc_test.drop(['instanceID'], axis=1, inplace=True)
53 | X_loc_test.drop(drop, axis=1, inplace=True)
54 |
55 |
56 | gc.collect()
57 | print('preprocess over...', X_loc_train.shape)
58 |
59 | ##########################################################比赛只用了lightGBM单模型
60 | X_loc_train=X_loc_train.values
61 | y_loc_train=y_loc_train.values
62 | # X_loc_val=X_loc_val.values
63 | # y_loc_val=y_loc_val.values
64 | X_loc_test=X_loc_test.values
65 |
66 | ##########################################################交叉预测,实际上是stacking第一层做的操作
67 | # 利用不同折数加参数,特征,样本(随机数种子)扰动,再加权平均得到最终成绩
68 | model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=29, max_depth=-1, learning_rate=0.1, n_estimators=10000,
69 | max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
70 | min_child_weight=5, min_child_samples=10, subsample=1, subsample_freq=1,
71 | colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, nthread=-1, silent=True)
72 | del X_loc_val
73 |
74 | skf=list(StratifiedKFold(y_loc_train, n_folds=10, shuffle=True, random_state=1024))
75 | for i, (train, test) in enumerate(skf):
76 | print("Fold", i)
77 | model.fit(X_loc_train[train], y_loc_train[train], eval_metric='logloss',eval_set=[(X_loc_train[train], y_loc_train[train]), (X_loc_train[test], y_loc_train[test])],early_stopping_rounds=100)
78 | preds= model.predict_proba(X_loc_test, num_iteration=model.best_iteration)[:, 1]
79 | print('mean:', preds.mean())
80 | res['prob_%s' % str(i)] = preds
81 |
82 | #平均或者加权的方式有很多种,台大三傻的比赛分享里有一个利用sigmoid反函数来平均的方法效果不错
83 | now = datetime.datetime.now()
84 | now = now.strftime('%m-%d-%H-%M')
85 | print(now)
86 | res.sort_values("instanceID", ascending=True, inplace=True)
87 | res.to_csv(rawpath+"%s.csv" % now, index=False)
88 |
89 | t_end = datetime.datetime.now()
90 | print('training time: %s' % ((t_end - t_start).seconds/60))
91 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TencentSocialAds
2 | 腾讯社交广告高校算法大赛
3 | objective: 预测移动广告被点击后激活的概率 [pCVR=P(conversion=1 | ad, user, context)]
4 |
5 | Data Cleaning:
6 |
7 | 1. 30th day is inaccuarate but valuable because the prediction problem is time-sensitive. So how to take advantage it? 由于转化回流时间有长有短,所以最后五天的label可能是不准确的,尤其是第30天。如果将第30天的数据全部删除,将会丢失大量有用的信息。如果全部保留,又引进了相当程度的噪声。而我们发现,转化回流时间是与APP ID有关的。于是我们统计了每个APP ID的平均转化回流时间,并且删除掉了第30天中平均转化回流时间偏长的数据。
8 |
9 | Feature engineering:
10 |
11 | 1. Feature types: raw features, statistic features, time-series features, cross features.
12 | 2. statistic features needs to do Bayesian smooth
13 | 3. time-series features such as the number of installed app before clicktime, the number of installed app of the same type before clicktime
14 | 4. How to select cross features? use xgb features importance -> run xgb again to get updated features importance
15 | 5. How to code cross features? 1. Hash and onehotencoding 2. groupby -> transfer cross features to statistic features
16 |
17 | Data Set Construction:
18 |
19 | 1. use data of 28,29th days to predict 31th day
20 | 2. conversion ratio of 17th -20th seems unstable, we should remove it.
21 |
22 | Model and Training:
23 |
24 | 1. Be careful of data leakage.
25 | 2. model ensemble should use different kind of models such as the combination of xgb ans LR. xgb and lightGBM are both tree-based model. So the result is not as good as imagination.
26 | 3. ensemble mothod: Weighted average, stacking, random seeds.
27 | 4. final result can multiple a ratio to approach platform mean conversion ratio.
28 |
29 | ## 其他队伍分享的highlight
30 | ### rank 14th 队伍名:竟然有这些操作
31 |
32 | Trick特征:
33 | 通过观察原始数据是不难发现的,有很多只有clickTime和label不一样的重复数据,按时间排序发现重复数据如果转化,label一般标在头或尾,少部分在中间,在训练集上出现的情况在测试集上也会出现,所以标记这些位置后onehot,让模型去学习,再就是时间差特征,关于trick我比赛分享的这篇文章有较详细的说明。比赛后期发现了几个和这个trick相类似的文章1和文章2,可以参考。
34 |
35 | 统计特征:
36 | 原始特征主要三大类:广告特征、用户特征、位置特征,通过交叉组合算统计构造特征,由于机器限制,统计特征主要使用了转化率,丢掉了点击次数和转化次数。初赛利用了7天滑窗构造,决赛采用了周冠军分享的clickTime之前所有天算统计。三组合特征也来自周冠军分享的下载行为和网络条件限制,以及用户属性对app需求挖掘出。贝叶斯平滑user相关的特征特别废时间,初赛做过根据点击次数阈值来操作转化率,效果和平滑差不多但是阈值选择不太准。
37 |
38 | 活跃数特征:
39 | 特征构造灵感来自这里,比如某个广告位的app种数。
40 |
41 | 均值特征:
42 | 比如点击某广告的用户平均年龄
43 |
44 | 平均回流时间特征:
45 | 利用回流时间方式不对的话很容易造成leackage,这里参考了官方群里的分享,计算了每个appID的平均回流时间,没有回流的app用其所在类的平均回流时间代替
46 |
47 | 用户流水和历史特征:
48 | 利用installed文件关联user和app获得历史统计特征,利用actions进行7天滑动窗口获得用户和app流水特征。
49 |
50 | 一些特征:
51 |
52 | 冷启动特征;
53 | 排序特征;
54 | 用户点击数和转化数过于稀疏,可以分别LenbelEncoder,然后拼接后LabelEncoder;
55 | 一天24小时以半小时为单位分箱;
56 | 连续特征离散化,如分箱离散化、参考决策树分裂点离散化、或用XGB叶子号离散化,再拼接原始离散特征送入FFM;
57 | 交叉验证方式构造统计特征防止leakage;
58 | 用户转化序列(比如:0010);
59 | 删除最后几天中平均回流时间长的某些appID或者advertiserID数据(考虑回流时间和广告主或app相关);
60 | 多窗口统计(1分钟内、1小时内、1天内...),利用多窗口将样本转化为二维,送人CNN(很强的捕捉局部信息能力),不采用pooling,采用drop-out等;
61 |
62 | ### rank 20 队伍名:unknown
63 |
64 | 用户点击日志挖掘_2_1_gen_user_click_features.py
65 | 挖掘广告点击日志,从不同时间粒度(天,小时)和不同属性维度(点击的素材,广告,推广计划,广告主类型,广告位等)提取用户点击行为的统计特征。
66 |
67 | 用户安装日志挖掘 _2_2_gen_app_install_features.py
68 | 根据用户历史APP安装记录日志,分析用户的安装偏好和APP的流行趋势,结合APP安装时间的信息提取APP的时间维度的描述向量。这里最后只用了一种特征。
69 |
70 | 广告主转化回流上报机制分析_2_4_gen_tricks.py
71 | 不同的广告主具有不同的转化计算方式,如第一次点击算转化,最后一次点击算转化,安装时点击算转化,分析并构造相应描述特征,提升模型预测精度。
72 |
73 | 广告转化率特征提取_2_5_gen_smooth_cvr.py
74 | 构造转化率特征,使用全局和滑动窗口等方式计算单特征转化率,组合特征转化率,使用均值填充,层级填充,贝叶斯平滑,拉普拉斯平滑等方式对转化率进行修正。
75 |
76 | 广告描述向量特征提取_2_6_gen_ID_click_vectors.py
77 | 广告投放是有特定受众对象的,而特定的受众对象也可以描述广告的相关特性,使用不同的人口属性对广告ID和APPID进行向量表示,学习隐含的语义特征。
78 | 建模预测
79 | 使用多种模型进行训练,包括LightGBM,XGBoost,FFM和神经网络,最后进行多模型加权融合提高最终模型性能。
80 |
81 | 总结:前期一直沉迷于LR带来的成绩,没有想到后期随着特征的增加,LR无法很好的表达特征。没有及时用xgboost,导致成绩一直提升缓慢,也许错过了许多重要特征。对于稀疏矩阵的运用有待加强。刚开始xgb结果很差是因为代码错误(label列选错了,直接贴之前京东赛代码的恶果),还是要思考一下,下次碰到这种稀疏矩阵,怎么通过pandas能够很好的解决。
82 |
83 | 李沐指出,模型是使用离散特征还是连续特征,其实是一个“海量离散特征+简单模型” 同 “少量连续特征+复杂模型”的权衡。既可以离散化用线性模型,也可以用连续特征加深度学习。就看是喜欢折腾特征还是折腾模型了。通常来说,前者容易,而且可以n个人一起并行做,有成功经验;后者目前看很赞,能走多远还须拭目以待。
84 | 逻辑回归属于广义线性模型,表达能力受限;单变量离散化为N个后,每个变量有单独的权重,相当于为模型引入了非线性,能够提升模型表达能力,加大拟合。
85 | 离散化后可以进行特征交叉,由M+N个变量变为M*N个变量,进一步引入非线性,提升表达能力。
86 |
87 | 参考资料:
88 | 30th https://github.com/oooozhizhi/TencentSocialAdvertising-30th-solutions
89 | 26th https://jiayi797.github.io/categories/腾讯算法大赛-CVR预估/
90 | 23th https://github.com/BladeCoda/Tencent2017_Final_Coda_Allegro
91 | 20th https://github.com/shenweichen/Tencent_Social_Ads2017_Mobile_App_pCVR
92 | 14th https://github.com/freelzy/Tencent_Social_Ads
93 | 7th http://blog.csdn.net/ben3ben/article/details/74838338
94 |
--------------------------------------------------------------------------------
/model - xgb - cross features.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import f_pack
3 | from scipy import sparse
4 | from sklearn.preprocessing import OneHotEncoder
5 | from sklearn.model_selection import train_test_split
6 | import numpy as np
7 | import matplotlib.pyplot as plt
8 | from sklearn.utils import resample
9 | import xgboost as xgb
10 | import operator
11 |
12 |
13 | def underSampling(training_data, label):
14 | n = label.size // 10
15 | small_data, small_label = resample(training_data, label, n_samples=n)
16 | positive = training_data[training_data['label'] == 1]
17 | small_data = pd.concat([small_data, positive])
18 | small_label = pd.concat([pd.DataFrame(small_label), positive['label']])
19 | return small_data, small_label
20 |
21 |
22 | def format_cross_features(dfTrain, dfTest, feat1, feat2):
23 | feat = feat1 + '-' + feat2
24 | dfTrain[feat] = dfTrain[feat1] + '.' + dfTrain[feat2]
25 | dfTrain[feat] = dfTrain[feat].astype(float)
26 | dfTrain[feat] = dfTrain[feat] * 10000
27 | dfTest[feat] = dfTest[feat1] + '.' + dfTest[feat2]
28 | dfTest[feat] = dfTest[feat].astype(float)
29 | dfTest[feat] = dfTest[feat] * 10000
30 | return dfTrain, dfTest
31 |
32 |
33 | def model(n_round):
34 | # load data
35 | dfTrain, dfTest, y_label = f_pack.load_data()
36 | # cross feature
37 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "sitesetID", "advertiserID",
38 | "appPlatform", 'gender', 'marriageStatus', 'haveBaby', 'residence', 'age', 'education', 'appID']
39 | for feat in feats:
40 | dfTrain[feat], dfTest[feat] = dfTrain[feat].astype(str), dfTest[feat].astype(str)
41 |
42 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'age', 'education')
43 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'appID')
44 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'appPlatform')
45 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'connectionType')
46 |
47 | dfTrain = dfTrain.fillna(0)
48 | dfTrain = dfTrain.replace(np.inf, 0)
49 | dfTest = dfTest.replace(np.inf, 0)
50 | dfTest = dfTest.fillna(0)
51 |
52 | enc = OneHotEncoder()
53 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "camgaignID", "advertiserID",
54 | "appPlatform", 'gender', 'marriageStatus', 'haveBaby', 'residence', 'age', 'education', 'positionID-appID',
55 | 'positionID-appPlatform', 'positionID-connectionType']
56 |
57 | for i, feat in enumerate(feats):
58 | x_train = enc.fit_transform(dfTrain[feat].values.reshape(-1, 1))
59 | x_test = enc.transform(dfTest[feat].values.reshape(-1, 1))
60 | if i == 0:
61 | X_train, X_test = x_train, x_test
62 | else:
63 | X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test))
64 |
65 | feats = ['download_num', 'avg_cvr', 'user_install_num', 'app_install_num', 'h_potential', 'h_potential02']
66 | for feat in feats:
67 | X_train = sparse.hstack((X_train, dfTrain[feat].values.reshape(-1, 1)))
68 | X_test = sparse.hstack((X_test, dfTest[feat].values.reshape(-1, 1)))
69 |
70 | # model training
71 | print("start modeling")
72 | X_train, valid_set, y_train, y_valid = train_test_split(X_train, y_label, test_size=0.05, random_state=0)
73 | dtrain = xgb.DMatrix(X_train, label=y_train)
74 | dtest = xgb.DMatrix(valid_set, label=y_valid)
75 | param = {'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 10,
76 | 'min_child_weight': 5, 'gamma': 0, 'silent': 1, 'objective': 'binary:logistic',
77 | 'early_stopping_rounds': 50}
78 | # xgb.cv(param, dtrain, n_round, nfold=5, metrics={'auc'}, seed=0,
79 | # callbacks=[xgb.callback.print_evaluation(show_stdv=True)], early_stopping_rounds=20)
80 | watchList = [(dtest, 'eval'), (dtrain, 'train')]
81 | plst = list(param.items()) + [('eval_metric', 'logloss')]
82 | bst = xgb.train(plst, dtrain, n_round, watchList)
83 | y = bst.predict(xgb.DMatrix(X_test))
84 | res = pd.concat([dfTest['instanceID'], pd.Series(y)], axis=1)
85 | res = res.sort_values('instanceID')
86 | res['instanceID'] = res['instanceID'].astype(int)
87 | res.columns = ['instanceID', 'proba']
88 | print(res.shape)
89 | res.to_csv(f_pack.file_submission, index=False)
90 |
91 |
92 | model(230)
93 |
94 | '''
95 | # feature importance
96 | feature_score = bst.get_fscore()
97 | feature_score = sorted(feature_score.items(), key=operator.itemgetter(1))
98 | print(feature_score)
99 | df = pd.DataFrame(feature_score, columns=['feature', 'fscore'])
100 | df['fscore'] = df['fscore'] / df['fscore'].sum()
101 | df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(16, 10))
102 | plt.show()
103 |
104 | proba_test = lr.predict_proba(X_test)[:, 1]
105 | # submission
106 | df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": proba_test})
107 | df.sort_values("instanceID", inplace=True)
108 | df.to_csv(f_pack.file_submission, index=False)
109 | '''
110 |
--------------------------------------------------------------------------------
/doFeats_1.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import scipy as sp
4 | import gc
5 | import datetime
6 | import random
7 | import scipy.special as special
8 |
9 | rawpath='C:\\final\\'
10 | temppath='C:\\final\\temp\\'
11 | iapath='C:\\final\\temp\\installedactions\\'
12 |
13 | def logloss(act, preds):
14 | epsilon = 1e-15
15 | preds = sp.maximum(epsilon, preds)
16 | preds = sp.minimum(1 - epsilon, preds)
17 | ll = sum(act * sp.log(preds) + sp.subtract(1, act) * sp.log(sp.subtract(1, preds)))
18 | ll = ll * -1.0 / len(act)
19 | return ll
20 |
21 | class HyperParam(object):#平滑,这个快一点;hyper=HyperParam(1, 1); hyper.update_from_data_by_moment(show, click)
22 | def __init__(self, alpha, beta):
23 | self.alpha = alpha
24 | self.beta = beta
25 |
26 | def sample_from_beta(self, alpha, beta, num, imp_upperbound):
27 | sample = numpy.random.beta(alpha, beta, num)
28 | I = []
29 | C = []
30 | for click_ratio in sample:
31 | imp = random.random() * imp_upperbound
32 | #imp = imp_upperbound
33 | click = imp * click_ratio
34 | I.append(imp)
35 | C.append(click)
36 | return I, C
37 |
38 | def update_from_data_by_FPI(self, tries, success, iter_num, epsilon):
39 | '''estimate alpha, beta using fixed point iteration'''
40 | for i in range(iter_num):
41 | new_alpha, new_beta = self.__fixed_point_iteration(tries, success, self.alpha, self.beta)
42 | if abs(new_alpha-self.alpha)day-8).values].count()).reset_index(name='appcount')
192 | count['day']=day
193 | res=res.append(count,ignore_index=True)
194 | res.to_csv(iapath+'all_user_seven_day_cnt.csv',index=False)
195 | res=pd.DataFrame()
196 | temp=actions[['userID','day','appID']]
197 | for day in range(28,32):
198 | count=temp.groupby(['appID']).apply(lambda x: x['userID'][(x['day']day-8).values].count()).reset_index(name='usercount')
199 | count['day']=day
200 | res=res.append(count,ignore_index=True)
201 | res.to_csv(iapath+'all_app_seven_day_cnt.csv',index=False)
202 | print('actions over...')
203 |
204 |
205 |
206 |
207 | X_loc_train,X_loc_test=readData(m_type='inner',drop=True)
208 | print('readData over')
209 | X_loc_train=doPre(X_loc_train)
210 | X_loc_test=doPre(X_loc_test)
211 | print('doPre over...')
212 |
213 | ##########################################################统计特征,统计特征为点击数,转化数,转化率为转化数/点击数,
214 | ##########################################################初赛用7天滑窗算统计,决赛根据周冠军分享改为了使用了clickTime之前所有天算统计
215 | for feat_1 in ['creativeID','positionID','userID','sitesetID']:
216 | gc.collect()
217 | res=pd.DataFrame()
218 | temp=X_loc_train[[feat_1,'day','label']]
219 | for day in range(28,32):
220 | count=temp.groupby([feat_1]).apply(lambda x: x['label'][(x['day']=scope[0]).values & (X_train['clickTime']//10000000<=scope[1]).values
70 | X_train=X_train.loc[pos,:]
71 | X_test = pd.read_csv(rawpath+'test.csv')
72 | X_train.drop('conversionTime', axis=1, inplace=True)
73 |
74 | userfile = pd.read_csv(rawpath+'user.csv')
75 | X_train = X_train.merge(userfile, how=m_type, on='userID')
76 | X_test = X_test.merge(userfile, how=m_type, on='userID')
77 | del userfile
78 | gc.collect()
79 |
80 | adfile = pd.read_csv(rawpath+'ad.csv')
81 | X_train = X_train.merge(adfile, how=m_type, on='creativeID')
82 | X_test = X_test.merge(adfile, how=m_type, on='creativeID')
83 | del adfile
84 | gc.collect()
85 |
86 | appcatfile = pd.read_csv(rawpath+'app_categories.csv')
87 | X_train = X_train.merge(appcatfile, how=m_type, on='appID')
88 | X_test = X_test.merge(appcatfile, how=m_type, on='appID')
89 | del appcatfile
90 | gc.collect()
91 |
92 | positionfile = pd.read_csv(rawpath+'position.csv')
93 | X_train = X_train.merge(positionfile, how=m_type, on='positionID')
94 | X_test = X_test.merge(positionfile, how=m_type, on='positionID')
95 | del positionfile
96 | gc.collect()
97 | print('merge type:', m_type)
98 | return X_train, X_test
99 |
100 | ##################################重复数据Trick,初赛有3.5个千分点提升,决赛在原始数据的基础上有3个千分点提升
101 | #训练集上的情况也会在测试集上出现
102 | def doTrick(data):
103 | subset = ['creativeID', 'positionID', 'adID', 'appID', 'userID']
104 | data['maybe'] = 0
105 | pos = data.duplicated(subset=subset, keep=False)
106 | data.loc[pos, 'maybe'] = 1
107 | pos = (~data.duplicated(subset=subset, keep='first')) & data.duplicated(subset=subset, keep=False)
108 | data.loc[pos, 'maybe'] = 2
109 | pos = (~data.duplicated(subset=subset, keep='last')) & data.duplicated(subset=subset, keep=False)
110 | data.loc[pos, 'maybe'] = 3
111 |
112 | #比较关键的一步,初赛刚发现trick时提升不多,经过onehot后提升近3个千分点
113 | features_trans = ['maybe']
114 | data = pd.get_dummies(data, columns=features_trans)
115 | data['maybe_0'] = data['maybe_0'].astype(np.int8)
116 | data['maybe_1'] = data['maybe_1'].astype(np.int8)
117 | data['maybe_2'] = data['maybe_2'].astype(np.int8)
118 | data['maybe_3'] = data['maybe_3'].astype(np.int8)
119 |
120 | #时间差Trick,对clickTime处理成秒,分钟都尝试过,效果有些微差别,最后选择不进行处理
121 | temp = data.loc[:,['clickTime', 'creativeID', 'positionID', 'adID', 'appID', 'userID']].drop_duplicates(subset=subset, keep='first')
122 | # temp = temp.drop_duplicates(subset=subset, keep='first')
123 | temp.rename(columns={'clickTime': 'diffTime_first'}, inplace=True)
124 | data = pd.merge(data, temp, how='left', on=subset)
125 | data['diffTime_first'] = data['clickTime'] - data['diffTime_first']
126 | del temp,pos
127 | gc.collect()
128 | temp = data.loc[:,['clickTime', 'creativeID', 'positionID', 'adID', 'appID', 'userID']].drop_duplicates(subset=subset, keep='last')
129 | # temp = temp.drop_duplicates(subset=subset, keep='last')
130 | temp.rename(columns={'clickTime': 'diffTime_last'}, inplace=True)
131 | data = pd.merge(data, temp, how='left', on=subset)
132 | data['diffTime_last'] = data['diffTime_last'] - data['clickTime']
133 | del temp
134 | gc.collect()
135 | data.loc[~data.duplicated(subset=subset, keep=False), ['diffTime_first', 'diffTime_last']] = -1 #置0会变差
136 |
137 | #重复次数是否大于2
138 | temp=data.groupby(subset)['label'].count().reset_index()
139 | temp.columns=['creativeID', 'positionID', 'adID', 'appID', 'userID','large2']
140 | temp['large2']=1*(temp['large2']>2)
141 | data = pd.merge(data, temp, how='left', on=subset)
142 | #-----------
143 | # data['last_click'] = data['clickTime']
144 | # pos = data.duplicated(subset=subset, keep=False)
145 | # data.loc[pos, 'last_click'] = data.loc[pos, 'last_click'].diff(periods=1)
146 | # pos = ~data.duplicated(subset=subset, keep='first')
147 | # data.loc[pos, 'last_click'] = -1
148 | # data['next_click'] = data['clickTime']
149 | # pos = data.duplicated(subset=subset, keep=False)
150 | # data.loc[pos, 'next_click'] = -1 * data.loc[pos, 'next_click'].diff(periods=-1)
151 | # pos = ~data.duplicated(subset=subset, keep='last')
152 | # data.loc[pos, 'next_click'] = -1
153 | # del pos
154 | # data['maybe_4']=data['maybe_1']+data['maybe_2']
155 | # data['maybe_5']=data['maybe_1']+data['maybe_3']
156 | # data['diffTime_span']=data['diffTime_last']+data['diffTime_first']
157 | #-------------
158 | del temp
159 | gc.collect()
160 | return data
161 |
162 | ##################################Trick2基于userID重复的数据做,重要性高但是线上效果不好,和Trick信息重复了
163 | def doTrick2(X_train,X_test):
164 | res = X_test[['instanceID']]
165 | X_test.drop('instanceID', axis=1, inplace=True)
166 | data = X_train.append(X_test, ignore_index=True)
167 | del X_train, X_test
168 | gc.collect()
169 |
170 | subset = ['userID']
171 | data['umaybe'] = 0
172 | pos = data.duplicated(subset=subset, keep=False)
173 | data.loc[pos, 'umaybe'] = 1
174 | pos = (~data.duplicated(subset=subset, keep='first')) & data.duplicated(subset=subset, keep=False)
175 | data.loc[pos, 'umaybe'] = 2
176 | pos = (~data.duplicated(subset=subset, keep='last')) & data.duplicated(subset=subset, keep=False)
177 | data.loc[pos, 'umaybe'] = 3
178 | del pos
179 | gc.collect()
180 | features_trans = ['umaybe']
181 | data = pd.get_dummies(data, columns=features_trans)
182 | data['umaybe_0'] = data['umaybe_0'].astype(np.int8)
183 | data['umaybe_1'] = data['umaybe_1'].astype(np.int8)
184 | data['umaybe_2'] = data['umaybe_2'].astype(np.int8)
185 | data['umaybe_3'] = data['umaybe_3'].astype(np.int8)
186 |
187 | temp = data[['clickTime','userID']]
188 | temp = temp.drop_duplicates(subset=subset, keep='first')
189 | temp.rename(columns={'clickTime': 'udiffTime_first'}, inplace=True)
190 | data = pd.merge(data, temp, how='left', on=subset)
191 | data['udiffTime_first'] = data['clickTime'] - data['udiffTime_first']
192 | del temp
193 | gc.collect()
194 | temp = data[['clickTime','userID']]
195 | temp = temp.drop_duplicates(subset=subset, keep='last')
196 | temp.rename(columns={'clickTime': 'udiffTime_last'}, inplace=True)
197 | data = pd.merge(data, temp, how='left', on=subset)
198 | data['udiffTime_last'] = data['udiffTime_last'] - data['clickTime']
199 | del temp
200 | gc.collect()
201 | data.loc[~data.duplicated(subset=subset, keep=False), ['udiffTime_first', 'udiffTime_last']] = -1
202 |
203 | X_train = data.loc[data['label'] != -1, :]
204 | X_test = data.loc[data['label'] == -1, :]
205 | X_test.loc[:, 'instanceID'] = res.values
206 | del temp,data
207 | gc.collect()
208 | return X_train, X_test
209 |
210 |
211 | def doPre(data):
212 | data['day'] = data['clickTime'] // 1000000
213 | data['hour'] = data['clickTime'] % 1000000 // 10000
214 | # data['clickTime'] = data['day'] * 1440 + (data['clickTime'] % 1000000 // 10000) * 60 + (data['clickTime'] % 10000 // 100) * 60 + data['clickTime'] % 100 # 默认
215 | # data['clickTime'] = data['day'] * 1440 + (data['clickTime'] % 1000000 // 10000) * 60 + data['clickTime'] % 10000#best
216 |
217 | # data['week'] = data['day'] % 7
218 |
219 | # data['appCategory_main'] = data['appCategory']
220 | # data.loc[data['appCategory'] > 99, 'appCategory_main'] = data.loc[data['appCategory'] > 99, 'appCategory'] // 100
221 | # data['appCategory'] = data['appCategory'] % 100
222 |
223 | # data.loc[data['age'] < 10,'age']=0
224 | # data.loc[(data['age'] >= 10)&(data['age']< 18), 'age'] = 1
225 | # data.loc[(data['age'] >= 18) & (data['age'] < 24), 'age'] = 2
226 | # data.loc[(data['age'] >= 24) & (data['age'] < 30), 'age'] = 3
227 | # data.loc[(data['age'] >= 30) & (data['age'] < 40), 'age'] = 4
228 | # data.loc[(data['age'] >= 40) & (data['age'] < 60), 'age'] = 5
229 | # data.loc[data['age'] >= 60, 'age'] = 6
230 |
231 | # data.loc[(data['hour'] >= 8) & (data['hour'] <14 ), 'preiod'] = 0
232 | # data.loc[(data['hour'] >= 14) | (data['hour'] < 8), 'preiod'] = 1
233 | # data = pd.get_dummies(data, columns=['preiod'])
234 | return data
235 |
236 | ##################################均值特征
237 | def doAvg(X_train, X_test):
238 | res = X_test[['instanceID']]
239 | X_test.drop('instanceID', axis=1, inplace=True)
240 | data = X_train.append(X_test, ignore_index=True)
241 | del X_train, X_test
242 | gc.collect()
243 |
244 | # 小时均值特征
245 | grouped = data.groupby('userID')['hour'].mean().reset_index()
246 | grouped.columns = ['userID', 'user_mean_hour']
247 | data = data.merge(grouped, how='left', on='userID')
248 | grouped = data.groupby('appID')['hour'].mean().reset_index()
249 | grouped.columns = ['appID', 'app_mean_hour']
250 | data = data.merge(grouped, how='left', on='appID')
251 | grouped = data.groupby('appCategory')['hour'].mean().reset_index()
252 | grouped.columns = ['appCategory', 'appCategory_mean_hour']
253 | data = data.merge(grouped, how='left', on='appCategory')
254 | grouped = data.groupby('positionID')['hour'].mean().reset_index()
255 | grouped.columns = ['positionID', 'position_mean_hour']
256 | data = data.merge(grouped, how='left', on='positionID')
257 |
258 | # 年龄均值特征
259 | grouped = data.groupby('appID')['age'].mean().reset_index()
260 | grouped.columns = ['appID', 'app_mean_age']
261 | data = data.merge(grouped, how='left', on='appID')
262 | grouped = data.groupby('positionID')['age'].mean().reset_index()
263 | grouped.columns = ['positionID', 'position_mean_age']
264 | data = data.merge(grouped, how='left', on='positionID')
265 | grouped = data.groupby('appCategory')['age'].mean().reset_index()
266 | grouped.columns = ['appCategory', 'appCategory_mean_age']
267 | data = data.merge(grouped, how='left', on='appCategory')
268 | # grouped = data.groupby('creativeID')['age'].mean().reset_index()
269 | # grouped.columns = ['creativeID', 'creative_mean_age']
270 | # data = data.merge(grouped, how='left', on='creativeID')
271 | # grouped = data.groupby('adID')['age'].mean().reset_index()
272 | # grouped.columns = ['adID', 'ad_mean_age']
273 | # data = data.merge(grouped, how='left', on='adID')
274 |
275 | X_train = data.loc[data['label'] != -1, :]
276 | X_test = data.loc[data['label'] == -1, :]
277 | X_test.loc[:, 'instanceID'] = res.values
278 | del data, grouped
279 | gc.collect()
280 | return X_train, X_test
281 |
282 | ##################################活跃数特征
283 | def doActive(X_train, X_test):
284 | res = X_test[['instanceID']]
285 | X_test.drop('instanceID', axis=1, inplace=True)
286 | data = X_train.append(X_test, ignore_index=True)
287 | del X_train, X_test
288 | gc.collect()
289 |
290 | # 活跃特征选取类别多的,类别太少,nunique差别不大,广告随时都在,用户不是时刻都在,一个只出现一次的用户活跃的ad,app,advertiser,camgaign,creative都为1
291 | # 用户活跃小时数
292 | add = pd.DataFrame(data.groupby(["userID"]).hour.nunique()).reset_index()
293 | add.columns = ["userID", "user_active_hour"]
294 | data = data.merge(add, on=["userID"], how="left")
295 |
296 | # 活跃app数特征
297 | add = pd.DataFrame(data.groupby(["appCategory"]).appID.nunique()).reset_index()
298 | add.columns = ["appCategory", "appCategory_active_app"]
299 | data = data.merge(add, on=["appCategory"], how="left")
300 | # add = pd.DataFrame(data.groupby(["userID"]).appID.nunique()).reset_index()
301 | # add.columns = ["userID", "user_active_app"]
302 | # data = data.merge(add, on=["userID"], how="left")
303 | # add = pd.DataFrame(data.groupby(["age"]).appID.nunique()).reset_index()
304 | # add.columns = ["age", "age_active_app"]
305 | # data = data.merge(add, on=["age"], how="left")
306 | # add = pd.DataFrame(data.groupby(["sitesetID"]).appID.nunique()).reset_index()
307 | # add.columns = ["sitesetID", "siteset_active_app"]
308 | # data = data.merge(add, on=["sitesetID"], how="left")
309 | # add = pd.DataFrame(data.groupby(["positionType"]).appID.nunique()).reset_index()
310 | # add.columns = ["positionType", "positionType_active_app"]
311 | # data = data.merge(add, on=["positionType"], how="left")
312 | # add = pd.DataFrame(data.groupby(["positionID"]).appID.nunique()).reset_index()
313 | # add.columns = ["positionID", "position_active_app"]
314 | # data = data.merge(add, on=["positionID"], how="left")
315 | add = pd.DataFrame(data.groupby(["connectionType"]).appID.nunique()).reset_index()
316 | add.columns = ["connectionType", "connectionType_active_app"]
317 | data = data.merge(add, on=["connectionType"], how="left")
318 |
319 | # 活跃position数特征
320 | add = pd.DataFrame(data.groupby(["appID"]).positionID.nunique()).reset_index()
321 | add.columns = ["appID", "app_active_position"]
322 | data = data.merge(add, on=["appID"], how="left")
323 | add = pd.DataFrame(data.groupby(["appCategory"]).positionID.nunique()).reset_index()
324 | add.columns = ["appCategory", "appCategory_active_position"]
325 | data = data.merge(add, on=["appCategory"], how="left")
326 | # add = pd.DataFrame(data.groupby(["userID"]).positionID.nunique()).reset_index()
327 | # add.columns = ["userID", "user_active_position"]
328 | # data = data.merge(add, on=["userID"], how="left")
329 | # add = pd.DataFrame(data.groupby(["age"]).positionID.nunique()).reset_index()
330 | # add.columns = ["age", "age_active_position"]
331 | # data = data.merge(add, on=["age"], how="left")
332 | # add = pd.DataFrame(data.groupby(["positionType"]).positionID.nunique()).reset_index()
333 | # add.columns = ["positionType", "positionType_active_position"]
334 | # data = data.merge(add, on=["positionType"], how="left")
335 | # add = pd.DataFrame(data.groupby(["advertiserID"]).positionID.nunique()).reset_index()
336 | # add.columns = ["advertiserID", "advertiser_active_position"]
337 | # data = data.merge(add, on=["advertiserID"], how="left")
338 |
339 | #活跃user数特征
340 | add = pd.DataFrame(data.groupby(["appID"]).userID.nunique()).reset_index()
341 | add.columns = ["appID", "app_active_user"]
342 | data = data.merge(add, on=["appID"], how="left")
343 | add = pd.DataFrame(data.groupby(["positionID"]).userID.nunique()).reset_index()
344 | add.columns = ["positionID", "position_active_user"]
345 | data = data.merge(add, on=["positionID"], how="left")
346 | add = pd.DataFrame(data.groupby(["appCategory"]).userID.nunique()).reset_index()
347 | add.columns = ["appCategory", "appCategory_active_user"]
348 | data = data.merge(add, on=["appCategory"], how="left")
349 |
350 | add = pd.DataFrame(data.groupby(["userID"]).creativeID.nunique()).reset_index()
351 | add.columns = ["userID", "user_active_creative"]
352 | data = data.merge(add, on=["userID"], how="left")
353 | # add = pd.DataFrame(data.groupby(["userID"]).sitesetID.nunique()).reset_index()
354 | # add.columns = ["userID", "user_active_siteset"]
355 | # data = data.merge(add, on=["userID"], how="left")
356 | # add = pd.DataFrame(data.groupby(["userID"]).appCategory.nunique()).reset_index()
357 | # add.columns = ["userID", "user_active_appCategory"]
358 | # data = data.merge(add, on=["userID"], how="left")
359 | add = pd.DataFrame(data.groupby(["positionID"]).advertiserID.nunique()).reset_index()
360 | add.columns = ["positionID", "positionID_active_advertiser"]
361 | data = data.merge(add, on=["positionID"], how="left")
362 |
363 |
364 | X_train = data.loc[data['label'] != -1, :]
365 | X_test = data.loc[data['label'] == -1, :]
366 | X_test.loc[:, 'instanceID'] = res.values
367 | del data, add
368 | gc.collect()
369 | return X_train, X_test
370 |
371 | ##################################这几个操作尝试过,效果不佳,后来放弃了
372 | def doOneHot(X_train, X_test):
373 | res = X_test[['instanceID']]
374 | X_test.drop('instanceID', axis=1, inplace=True)
375 | data = X_train.append(X_test, ignore_index=True)
376 | del X_train, X_test
377 | gc.collect()
378 |
379 | features_trans = ['gender','appCategory_main','connectionType']
380 | data = pd.get_dummies(data, columns=features_trans)
381 |
382 | X_train = data.loc[data['label'] != -1, :]
383 | X_test = data.loc[data['label'] == -1, :]
384 | X_test.loc[:, 'instanceID'] = res.values
385 | del data
386 | gc.collect()
387 | return X_train, X_test
388 | def doCrossProduct(data):
389 | data['position_creative'] = data['positionID'] * data['creativeID']
390 | data['creative_age'] = data['creativeID'] * data['age']
391 | return data
392 | def doDescartes(X_train, X_test):
393 | res = X_test[['instanceID']]
394 | X_test.drop('instanceID', axis=1, inplace=True)
395 | data = X_train.append(X_test, ignore_index=True)
396 | del X_train, X_test
397 | gc.collect()
398 |
399 | for feat_1 in ['maybe_0', 'maybe_2']:
400 | for feat_2 in ['connectionType', 'creativeID', 'positionID']:
401 | le = LabelEncoder()
402 | data[feat_1 + '_' + feat_2] = le.fit_transform(data[feat_1].astype('str') + data[feat_2].astype('str'))
403 | X_train = data.loc[data['label'] != -1, :]
404 | X_test = data.loc[data['label'] == -1, :]
405 | X_test.loc[:, 'instanceID'] = res.values
406 | del data
407 | gc.collect()
408 | return X_train, X_test
409 | def doSpecial(X_train, X_test):
410 | res = X_test[['instanceID']]
411 | X_test.drop('instanceID', axis=1, inplace=True)
412 | data = X_train.append(X_test, ignore_index=True)
413 | del X_train, X_test
414 | gc.collect()
415 |
416 | #####增加id与时间的斜率
417 | Min_id = data["listing_id"].min()
418 | Min_time = data["time"].min()
419 | data["gradient"] = ((data["listing_id"]) - Min_id) / (data["time"] - Min_time)
420 |
421 | X_train = data.loc[data['label'] != -1, :]
422 | X_test = data.loc[data['label'] == -1, :]
423 | X_test.loc[:, 'instanceID'] = res.values
424 | del data
425 | gc.collect()
426 | return X_train, X_test
427 |
428 |
429 | X_loc_train,X_loc_test=readData(m_type='inner',scope=(28,30))
430 | print('readData over')
431 | X_loc_train=doPre(X_loc_train)
432 | X_loc_test=doPre(X_loc_test)
433 | print('doPre over...')
434 |
435 | ##########################################################actions和installed文件特征
436 | temp = pd.read_csv(iapath+'all_app_seven_day_cnt.csv')
437 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appID', 'day'])
438 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appID', 'day'])
439 | temp = pd.read_csv(iapath+'all_user_seven_day_cnt.csv')
440 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['userID', 'day'])
441 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['userID', 'day'])
442 | temp = pd.read_csv(iapath+'userInstalledappscount.csv')
443 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['userID'])
444 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['userID'])
445 | temp = pd.read_csv(iapath+'appInstalledusercount.csv')
446 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appID'])
447 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appID'])
448 | temp = pd.read_csv(iapath+'ageuserInstalledappscount.csv')
449 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['age'])
450 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['age'])
451 | temp = pd.read_csv(iapath+'appCatInstalledusercount.csv')
452 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appCategory'])
453 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appCategory'])
454 | temp = pd.read_csv(iapath+'eduuserInstalledappscount.csv')
455 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['education'])
456 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['education'])
457 | temp = pd.read_csv(iapath+'genderuserInstalledappscount.csv')
458 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['gender'])
459 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['gender'])
460 |
461 | ##########################################################appID平均回流时间特征
462 | temp = pd.read_csv(temppath+'app_cov_diffTime.csv')
463 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appID'])
464 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appID'])
465 | temp = pd.read_csv(temppath+'appCat_cov_diffTime.csv')
466 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appCategory'])
467 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appCategory'])
468 | X_loc_train['cov_diffTime'].fillna(value=X_loc_train['appCat_cov_diffTime'], inplace=True)
469 | X_loc_test['cov_diffTime'].fillna(value=X_loc_test['appCat_cov_diffTime'], inplace=True)
470 | X_loc_train.drop(['appCat_cov_diffTime'],axis=1,inplace=True)
471 | X_loc_test.drop(['appCat_cov_diffTime'],axis=1,inplace=True)
472 | print('app_cov_diffTime over...')
473 |
474 | ##########################################################活跃数特征
475 | X_loc_train,X_loc_test=doActive(X_loc_train,X_loc_test)
476 | print('doActive over...')
477 |
478 | ##########################################################均值特征
479 | X_loc_train,X_loc_test=doAvg(X_loc_train,X_loc_test)
480 | print('doAvg over...')
481 |
482 | print(X_loc_train.shape)
483 | print(X_loc_train.columns)
484 | # res = X_loc_test[['instanceID']]
485 | # X_loc_test.drop('instanceID', axis=1, inplace=True)
486 | # data = X_loc_train.append(X_loc_test, ignore_index=True)
487 | # del X_loc_train, X_loc_test
488 | # gc.collect()
489 | # # data.sort_values(['userID','clickTime'],inplace=True,kind='mergesort')
490 | # # data['ulast_click']=data['clickTime']
491 | # # pos=data.duplicated(subset=['userID'], keep=False)
492 | # # data.loc[pos,'ulast_click']=data.loc[pos,'ulast_click'].diff(periods=1)
493 | # # pos=~data.duplicated(subset=['userID'], keep='first')
494 | # # data.loc[pos,'ulast_click']=-1
495 | # # data['unext_click']=data['clickTime']
496 | # # pos=data.duplicated(subset=['userID'], keep=False)
497 | # # data.loc[pos,'unext_click']=-1*data.loc[pos,'unext_click'].diff(periods=-1)
498 | # # pos=~data.duplicated(subset=['userID'], keep='last')
499 | # # data.loc[pos,'unext_click']=-1
500 | # # del pos
501 | # # temp = data.loc[:, ['clickTime', 'userID']].drop_duplicates(subset=['userID'],keep='first')
502 | # # temp.rename(columns={'clickTime': 'udiffTime_first'}, inplace=True)
503 | # # data = pd.merge(data, temp, how='left', on=['userID'])
504 | # # data['udiffTime_first'] = data['clickTime'] - data['udiffTime_first']
505 | # # del temp
506 | # # gc.collect()
507 | # # temp = data.loc[:, ['clickTime', 'userID']].drop_duplicates(subset=['userID'],keep='last')
508 | # # temp.rename(columns={'clickTime': 'udiffTime_last'}, inplace=True)
509 | # # data = pd.merge(data, temp, how='left', on=['userID'])
510 | # # data['udiffTime_last'] = data['udiffTime_last'] - data['clickTime']
511 | # # del temp
512 | # # gc.collect()
513 | # # data.loc[~data.duplicated(subset=['userID'], keep=False), ['udiffTime_first', 'udiffTime_last']] = -1
514 | #
515 | # X_loc_train = data.loc[data['label'] != -1, :]
516 | # X_loc_test = data.loc[data['label'] == -1, :]
517 | # X_loc_test.loc[:, 'instanceID'] = res.values
518 | # # del data
519 | # del data
520 | # gc.collect()
521 |
522 |
523 | ##########################################################统计特征决赛用了clickTime之前所有天的统计,基本只用了平滑转化率特征,丢弃了点击数和转化数
524 | #由于操作错误提交了包含creativeID_smooth和creativeID_rate两个特征的结果,后来丢掉rate效果会变差就一直留着了
525 | #平滑user相关的特征特别废时间,初赛做过根据点击次数阈值来操作转化率,效果和平滑差不多但是阈值选择不太准
526 | for feat_1 in ['creativeID','positionID','userID']:
527 | temp = pd.read_csv(temppath+'%s.csv' %feat_1)
528 | bs = BayesianSmoothing(1, 1)
529 | bs.update(temp[feat_1 + '_all'].values, temp[feat_1 + '_1'].values, 1000, 0.001)
530 | temp[feat_1 + '_smooth'] = (temp[feat_1 + '_1'] + bs.alpha) / (temp[feat_1 + '_all'] + bs.alpha + bs.beta)
531 | if feat_1 in ['creativeID']:
532 | temp[feat_1 + '_rate'] = temp[feat_1 + '_1'] / temp[feat_1 + '_all']
533 | temp.drop([feat_1 + '_1',feat_1 + '_all'],axis=1,inplace=True)
534 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=[feat_1, 'day'])
535 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=[feat_1, 'day'])
536 | del temp
537 | gc.collect()
538 | print(feat_1 + ' over...')
539 | X_loc_train.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True)
540 | X_loc_test.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True)
541 | #类别少,不用平滑
542 | for feat_1 in ['sitesetID']:
543 | temp = pd.read_csv(temppath+'%s.csv' %feat_1)
544 | temp[feat_1 + '_rate'] = temp[feat_1 + '_1'] / temp[feat_1 + '_all']
545 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=[feat_1, 'day'])
546 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=[feat_1, 'day'])
547 | del temp
548 | gc.collect()
549 | print(feat_1 + ' over...')
550 | X_loc_train.fillna(value=0, inplace=True)
551 | X_loc_test.fillna(value=0, inplace=True)
552 |
553 | #三特征组合从周冠军分享的下载行为和网络条件限制,以及用户属性对app需求挖掘出
554 | for feat_1,feat_2,feat_3 in[('appID','connectionType','positionID'),('appID','haveBaby','gender')]:
555 | temp = pd.read_csv(temppath+'%s.csv' % (feat_1+'_'+feat_2+'_'+feat_3))
556 | bs = BayesianSmoothing(1, 1)
557 | bs.update(temp[feat_1+'_'+feat_2+'_'+feat_3 + '_all'].values, temp[feat_1+'_'+feat_2+'_'+feat_3 + '_1'].values, 1000, 0.001)
558 | temp[feat_1+'_'+feat_2+'_'+feat_3 + '_smooth'] = (temp[feat_1+'_'+feat_2+'_'+feat_3 + '_1'] + bs.alpha) / (temp[feat_1+'_'+feat_2+'_'+feat_3 + '_all'] + bs.alpha + bs.beta)
559 | temp.drop([feat_1+'_'+feat_2+'_'+feat_3+ '_1',feat_1+'_'+feat_2+'_'+feat_3 + '_all'],axis=1,inplace=True)
560 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=[feat_1,feat_2,feat_3, 'day'])
561 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=[feat_1,feat_2,feat_3, 'day'])
562 | del temp
563 | gc.collect()
564 | print(feat_1 + '_' + feat_2+'_'+feat_3+ ' over...')
565 | X_loc_train.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True)
566 | X_loc_test.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True)
567 |
568 | #userID和positionID的点击次数重要性排名靠前,所有统计特征只加了这一个点击次数
569 | for feat_1,feat_2 in[('positionID','advertiserID'),('userID','sitesetID'),('positionID','connectionType'),('userID','positionID'),
570 | ('appPlatform','positionType'),('advertiserID','connectionType'),('positionID','appCategory'),('appID','age'),
571 | ('userID', 'appID'),('userID','connectionType'),('appCategory','connectionType'),('appID','hour'),('hour','age')]:
572 | temp = pd.read_csv(temppath+'%s.csv' % (feat_1+'_'+feat_2))
573 | bs = BayesianSmoothing(1, 1)
574 | bs.update(temp[feat_1+'_'+feat_2 + '_all'].values, temp[feat_1+'_'+feat_2 + '_1'].values, 1000, 0.001)
575 | temp[feat_1+'_'+feat_2 + '_smooth'] = (temp[feat_1+'_'+feat_2 + '_1'] + bs.alpha) / (temp[feat_1+'_'+feat_2 + '_all'] + bs.alpha + bs.beta)
576 | if (feat_1,feat_2) in [('userID','positionID')]:
577 | temp.drop([feat_1 + '_' + feat_2 + '_1'], axis=1, inplace=True)
578 | else:
579 | temp.drop([feat_1+'_'+feat_2 + '_1',feat_1+'_'+feat_2 + '_all'],axis=1,inplace=True)
580 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=[feat_1,feat_2, 'day'])
581 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=[feat_1,feat_2, 'day'])
582 | del temp
583 | gc.collect()
584 | print(feat_1 + '_' + feat_2 + ' over...')
585 | X_loc_train.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True)
586 | X_loc_test.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True)
587 |
588 |
589 | ##########################################################doTrick
590 | X_loc_train=doTrick(X_loc_train)
591 | X_loc_test=doTrick(X_loc_test)
592 |
593 | ##########################################################丢掉重要性低,缺失值多的原始特征
594 | drop = ['hometown', 'haveBaby', 'telecomsOperator', 'userID', 'clickTime',
595 | 'appPlatform', 'connectionType', 'marriageStatus', 'positionType',
596 | 'gender', 'education', 'camgaignID', 'positionID','maybe_0'
597 | ]
598 | X_loc_train.drop(drop, axis=1, inplace=True)
599 | X_loc_train.fillna(value=0, inplace=True)
600 | X_loc_test.drop(drop, axis=1, inplace=True)
601 | X_loc_test.fillna(value=0, inplace=True)
602 | print('over')
603 | print(X_loc_train.shape)
604 | print(X_loc_train.columns)
605 | X_loc_train.to_csv(temppath+'2_smooth.csv',index=False)
606 | X_loc_test.to_csv(temppath+'2_test_smooth.csv',index=False)
607 |
--------------------------------------------------------------------------------