├── .idea ├── vcs.xml ├── misc.xml ├── modules.xml └── TencentSocialAds.iml ├── cross features stat.py ├── Other.py ├── model - appID.py ├── feat_label.py ├── model - group_value.py ├── model - LR06.py ├── train_xgb.py ├── train_cv.py ├── README.md ├── model - xgb - cross features.py ├── doFeats_1.py └── doFeats_2.py /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/TencentSocialAds.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /cross features stat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import f_pack 4 | 5 | data = pd.read_csv(f_pack.file_train) 6 | dfAd = pd.read_csv(f_pack.file_ad) 7 | data = pd.merge(data, dfAd, how='left', on='creativeID') 8 | ''' 9 | g1 = data.groupby(['positionID', 'connectionType']).apply(lambda x: np.mean(x["label"])).reset_index() 10 | g1.columns = ['positionID', 'connectionType', 'mean_pos_conn'] 11 | g1.to_csv(f_pack.file_cf_pos_conn) 12 | 13 | g2 = data.groupby(['positionID', 'advertiserID']).apply(lambda x: np.mean(x["label"])).reset_index() 14 | g2.columns = ['positionID', 'advertiserID', 'mean_pos_adv'] 15 | g2.to_csv(f_pack.file_cf_pos_adv) 16 | ''' 17 | 18 | g3 = data.groupby(['userID']).apply(lambda x: np.mean(x["label"])).reset_index() 19 | g3.columns = ['userID', 'mean_userID'] 20 | g3.to_csv(f_pack.file_f_mean_userID) 21 | -------------------------------------------------------------------------------- /Other.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import math 4 | 5 | def zeroMean(dataMat): 6 | meanVal=np.mean(dataMat,axis=0) 7 | newData=dataMat-meanVal 8 | return newData,meanVal 9 | 10 | newData,meanVal=zeroMean(X_train) 11 | res=newData.corr(method='pearson') 12 | 13 | def f(x): 14 | res=1/(1+np.e**(-x)) 15 | return res 16 | 17 | def f_ver(x): 18 | res=np.log(x/(1-x)) 19 | return res 20 | 21 | 22 | df=pd.read_csv(r'a.csv') 23 | print(df.prob.mean()) 24 | avg=0.027232030146226882#0.0273 25 | b=[-2,2] 26 | df.prob1=df.prob 27 | while abs(np.mean(df.prob1)-avg)>0.00001: 28 | mid=(b[0]+b[1])/2.0 29 | df.prob1=df.prob.apply(lambda x:math.log(x/(1-x))) 30 | df.prob1=df.prob1.apply(lambda x:x+mid) 31 | df.prob1=df.prob1.apply(lambda x:1/(1+math.exp(-x))) 32 | if np.mean(df.prob1)>avg: 33 | b[1]=mid 34 | else: 35 | b[0]=mid 36 | df.prob=df.prob1 37 | del df.prob1 38 | df.to_csv(r'submission.csv',index=False) -------------------------------------------------------------------------------- /model - appID.py: -------------------------------------------------------------------------------- 1 | """ 2 | baseline 1: history pCVR of creativeID/adID/camgaignID/advertiserID/appID/appPlatform 3 | """ 4 | 5 | # res: 0.1 appID is a great feature 6 | 7 | import zipfile 8 | import numpy as np 9 | import pandas as pd 10 | import f_pack 11 | 12 | # load data 13 | dfTrain = pd.read_csv(f_pack.file_train) 14 | dfTest = pd.read_csv(f_pack.file_test) 15 | dfAd = pd.read_csv(f_pack.file_ad) 16 | 17 | # process data 18 | dfTrain = pd.merge(dfTrain, dfAd, on="creativeID") 19 | dfTest = pd.merge(dfTest, dfAd, on="creativeID") 20 | y_train = dfTrain["label"].values 21 | 22 | # model building 23 | key = "appID" 24 | dfCvr = dfTrain.groupby(key).apply(lambda df: np.mean(df["label"])).reset_index() 25 | dfCvr.columns = [key, "avg_cvr"] 26 | dfCvr.to_csv(f_pack.file_appID_score) 27 | dfTest = pd.merge(dfTest, dfCvr, how="left", on=key) 28 | dfTest["avg_cvr"].fillna(np.mean(dfTrain["label"]), inplace=True) 29 | proba_test = dfTest["avg_cvr"].values 30 | 31 | # submission 32 | df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": proba_test}) 33 | df.sort_values("instanceID", inplace=True) 34 | df.to_csv(f_pack.file_submission, index=False) 35 | -------------------------------------------------------------------------------- /feat_label.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import f_pack 3 | 4 | 5 | def stat(group): 6 | group = group.sort_values('minute') 7 | group = group.reset_index() 8 | group['h_potential'] = 0 9 | group['h_potential02'] = 0 10 | length = group.size / len(group.columns) 11 | if length < 2: 12 | return group 13 | 14 | minute = group['minute'] 15 | 16 | if length == 2: 17 | if minute[1] - minute[0] <= 3 or (0 <= minute[1] <= 1 and 58 <= minute[0] < 60): 18 | group.loc[0, 'h_potential'] = 1 19 | return group 20 | 21 | count = 0 22 | for i in range(len(minute) - 1): 23 | if minute[i + 1] - minute[i] <= 3 or (0 <= minute[i + 1] <= 1 and 58 <= minute[i] < 60): 24 | count = 1 25 | group.loc[i, 'h_potential'] = 1 26 | if i > 0 and group.loc[i - 1, 'h_potential02'] == 1: 27 | group.loc[i, 'h_potential'] = 0 28 | if count == 1 and i < len(minute) - 2: 29 | if minute[i + 2] - minute[i + 1] <= 3 or (0 <= minute[i + 2] <= 1 and 58 <= minute[i + 1] < 60): 30 | group.loc[i, 'h_potential02'] = 1 31 | count = 0 32 | return group 33 | 34 | 35 | data = pd.read_csv(f_pack.file_train) 36 | print(data.shape) 37 | data['day'] = data['clickTime'].map(lambda x: x // 10000) 38 | days = data['day'] 39 | days = days.unique() 40 | save_data = pd.DataFrame() 41 | for day in days: 42 | test_data = data[data['day'] == day] 43 | print('test_data num is %d' % len(test_data)) 44 | sub_data = data[data['day'] == day].copy() 45 | print('sub_data num is %d' % len(sub_data)) 46 | sub_data['minute'] = sub_data['clickTime'] % 100 47 | sub_data = sub_data.groupby('userID', as_index=False).apply(stat).reset_index() 48 | sub_data = sub_data.drop(['level_0', 'level_1', 'index'], axis=1) 49 | print('sub_data num is %d' % len(sub_data)) 50 | save_data = pd.concat([save_data, sub_data]) 51 | print(save_data.shape) 52 | 53 | save_data.to_csv(f_pack.file_train_s, index=False) 54 | data = pd.read_csv(f_pack.file_train_s) 55 | print(data.shape) 56 | -------------------------------------------------------------------------------- /model - group_value.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy import sparse 4 | from sklearn.preprocessing import OneHotEncoder 5 | from sklearn.linear_model import LogisticRegression 6 | import f_pack 7 | 8 | # load data 9 | dfTrain = pd.read_csv(f_pack.file_train) 10 | dfTest = pd.read_csv(f_pack.file_test) 11 | dfAd = pd.read_csv(f_pack.file_ad) 12 | dfUser = pd.read_csv(f_pack.file_user) 13 | dfUser['age'] = dfUser['age'].map(lambda x: (x // 5) + 1 if x > 0 else 0) 14 | dfUser['haveBaby'] = dfUser['haveBaby'].map(lambda x: 3 if x >= 3 else x) 15 | dfAppCate = pd.read_csv(f_pack.file_app_categories) 16 | dfPos = pd.read_csv(f_pack.file_position) 17 | dfAppScore = pd.read_csv(f_pack.file_appID_score) 18 | # process data 19 | print(dfTrain.shape) 20 | print(dfTest.shape) 21 | dfTrain = pd.merge(dfTrain, dfAd, on="creativeID") 22 | dfTest = pd.merge(dfTest, dfAd, on="creativeID") 23 | dfTrain = pd.merge(dfTrain, dfUser, on='userID') 24 | dfTest = pd.merge(dfTest, dfUser, on='userID') 25 | dfTrain = pd.merge(dfTrain, dfAppCate, on='appID') 26 | dfTest = pd.merge(dfTest, dfAppCate, on='appID') 27 | dfTrain = pd.merge(dfTrain, dfPos, on='positionID') 28 | dfTest = pd.merge(dfTest, dfPos, on='positionID') 29 | y_train = dfTrain["label"].values 30 | print(dfTrain.shape) 31 | print(dfTest.shape) 32 | 33 | # feature engineering/encoding 34 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "camgaignID", "advertiserID", 35 | "appPlatform", 'age', 'gender', 'education', 'marriageStatus', 'haveBaby', 'residence', 'appID', 36 | 'positionType', 'sitesetID', 'hometown', 'appCategory'] 37 | 38 | for i, feat in enumerate(feats): 39 | dfCvr = dfTrain.groupby(feat).apply(lambda df: np.mean(df["label"])).reset_index() 40 | dfCvr.columns = [feat, feat + "_avg_cvr"] 41 | dfTrain = pd.merge(dfTrain, dfCvr, how="left", on=feat) 42 | dfTest = pd.merge(dfTest, dfCvr, how="left", on=feat) 43 | dfTest.fillna(0, inplace=True) 44 | 45 | filt = '.*_avg_cvr' 46 | X_train = dfTrain.filter(regex=filt) 47 | X_test = dfTest.filter(regex=filt) 48 | 49 | # model building 50 | lr = LogisticRegression(penalty='l1') 51 | lr.fit(X_train, y_train) 52 | proba_test = lr.predict_proba(X_test)[:, 1] 53 | param=pd.DataFrame({"columns":list(X_train.columns), "coef":list(lr.coef_.T)}) 54 | print(param) 55 | 56 | # submission 57 | df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": proba_test}) 58 | df.sort_values("instanceID", inplace=True) 59 | df.to_csv(f_pack.file_submission, index=False) 60 | -------------------------------------------------------------------------------- /model - LR06.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | from scipy import sparse 4 | from sklearn.preprocessing import OneHotEncoder 5 | from sklearn.linear_model import LogisticRegression 6 | import f_pack 7 | from sklearn.model_selection import train_test_split 8 | import numpy as np 9 | 10 | 11 | def format_cross_features(dfTrain, dfTest, feat1, feat2): 12 | feat = feat1 + '-' + feat2 13 | dfTrain[feat] = dfTrain[feat1] + '.' + dfTrain[feat2] 14 | dfTrain[feat] = dfTrain[feat].astype(float) 15 | dfTrain[feat] = dfTrain[feat] * 10000 16 | dfTest[feat] = dfTest[feat1] + '.' + dfTest[feat2] 17 | dfTest[feat] = dfTest[feat].astype(float) 18 | dfTest[feat] = dfTest[feat] * 10000 19 | return dfTrain, dfTest 20 | 21 | 22 | # load data 23 | dfTrain, dfTest, y_label = f_pack.load_data() 24 | # cross feature 25 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "camgaignID", "advertiserID", 26 | "appPlatform", 'gender', 'marriageStatus', 'haveBaby', 'residence', 'age', 'education', 'appID'] 27 | for feat in feats: 28 | dfTrain[feat], dfTest[feat] = dfTrain[feat].astype(str), dfTest[feat].astype(str) 29 | 30 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'age', 'education') 31 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'appID') 32 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'appPlatform') 33 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'connectionType') 34 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'advertiserID') 35 | 36 | dfTrain = dfTrain.fillna(0) 37 | dfTrain = dfTrain.replace(np.inf, 0) 38 | dfTest = dfTest.replace(np.inf, 0) 39 | dfTest = dfTest.fillna(0) 40 | 41 | enc = OneHotEncoder() 42 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "camgaignID", "advertiserID", 43 | "appPlatform", 'gender', 'marriageStatus', 'haveBaby', 'residence', 'age', 'education', 'positionID-appID', 44 | 'positionID-appPlatform', 'positionID-connectionType'] 45 | 46 | for i, feat in enumerate(feats): 47 | x_train = enc.fit_transform(dfTrain[feat].values.reshape(-1, 1)) 48 | x_test = enc.transform(dfTest[feat].values.reshape(-1, 1)) 49 | if i == 0: 50 | X_train, X_test = x_train, x_test 51 | else: 52 | X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test)) 53 | 54 | feats = ['download_num', 'avg_cvr', 'user_install_num', 'app_install_num'] 55 | for feat in feats: 56 | X_train = sparse.hstack((X_train, dfTrain[feat].values.reshape(-1, 1))) 57 | X_test = sparse.hstack((X_test, dfTest[feat].values.reshape(-1, 1))) 58 | 59 | X_train, test_set, y_train, y_test_set = train_test_split(X_train, y_label, test_size=0.2, random_state=0) 60 | # model training 61 | print("start modeling") 62 | lr = LogisticRegression(penalty='l1') 63 | lr.fit(X_train, y_train) 64 | proba_test = lr.predict_proba(X_test)[:, 1] 65 | 66 | # submission 67 | df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": proba_test}) 68 | df.sort_values("instanceID", inplace=True) 69 | df.to_csv(f_pack.file_submission, index=False) 70 | 71 | # metric 72 | predictions = lr.predict(test_set) 73 | f_pack.print_metrics(y_test_set, predictions) 74 | -------------------------------------------------------------------------------- /train_xgb.py: -------------------------------------------------------------------------------- 1 | from feature_set import make_train_set 2 | from feature_set import make_test_set 3 | from sklearn.model_selection import train_test_split 4 | import xgboost as xgb 5 | from datetime import datetime 6 | import pandas as pd 7 | from sklearn.utils import resample 8 | import f_pack 9 | import matplotlib.pyplot as plt 10 | import operator 11 | 12 | def statistic(group): 13 | group = group.sort_values('label', ascending=False).head(1) 14 | return group 15 | 16 | def ceate_feature_map(features): 17 | outfile = open('xgb.fmap', 'w') 18 | i = 0 19 | for feat in features: 20 | outfile.write('{0}\t{1}\tq\n'.format(i, feat)) 21 | i = i + 1 22 | outfile.close() 23 | 24 | def underSampling(training_data, label): 25 | n = label.size // 8 26 | small_data, small_label = resample(training_data, label, n_samples=n) 27 | data = pd.concat([training_data, label], axis=1) 28 | positive = data[data['label'] == 1] 29 | small_data = pd.concat([small_data, positive.ix[:, :positive.columns.size - 1]]) 30 | small_label = pd.concat([small_label, positive.ix[:, positive.columns.size - 1]]) 31 | return small_data, small_label 32 | 33 | 34 | def xgboost_make_submission(): 35 | training_data, label = make_train_set() 36 | instanceID, test_trainning_data = make_test_set() 37 | instanceID = instanceID.reset_index() 38 | test_trainning_data = xgb.DMatrix(test_trainning_data.values) 39 | print('start fit') 40 | for i in range(1): 41 | small_data, small_label = underSampling(training_data, label) 42 | ceate_feature_map(small_data.columns) 43 | X_train, X_test, y_train, y_test = train_test_split(small_data.values, small_label.values, test_size=0.2, 44 | random_state=0) 45 | dtrain = xgb.DMatrix(X_train, label=y_train) 46 | dtest = xgb.DMatrix(X_test, label=y_test) 47 | param = {'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 3, 48 | 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 49 | 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} 50 | num_round = 300 51 | watchList = [(dtest, 'eval'), (dtrain, 'train')] 52 | plst = list(param.items()) + [('eval_metric', 'logloss')] 53 | bst = xgb.train(plst, dtrain, num_round, watchList) 54 | y = bst.predict(test_trainning_data) 55 | instanceID = pd.concat([instanceID, pd.Series(y)], axis=1) 56 | # feature importance 57 | feature_score=bst.get_fscore(fmap='xgb.fmap') 58 | print(feature_score) 59 | feature_score = sorted(feature_score.items(), key=operator.itemgetter(1)) 60 | df = pd.DataFrame(feature_score, columns=['feature', 'fscore']) 61 | df['fscore'] = df['fscore'] / df['fscore'].sum() 62 | df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(16, 10)) 63 | plt.show() 64 | instanceID.to_csv(f_pack.file_output_test, index=False) 65 | 66 | # output 67 | data = f_pack.read_file(f_pack.file_output_test) 68 | data = data.ix[:, 1:] 69 | print(data.head()) 70 | data['Prob'] = data.ix[:, 1:].sum(axis=1) / 10 71 | data = data[['instanceID', 'Prob']] 72 | data.to_csv(f_pack.file_submission, index=False) 73 | 74 | 75 | if __name__ == '__main__': 76 | print(datetime.now()) 77 | xgboost_make_submission() 78 | print(datetime.now()) 79 | -------------------------------------------------------------------------------- /train_cv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy as sp 4 | import lightgbm as lgb 5 | import gc 6 | import datetime 7 | import random 8 | import scipy.special as special 9 | from sklearn.cross_validation import train_test_split 10 | from sklearn.cross_validation import StratifiedKFold 11 | 12 | rawpath='C:\\final\\' 13 | temppath='C:\\final\\temp\\' 14 | iapath='C:\\final\\temp\\installedactions\\' 15 | 16 | def logloss(act, preds): 17 | epsilon = 1e-15 18 | preds = sp.maximum(epsilon, preds) 19 | preds = sp.minimum(1 - epsilon, preds) 20 | ll = sum(act * sp.log(preds) + sp.subtract(1, act) * sp.log(sp.subtract(1, preds))) 21 | ll = ll * -1.0 / len(act) 22 | return ll 23 | 24 | 25 | def getTrainVal(X_train, scope=(28, 29), val_type='30', seed=1000): 26 | if val_type == '30': 27 | X_val = X_train.loc[X_train['day'] == 30, :] 28 | X_train = X_train.loc[(X_train['day'] >= scope[0]) & (X_train['day'] <= scope[1]), :] 29 | elif val_type == '73': 30 | X_train = X_train.loc[(X_train['day'] >= scope[0]) & (X_train['day'] <= scope[1]), :] 31 | X_train, X_val, y_train, y_val = train_test_split(X_train, X_train['label'], test_size=0.3, random_state=seed) 32 | return X_train, X_val 33 | 34 | 35 | t_start = datetime.datetime.now() 36 | X_loc_train=pd.read_csv(temppath+'2_smooth.csv') 37 | print('load train over...') 38 | X_loc_test=pd.read_csv(temppath+'2_test_smooth.csv') 39 | print('load test over...') 40 | 41 | ##########################################################CV预测时30号验证效果不好,而其中一折做验证来提前停止,后面删除30天数据 42 | X_loc_train, X_loc_val = getTrainVal(X_loc_train, scope=(28, 29), val_type='30', seed=1000) 43 | 44 | drop = ['label', 'day'] 45 | y_loc_train = X_loc_train.loc[:, 'label'] 46 | X_loc_train.drop(drop, axis=1, inplace=True) 47 | 48 | # y_loc_val = X_loc_val.loc[:, 'label'] 49 | # X_loc_val.drop(drop, axis=1, inplace=True) 50 | 51 | res = X_loc_test.loc[:, ['instanceID']] 52 | X_loc_test.drop(['instanceID'], axis=1, inplace=True) 53 | X_loc_test.drop(drop, axis=1, inplace=True) 54 | 55 | 56 | gc.collect() 57 | print('preprocess over...', X_loc_train.shape) 58 | 59 | ##########################################################比赛只用了lightGBM单模型 60 | X_loc_train=X_loc_train.values 61 | y_loc_train=y_loc_train.values 62 | # X_loc_val=X_loc_val.values 63 | # y_loc_val=y_loc_val.values 64 | X_loc_test=X_loc_test.values 65 | 66 | ##########################################################交叉预测,实际上是stacking第一层做的操作 67 | # 利用不同折数加参数,特征,样本(随机数种子)扰动,再加权平均得到最终成绩 68 | model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=29, max_depth=-1, learning_rate=0.1, n_estimators=10000, 69 | max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0, 70 | min_child_weight=5, min_child_samples=10, subsample=1, subsample_freq=1, 71 | colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, nthread=-1, silent=True) 72 | del X_loc_val 73 | 74 | skf=list(StratifiedKFold(y_loc_train, n_folds=10, shuffle=True, random_state=1024)) 75 | for i, (train, test) in enumerate(skf): 76 | print("Fold", i) 77 | model.fit(X_loc_train[train], y_loc_train[train], eval_metric='logloss',eval_set=[(X_loc_train[train], y_loc_train[train]), (X_loc_train[test], y_loc_train[test])],early_stopping_rounds=100) 78 | preds= model.predict_proba(X_loc_test, num_iteration=model.best_iteration)[:, 1] 79 | print('mean:', preds.mean()) 80 | res['prob_%s' % str(i)] = preds 81 | 82 | #平均或者加权的方式有很多种,台大三傻的比赛分享里有一个利用sigmoid反函数来平均的方法效果不错 83 | now = datetime.datetime.now() 84 | now = now.strftime('%m-%d-%H-%M') 85 | print(now) 86 | res.sort_values("instanceID", ascending=True, inplace=True) 87 | res.to_csv(rawpath+"%s.csv" % now, index=False) 88 | 89 | t_end = datetime.datetime.now() 90 | print('training time: %s' % ((t_end - t_start).seconds/60)) 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TencentSocialAds 2 | 腾讯社交广告高校算法大赛 3 | objective: 预测移动广告被点击后激活的概率 [pCVR=P(conversion=1 | ad, user, context)] 4 | 5 | Data Cleaning: 6 | 7 | 1. 30th day is inaccuarate but valuable because the prediction problem is time-sensitive. So how to take advantage it? 由于转化回流时间有长有短,所以最后五天的label可能是不准确的,尤其是第30天。如果将第30天的数据全部删除,将会丢失大量有用的信息。如果全部保留,又引进了相当程度的噪声。而我们发现,转化回流时间是与APP ID有关的。于是我们统计了每个APP ID的平均转化回流时间,并且删除掉了第30天中平均转化回流时间偏长的数据。 8 | 9 | Feature engineering: 10 | 11 | 1. Feature types: raw features, statistic features, time-series features, cross features. 12 | 2. statistic features needs to do Bayesian smooth 13 | 3. time-series features such as the number of installed app before clicktime, the number of installed app of the same type before clicktime 14 | 4. How to select cross features? use xgb features importance -> run xgb again to get updated features importance 15 | 5. How to code cross features? 1. Hash and onehotencoding 2. groupby -> transfer cross features to statistic features 16 | 17 | Data Set Construction: 18 | 19 | 1. use data of 28,29th days to predict 31th day 20 | 2. conversion ratio of 17th -20th seems unstable, we should remove it. 21 | 22 | Model and Training: 23 | 24 | 1. Be careful of data leakage. 25 | 2. model ensemble should use different kind of models such as the combination of xgb ans LR. xgb and lightGBM are both tree-based model. So the result is not as good as imagination. 26 | 3. ensemble mothod: Weighted average, stacking, random seeds. 27 | 4. final result can multiple a ratio to approach platform mean conversion ratio. 28 | 29 | ## 其他队伍分享的highlight 30 | ### rank 14th 队伍名:竟然有这些操作 31 | 32 | Trick特征: 33 | 通过观察原始数据是不难发现的,有很多只有clickTime和label不一样的重复数据,按时间排序发现重复数据如果转化,label一般标在头或尾,少部分在中间,在训练集上出现的情况在测试集上也会出现,所以标记这些位置后onehot,让模型去学习,再就是时间差特征,关于trick我比赛分享的这篇文章有较详细的说明。比赛后期发现了几个和这个trick相类似的文章1和文章2,可以参考。 34 | 35 | 统计特征: 36 | 原始特征主要三大类:广告特征、用户特征、位置特征,通过交叉组合算统计构造特征,由于机器限制,统计特征主要使用了转化率,丢掉了点击次数和转化次数。初赛利用了7天滑窗构造,决赛采用了周冠军分享的clickTime之前所有天算统计。三组合特征也来自周冠军分享的下载行为和网络条件限制,以及用户属性对app需求挖掘出。贝叶斯平滑user相关的特征特别废时间,初赛做过根据点击次数阈值来操作转化率,效果和平滑差不多但是阈值选择不太准。 37 | 38 | 活跃数特征: 39 | 特征构造灵感来自这里,比如某个广告位的app种数。 40 | 41 | 均值特征: 42 | 比如点击某广告的用户平均年龄 43 | 44 | 平均回流时间特征: 45 | 利用回流时间方式不对的话很容易造成leackage,这里参考了官方群里的分享,计算了每个appID的平均回流时间,没有回流的app用其所在类的平均回流时间代替 46 | 47 | 用户流水和历史特征: 48 | 利用installed文件关联user和app获得历史统计特征,利用actions进行7天滑动窗口获得用户和app流水特征。 49 | 50 | 一些特征: 51 | 52 | 冷启动特征; 53 | 排序特征; 54 | 用户点击数和转化数过于稀疏,可以分别LenbelEncoder,然后拼接后LabelEncoder; 55 | 一天24小时以半小时为单位分箱; 56 | 连续特征离散化,如分箱离散化、参考决策树分裂点离散化、或用XGB叶子号离散化,再拼接原始离散特征送入FFM; 57 | 交叉验证方式构造统计特征防止leakage; 58 | 用户转化序列(比如:0010); 59 | 删除最后几天中平均回流时间长的某些appID或者advertiserID数据(考虑回流时间和广告主或app相关); 60 | 多窗口统计(1分钟内、1小时内、1天内...),利用多窗口将样本转化为二维,送人CNN(很强的捕捉局部信息能力),不采用pooling,采用drop-out等; 61 | 62 | ### rank 20 队伍名:unknown 63 | 64 | 用户点击日志挖掘_2_1_gen_user_click_features.py 65 | 挖掘广告点击日志,从不同时间粒度(天,小时)和不同属性维度(点击的素材,广告,推广计划,广告主类型,广告位等)提取用户点击行为的统计特征。 66 | 67 | 用户安装日志挖掘 _2_2_gen_app_install_features.py 68 | 根据用户历史APP安装记录日志,分析用户的安装偏好和APP的流行趋势,结合APP安装时间的信息提取APP的时间维度的描述向量。这里最后只用了一种特征。 69 | 70 | 广告主转化回流上报机制分析_2_4_gen_tricks.py 71 | 不同的广告主具有不同的转化计算方式,如第一次点击算转化,最后一次点击算转化,安装时点击算转化,分析并构造相应描述特征,提升模型预测精度。 72 | 73 | 广告转化率特征提取_2_5_gen_smooth_cvr.py 74 | 构造转化率特征,使用全局和滑动窗口等方式计算单特征转化率,组合特征转化率,使用均值填充,层级填充,贝叶斯平滑,拉普拉斯平滑等方式对转化率进行修正。 75 | 76 | 广告描述向量特征提取_2_6_gen_ID_click_vectors.py 77 | 广告投放是有特定受众对象的,而特定的受众对象也可以描述广告的相关特性,使用不同的人口属性对广告ID和APPID进行向量表示,学习隐含的语义特征。 78 | 建模预测 79 | 使用多种模型进行训练,包括LightGBM,XGBoost,FFM和神经网络,最后进行多模型加权融合提高最终模型性能。 80 | 81 | 总结:前期一直沉迷于LR带来的成绩,没有想到后期随着特征的增加,LR无法很好的表达特征。没有及时用xgboost,导致成绩一直提升缓慢,也许错过了许多重要特征。对于稀疏矩阵的运用有待加强。刚开始xgb结果很差是因为代码错误(label列选错了,直接贴之前京东赛代码的恶果),还是要思考一下,下次碰到这种稀疏矩阵,怎么通过pandas能够很好的解决。 82 | 83 | 李沐指出,模型是使用离散特征还是连续特征,其实是一个“海量离散特征+简单模型” 同 “少量连续特征+复杂模型”的权衡。既可以离散化用线性模型,也可以用连续特征加深度学习。就看是喜欢折腾特征还是折腾模型了。通常来说,前者容易,而且可以n个人一起并行做,有成功经验;后者目前看很赞,能走多远还须拭目以待。 84 | 逻辑回归属于广义线性模型,表达能力受限;单变量离散化为N个后,每个变量有单独的权重,相当于为模型引入了非线性,能够提升模型表达能力,加大拟合。 85 | 离散化后可以进行特征交叉,由M+N个变量变为M*N个变量,进一步引入非线性,提升表达能力。 86 | 87 | 参考资料: 88 | 30th https://github.com/oooozhizhi/TencentSocialAdvertising-30th-solutions 89 | 26th https://jiayi797.github.io/categories/腾讯算法大赛-CVR预估/ 90 | 23th https://github.com/BladeCoda/Tencent2017_Final_Coda_Allegro 91 | 20th https://github.com/shenweichen/Tencent_Social_Ads2017_Mobile_App_pCVR 92 | 14th https://github.com/freelzy/Tencent_Social_Ads 93 | 7th http://blog.csdn.net/ben3ben/article/details/74838338 94 | -------------------------------------------------------------------------------- /model - xgb - cross features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import f_pack 3 | from scipy import sparse 4 | from sklearn.preprocessing import OneHotEncoder 5 | from sklearn.model_selection import train_test_split 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from sklearn.utils import resample 9 | import xgboost as xgb 10 | import operator 11 | 12 | 13 | def underSampling(training_data, label): 14 | n = label.size // 10 15 | small_data, small_label = resample(training_data, label, n_samples=n) 16 | positive = training_data[training_data['label'] == 1] 17 | small_data = pd.concat([small_data, positive]) 18 | small_label = pd.concat([pd.DataFrame(small_label), positive['label']]) 19 | return small_data, small_label 20 | 21 | 22 | def format_cross_features(dfTrain, dfTest, feat1, feat2): 23 | feat = feat1 + '-' + feat2 24 | dfTrain[feat] = dfTrain[feat1] + '.' + dfTrain[feat2] 25 | dfTrain[feat] = dfTrain[feat].astype(float) 26 | dfTrain[feat] = dfTrain[feat] * 10000 27 | dfTest[feat] = dfTest[feat1] + '.' + dfTest[feat2] 28 | dfTest[feat] = dfTest[feat].astype(float) 29 | dfTest[feat] = dfTest[feat] * 10000 30 | return dfTrain, dfTest 31 | 32 | 33 | def model(n_round): 34 | # load data 35 | dfTrain, dfTest, y_label = f_pack.load_data() 36 | # cross feature 37 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "sitesetID", "advertiserID", 38 | "appPlatform", 'gender', 'marriageStatus', 'haveBaby', 'residence', 'age', 'education', 'appID'] 39 | for feat in feats: 40 | dfTrain[feat], dfTest[feat] = dfTrain[feat].astype(str), dfTest[feat].astype(str) 41 | 42 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'age', 'education') 43 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'appID') 44 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'appPlatform') 45 | dfTrain, dfTest = format_cross_features(dfTrain, dfTest, 'positionID', 'connectionType') 46 | 47 | dfTrain = dfTrain.fillna(0) 48 | dfTrain = dfTrain.replace(np.inf, 0) 49 | dfTest = dfTest.replace(np.inf, 0) 50 | dfTest = dfTest.fillna(0) 51 | 52 | enc = OneHotEncoder() 53 | feats = ['positionID', 'connectionType', 'telecomsOperator', "creativeID", "adID", "camgaignID", "advertiserID", 54 | "appPlatform", 'gender', 'marriageStatus', 'haveBaby', 'residence', 'age', 'education', 'positionID-appID', 55 | 'positionID-appPlatform', 'positionID-connectionType'] 56 | 57 | for i, feat in enumerate(feats): 58 | x_train = enc.fit_transform(dfTrain[feat].values.reshape(-1, 1)) 59 | x_test = enc.transform(dfTest[feat].values.reshape(-1, 1)) 60 | if i == 0: 61 | X_train, X_test = x_train, x_test 62 | else: 63 | X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test)) 64 | 65 | feats = ['download_num', 'avg_cvr', 'user_install_num', 'app_install_num', 'h_potential', 'h_potential02'] 66 | for feat in feats: 67 | X_train = sparse.hstack((X_train, dfTrain[feat].values.reshape(-1, 1))) 68 | X_test = sparse.hstack((X_test, dfTest[feat].values.reshape(-1, 1))) 69 | 70 | # model training 71 | print("start modeling") 72 | X_train, valid_set, y_train, y_valid = train_test_split(X_train, y_label, test_size=0.05, random_state=0) 73 | dtrain = xgb.DMatrix(X_train, label=y_train) 74 | dtest = xgb.DMatrix(valid_set, label=y_valid) 75 | param = {'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 10, 76 | 'min_child_weight': 5, 'gamma': 0, 'silent': 1, 'objective': 'binary:logistic', 77 | 'early_stopping_rounds': 50} 78 | # xgb.cv(param, dtrain, n_round, nfold=5, metrics={'auc'}, seed=0, 79 | # callbacks=[xgb.callback.print_evaluation(show_stdv=True)], early_stopping_rounds=20) 80 | watchList = [(dtest, 'eval'), (dtrain, 'train')] 81 | plst = list(param.items()) + [('eval_metric', 'logloss')] 82 | bst = xgb.train(plst, dtrain, n_round, watchList) 83 | y = bst.predict(xgb.DMatrix(X_test)) 84 | res = pd.concat([dfTest['instanceID'], pd.Series(y)], axis=1) 85 | res = res.sort_values('instanceID') 86 | res['instanceID'] = res['instanceID'].astype(int) 87 | res.columns = ['instanceID', 'proba'] 88 | print(res.shape) 89 | res.to_csv(f_pack.file_submission, index=False) 90 | 91 | 92 | model(230) 93 | 94 | ''' 95 | # feature importance 96 | feature_score = bst.get_fscore() 97 | feature_score = sorted(feature_score.items(), key=operator.itemgetter(1)) 98 | print(feature_score) 99 | df = pd.DataFrame(feature_score, columns=['feature', 'fscore']) 100 | df['fscore'] = df['fscore'] / df['fscore'].sum() 101 | df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(16, 10)) 102 | plt.show() 103 | 104 | proba_test = lr.predict_proba(X_test)[:, 1] 105 | # submission 106 | df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": proba_test}) 107 | df.sort_values("instanceID", inplace=True) 108 | df.to_csv(f_pack.file_submission, index=False) 109 | ''' 110 | -------------------------------------------------------------------------------- /doFeats_1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy as sp 4 | import gc 5 | import datetime 6 | import random 7 | import scipy.special as special 8 | 9 | rawpath='C:\\final\\' 10 | temppath='C:\\final\\temp\\' 11 | iapath='C:\\final\\temp\\installedactions\\' 12 | 13 | def logloss(act, preds): 14 | epsilon = 1e-15 15 | preds = sp.maximum(epsilon, preds) 16 | preds = sp.minimum(1 - epsilon, preds) 17 | ll = sum(act * sp.log(preds) + sp.subtract(1, act) * sp.log(sp.subtract(1, preds))) 18 | ll = ll * -1.0 / len(act) 19 | return ll 20 | 21 | class HyperParam(object):#平滑,这个快一点;hyper=HyperParam(1, 1); hyper.update_from_data_by_moment(show, click) 22 | def __init__(self, alpha, beta): 23 | self.alpha = alpha 24 | self.beta = beta 25 | 26 | def sample_from_beta(self, alpha, beta, num, imp_upperbound): 27 | sample = numpy.random.beta(alpha, beta, num) 28 | I = [] 29 | C = [] 30 | for click_ratio in sample: 31 | imp = random.random() * imp_upperbound 32 | #imp = imp_upperbound 33 | click = imp * click_ratio 34 | I.append(imp) 35 | C.append(click) 36 | return I, C 37 | 38 | def update_from_data_by_FPI(self, tries, success, iter_num, epsilon): 39 | '''estimate alpha, beta using fixed point iteration''' 40 | for i in range(iter_num): 41 | new_alpha, new_beta = self.__fixed_point_iteration(tries, success, self.alpha, self.beta) 42 | if abs(new_alpha-self.alpha)day-8).values].count()).reset_index(name='appcount') 192 | count['day']=day 193 | res=res.append(count,ignore_index=True) 194 | res.to_csv(iapath+'all_user_seven_day_cnt.csv',index=False) 195 | res=pd.DataFrame() 196 | temp=actions[['userID','day','appID']] 197 | for day in range(28,32): 198 | count=temp.groupby(['appID']).apply(lambda x: x['userID'][(x['day']day-8).values].count()).reset_index(name='usercount') 199 | count['day']=day 200 | res=res.append(count,ignore_index=True) 201 | res.to_csv(iapath+'all_app_seven_day_cnt.csv',index=False) 202 | print('actions over...') 203 | 204 | 205 | 206 | 207 | X_loc_train,X_loc_test=readData(m_type='inner',drop=True) 208 | print('readData over') 209 | X_loc_train=doPre(X_loc_train) 210 | X_loc_test=doPre(X_loc_test) 211 | print('doPre over...') 212 | 213 | ##########################################################统计特征,统计特征为点击数,转化数,转化率为转化数/点击数, 214 | ##########################################################初赛用7天滑窗算统计,决赛根据周冠军分享改为了使用了clickTime之前所有天算统计 215 | for feat_1 in ['creativeID','positionID','userID','sitesetID']: 216 | gc.collect() 217 | res=pd.DataFrame() 218 | temp=X_loc_train[[feat_1,'day','label']] 219 | for day in range(28,32): 220 | count=temp.groupby([feat_1]).apply(lambda x: x['label'][(x['day']=scope[0]).values & (X_train['clickTime']//10000000<=scope[1]).values 70 | X_train=X_train.loc[pos,:] 71 | X_test = pd.read_csv(rawpath+'test.csv') 72 | X_train.drop('conversionTime', axis=1, inplace=True) 73 | 74 | userfile = pd.read_csv(rawpath+'user.csv') 75 | X_train = X_train.merge(userfile, how=m_type, on='userID') 76 | X_test = X_test.merge(userfile, how=m_type, on='userID') 77 | del userfile 78 | gc.collect() 79 | 80 | adfile = pd.read_csv(rawpath+'ad.csv') 81 | X_train = X_train.merge(adfile, how=m_type, on='creativeID') 82 | X_test = X_test.merge(adfile, how=m_type, on='creativeID') 83 | del adfile 84 | gc.collect() 85 | 86 | appcatfile = pd.read_csv(rawpath+'app_categories.csv') 87 | X_train = X_train.merge(appcatfile, how=m_type, on='appID') 88 | X_test = X_test.merge(appcatfile, how=m_type, on='appID') 89 | del appcatfile 90 | gc.collect() 91 | 92 | positionfile = pd.read_csv(rawpath+'position.csv') 93 | X_train = X_train.merge(positionfile, how=m_type, on='positionID') 94 | X_test = X_test.merge(positionfile, how=m_type, on='positionID') 95 | del positionfile 96 | gc.collect() 97 | print('merge type:', m_type) 98 | return X_train, X_test 99 | 100 | ##################################重复数据Trick,初赛有3.5个千分点提升,决赛在原始数据的基础上有3个千分点提升 101 | #训练集上的情况也会在测试集上出现 102 | def doTrick(data): 103 | subset = ['creativeID', 'positionID', 'adID', 'appID', 'userID'] 104 | data['maybe'] = 0 105 | pos = data.duplicated(subset=subset, keep=False) 106 | data.loc[pos, 'maybe'] = 1 107 | pos = (~data.duplicated(subset=subset, keep='first')) & data.duplicated(subset=subset, keep=False) 108 | data.loc[pos, 'maybe'] = 2 109 | pos = (~data.duplicated(subset=subset, keep='last')) & data.duplicated(subset=subset, keep=False) 110 | data.loc[pos, 'maybe'] = 3 111 | 112 | #比较关键的一步,初赛刚发现trick时提升不多,经过onehot后提升近3个千分点 113 | features_trans = ['maybe'] 114 | data = pd.get_dummies(data, columns=features_trans) 115 | data['maybe_0'] = data['maybe_0'].astype(np.int8) 116 | data['maybe_1'] = data['maybe_1'].astype(np.int8) 117 | data['maybe_2'] = data['maybe_2'].astype(np.int8) 118 | data['maybe_3'] = data['maybe_3'].astype(np.int8) 119 | 120 | #时间差Trick,对clickTime处理成秒,分钟都尝试过,效果有些微差别,最后选择不进行处理 121 | temp = data.loc[:,['clickTime', 'creativeID', 'positionID', 'adID', 'appID', 'userID']].drop_duplicates(subset=subset, keep='first') 122 | # temp = temp.drop_duplicates(subset=subset, keep='first') 123 | temp.rename(columns={'clickTime': 'diffTime_first'}, inplace=True) 124 | data = pd.merge(data, temp, how='left', on=subset) 125 | data['diffTime_first'] = data['clickTime'] - data['diffTime_first'] 126 | del temp,pos 127 | gc.collect() 128 | temp = data.loc[:,['clickTime', 'creativeID', 'positionID', 'adID', 'appID', 'userID']].drop_duplicates(subset=subset, keep='last') 129 | # temp = temp.drop_duplicates(subset=subset, keep='last') 130 | temp.rename(columns={'clickTime': 'diffTime_last'}, inplace=True) 131 | data = pd.merge(data, temp, how='left', on=subset) 132 | data['diffTime_last'] = data['diffTime_last'] - data['clickTime'] 133 | del temp 134 | gc.collect() 135 | data.loc[~data.duplicated(subset=subset, keep=False), ['diffTime_first', 'diffTime_last']] = -1 #置0会变差 136 | 137 | #重复次数是否大于2 138 | temp=data.groupby(subset)['label'].count().reset_index() 139 | temp.columns=['creativeID', 'positionID', 'adID', 'appID', 'userID','large2'] 140 | temp['large2']=1*(temp['large2']>2) 141 | data = pd.merge(data, temp, how='left', on=subset) 142 | #----------- 143 | # data['last_click'] = data['clickTime'] 144 | # pos = data.duplicated(subset=subset, keep=False) 145 | # data.loc[pos, 'last_click'] = data.loc[pos, 'last_click'].diff(periods=1) 146 | # pos = ~data.duplicated(subset=subset, keep='first') 147 | # data.loc[pos, 'last_click'] = -1 148 | # data['next_click'] = data['clickTime'] 149 | # pos = data.duplicated(subset=subset, keep=False) 150 | # data.loc[pos, 'next_click'] = -1 * data.loc[pos, 'next_click'].diff(periods=-1) 151 | # pos = ~data.duplicated(subset=subset, keep='last') 152 | # data.loc[pos, 'next_click'] = -1 153 | # del pos 154 | # data['maybe_4']=data['maybe_1']+data['maybe_2'] 155 | # data['maybe_5']=data['maybe_1']+data['maybe_3'] 156 | # data['diffTime_span']=data['diffTime_last']+data['diffTime_first'] 157 | #------------- 158 | del temp 159 | gc.collect() 160 | return data 161 | 162 | ##################################Trick2基于userID重复的数据做,重要性高但是线上效果不好,和Trick信息重复了 163 | def doTrick2(X_train,X_test): 164 | res = X_test[['instanceID']] 165 | X_test.drop('instanceID', axis=1, inplace=True) 166 | data = X_train.append(X_test, ignore_index=True) 167 | del X_train, X_test 168 | gc.collect() 169 | 170 | subset = ['userID'] 171 | data['umaybe'] = 0 172 | pos = data.duplicated(subset=subset, keep=False) 173 | data.loc[pos, 'umaybe'] = 1 174 | pos = (~data.duplicated(subset=subset, keep='first')) & data.duplicated(subset=subset, keep=False) 175 | data.loc[pos, 'umaybe'] = 2 176 | pos = (~data.duplicated(subset=subset, keep='last')) & data.duplicated(subset=subset, keep=False) 177 | data.loc[pos, 'umaybe'] = 3 178 | del pos 179 | gc.collect() 180 | features_trans = ['umaybe'] 181 | data = pd.get_dummies(data, columns=features_trans) 182 | data['umaybe_0'] = data['umaybe_0'].astype(np.int8) 183 | data['umaybe_1'] = data['umaybe_1'].astype(np.int8) 184 | data['umaybe_2'] = data['umaybe_2'].astype(np.int8) 185 | data['umaybe_3'] = data['umaybe_3'].astype(np.int8) 186 | 187 | temp = data[['clickTime','userID']] 188 | temp = temp.drop_duplicates(subset=subset, keep='first') 189 | temp.rename(columns={'clickTime': 'udiffTime_first'}, inplace=True) 190 | data = pd.merge(data, temp, how='left', on=subset) 191 | data['udiffTime_first'] = data['clickTime'] - data['udiffTime_first'] 192 | del temp 193 | gc.collect() 194 | temp = data[['clickTime','userID']] 195 | temp = temp.drop_duplicates(subset=subset, keep='last') 196 | temp.rename(columns={'clickTime': 'udiffTime_last'}, inplace=True) 197 | data = pd.merge(data, temp, how='left', on=subset) 198 | data['udiffTime_last'] = data['udiffTime_last'] - data['clickTime'] 199 | del temp 200 | gc.collect() 201 | data.loc[~data.duplicated(subset=subset, keep=False), ['udiffTime_first', 'udiffTime_last']] = -1 202 | 203 | X_train = data.loc[data['label'] != -1, :] 204 | X_test = data.loc[data['label'] == -1, :] 205 | X_test.loc[:, 'instanceID'] = res.values 206 | del temp,data 207 | gc.collect() 208 | return X_train, X_test 209 | 210 | 211 | def doPre(data): 212 | data['day'] = data['clickTime'] // 1000000 213 | data['hour'] = data['clickTime'] % 1000000 // 10000 214 | # data['clickTime'] = data['day'] * 1440 + (data['clickTime'] % 1000000 // 10000) * 60 + (data['clickTime'] % 10000 // 100) * 60 + data['clickTime'] % 100 # 默认 215 | # data['clickTime'] = data['day'] * 1440 + (data['clickTime'] % 1000000 // 10000) * 60 + data['clickTime'] % 10000#best 216 | 217 | # data['week'] = data['day'] % 7 218 | 219 | # data['appCategory_main'] = data['appCategory'] 220 | # data.loc[data['appCategory'] > 99, 'appCategory_main'] = data.loc[data['appCategory'] > 99, 'appCategory'] // 100 221 | # data['appCategory'] = data['appCategory'] % 100 222 | 223 | # data.loc[data['age'] < 10,'age']=0 224 | # data.loc[(data['age'] >= 10)&(data['age']< 18), 'age'] = 1 225 | # data.loc[(data['age'] >= 18) & (data['age'] < 24), 'age'] = 2 226 | # data.loc[(data['age'] >= 24) & (data['age'] < 30), 'age'] = 3 227 | # data.loc[(data['age'] >= 30) & (data['age'] < 40), 'age'] = 4 228 | # data.loc[(data['age'] >= 40) & (data['age'] < 60), 'age'] = 5 229 | # data.loc[data['age'] >= 60, 'age'] = 6 230 | 231 | # data.loc[(data['hour'] >= 8) & (data['hour'] <14 ), 'preiod'] = 0 232 | # data.loc[(data['hour'] >= 14) | (data['hour'] < 8), 'preiod'] = 1 233 | # data = pd.get_dummies(data, columns=['preiod']) 234 | return data 235 | 236 | ##################################均值特征 237 | def doAvg(X_train, X_test): 238 | res = X_test[['instanceID']] 239 | X_test.drop('instanceID', axis=1, inplace=True) 240 | data = X_train.append(X_test, ignore_index=True) 241 | del X_train, X_test 242 | gc.collect() 243 | 244 | # 小时均值特征 245 | grouped = data.groupby('userID')['hour'].mean().reset_index() 246 | grouped.columns = ['userID', 'user_mean_hour'] 247 | data = data.merge(grouped, how='left', on='userID') 248 | grouped = data.groupby('appID')['hour'].mean().reset_index() 249 | grouped.columns = ['appID', 'app_mean_hour'] 250 | data = data.merge(grouped, how='left', on='appID') 251 | grouped = data.groupby('appCategory')['hour'].mean().reset_index() 252 | grouped.columns = ['appCategory', 'appCategory_mean_hour'] 253 | data = data.merge(grouped, how='left', on='appCategory') 254 | grouped = data.groupby('positionID')['hour'].mean().reset_index() 255 | grouped.columns = ['positionID', 'position_mean_hour'] 256 | data = data.merge(grouped, how='left', on='positionID') 257 | 258 | # 年龄均值特征 259 | grouped = data.groupby('appID')['age'].mean().reset_index() 260 | grouped.columns = ['appID', 'app_mean_age'] 261 | data = data.merge(grouped, how='left', on='appID') 262 | grouped = data.groupby('positionID')['age'].mean().reset_index() 263 | grouped.columns = ['positionID', 'position_mean_age'] 264 | data = data.merge(grouped, how='left', on='positionID') 265 | grouped = data.groupby('appCategory')['age'].mean().reset_index() 266 | grouped.columns = ['appCategory', 'appCategory_mean_age'] 267 | data = data.merge(grouped, how='left', on='appCategory') 268 | # grouped = data.groupby('creativeID')['age'].mean().reset_index() 269 | # grouped.columns = ['creativeID', 'creative_mean_age'] 270 | # data = data.merge(grouped, how='left', on='creativeID') 271 | # grouped = data.groupby('adID')['age'].mean().reset_index() 272 | # grouped.columns = ['adID', 'ad_mean_age'] 273 | # data = data.merge(grouped, how='left', on='adID') 274 | 275 | X_train = data.loc[data['label'] != -1, :] 276 | X_test = data.loc[data['label'] == -1, :] 277 | X_test.loc[:, 'instanceID'] = res.values 278 | del data, grouped 279 | gc.collect() 280 | return X_train, X_test 281 | 282 | ##################################活跃数特征 283 | def doActive(X_train, X_test): 284 | res = X_test[['instanceID']] 285 | X_test.drop('instanceID', axis=1, inplace=True) 286 | data = X_train.append(X_test, ignore_index=True) 287 | del X_train, X_test 288 | gc.collect() 289 | 290 | # 活跃特征选取类别多的,类别太少,nunique差别不大,广告随时都在,用户不是时刻都在,一个只出现一次的用户活跃的ad,app,advertiser,camgaign,creative都为1 291 | # 用户活跃小时数 292 | add = pd.DataFrame(data.groupby(["userID"]).hour.nunique()).reset_index() 293 | add.columns = ["userID", "user_active_hour"] 294 | data = data.merge(add, on=["userID"], how="left") 295 | 296 | # 活跃app数特征 297 | add = pd.DataFrame(data.groupby(["appCategory"]).appID.nunique()).reset_index() 298 | add.columns = ["appCategory", "appCategory_active_app"] 299 | data = data.merge(add, on=["appCategory"], how="left") 300 | # add = pd.DataFrame(data.groupby(["userID"]).appID.nunique()).reset_index() 301 | # add.columns = ["userID", "user_active_app"] 302 | # data = data.merge(add, on=["userID"], how="left") 303 | # add = pd.DataFrame(data.groupby(["age"]).appID.nunique()).reset_index() 304 | # add.columns = ["age", "age_active_app"] 305 | # data = data.merge(add, on=["age"], how="left") 306 | # add = pd.DataFrame(data.groupby(["sitesetID"]).appID.nunique()).reset_index() 307 | # add.columns = ["sitesetID", "siteset_active_app"] 308 | # data = data.merge(add, on=["sitesetID"], how="left") 309 | # add = pd.DataFrame(data.groupby(["positionType"]).appID.nunique()).reset_index() 310 | # add.columns = ["positionType", "positionType_active_app"] 311 | # data = data.merge(add, on=["positionType"], how="left") 312 | # add = pd.DataFrame(data.groupby(["positionID"]).appID.nunique()).reset_index() 313 | # add.columns = ["positionID", "position_active_app"] 314 | # data = data.merge(add, on=["positionID"], how="left") 315 | add = pd.DataFrame(data.groupby(["connectionType"]).appID.nunique()).reset_index() 316 | add.columns = ["connectionType", "connectionType_active_app"] 317 | data = data.merge(add, on=["connectionType"], how="left") 318 | 319 | # 活跃position数特征 320 | add = pd.DataFrame(data.groupby(["appID"]).positionID.nunique()).reset_index() 321 | add.columns = ["appID", "app_active_position"] 322 | data = data.merge(add, on=["appID"], how="left") 323 | add = pd.DataFrame(data.groupby(["appCategory"]).positionID.nunique()).reset_index() 324 | add.columns = ["appCategory", "appCategory_active_position"] 325 | data = data.merge(add, on=["appCategory"], how="left") 326 | # add = pd.DataFrame(data.groupby(["userID"]).positionID.nunique()).reset_index() 327 | # add.columns = ["userID", "user_active_position"] 328 | # data = data.merge(add, on=["userID"], how="left") 329 | # add = pd.DataFrame(data.groupby(["age"]).positionID.nunique()).reset_index() 330 | # add.columns = ["age", "age_active_position"] 331 | # data = data.merge(add, on=["age"], how="left") 332 | # add = pd.DataFrame(data.groupby(["positionType"]).positionID.nunique()).reset_index() 333 | # add.columns = ["positionType", "positionType_active_position"] 334 | # data = data.merge(add, on=["positionType"], how="left") 335 | # add = pd.DataFrame(data.groupby(["advertiserID"]).positionID.nunique()).reset_index() 336 | # add.columns = ["advertiserID", "advertiser_active_position"] 337 | # data = data.merge(add, on=["advertiserID"], how="left") 338 | 339 | #活跃user数特征 340 | add = pd.DataFrame(data.groupby(["appID"]).userID.nunique()).reset_index() 341 | add.columns = ["appID", "app_active_user"] 342 | data = data.merge(add, on=["appID"], how="left") 343 | add = pd.DataFrame(data.groupby(["positionID"]).userID.nunique()).reset_index() 344 | add.columns = ["positionID", "position_active_user"] 345 | data = data.merge(add, on=["positionID"], how="left") 346 | add = pd.DataFrame(data.groupby(["appCategory"]).userID.nunique()).reset_index() 347 | add.columns = ["appCategory", "appCategory_active_user"] 348 | data = data.merge(add, on=["appCategory"], how="left") 349 | 350 | add = pd.DataFrame(data.groupby(["userID"]).creativeID.nunique()).reset_index() 351 | add.columns = ["userID", "user_active_creative"] 352 | data = data.merge(add, on=["userID"], how="left") 353 | # add = pd.DataFrame(data.groupby(["userID"]).sitesetID.nunique()).reset_index() 354 | # add.columns = ["userID", "user_active_siteset"] 355 | # data = data.merge(add, on=["userID"], how="left") 356 | # add = pd.DataFrame(data.groupby(["userID"]).appCategory.nunique()).reset_index() 357 | # add.columns = ["userID", "user_active_appCategory"] 358 | # data = data.merge(add, on=["userID"], how="left") 359 | add = pd.DataFrame(data.groupby(["positionID"]).advertiserID.nunique()).reset_index() 360 | add.columns = ["positionID", "positionID_active_advertiser"] 361 | data = data.merge(add, on=["positionID"], how="left") 362 | 363 | 364 | X_train = data.loc[data['label'] != -1, :] 365 | X_test = data.loc[data['label'] == -1, :] 366 | X_test.loc[:, 'instanceID'] = res.values 367 | del data, add 368 | gc.collect() 369 | return X_train, X_test 370 | 371 | ##################################这几个操作尝试过,效果不佳,后来放弃了 372 | def doOneHot(X_train, X_test): 373 | res = X_test[['instanceID']] 374 | X_test.drop('instanceID', axis=1, inplace=True) 375 | data = X_train.append(X_test, ignore_index=True) 376 | del X_train, X_test 377 | gc.collect() 378 | 379 | features_trans = ['gender','appCategory_main','connectionType'] 380 | data = pd.get_dummies(data, columns=features_trans) 381 | 382 | X_train = data.loc[data['label'] != -1, :] 383 | X_test = data.loc[data['label'] == -1, :] 384 | X_test.loc[:, 'instanceID'] = res.values 385 | del data 386 | gc.collect() 387 | return X_train, X_test 388 | def doCrossProduct(data): 389 | data['position_creative'] = data['positionID'] * data['creativeID'] 390 | data['creative_age'] = data['creativeID'] * data['age'] 391 | return data 392 | def doDescartes(X_train, X_test): 393 | res = X_test[['instanceID']] 394 | X_test.drop('instanceID', axis=1, inplace=True) 395 | data = X_train.append(X_test, ignore_index=True) 396 | del X_train, X_test 397 | gc.collect() 398 | 399 | for feat_1 in ['maybe_0', 'maybe_2']: 400 | for feat_2 in ['connectionType', 'creativeID', 'positionID']: 401 | le = LabelEncoder() 402 | data[feat_1 + '_' + feat_2] = le.fit_transform(data[feat_1].astype('str') + data[feat_2].astype('str')) 403 | X_train = data.loc[data['label'] != -1, :] 404 | X_test = data.loc[data['label'] == -1, :] 405 | X_test.loc[:, 'instanceID'] = res.values 406 | del data 407 | gc.collect() 408 | return X_train, X_test 409 | def doSpecial(X_train, X_test): 410 | res = X_test[['instanceID']] 411 | X_test.drop('instanceID', axis=1, inplace=True) 412 | data = X_train.append(X_test, ignore_index=True) 413 | del X_train, X_test 414 | gc.collect() 415 | 416 | #####增加id与时间的斜率 417 | Min_id = data["listing_id"].min() 418 | Min_time = data["time"].min() 419 | data["gradient"] = ((data["listing_id"]) - Min_id) / (data["time"] - Min_time) 420 | 421 | X_train = data.loc[data['label'] != -1, :] 422 | X_test = data.loc[data['label'] == -1, :] 423 | X_test.loc[:, 'instanceID'] = res.values 424 | del data 425 | gc.collect() 426 | return X_train, X_test 427 | 428 | 429 | X_loc_train,X_loc_test=readData(m_type='inner',scope=(28,30)) 430 | print('readData over') 431 | X_loc_train=doPre(X_loc_train) 432 | X_loc_test=doPre(X_loc_test) 433 | print('doPre over...') 434 | 435 | ##########################################################actions和installed文件特征 436 | temp = pd.read_csv(iapath+'all_app_seven_day_cnt.csv') 437 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appID', 'day']) 438 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appID', 'day']) 439 | temp = pd.read_csv(iapath+'all_user_seven_day_cnt.csv') 440 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['userID', 'day']) 441 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['userID', 'day']) 442 | temp = pd.read_csv(iapath+'userInstalledappscount.csv') 443 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['userID']) 444 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['userID']) 445 | temp = pd.read_csv(iapath+'appInstalledusercount.csv') 446 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appID']) 447 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appID']) 448 | temp = pd.read_csv(iapath+'ageuserInstalledappscount.csv') 449 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['age']) 450 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['age']) 451 | temp = pd.read_csv(iapath+'appCatInstalledusercount.csv') 452 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appCategory']) 453 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appCategory']) 454 | temp = pd.read_csv(iapath+'eduuserInstalledappscount.csv') 455 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['education']) 456 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['education']) 457 | temp = pd.read_csv(iapath+'genderuserInstalledappscount.csv') 458 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['gender']) 459 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['gender']) 460 | 461 | ##########################################################appID平均回流时间特征 462 | temp = pd.read_csv(temppath+'app_cov_diffTime.csv') 463 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appID']) 464 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appID']) 465 | temp = pd.read_csv(temppath+'appCat_cov_diffTime.csv') 466 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=['appCategory']) 467 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=['appCategory']) 468 | X_loc_train['cov_diffTime'].fillna(value=X_loc_train['appCat_cov_diffTime'], inplace=True) 469 | X_loc_test['cov_diffTime'].fillna(value=X_loc_test['appCat_cov_diffTime'], inplace=True) 470 | X_loc_train.drop(['appCat_cov_diffTime'],axis=1,inplace=True) 471 | X_loc_test.drop(['appCat_cov_diffTime'],axis=1,inplace=True) 472 | print('app_cov_diffTime over...') 473 | 474 | ##########################################################活跃数特征 475 | X_loc_train,X_loc_test=doActive(X_loc_train,X_loc_test) 476 | print('doActive over...') 477 | 478 | ##########################################################均值特征 479 | X_loc_train,X_loc_test=doAvg(X_loc_train,X_loc_test) 480 | print('doAvg over...') 481 | 482 | print(X_loc_train.shape) 483 | print(X_loc_train.columns) 484 | # res = X_loc_test[['instanceID']] 485 | # X_loc_test.drop('instanceID', axis=1, inplace=True) 486 | # data = X_loc_train.append(X_loc_test, ignore_index=True) 487 | # del X_loc_train, X_loc_test 488 | # gc.collect() 489 | # # data.sort_values(['userID','clickTime'],inplace=True,kind='mergesort') 490 | # # data['ulast_click']=data['clickTime'] 491 | # # pos=data.duplicated(subset=['userID'], keep=False) 492 | # # data.loc[pos,'ulast_click']=data.loc[pos,'ulast_click'].diff(periods=1) 493 | # # pos=~data.duplicated(subset=['userID'], keep='first') 494 | # # data.loc[pos,'ulast_click']=-1 495 | # # data['unext_click']=data['clickTime'] 496 | # # pos=data.duplicated(subset=['userID'], keep=False) 497 | # # data.loc[pos,'unext_click']=-1*data.loc[pos,'unext_click'].diff(periods=-1) 498 | # # pos=~data.duplicated(subset=['userID'], keep='last') 499 | # # data.loc[pos,'unext_click']=-1 500 | # # del pos 501 | # # temp = data.loc[:, ['clickTime', 'userID']].drop_duplicates(subset=['userID'],keep='first') 502 | # # temp.rename(columns={'clickTime': 'udiffTime_first'}, inplace=True) 503 | # # data = pd.merge(data, temp, how='left', on=['userID']) 504 | # # data['udiffTime_first'] = data['clickTime'] - data['udiffTime_first'] 505 | # # del temp 506 | # # gc.collect() 507 | # # temp = data.loc[:, ['clickTime', 'userID']].drop_duplicates(subset=['userID'],keep='last') 508 | # # temp.rename(columns={'clickTime': 'udiffTime_last'}, inplace=True) 509 | # # data = pd.merge(data, temp, how='left', on=['userID']) 510 | # # data['udiffTime_last'] = data['udiffTime_last'] - data['clickTime'] 511 | # # del temp 512 | # # gc.collect() 513 | # # data.loc[~data.duplicated(subset=['userID'], keep=False), ['udiffTime_first', 'udiffTime_last']] = -1 514 | # 515 | # X_loc_train = data.loc[data['label'] != -1, :] 516 | # X_loc_test = data.loc[data['label'] == -1, :] 517 | # X_loc_test.loc[:, 'instanceID'] = res.values 518 | # # del data 519 | # del data 520 | # gc.collect() 521 | 522 | 523 | ##########################################################统计特征决赛用了clickTime之前所有天的统计,基本只用了平滑转化率特征,丢弃了点击数和转化数 524 | #由于操作错误提交了包含creativeID_smooth和creativeID_rate两个特征的结果,后来丢掉rate效果会变差就一直留着了 525 | #平滑user相关的特征特别废时间,初赛做过根据点击次数阈值来操作转化率,效果和平滑差不多但是阈值选择不太准 526 | for feat_1 in ['creativeID','positionID','userID']: 527 | temp = pd.read_csv(temppath+'%s.csv' %feat_1) 528 | bs = BayesianSmoothing(1, 1) 529 | bs.update(temp[feat_1 + '_all'].values, temp[feat_1 + '_1'].values, 1000, 0.001) 530 | temp[feat_1 + '_smooth'] = (temp[feat_1 + '_1'] + bs.alpha) / (temp[feat_1 + '_all'] + bs.alpha + bs.beta) 531 | if feat_1 in ['creativeID']: 532 | temp[feat_1 + '_rate'] = temp[feat_1 + '_1'] / temp[feat_1 + '_all'] 533 | temp.drop([feat_1 + '_1',feat_1 + '_all'],axis=1,inplace=True) 534 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=[feat_1, 'day']) 535 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=[feat_1, 'day']) 536 | del temp 537 | gc.collect() 538 | print(feat_1 + ' over...') 539 | X_loc_train.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True) 540 | X_loc_test.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True) 541 | #类别少,不用平滑 542 | for feat_1 in ['sitesetID']: 543 | temp = pd.read_csv(temppath+'%s.csv' %feat_1) 544 | temp[feat_1 + '_rate'] = temp[feat_1 + '_1'] / temp[feat_1 + '_all'] 545 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=[feat_1, 'day']) 546 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=[feat_1, 'day']) 547 | del temp 548 | gc.collect() 549 | print(feat_1 + ' over...') 550 | X_loc_train.fillna(value=0, inplace=True) 551 | X_loc_test.fillna(value=0, inplace=True) 552 | 553 | #三特征组合从周冠军分享的下载行为和网络条件限制,以及用户属性对app需求挖掘出 554 | for feat_1,feat_2,feat_3 in[('appID','connectionType','positionID'),('appID','haveBaby','gender')]: 555 | temp = pd.read_csv(temppath+'%s.csv' % (feat_1+'_'+feat_2+'_'+feat_3)) 556 | bs = BayesianSmoothing(1, 1) 557 | bs.update(temp[feat_1+'_'+feat_2+'_'+feat_3 + '_all'].values, temp[feat_1+'_'+feat_2+'_'+feat_3 + '_1'].values, 1000, 0.001) 558 | temp[feat_1+'_'+feat_2+'_'+feat_3 + '_smooth'] = (temp[feat_1+'_'+feat_2+'_'+feat_3 + '_1'] + bs.alpha) / (temp[feat_1+'_'+feat_2+'_'+feat_3 + '_all'] + bs.alpha + bs.beta) 559 | temp.drop([feat_1+'_'+feat_2+'_'+feat_3+ '_1',feat_1+'_'+feat_2+'_'+feat_3 + '_all'],axis=1,inplace=True) 560 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=[feat_1,feat_2,feat_3, 'day']) 561 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=[feat_1,feat_2,feat_3, 'day']) 562 | del temp 563 | gc.collect() 564 | print(feat_1 + '_' + feat_2+'_'+feat_3+ ' over...') 565 | X_loc_train.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True) 566 | X_loc_test.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True) 567 | 568 | #userID和positionID的点击次数重要性排名靠前,所有统计特征只加了这一个点击次数 569 | for feat_1,feat_2 in[('positionID','advertiserID'),('userID','sitesetID'),('positionID','connectionType'),('userID','positionID'), 570 | ('appPlatform','positionType'),('advertiserID','connectionType'),('positionID','appCategory'),('appID','age'), 571 | ('userID', 'appID'),('userID','connectionType'),('appCategory','connectionType'),('appID','hour'),('hour','age')]: 572 | temp = pd.read_csv(temppath+'%s.csv' % (feat_1+'_'+feat_2)) 573 | bs = BayesianSmoothing(1, 1) 574 | bs.update(temp[feat_1+'_'+feat_2 + '_all'].values, temp[feat_1+'_'+feat_2 + '_1'].values, 1000, 0.001) 575 | temp[feat_1+'_'+feat_2 + '_smooth'] = (temp[feat_1+'_'+feat_2 + '_1'] + bs.alpha) / (temp[feat_1+'_'+feat_2 + '_all'] + bs.alpha + bs.beta) 576 | if (feat_1,feat_2) in [('userID','positionID')]: 577 | temp.drop([feat_1 + '_' + feat_2 + '_1'], axis=1, inplace=True) 578 | else: 579 | temp.drop([feat_1+'_'+feat_2 + '_1',feat_1+'_'+feat_2 + '_all'],axis=1,inplace=True) 580 | X_loc_train = pd.merge(X_loc_train, temp, how='left', on=[feat_1,feat_2, 'day']) 581 | X_loc_test = pd.merge(X_loc_test, temp, how='left', on=[feat_1,feat_2, 'day']) 582 | del temp 583 | gc.collect() 584 | print(feat_1 + '_' + feat_2 + ' over...') 585 | X_loc_train.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True) 586 | X_loc_test.fillna(value=bs.alpha/(bs.alpha + bs.beta), inplace=True) 587 | 588 | 589 | ##########################################################doTrick 590 | X_loc_train=doTrick(X_loc_train) 591 | X_loc_test=doTrick(X_loc_test) 592 | 593 | ##########################################################丢掉重要性低,缺失值多的原始特征 594 | drop = ['hometown', 'haveBaby', 'telecomsOperator', 'userID', 'clickTime', 595 | 'appPlatform', 'connectionType', 'marriageStatus', 'positionType', 596 | 'gender', 'education', 'camgaignID', 'positionID','maybe_0' 597 | ] 598 | X_loc_train.drop(drop, axis=1, inplace=True) 599 | X_loc_train.fillna(value=0, inplace=True) 600 | X_loc_test.drop(drop, axis=1, inplace=True) 601 | X_loc_test.fillna(value=0, inplace=True) 602 | print('over') 603 | print(X_loc_train.shape) 604 | print(X_loc_train.columns) 605 | X_loc_train.to_csv(temppath+'2_smooth.csv',index=False) 606 | X_loc_test.to_csv(temppath+'2_test_smooth.csv',index=False) 607 | --------------------------------------------------------------------------------