├── README.md ├── huawei2020-recommand.py └── huawei2020-ctr.py /README.md: -------------------------------------------------------------------------------- 1 | # huawei-digix-2020-baseline 2 | huawei digix 2020 competition baselines for ctr & recommand 3 | 4 | In the CTR part, we coded the ID with count/nunique/target, crossed the category and numerical features, and constructed the embedding feature of word2vec. And it uses the xDeepFM model in the elegant and easy-to-use deepctr library to provide a simple neural network baseline. Affected by the selection of days in the training set and the instability of the neural network, the score will fluctuate between `0.76-0.77`. Perhaps we can use the migration learning method that spans the number of days in this question (refer to the plan of plantsgo in IJCAI 2018) 5 | 6 | In the search correlation prediction part, we consider the monotonicity between tags, replace the Rank model with a regression model, filter the original features for variance, and directly put them into the xgboost and catboost models for learning and get the answer. Using regression modeling can improve your baseline to a score of `0.43+`. Perhaps you can consider cross-combination of different features to further improve your score. 7 | 8 | 9 | Thanks: https://github.com/shenweichen/DeepCTR-Torch & https://github.com/shenweichen/DeepCTR 10 | -------------------------------------------------------------------------------- /huawei2020-recommand.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.metrics import accuracy_score, f1_score 4 | from sklearn.model_selection import KFold, StratifiedKFold 5 | import xgboost as xgb 6 | import catboost as cbt 7 | import gc 8 | from tqdm import tqdm 9 | 10 | import warnings 11 | warnings.filterwarnings("ignore") 12 | 13 | import os 14 | os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3,4,5,6,7" 15 | 16 | from sklearn.metrics import mean_squared_error as mse 17 | def rmse(y_true, y_pred): 18 | return np.sqrt(mse(y_true, y_pred)) 19 | 20 | train = pd.read_csv("../inputs/train_dataset.csv",sep="\t",names=['label','query_id','doc_id'] + ["feature_{}".format(i) for i in range(362)]) 21 | test = pd.read_csv("../inputs/test_dataset_A.csv",sep='\t',names=['query_id','doc_id'] + ["feature_{}".format(i) for i in range(362)]) 22 | df = pd.concat([train, test], ignore_index=True) 23 | 24 | feature_name = [i for i in df.columns if 'feature' in i] 25 | drop_col = [] 26 | for i in tqdm(feature_name): 27 | if df[i].std()==0: 28 | feature_name.remove(i) 29 | print(len(feature_name)) 30 | 31 | target = 'label' 32 | 33 | nfold = 5 34 | kf = KFold(n_splits=nfold, shuffle=True, random_state=2020) 35 | 36 | oof = np.zeros((len(train), )) 37 | predictions = np.zeros((len(test), )) 38 | fi = [] 39 | 40 | ITERATIONS = 100000 41 | EARLY_STOP = 500 42 | VERBOSE = 500 43 | 44 | i = 0 45 | for train_index, valid_index in kf.split(train, train[target].astype(int).values): 46 | print("\nFold {}".format(i + 1)) 47 | X_train, label_train = train.iloc[train_index][feature_name],train.iloc[train_index][target].astype(int).values 48 | X_valid, label_valid = train.iloc[valid_index][feature_name],train.iloc[valid_index][target].astype(int).values 49 | 50 | clf = cbt.CatBoostRegressor(iterations = ITERATIONS, learning_rate = 0.1, depth = 10, 51 | l2_leaf_reg = 10, loss_function = 'RMSE', eval_metric= "RMSE", 52 | task_type = 'GPU',devices="0:1",simple_ctr = 'FeatureFreq', combinations_ctr = 'FeatureFreq',) 53 | clf.fit(X_train, label_train, eval_set = [(X_valid, label_valid)], 54 | early_stopping_rounds=EARLY_STOP, verbose=VERBOSE*10) 55 | x1 = clf.predict(X_valid) 56 | y1 = clf.predict(test[feature_name]) 57 | 58 | clf = xgb.XGBRegressor(learning_rate=0.1, max_depth=7, 59 | subsample=0.5, colsample_bytree=0.5, n_estimators=ITERATIONS, 60 | eval_metric = 'rmse', tree_method='gpu_hist') 61 | clf.fit(X_train, label_train, eval_set = [(X_valid, label_valid)], 62 | early_stopping_rounds=EARLY_STOP, verbose=VERBOSE) 63 | x2 = clf.predict(X_valid) 64 | y2 = clf.predict(test[feature_name]) 65 | 66 | oof[valid_index] = (x1+x2) / 2#clf.predict(X_valid) 67 | 68 | predictions += ((y1+y2)/2) / nfold 69 | i += 1 70 | 71 | print(rmse(oof, train[target])) 72 | 73 | submit = test[['query_id','doc_id']].reset_index(drop=True) 74 | submit['predict_label'] = predictions 75 | submit.columns = ['queryid','documentid','predict_label'] 76 | submit.to_csv("../submit/baseline.csv",index=False) 77 | -------------------------------------------------------------------------------- /huawei2020-ctr.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Torch == 1.4.0 3 | 4 | CTR 5 | /inputs 6 | /models 7 | /vector 8 | **model.py 9 | /submit 10 | ''' 11 | 12 | import pandas as pd 13 | import numpy as np 14 | import random 15 | from tqdm import tqdm 16 | from datetime import datetime 17 | from sklearn.preprocessing import * 18 | from sklearn.model_selection import StratifiedKFold 19 | 20 | from deepctr_torch.models import xDeepFM 21 | from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names 22 | 23 | import torch 24 | import gc 25 | import os 26 | import json 27 | from joblib import * 28 | 29 | import warnings 30 | warnings.filterwarnings("ignore") 31 | 32 | def model_feed_dict(df): 33 | model = {name: df[name] for name in tqdm(feature_name)} 34 | return model 35 | 36 | os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3,4,5,6,7" 37 | from sklearn.metrics import roc_auc_score 38 | 39 | if not os.path.exists("../inputs/train_data.pickle"): 40 | print("原始数据读入") 41 | train = pd.read_csv("../inputs/train_data.csv",sep="|") 42 | test = pd.read_csv("../inputs/test_data_A.csv",sep="|") 43 | train.to_pickle("../inputs/train_data.pickle") 44 | test.to_pickle("../inputs/test_data_A.pickle") 45 | df = pd.concat([train, test],ignore_index=True) 46 | df.to_pickle("../inputs/all_data.pickle") 47 | else: 48 | print("缓存数据读入") 49 | df = pd.read_pickle("../inputs/all_data.pickle") 50 | df = df[df['pt_d'].isin([6,7,8])] 51 | 52 | # 特征工程 53 | ''' 54 | 1. 对ID特征进行Count Encoder,Nunique Encoder, Target Encoder 55 | 2. 类别与数值特征的交叉信息 56 | 3. Word2Vec特征 57 | ''' 58 | 59 | from gensim.models import * 60 | 61 | def w2v_id_feature(df, key1, key2, mode='group', 62 | embedding_size=64, window_size=20, iter=10, workers=20, min_count=0, 63 | func=['mean','std','max'], use_cache=True): 64 | 65 | df = df[[key1, key2]] 66 | if mode == 'group': 67 | lbl = LabelEncoder() 68 | try: 69 | df[key2] = lbl.fit_transform(df[key2]) 70 | except: 71 | df[key2] = lbl.fit_transform(df[key2].astype(str)) 72 | sentences = df[[key1, key2]].groupby([key1])[key2].apply(list) 73 | else: 74 | sentences = df[[key1, key2]].groupby([key1])[key2].apply(lambda x:list(x)[0]) 75 | 76 | if (os.path.exists("./vector/{}_{}_{}_{}.model".format(key1, key2, embedding_size, window_size))) & (use_cache): 77 | model = Word2Vec.load("./vector/{}_{}_{}_{}.model".format(key1, key2, embedding_size, window_size)) 78 | else: 79 | model = Word2Vec(df[[key1, key2]].groupby([key1])[key2].apply(lambda x:[str(i) for i in x]).values.tolist(), 80 | size=embedding_size, window=window_size, 81 | min_count=min_count, sg=1, seed=seed,iter=iter, workers=workers) 82 | model.save("./vector/{}_{}_{}_{}.model".format(key1, key2, embedding_size, window_size)) 83 | 84 | embedding = pd.DataFrame() 85 | embedding[key2] = model.wv.vocab.keys() 86 | embedding['embedding'] = [model[i] for i in embedding[key2].values] 87 | embedding[key2] = embedding[key2].astype(int) 88 | embedding = embedding.sort_values(by=[key2],ascending=True) 89 | embedding[key2] = lbl.inverse_transform(embedding[key2]) 90 | emb_matrix = np.array([i for i in embedding['embedding'].values]) 91 | emb_mean = [] 92 | for i in tqdm(sentences.values.tolist()): 93 | emb_mean.append(np.mean(emb_matrix[i], axis=0)) 94 | 95 | emb_feature = np.asarray(emb_mean) 96 | mean_col = ['{}(MainKEY)_{}_MEAN_Window{}_{}'.format(key1, key2, window_size, i) for i in range(embedding_size)] 97 | 98 | emb_feature = pd.DataFrame(emb_feature, 99 | columns=mean_col) 100 | 101 | emb_feature[key1] = sentences.index 102 | 103 | # deal embedding 104 | embeddings = np.concatenate(embedding['embedding'].values).reshape(-1, embedding_size) 105 | embeddings = pd.DataFrame(embeddings, columns=["{}_{}(MainKEY)_Window{}_{}".format(key1, key2, window_size, i) for i in range(embedding_size)]) 106 | embedding[embeddings.columns] = embeddings 107 | del embedding['embedding'] 108 | 109 | return emb_feature.reset_index(drop=True), embedding.reset_index(drop=True) 110 | 111 | def kfold_stats_feature(train, test, feats, k): 112 | folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020) 113 | 114 | train['fold'] = None 115 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])): 116 | train.loc[val_idx, 'fold'] = fold_ 117 | 118 | kfold_features = [] 119 | for feat in tqdm(feats): 120 | nums_columns = ['label'] 121 | for f in nums_columns: 122 | colname = feat + '_' + f + '_kfold_mean' 123 | kfold_features.append(colname) 124 | train[colname] = None 125 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])): 126 | tmp_trn = train.iloc[trn_idx] 127 | order_label = tmp_trn.groupby([feat])[f].mean() 128 | tmp = train.loc[train.fold == fold_, [feat]] 129 | train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label) 130 | # fillna 131 | global_mean = train[f].mean() 132 | train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean) 133 | train[colname] = train[colname].astype(float) 134 | 135 | for f in nums_columns: 136 | colname = feat + '_' + f + '_kfold_mean' 137 | test[colname] = None 138 | order_label = train.groupby([feat])[f].mean() 139 | test[colname] = test[feat].map(order_label) 140 | # fillna 141 | global_mean = train[f].mean() 142 | test[colname] = test[colname].fillna(global_mean) 143 | test[colname] = test[colname].astype(float) 144 | del train['fold'] 145 | return train, test 146 | 147 | 148 | # Part 1 149 | 150 | to_count = [['uid'], ['task_id'], ['adv_id'], ['creat_type_cd'], ['adv_prim_id'], 151 | ['dev_id'], ['inter_type_cd'], ['slot_id'], ['spread_app_id'], ['tags'], ['app_first_class'], 152 | ['app_second_class'], ['age'], ['city'], ['city_rank'], ['device_name'], ['device_size'], 153 | ['career'], ['gender'], ['net_type'], ['residence'], ['his_app_size'], ['his_on_shelf_time'], 154 | ['app_score'], ['emui_dev'], ['list_time'], ['device_price'], ['up_life_duration'], ['up_membership_grade'], 155 | ['membership_life_duration'], ['consume_purchase'], ['communication_onlinerate'], ['communication_avgonline_30d'], 156 | ['indu_name']] 157 | 158 | for i in tqdm(to_count): 159 | df["{}_count".format("_".join(i))] = df[i].groupby(i)[i].transform('count') 160 | # df["{}_rank".format("_".join(i))] = df["{}_count".format("_".join(i))].rank(method='min') 161 | 162 | to_group = [ 163 | ['uid','task_id'], ['uid','adv_id'], ['uid','adv_prim_id'], ['uid','dev_id'], ['uid','slot_id'], 164 | ['uid','spread_app_id'], ['uid','app_first_class'], ['uid','city'], ['uid','device_name'], ['uid', 'net_type'], 165 | ['uid','communication_onlinerate'], ['uid','list_time'] 166 | ] 167 | 168 | feature = pd.DataFrame() 169 | for i in tqdm(to_group): 170 | feature["STAT_{}_nunique_1".format("_".join(i))] = df[i].groupby(i[1])[i[0]].transform('nunique') 171 | feature["STAT_{}_nunique_2".format("_".join(i))] = df[i].groupby(i[0])[i[1]].transform('nunique') 172 | feature["COUNT-2order_{}".format("_".join(i))] = df[i].groupby(i)[i[0]].transform("count") 173 | 174 | # Part 2 175 | to_group = [ 176 | ['task_id'], ['dev_id'], ['adv_prim_id'], ['adv_id'], 177 | ['inter_type_cd'], ['slot_id'], ['tags'], ['app_first_class'], 178 | ] 179 | 180 | to_inter = [ 181 | 'age', 182 | 'city_rank', 183 | 'career', 184 | 'his_app_size', 185 | 'his_on_shelf_time', 186 | 'app_score', 187 | 'emui_dev', 188 | 'device_price', 189 | 'up_life_duration', 190 | 'communication_avgonline_30d', 191 | ] 192 | 193 | to_calc = [ 194 | 'std', 195 | 'mean', 196 | 'min', 197 | 'max', 198 | lambda x:np.std(np.fft.fft(x)), 199 | ] 200 | 201 | for i in tqdm(to_group): 202 | for j in to_inter: 203 | for k in to_calc: 204 | feature["STAT_{}_{}_{}".format("_".join(i),j,k)] = df[i + [j]].groupby(i)[j].transform(k) 205 | 206 | choose = df['pt_d']!=8 207 | train, test = df[choose].reset_index(drop=True), df[~choose].reset_index(drop=True) 208 | target_encode_cols = ['uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 209 | 'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class',] 210 | train, test = kfold_stats_feature(train, test, target_encode_cols, 5) 211 | df = pd.concat([train, test], ignore_index=True) 212 | 213 | # Part 3 214 | 215 | merge_features = [] 216 | embedding_size = 32 217 | seed = 2020 218 | 219 | tmp = w2v_id_feature(df, 'uid', 'task_id', embedding_size=embedding_size) 220 | merge_features.append(['uid', tmp[0]]) 221 | merge_features.append(['task_id', tmp[1]]) 222 | 223 | tmp = w2v_id_feature(df, 'uid', 'adv_id', embedding_size=embedding_size) 224 | merge_features.append(['uid', tmp[0]]) 225 | merge_features.append(['adv_id', tmp[1]]) 226 | 227 | tmp = w2v_id_feature(df, 'uid', 'slot_id', embedding_size=embedding_size) 228 | merge_features.append(['uid', tmp[0]]) 229 | merge_features.append(['slot_id', tmp[1]]) 230 | 231 | tmp = w2v_id_feature(df, 'uid', 'tags', embedding_size=embedding_size) 232 | merge_features.append(['uid', tmp[0]]) 233 | merge_features.append(['tags', tmp[1]]) 234 | 235 | merges = [] 236 | for key,fea in tqdm(merge_features): 237 | tmp = df[[key]].merge(fea, how='left', on=key) 238 | merges.append(tmp) 239 | 240 | feature.reset_index(drop=True, inplace=True) 241 | df[feature.columns] = feature 242 | 243 | for fea in tqdm(merges): 244 | fea = fea.reset_index(drop=True) 245 | df[fea.columns] = fea 246 | 247 | ''' 248 | 数据预处理 & 模型 249 | ''' 250 | 251 | drop_feature = ['label','id','pt_d'] 252 | feature_name = [i for i in df.columns if i not in drop_feature] 253 | print(len(feature_name)) 254 | 255 | sparse_feature = ['uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 256 | 'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class', 257 | 'app_second_class', 'age', 'city', 'city_rank', 'device_name', 'device_size', 258 | 'career', 'gender', 'net_type', 'residence', 'his_app_size', 'his_on_shelf_time', 259 | 'app_score', 'emui_dev', 'list_time', 'device_price', 'up_life_duration', 'up_membership_grade', 260 | 'membership_life_duration', 'consume_purchase', 'communication_onlinerate', 'communication_avgonline_30d', 261 | 'indu_name'] 262 | 263 | dense_feature = [i for i in feature_name if i not in sparse_feature] 264 | 265 | for i in tqdm(sparse_feature): 266 | lbl = LabelEncoder() 267 | try: 268 | df[i] = lbl.fit_transform(df[i]) 269 | except: 270 | continue 271 | df[i] = lbl.fit_trasnform(df[i].astype('str')) 272 | 273 | df = df.fillna(-1) 274 | 275 | for i in tqdm(dense_feature): 276 | try: 277 | df[i] = MinMaxScaler().fit_transform(df[i].values.reshape(-1,1)) 278 | except: 279 | feature_name.remove(i) 280 | dense_feature.remove(i) 281 | print("Remove", i) 282 | 283 | train = df[df['pt_d'].isin([1,2,3,4,5,6])] 284 | valid = df[df['pt_d']==7] 285 | test = df[df['pt_d']==8] 286 | 287 | fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique(), embedding_dim=8) for feat in sparse_feature] +\ 288 | [DenseFeat(feat, 1, ) for feat in dense_feature] 289 | 290 | dnn_feature_columns = fixlen_feature_columns 291 | linear_feature_columns = fixlen_feature_columns 292 | 293 | feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) 294 | 295 | X_train = model_feed_dict(train[feature_name]) 296 | X_valid = model_feed_dict(valid[feature_name]) 297 | X_test = model_feed_dict(test[feature_name]) 298 | 299 | Y = train['label'].values 300 | valid_Y = valid['label'].values 301 | 302 | torch.cuda.empty_cache() 303 | 304 | use_cuda = True 305 | if use_cuda and torch.cuda.is_available(): 306 | print('cuda ready...') 307 | device = 'cuda:0' 308 | 309 | # torch.autograd.set_detect_anomaly(True) 310 | 311 | model = xDeepFM(linear_feature_columns, dnn_feature_columns, device=device) 312 | model.compile("adam", 313 | 'binary_crossentropy', 314 | ["auc"]) 315 | model.fit(X_train, Y, batch_size=4096, epochs=1, 316 | validation_data=(X_valid, valid_Y), verbose=1, ) 317 | 318 | model.fit(X_valid, valid_Y, batch_size=4096) 319 | answer = model.predict(X_test, batch_size=8192) 320 | submit = pd.DataFrame() 321 | submit['id'] = test['id'].astype(int) 322 | submit['probability'] = np.round(answer.flatten(), 6) 323 | submit.to_csv("../submit/xDeepFM-deepctr-baseline.csv",index=False) 324 | --------------------------------------------------------------------------------