├── README.md
├── huawei2020-recommand.py
└── huawei2020-ctr.py


/README.md:
--------------------------------------------------------------------------------
 1 | # huawei-digix-2020-baseline
 2 | huawei digix 2020 competition baselines for ctr &amp; recommand
 3 | 
 4 | In the CTR part, we coded the ID with count/nunique/target, crossed the category and numerical features, and constructed the embedding feature of word2vec. And it uses the xDeepFM model in the elegant and easy-to-use deepctr library to provide a simple neural network baseline. Affected by the selection of days in the training set and the instability of the neural network, the score will fluctuate between `0.76-0.77`. Perhaps we can use the migration learning method that spans the number of days in this question (refer to the plan of plantsgo in IJCAI 2018)
 5 | 
 6 | In the search correlation prediction part, we consider the monotonicity between tags, replace the Rank model with a regression model, filter the original features for variance, and directly put them into the xgboost and catboost models for learning and get the answer. Using regression modeling can improve your baseline to a score of `0.43+`. Perhaps you can consider cross-combination of different features to further improve your score.
 7 | 
 8 | 
 9 | Thanks: https://github.com/shenweichen/DeepCTR-Torch & https://github.com/shenweichen/DeepCTR
10 | 


--------------------------------------------------------------------------------
/huawei2020-recommand.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.metrics import accuracy_score, f1_score
 4 | from sklearn.model_selection import KFold, StratifiedKFold
 5 | import xgboost as xgb
 6 | import catboost as cbt
 7 | import gc
 8 | from tqdm import tqdm
 9 | 
10 | import warnings
11 | warnings.filterwarnings("ignore")
12 | 
13 | import os
14 | os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3,4,5,6,7"
15 | 
16 | from sklearn.metrics import mean_squared_error as mse
17 | def rmse(y_true, y_pred):
18 |     return np.sqrt(mse(y_true, y_pred))
19 | 
20 | train = pd.read_csv("../inputs/train_dataset.csv",sep="\t",names=['label','query_id','doc_id'] + ["feature_{}".format(i) for i in range(362)])
21 | test = pd.read_csv("../inputs/test_dataset_A.csv",sep='\t',names=['query_id','doc_id'] + ["feature_{}".format(i) for i in range(362)])
22 | df = pd.concat([train, test], ignore_index=True)
23 | 
24 | feature_name = [i for i in df.columns if 'feature' in i]
25 | drop_col = []
26 | for i in tqdm(feature_name):
27 |     if df[i].std()==0:
28 |         feature_name.remove(i)
29 | print(len(feature_name))
30 | 
31 | target = 'label'
32 | 
33 | nfold = 5
34 | kf = KFold(n_splits=nfold, shuffle=True, random_state=2020)
35 | 
36 | oof = np.zeros((len(train), ))
37 | predictions = np.zeros((len(test), ))
38 | fi = []
39 | 
40 | ITERATIONS = 100000
41 | EARLY_STOP = 500
42 | VERBOSE = 500
43 | 
44 | i = 0
45 | for train_index, valid_index in kf.split(train, train[target].astype(int).values):
46 |     print("\nFold {}".format(i + 1))
47 |     X_train, label_train = train.iloc[train_index][feature_name],train.iloc[train_index][target].astype(int).values
48 |     X_valid, label_valid = train.iloc[valid_index][feature_name],train.iloc[valid_index][target].astype(int).values
49 |     
50 |     clf = cbt.CatBoostRegressor(iterations = ITERATIONS, learning_rate = 0.1, depth = 10, 
51 |                                 l2_leaf_reg = 10, loss_function = 'RMSE', eval_metric= "RMSE",
52 |                                 task_type = 'GPU',devices="0:1",simple_ctr = 'FeatureFreq', combinations_ctr = 'FeatureFreq',)
53 |     clf.fit(X_train, label_train, eval_set = [(X_valid, label_valid)], 
54 |             early_stopping_rounds=EARLY_STOP, verbose=VERBOSE*10)
55 |     x1 = clf.predict(X_valid)
56 |     y1 = clf.predict(test[feature_name])
57 |     
58 |     clf = xgb.XGBRegressor(learning_rate=0.1, max_depth=7, 
59 |                            subsample=0.5, colsample_bytree=0.5, n_estimators=ITERATIONS,
60 |                            eval_metric = 'rmse', tree_method='gpu_hist')
61 |     clf.fit(X_train, label_train, eval_set = [(X_valid, label_valid)], 
62 |             early_stopping_rounds=EARLY_STOP, verbose=VERBOSE)
63 |     x2 = clf.predict(X_valid)
64 |     y2 = clf.predict(test[feature_name])
65 |     
66 |     oof[valid_index] = (x1+x2) / 2#clf.predict(X_valid)
67 |     
68 |     predictions += ((y1+y2)/2) / nfold
69 |     i += 1
70 |     
71 | print(rmse(oof, train[target]))
72 | 
73 | submit = test[['query_id','doc_id']].reset_index(drop=True)
74 | submit['predict_label'] = predictions 
75 | submit.columns = ['queryid','documentid','predict_label']
76 | submit.to_csv("../submit/baseline.csv",index=False)
77 | 


--------------------------------------------------------------------------------
/huawei2020-ctr.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 	Torch == 1.4.0
  3 | 
  4 | 	CTR
  5 | 		/inputs
  6 | 		/models
  7 | 			/vector
  8 | 			**model.py
  9 | 		/submit
 10 | '''
 11 | 
 12 | import pandas as pd
 13 | import numpy as np
 14 | import random
 15 | from tqdm import tqdm
 16 | from datetime import datetime
 17 | from sklearn.preprocessing import *
 18 | from sklearn.model_selection import StratifiedKFold
 19 | 
 20 | from deepctr_torch.models import xDeepFM 
 21 | from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
 22 | 
 23 | import torch
 24 | import gc
 25 | import os
 26 | import json
 27 | from joblib import *
 28 | 
 29 | import warnings
 30 | warnings.filterwarnings("ignore")
 31 | 
 32 | def model_feed_dict(df):
 33 |     model = {name: df[name] for name in tqdm(feature_name)}
 34 |     return model
 35 | 
 36 | os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3,4,5,6,7"
 37 | from sklearn.metrics import roc_auc_score
 38 | 
 39 | if not os.path.exists("../inputs/train_data.pickle"):
 40 |     print("原始数据读入")
 41 |     train = pd.read_csv("../inputs/train_data.csv",sep="|")
 42 |     test = pd.read_csv("../inputs/test_data_A.csv",sep="|")
 43 |     train.to_pickle("../inputs/train_data.pickle")
 44 |     test.to_pickle("../inputs/test_data_A.pickle")
 45 |     df = pd.concat([train, test],ignore_index=True)
 46 |     df.to_pickle("../inputs/all_data.pickle")
 47 | else:
 48 |     print("缓存数据读入")
 49 |     df = pd.read_pickle("../inputs/all_data.pickle")
 50 |     df = df[df['pt_d'].isin([6,7,8])]
 51 | 
 52 | # 特征工程
 53 | '''
 54 | 	1. 对ID特征进行Count Encoder，Nunique Encoder, Target Encoder
 55 | 	2. 类别与数值特征的交叉信息
 56 | 	3. Word2Vec特征
 57 | '''
 58 | 
 59 | from gensim.models import *
 60 | 
 61 | def w2v_id_feature(df, key1, key2, mode='group',
 62 |                    embedding_size=64, window_size=20, iter=10, workers=20, min_count=0,
 63 |                    func=['mean','std','max'], use_cache=True):
 64 |     
 65 |     df = df[[key1, key2]]
 66 |     if mode == 'group':
 67 |         lbl = LabelEncoder()
 68 |         try:
 69 |             df[key2] = lbl.fit_transform(df[key2])
 70 |         except:
 71 |             df[key2] = lbl.fit_transform(df[key2].astype(str))
 72 |         sentences = df[[key1, key2]].groupby([key1])[key2].apply(list)
 73 |     else:
 74 |         sentences = df[[key1, key2]].groupby([key1])[key2].apply(lambda x:list(x)[0])
 75 |     
 76 |     if (os.path.exists("./vector/{}_{}_{}_{}.model".format(key1, key2, embedding_size, window_size))) & (use_cache):
 77 |         model = Word2Vec.load("./vector/{}_{}_{}_{}.model".format(key1, key2, embedding_size, window_size))
 78 |     else:
 79 |         model = Word2Vec(df[[key1, key2]].groupby([key1])[key2].apply(lambda x:[str(i) for i in x]).values.tolist(), 
 80 |                          size=embedding_size, window=window_size, 
 81 |                          min_count=min_count, sg=1, seed=seed,iter=iter, workers=workers)
 82 |         model.save("./vector/{}_{}_{}_{}.model".format(key1, key2, embedding_size, window_size))
 83 |     
 84 |     embedding = pd.DataFrame()
 85 |     embedding[key2] = model.wv.vocab.keys()
 86 |     embedding['embedding'] = [model[i] for i in embedding[key2].values]
 87 |     embedding[key2] = embedding[key2].astype(int)
 88 |     embedding = embedding.sort_values(by=[key2],ascending=True)
 89 |     embedding[key2] = lbl.inverse_transform(embedding[key2])
 90 |     emb_matrix = np.array([i for i in embedding['embedding'].values])
 91 |     emb_mean = []
 92 |     for i in tqdm(sentences.values.tolist()):
 93 |         emb_mean.append(np.mean(emb_matrix[i], axis=0))
 94 |     
 95 |     emb_feature = np.asarray(emb_mean)
 96 |     mean_col = ['{}(MainKEY)_{}_MEAN_Window{}_{}'.format(key1, key2, window_size, i) for i in range(embedding_size)]
 97 |     
 98 |     emb_feature = pd.DataFrame(emb_feature, 
 99 |                                columns=mean_col)
100 |     
101 |     emb_feature[key1] = sentences.index
102 | 
103 |     # deal embedding
104 |     embeddings = np.concatenate(embedding['embedding'].values).reshape(-1, embedding_size)
105 |     embeddings = pd.DataFrame(embeddings, columns=["{}_{}(MainKEY)_Window{}_{}".format(key1, key2, window_size, i) for i in range(embedding_size)])
106 |     embedding[embeddings.columns] = embeddings
107 |     del embedding['embedding']
108 | 
109 |     return emb_feature.reset_index(drop=True), embedding.reset_index(drop=True)
110 | 
111 | def kfold_stats_feature(train, test, feats, k):
112 |     folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)  
113 | 
114 |     train['fold'] = None
115 |     for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
116 |         train.loc[val_idx, 'fold'] = fold_
117 | 
118 |     kfold_features = []
119 |     for feat in tqdm(feats):
120 |         nums_columns = ['label']
121 |         for f in nums_columns:
122 |             colname = feat + '_' + f + '_kfold_mean'
123 |             kfold_features.append(colname)
124 |             train[colname] = None
125 |             for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
126 |                 tmp_trn = train.iloc[trn_idx]
127 |                 order_label = tmp_trn.groupby([feat])[f].mean()
128 |                 tmp = train.loc[train.fold == fold_, [feat]]
129 |                 train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)
130 |                 # fillna
131 |                 global_mean = train[f].mean()
132 |                 train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
133 |             train[colname] = train[colname].astype(float)
134 | 
135 |         for f in nums_columns:
136 |             colname = feat + '_' + f + '_kfold_mean'
137 |             test[colname] = None
138 |             order_label = train.groupby([feat])[f].mean()
139 |             test[colname] = test[feat].map(order_label)
140 |             # fillna
141 |             global_mean = train[f].mean()
142 |             test[colname] = test[colname].fillna(global_mean)
143 |             test[colname] = test[colname].astype(float)
144 |     del train['fold']
145 |     return train, test
146 | 
147 | 
148 | # Part 1
149 | 
150 | to_count = [['uid'], ['task_id'], ['adv_id'], ['creat_type_cd'], ['adv_prim_id'], 
151 |             ['dev_id'], ['inter_type_cd'], ['slot_id'], ['spread_app_id'], ['tags'], ['app_first_class'],
152 |             ['app_second_class'], ['age'], ['city'], ['city_rank'], ['device_name'], ['device_size'],
153 |             ['career'], ['gender'], ['net_type'], ['residence'], ['his_app_size'], ['his_on_shelf_time'],
154 |             ['app_score'], ['emui_dev'], ['list_time'], ['device_price'], ['up_life_duration'], ['up_membership_grade'],
155 |             ['membership_life_duration'], ['consume_purchase'], ['communication_onlinerate'], ['communication_avgonline_30d'],
156 |             ['indu_name']] 
157 | 
158 | for i in tqdm(to_count):
159 |     df["{}_count".format("_".join(i))] = df[i].groupby(i)[i].transform('count') 
160 |     # df["{}_rank".format("_".join(i))] = df["{}_count".format("_".join(i))].rank(method='min')
161 | 
162 | to_group = [
163 |     ['uid','task_id'], ['uid','adv_id'], ['uid','adv_prim_id'], ['uid','dev_id'], ['uid','slot_id'],
164 |     ['uid','spread_app_id'], ['uid','app_first_class'], ['uid','city'], ['uid','device_name'], ['uid', 'net_type'],
165 |     ['uid','communication_onlinerate'], ['uid','list_time']
166 | ]
167 | 
168 | feature = pd.DataFrame()
169 | for i in tqdm(to_group):
170 |     feature["STAT_{}_nunique_1".format("_".join(i))] = df[i].groupby(i[1])[i[0]].transform('nunique')
171 |     feature["STAT_{}_nunique_2".format("_".join(i))] = df[i].groupby(i[0])[i[1]].transform('nunique')
172 |     feature["COUNT-2order_{}".format("_".join(i))] = df[i].groupby(i)[i[0]].transform("count")
173 | 
174 | # Part 2
175 | to_group = [
176 |         ['task_id'], ['dev_id'], ['adv_prim_id'], ['adv_id'], 
177 |         ['inter_type_cd'], ['slot_id'], ['tags'], ['app_first_class'],
178 |     ]
179 | 
180 | to_inter = [
181 |     'age',
182 |     'city_rank',
183 |     'career',
184 |     'his_app_size',
185 |     'his_on_shelf_time',
186 |     'app_score',
187 |     'emui_dev',
188 |     'device_price',
189 |     'up_life_duration',
190 |     'communication_avgonline_30d',
191 | ]
192 | 
193 | to_calc = [
194 |     'std',
195 |     'mean',
196 |     'min',
197 |     'max',
198 |     lambda x:np.std(np.fft.fft(x)),
199 | ]
200 | 
201 | for i in tqdm(to_group):
202 |     for j in to_inter:
203 |         for k in to_calc:
204 |             feature["STAT_{}_{}_{}".format("_".join(i),j,k)] = df[i + [j]].groupby(i)[j].transform(k)
205 | 
206 | choose = df['pt_d']!=8
207 | train, test = df[choose].reset_index(drop=True), df[~choose].reset_index(drop=True)
208 | target_encode_cols = ['uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 
209 |                   	  'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class',]
210 | train, test = kfold_stats_feature(train, test, target_encode_cols, 5)
211 | df = pd.concat([train, test], ignore_index=True)
212 | 
213 | # Part 3
214 | 
215 | merge_features = []
216 | embedding_size = 32
217 | seed = 2020
218 | 
219 | tmp = w2v_id_feature(df, 'uid', 'task_id', embedding_size=embedding_size)
220 | merge_features.append(['uid', tmp[0]])
221 | merge_features.append(['task_id', tmp[1]])
222 | 
223 | tmp = w2v_id_feature(df, 'uid', 'adv_id', embedding_size=embedding_size)
224 | merge_features.append(['uid', tmp[0]])
225 | merge_features.append(['adv_id', tmp[1]])
226 | 
227 | tmp = w2v_id_feature(df, 'uid', 'slot_id', embedding_size=embedding_size)
228 | merge_features.append(['uid', tmp[0]])
229 | merge_features.append(['slot_id', tmp[1]])
230 | 
231 | tmp = w2v_id_feature(df, 'uid', 'tags', embedding_size=embedding_size)
232 | merge_features.append(['uid', tmp[0]])
233 | merge_features.append(['tags', tmp[1]])
234 | 
235 | merges = []
236 | for key,fea in tqdm(merge_features):
237 |     tmp = df[[key]].merge(fea, how='left', on=key)
238 |     merges.append(tmp)
239 | 
240 | feature.reset_index(drop=True, inplace=True)
241 | df[feature.columns] = feature
242 | 
243 | for fea in tqdm(merges):
244 |     fea = fea.reset_index(drop=True)
245 |     df[fea.columns] = fea
246 | 
247 | '''
248 | 	数据预处理 & 模型
249 | '''
250 | 
251 | drop_feature = ['label','id','pt_d']
252 | feature_name = [i for i in df.columns if i not in drop_feature]
253 | print(len(feature_name))
254 | 
255 | sparse_feature = ['uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 
256 |                   'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class',
257 |                   'app_second_class', 'age', 'city', 'city_rank', 'device_name', 'device_size',
258 |                   'career', 'gender', 'net_type', 'residence', 'his_app_size', 'his_on_shelf_time',
259 |                   'app_score', 'emui_dev', 'list_time', 'device_price', 'up_life_duration', 'up_membership_grade',
260 |                   'membership_life_duration', 'consume_purchase', 'communication_onlinerate', 'communication_avgonline_30d',
261 |                   'indu_name']
262 | 
263 | dense_feature = [i for i in feature_name if i not in sparse_feature]
264 | 
265 | for i in tqdm(sparse_feature):
266 |     lbl = LabelEncoder()
267 |     try:
268 |         df[i] = lbl.fit_transform(df[i])
269 |     except:
270 |         continue
271 |         df[i] = lbl.fit_trasnform(df[i].astype('str'))
272 | 
273 | df = df.fillna(-1)
274 |         
275 | for i in tqdm(dense_feature):
276 |     try:
277 |         df[i] = MinMaxScaler().fit_transform(df[i].values.reshape(-1,1))
278 |     except:
279 |         feature_name.remove(i)
280 |         dense_feature.remove(i)
281 |         print("Remove", i)
282 | 
283 | train = df[df['pt_d'].isin([1,2,3,4,5,6])]
284 | valid = df[df['pt_d']==7]
285 | test = df[df['pt_d']==8]
286 | 
287 | fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique(), embedding_dim=8) for feat in sparse_feature] +\
288 |                          [DenseFeat(feat, 1, ) for feat in dense_feature]
289 | 
290 | dnn_feature_columns = fixlen_feature_columns 
291 | linear_feature_columns = fixlen_feature_columns 
292 | 
293 | feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
294 | 
295 | X_train = model_feed_dict(train[feature_name])
296 | X_valid = model_feed_dict(valid[feature_name])
297 | X_test = model_feed_dict(test[feature_name])
298 | 
299 | Y = train['label'].values
300 | valid_Y = valid['label'].values
301 | 
302 | torch.cuda.empty_cache()
303 | 
304 | use_cuda = True
305 | if use_cuda and torch.cuda.is_available():
306 |     print('cuda ready...')
307 |     device = 'cuda:0'
308 |     
309 | # torch.autograd.set_detect_anomaly(True)
310 | 
311 | model = xDeepFM(linear_feature_columns, dnn_feature_columns, device=device)
312 | model.compile("adam", 
313 |               'binary_crossentropy',
314 |               ["auc"])
315 | model.fit(X_train, Y, batch_size=4096, epochs=1, 
316 |           validation_data=(X_valid, valid_Y), verbose=1, )
317 | 
318 | model.fit(X_valid, valid_Y, batch_size=4096)
319 | answer = model.predict(X_test, batch_size=8192)
320 | submit = pd.DataFrame()
321 | submit['id'] = test['id'].astype(int)
322 | submit['probability'] = np.round(answer.flatten(), 6)
323 | submit.to_csv("../submit/xDeepFM-deepctr-baseline.csv",index=False)
324 | 


--------------------------------------------------------------------------------