├── .gitignore
├── README.md
└── xunfei_dl_gru.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | __pycache__
 3 | archive
 4 | .pyc
 5 | .idea
 6 | __pyache__/
 7 | dist/
 8 | ~$*
 9 | /data
10 | resources/
11 | .ipynb_checkpoints


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 讯飞移动广告反欺诈算法挑战赛深度学习模型
 2 | > 该深度学习模型仅供尝鲜。目前最好成绩为94.12672。
 3 | 
 4 | **比赛链接：**
 5 | - http://challenge.xfyun.cn/2019/gamedetail?type=detail/mobileAD
 6 | 
 7 | ## 使用方式
 8 | - 将`data_dir`指定数据目录即可。
 9 | 
10 | ## 参考资料
11 | - [kaggle talkingdata比赛](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection)
12 | 
13 | @**Galen**_20190717_


--------------------------------------------------------------------------------
/xunfei_dl_gru.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from datetime import timedelta, datetime
  3 | 
  4 | import gc
  5 | import pandas as pd
  6 | from keras.callbacks import EarlyStopping
  7 | from keras.layers import Input, Embedding, Dense, Dropout, concatenate, Reshape
  8 | from keras.layers import Lambda, GaussianDropout, CuDNNGRU, BatchNormalization, PReLU
  9 | from keras.models import Model
 10 | from keras.optimizers import Adam
 11 | 
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | base_cols = ['ip']
 15 | media_cols = ['pkgname', 'ver', 'adunitshowid', 'mediashowid', 'apptype']
 16 | time_cols = ['hour']
 17 | location_cols = ['city']
 18 | device_cols = ['adidmd5', 'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5', 'dvctype', 'model', 'make', 'ntt', 'carrier',
 19 |                'osv', 'orientation', 'ppi', 'screen_area', 'creative_dpi']
 20 | total_cate = [base_cols, media_cols, time_cols, location_cols, device_cols]
 21 | 
 22 | data_dir = '/home/galen/workspace/competition/data/'
 23 | print('read data')
 24 | df_test = pd.read_csv(data_dir + 'round1_iflyad_anticheat_testdata_feature.txt', sep='\t')
 25 | df_train = pd.read_csv(data_dir + 'round1_iflyad_anticheat_traindata.txt', sep='\t')
 26 | df_uni = pd.concat([df_train, df_test], ignore_index=True)
 27 | df_uni['label'] = df_uni['label'].fillna(-1).astype(int)
 28 | 
 29 | # 数据预处理
 30 | print('prework')
 31 | # 处理ip。ip 为空时，使用 reqrealip。
 32 | df_uni.ip.fillna(df_uni.reqrealip, inplace=True)
 33 | # 屏幕尺寸 合并成宽和高
 34 | df_uni['screen_area'] = (df_uni['w'] * df_uni['h']).astype('category')
 35 | df_uni['creative_dpi'] = df_uni['w'].astype(str) + "_" + df_uni['h'].astype(str)
 36 | # orientation 出现异常值 90度和2 归为 0
 37 | df_uni.orientation[(df_uni.orientation == 90) | (df_uni.orientation == 2)] = 0
 38 | # carrier  -1 就是0
 39 | df_uni.carrier[df_uni.carrier == -1] = 0
 40 | # ntt 网络类型。0 未知 -> 0 , 1 2 宽带 1 ,  4,5,6 移动网络 -> 2
 41 | df_uni.ntt[(df_uni.ntt <= 0) | (df_uni.ntt > 6)] = 0
 42 | df_uni.ntt[(df_uni.ntt <= 2) | (df_uni.ntt >= 1)] = 1
 43 | df_uni.ntt[(df_uni.ntt <= 6) | (df_uni.ntt >= 4)] = 2
 44 | # 运营商 carrier
 45 | df_uni.ntt[(df_uni.carrier <= 0) | (df_uni.carrier > 46003)] = 0
 46 | 
 47 | 
 48 | # make
 49 | def make_fix(x):
 50 |     """
 51 |     iphone,iPhone,Apple,APPLE>--apple
 52 |     redmi>--xiaomi
 53 |     honor>--huawei
 54 |     Best sony,Best-sony,Best_sony,BESTSONY>--best_sony
 55 |     :param x:
 56 |     :return:
 57 |     """
 58 |     x = x.lower()
 59 |     if 'iphone' in x or 'apple' in x:
 60 |         return 'apple'
 61 |     if '华为' in x or 'huawei' in x or "荣耀" in x:
 62 |         return 'huawei'
 63 |     if "魅族" in x:
 64 |         return 'meizu'
 65 |     if "金立" in x:
 66 |         return 'gionee'
 67 |     if "三星" in x:
 68 |         return 'samsung'
 69 |     if 'xiaomi' in x or 'redmi' in x:
 70 |         return 'xiaomi'
 71 |     if 'oppo' in x:
 72 |         return 'oppo'
 73 |     return x
 74 | 
 75 | 
 76 | df_uni['make'] = df_uni['make'].astype('str').apply(lambda x: x.lower())
 77 | df_uni['make'] = df_uni['make'].apply(make_fix)
 78 | 
 79 | print('feature time...')
 80 | # 处理时间
 81 | df_uni['datetime'] = pd.to_datetime(df_uni['nginxtime'] / 1000, unit='s') + timedelta(hours=8)
 82 | df_uni['hour'] = df_uni['datetime'].dt.hour
 83 | # 将天数归零成有序数列。[0,1,2,3,4,5,6]
 84 | df_uni['day'] = df_uni['datetime'].dt.day - df_uni['datetime'].dt.day.min()
 85 | 
 86 | 
 87 | def unique_count(index_col, feature, df_data):
 88 |     if isinstance(index_col, list):
 89 |         name = "{0}_{1}_nq".format('_'.join(index_col), feature)
 90 |     else:
 91 |         name = "{0}_{1}_nq".format(index_col, feature)
 92 |     print(name)
 93 |     gp1 = df_data.groupby(index_col)[feature].nunique().reset_index().rename(
 94 |         columns={feature: name})
 95 |     df_data = pd.merge(df_data, gp1, how='left', on=[index_col])
 96 |     return df_data.fillna(0)
 97 | 
 98 | 
 99 | # 设备下的媒体数  model_mediashowid_nq model_city_nq
100 | df_uni = unique_count('model', 'mediashowid', df_uni)
101 | df_uni = unique_count('model', 'city', df_uni)
102 | # 设备
103 | df_uni = unique_count('adidmd5', 'model', df_uni)
104 | df_uni = unique_count('imeimd5', 'model', df_uni)
105 | df_uni = unique_count('macmd5', 'model', df_uni)
106 | df_uni = unique_count('openudidmd5', 'model', df_uni)
107 | df_uni = unique_count('ip', 'model', df_uni)
108 | df_uni = unique_count('reqrealip', 'model', df_uni)
109 | 
110 | # 屏幕密度
111 | df_uni = unique_count('adidmd5', 'ppi', df_uni)
112 | df_uni = unique_count('imeimd5', 'ppi', df_uni)
113 | df_uni = unique_count('macmd5', 'ppi', df_uni)
114 | df_uni = unique_count('openudidmd5', 'ppi', df_uni)
115 | df_uni = unique_count('ip', 'ppi', df_uni)
116 | df_uni = unique_count('reqrealip', 'ppi', df_uni)
117 | 
118 | # 网络类型
119 | df_uni = unique_count('adidmd5', 'dvctype', df_uni)
120 | df_uni = unique_count('imeimd5', 'dvctype', df_uni)
121 | df_uni = unique_count('macmd5', 'dvctype', df_uni)
122 | df_uni = unique_count('openudidmd5', 'dvctype', df_uni)
123 | df_uni = unique_count('ip', 'dvctype', df_uni)
124 | df_uni = unique_count('reqrealip', 'dvctype', df_uni)
125 | 
126 | # 地理位置
127 | df_uni = unique_count('ip', 'city', df_uni)
128 | df_uni = unique_count('reqrealip', 'city', df_uni)
129 | 
130 | # 用户下的ip数
131 | df_uni = unique_count('adidmd5', 'ip', df_uni)
132 | df_uni = unique_count('imeimd5', 'ip', df_uni)
133 | df_uni = unique_count('macmd5', 'ip', df_uni)
134 | df_uni = unique_count('openudidmd5', 'ip', df_uni)
135 | 
136 | # 统计数据
137 | value_counts_col = [
138 |     # 'adidmd5', 'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5',
139 |     'make', 'pkgname', 'adunitshowid', 'mediashowid', 'ip', 'city', 'model', 'hour',
140 |     'screen_area', 'creative_dpi', 'h', 'w',
141 |     'dvctype',
142 | ]
143 | 
144 | 
145 | def gen_value_counts(data, col):
146 |     """
147 |     # 统计每个种类的个数。
148 |     :param data:
149 |     :param col:
150 |     :return:
151 |     """
152 |     print('value counts', col)
153 |     df_tmp = pd.DataFrame(data[col].value_counts().reset_index())
154 |     df_tmp.columns = [col, 'tmp']
155 |     r = pd.merge(data, df_tmp, how='left', on=col)['tmp']
156 |     return r.fillna(0)
157 | 
158 | 
159 | # 统计值
160 | counts_col_name = []
161 | for col_values in value_counts_col:
162 |     new_name = 'vc_' + col_values
163 |     df_uni[new_name] = gen_value_counts(df_uni, col_values)
164 |     counts_col_name.append(new_name)
165 | 
166 | # ip
167 | gp = df_uni[['ip', 'mediashowid', 'adunitshowid']].groupby(by=['ip', 'mediashowid'])[
168 |     ['adunitshowid']].count().reset_index().rename(index=str, columns={'adunitshowid': 'ip_media_count_ad'})
169 | df_uni = df_uni.merge(gp, on=['ip', 'mediashowid', ], how='left')
170 | del gp
171 | gc.collect()
172 | 
173 | gp = df_uni[['ip', 'mediashowid', 'dvctype', 'hour']].groupby(by=['ip', 'mediashowid', 'dvctype'])[
174 |     ['hour']].var().reset_index().rename(
175 |     index=str, columns={'hour': 'ip_media_dvctype_var_hour'})
176 | df_uni = df_uni.merge(gp, on=['ip', 'mediashowid', 'dvctype'], how='left')
177 | del gp
178 | gc.collect()
179 | 
180 | gp = df_uni[['ip', 'mediashowid', 'dvctype', 'hour']].groupby(by=['ip', 'mediashowid', 'dvctype'])[
181 |     ['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_media_dvctype_mean_hour'})
182 | df_uni = df_uni.merge(gp, on=['ip', 'mediashowid', 'dvctype'], how='left')
183 | del gp
184 | 
185 | # make
186 | gp = df_uni[['make', 'mediashowid', 'adunitshowid']].groupby(by=['make', 'mediashowid'])[
187 |     ['adunitshowid']].count().reset_index().rename(index=str, columns={'adunitshowid': 'make_media_count_ad'})
188 | df_uni = df_uni.merge(gp, on=['make', 'mediashowid', ], how='left')
189 | del gp
190 | gc.collect()
191 | 
192 | gp = df_uni[['make', 'mediashowid', 'dvctype', 'hour']].groupby(by=['make', 'mediashowid', 'dvctype'])[
193 |     ['hour']].var().reset_index().rename(
194 |     index=str, columns={'hour': 'make_media_dvctype_var_hour'})
195 | df_uni = df_uni.merge(gp, on=['make', 'mediashowid', 'dvctype'], how='left')
196 | del gp
197 | gc.collect()
198 | 
199 | gp = df_uni[['make', 'mediashowid', 'dvctype', 'hour']].groupby(by=['make', 'mediashowid', 'dvctype'])[
200 |     ['hour']].mean().reset_index().rename(index=str, columns={'hour': 'make_media_dvctype_mean_hour'})
201 | df_uni = df_uni.merge(gp, on=['make', 'mediashowid', 'dvctype'], how='left')
202 | del gp
203 | 
204 | # model
205 | gp = df_uni[['model', 'mediashowid', 'adunitshowid']].groupby(by=['model', 'mediashowid'])[
206 |     ['adunitshowid']].count().reset_index().rename(index=str, columns={'adunitshowid': 'model_media_count_ad'})
207 | df_uni = df_uni.merge(gp, on=['model', 'mediashowid', ], how='left')
208 | del gp
209 | gc.collect()
210 | 
211 | gp = df_uni[['model', 'mediashowid', 'dvctype', 'hour']].groupby(by=['model', 'mediashowid', 'dvctype'])[
212 |     ['hour']].var().reset_index().rename(
213 |     index=str, columns={'hour': 'model_media_dvctype_var_hour'})
214 | df_uni = df_uni.merge(gp, on=['model', 'mediashowid', 'dvctype'], how='left')
215 | del gp
216 | gc.collect()
217 | 
218 | gp = df_uni[['model', 'mediashowid', 'dvctype', 'hour']].groupby(by=['model', 'mediashowid', 'dvctype'])[
219 |     ['hour']].mean().reset_index().rename(index=str, columns={'hour': 'model_media_dvctype_mean_hour'})
220 | df_uni = df_uni.merge(gp, on=['model', 'mediashowid', 'dvctype'], how='left')
221 | del gp
222 | 
223 | # city dvctype
224 | gp = df_uni[['city', 'dvctype']].groupby(by=['city'])[
225 |     ['dvctype']].count().reset_index().rename(index=str, columns={'dvctype': 'city_count_dvctype'})
226 | df_uni = df_uni.merge(gp, on=['city'], how='left')
227 | del gp
228 | gc.collect()
229 | 
230 | # 'dvctype', 'orientation', 'city'
231 | gp = df_uni[['dvctype', 'orientation', 'city']].groupby(by=['dvctype', 'orientation'])[
232 |     ['city']].count().reset_index().rename(index=str, columns={'city': 'dvctype_orientation_count_city'})
233 | df_uni = df_uni.merge(gp, on=['dvctype', 'orientation'], how='left')
234 | del gp
235 | gc.collect()
236 | 
237 | # 'dvctype', 'ppi', 'city'
238 | gp = df_uni[['dvctype', 'ppi', 'city']].groupby(by=['dvctype', 'ppi'])[
239 |     ['city']].count().reset_index().rename(index=str, columns={'city': 'dvctype_ppi_count_city'})
240 | df_uni = df_uni.merge(gp, on=['dvctype', 'ppi'], how='left')
241 | del gp
242 | gc.collect()
243 | 
244 | print("merging success...")
245 | # 将种类编码成数字
246 | print('post process')
247 | cat_cols = [
248 |     'model', 'make', 'ppi', 'screen_area', 'creative_dpi',
249 |     'pkgname', 'ver', 'osv', 'city',
250 |     'adidmd5', 'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5',
251 |     'adunitshowid', 'mediashowid',
252 |     'apptype', 'dvctype', 'ntt', 'carrier', 'orientation',
253 |     'hour', 'reqrealip', 'ip', 'h', 'w', 'lan',
254 | ]
255 | print(set(df_uni.columns) - (set(cat_cols) | set(counts_col_name)))
256 | for col_values in cat_cols:
257 |     # 将种类进行  映射成唯一编码 {"A":1"}    .unique() 获得唯一值。
258 |     df_uni[col_values] = df_uni[col_values].map(
259 |         dict(zip(df_uni[col_values].unique(), range(0, df_uni[col_values].nunique()))))
260 | 
261 | # print('model', df_uni['model'].max())
262 | # 数据集索引。最后一天数据用于预测，不提供“是否作弊”标识，其余日期的数据作为训练数据。
263 | all_train_index = (df_uni['day'] <= 6).values
264 | test_index = (df_uni['day'] == 7).values
265 | train_label = df_uni['label']
266 | 
267 | train_df = df_uni.iloc[all_train_index, :]
268 | y_train = train_label.iloc[all_train_index].values
269 | 
270 | test_df = df_uni.iloc[test_index, :]
271 | 
272 | 
273 | def get_keras_data(dataset, cate_list, num_list):
274 |     X = {
275 |         'category_inp': dataset[cate_list].values,
276 |         'continous_inp': dataset[num_list].values,
277 |     }
278 |     return X
279 | 
280 | 
281 | category = [
282 |     # 'adidmd5', 'idfamd5', 'imeimd5', 'macmd5', 'openudidmd5', 'ip', 'reqrealip',
283 |     # 'idfamd5',
284 |     'adunitshowid', 'apptype', 'carrier', 'city', 'dvctype', 'make', 'model', 'mediashowid', 'ntt',
285 |     'orientation', 'osv', 'pkgname', 'ppi', 'hour',
286 |     'screen_area', 'creative_dpi', 'ver', 'h', 'w', 'lan',
287 | ]
288 | 
289 | numerical = [
290 |     'ip_media_count_ad', 'ip_media_dvctype_var_hour', 'ip_media_dvctype_mean_hour',
291 |     'make_media_count_ad', 'make_media_dvctype_var_hour', 'make_media_dvctype_mean_hour',
292 |     'model_media_count_ad', 'model_media_dvctype_var_hour', 'model_media_dvctype_mean_hour',
293 |     'city_count_dvctype', 'dvctype_orientation_count_city', 'dvctype_ppi_count_city',
294 | 
295 |     'model_mediashowid_nq',
296 |     'model_city_nq',
297 |     # model
298 |     'adidmd5_model_nq',
299 |     'ip_model_nq',
300 |     'imeimd5_model_nq',
301 |     'macmd5_model_nq',
302 |     'openudidmd5_model_nq',
303 |     'reqrealip_model_nq',
304 | 
305 |     # ppi
306 |     'adidmd5_ppi_nq',
307 |     'ip_ppi_nq',
308 |     'imeimd5_ppi_nq',
309 |     'macmd5_ppi_nq',
310 |     'openudidmd5_ppi_nq',
311 |     'reqrealip_ppi_nq',
312 | 
313 |     # dvctype
314 |     'adidmd5_dvctype_nq',
315 |     'ip_dvctype_nq',
316 |     'imeimd5_dvctype_nq',
317 |     'macmd5_dvctype_nq',
318 |     'openudidmd5_dvctype_nq',
319 |     'reqrealip_dvctype_nq',
320 | 
321 |     'ip_city_nq',
322 |     'reqrealip_city_nq',
323 | 
324 |     'adidmd5_ip_nq',
325 |     'imeimd5_ip_nq',
326 |     'macmd5_ip_nq',
327 |     'openudidmd5_ip_nq',
328 | ]
329 | 
330 | 
331 | def gru_model():
332 |     emb_n = 64
333 |     category_num = {
334 |         # 'adidmd5': (780369, emb_n),
335 |         # 'idfamd5': (360, emb_n),
336 |         # 'imeimd5': (1021836, emb_n),
337 |         # 'macmd5': (329184, emb_n),
338 |         # 'openudidmd5': (85051, emb_n),
339 |         # 'ip': (813719, emb_n),
340 |         # 'reqrealip': (9748, emb_n),
341 |         'adunitshowid': (800, emb_n),
342 |         'apptype': (91, emb_n),
343 |         'carrier': (4, emb_n),
344 |         'city': (331, emb_n),
345 |         'dvctype': (3, emb_n),
346 |         'model': (5923, emb_n),  # 7957 7958  5922
347 |         'make': (1704, emb_n),
348 |         'mediashowid': (313, emb_n),
349 |         'ntt': (7, emb_n),
350 |         'orientation': (2, emb_n),
351 |         'osv': (185, emb_n),
352 |         'pkgname': (2368, emb_n),
353 |         'ppi': (119, emb_n),
354 |         'ver': (3268, emb_n),
355 |         'screen_area': (1396, emb_n),
356 |         'creative_dpi': (1763, emb_n),
357 |         'hour': (24, emb_n),
358 |         'lan': (33, emb_n),
359 |         'h': (985, emb_n),
360 |         'w': (449, emb_n),
361 | 
362 |     }
363 |     # 类别型变量输入
364 |     category_inp = Input(shape=(len(category),), name='category_inp')
365 |     cat_embeds = []
366 |     for idx, col in enumerate(category):
367 |         x = Lambda(lambda x: x[:, idx, None])(category_inp)
368 |         x = Embedding(category_num[col][0], category_num[col][1], input_length=1)(x)
369 |         cat_embeds.append(x)
370 |     embeds = concatenate(cat_embeds, axis=2)
371 |     embeds = GaussianDropout(0.5)(embeds)
372 |     # 数值型变量输入
373 |     numerical_inp = Input(shape=(len(numerical),), name='continous_inp')
374 |     print('numerical', len(numerical) // 8 * 8 + 8)
375 |     x2 = Dense(len(numerical) // 8 + 8, activation='relu', kernel_initializer='random_uniform',
376 |                bias_initializer='zeros')(
377 |         numerical_inp)
378 |     x2 = Dropout(0.5)(x2)
379 |     x2 = BatchNormalization()(x2)
380 |     x2 = Reshape([1, int(x2.shape[1])])(x2)
381 |     x = concatenate([embeds, x2], axis=2)
382 |     # 主干网络
383 |     x = CuDNNGRU(128)(x)
384 |     x = BatchNormalization()(x)
385 |     x = Dropout(0.50)(x)
386 |     x = Dense(64, activation='relu', kernel_initializer='random_uniform')(x)
387 |     x = PReLU()(x)
388 |     x = BatchNormalization()(x)
389 |     x = Dropout(0.50)(x)
390 |     x = Dense(32, activation='relu', kernel_initializer='random_uniform')(x)
391 |     x = PReLU()(x)
392 |     x = BatchNormalization()(x)
393 |     x = Dropout(0.50)(x)
394 |     out_p = Dense(1, activation='sigmoid')(x)
395 |     return Model(inputs=[category_inp, numerical_inp], outputs=out_p)
396 | 
397 | 
398 | model = gru_model()
399 | # model.summary()
400 | 
401 | batch_size = 1024  # 20000 512
402 | epochs = 20
403 | 
404 | steps = int(len(train_df) / batch_size) * epochs
405 | exp_decay = lambda init, fin, steps: (init / fin) ** (1 / (steps - 1)) - 1
406 | lr_init, lr_fin = 0.001, 0.0001
407 | lr_decay = exp_decay(lr_init, lr_fin, steps)
408 | optimizer_adam = Adam(lr=0.001, decay=lr_decay)
409 | model.compile(loss='binary_crossentropy', optimizer=optimizer_adam, metrics=['accuracy'])
410 | 
411 | train_df = get_keras_data(train_df, category, numerical)
412 | 
413 | early_stopping = EarlyStopping(monitor='va', patience=3)
414 | model.fit(train_df, y_train, callbacks=[early_stopping], validation_split=0.2, batch_size=batch_size, epochs=epochs,
415 |           shuffle=True, verbose=1)
416 | 
417 | test_df = get_keras_data(test_df, category, numerical)
418 | 
419 | print("predicting....")
420 | test_y = model.predict(test_df, batch_size=batch_size)
421 | 
422 | test_list = test_y.flatten().tolist()
423 | result = []
424 | for d in test_list:
425 |     if d > 0.5:
426 |         result.append(1)
427 |     else:
428 |         result.append(0)
429 | 
430 | df_sub = pd.concat([df_test['sid'], pd.Series(result)], axis=1)
431 | df_sub.columns = ['sid', 'label']
432 | save_path = 'submit-{}.csv'.format(datetime.now().strftime('%m%d_%H%M%S'))
433 | print(save_path)
434 | df_sub.to_csv(save_path, sep=',', index=False)
435 | 


--------------------------------------------------------------------------------