├── .gitignore ├── README.md └── xunfei_dl_gru.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | archive 4 | .pyc 5 | .idea 6 | __pyache__/ 7 | dist/ 8 | ~$* 9 | /data 10 | resources/ 11 | .ipynb_checkpoints -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 讯飞移动广告反欺诈算法挑战赛深度学习模型 2 | > 该深度学习模型仅供尝鲜。目前最好成绩为94.12672。 3 | 4 | **比赛链接:** 5 | - http://challenge.xfyun.cn/2019/gamedetail?type=detail/mobileAD 6 | 7 | ## 使用方式 8 | - 将`data_dir`指定数据目录即可。 9 | 10 | ## 参考资料 11 | - [kaggle talkingdata比赛](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection) 12 | 13 | @**Galen**_20190717_ -------------------------------------------------------------------------------- /xunfei_dl_gru.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from datetime import timedelta, datetime 3 | 4 | import gc 5 | import pandas as pd 6 | from keras.callbacks import EarlyStopping 7 | from keras.layers import Input, Embedding, Dense, Dropout, concatenate, Reshape 8 | from keras.layers import Lambda, GaussianDropout, CuDNNGRU, BatchNormalization, PReLU 9 | from keras.models import Model 10 | from keras.optimizers import Adam 11 | 12 | warnings.filterwarnings('ignore') 13 | 14 | base_cols = ['ip'] 15 | media_cols = ['pkgname', 'ver', 'adunitshowid', 'mediashowid', 'apptype'] 16 | time_cols = ['hour'] 17 | location_cols = ['city'] 18 | device_cols = ['adidmd5', 'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5', 'dvctype', 'model', 'make', 'ntt', 'carrier', 19 | 'osv', 'orientation', 'ppi', 'screen_area', 'creative_dpi'] 20 | total_cate = [base_cols, media_cols, time_cols, location_cols, device_cols] 21 | 22 | data_dir = '/home/galen/workspace/competition/data/' 23 | print('read data') 24 | df_test = pd.read_csv(data_dir + 'round1_iflyad_anticheat_testdata_feature.txt', sep='\t') 25 | df_train = pd.read_csv(data_dir + 'round1_iflyad_anticheat_traindata.txt', sep='\t') 26 | df_uni = pd.concat([df_train, df_test], ignore_index=True) 27 | df_uni['label'] = df_uni['label'].fillna(-1).astype(int) 28 | 29 | # 数据预处理 30 | print('prework') 31 | # 处理ip。ip 为空时,使用 reqrealip。 32 | df_uni.ip.fillna(df_uni.reqrealip, inplace=True) 33 | # 屏幕尺寸 合并成宽和高 34 | df_uni['screen_area'] = (df_uni['w'] * df_uni['h']).astype('category') 35 | df_uni['creative_dpi'] = df_uni['w'].astype(str) + "_" + df_uni['h'].astype(str) 36 | # orientation 出现异常值 90度和2 归为 0 37 | df_uni.orientation[(df_uni.orientation == 90) | (df_uni.orientation == 2)] = 0 38 | # carrier -1 就是0 39 | df_uni.carrier[df_uni.carrier == -1] = 0 40 | # ntt 网络类型。0 未知 -> 0 , 1 2 宽带 1 , 4,5,6 移动网络 -> 2 41 | df_uni.ntt[(df_uni.ntt <= 0) | (df_uni.ntt > 6)] = 0 42 | df_uni.ntt[(df_uni.ntt <= 2) | (df_uni.ntt >= 1)] = 1 43 | df_uni.ntt[(df_uni.ntt <= 6) | (df_uni.ntt >= 4)] = 2 44 | # 运营商 carrier 45 | df_uni.ntt[(df_uni.carrier <= 0) | (df_uni.carrier > 46003)] = 0 46 | 47 | 48 | # make 49 | def make_fix(x): 50 | """ 51 | iphone,iPhone,Apple,APPLE>--apple 52 | redmi>--xiaomi 53 | honor>--huawei 54 | Best sony,Best-sony,Best_sony,BESTSONY>--best_sony 55 | :param x: 56 | :return: 57 | """ 58 | x = x.lower() 59 | if 'iphone' in x or 'apple' in x: 60 | return 'apple' 61 | if '华为' in x or 'huawei' in x or "荣耀" in x: 62 | return 'huawei' 63 | if "魅族" in x: 64 | return 'meizu' 65 | if "金立" in x: 66 | return 'gionee' 67 | if "三星" in x: 68 | return 'samsung' 69 | if 'xiaomi' in x or 'redmi' in x: 70 | return 'xiaomi' 71 | if 'oppo' in x: 72 | return 'oppo' 73 | return x 74 | 75 | 76 | df_uni['make'] = df_uni['make'].astype('str').apply(lambda x: x.lower()) 77 | df_uni['make'] = df_uni['make'].apply(make_fix) 78 | 79 | print('feature time...') 80 | # 处理时间 81 | df_uni['datetime'] = pd.to_datetime(df_uni['nginxtime'] / 1000, unit='s') + timedelta(hours=8) 82 | df_uni['hour'] = df_uni['datetime'].dt.hour 83 | # 将天数归零成有序数列。[0,1,2,3,4,5,6] 84 | df_uni['day'] = df_uni['datetime'].dt.day - df_uni['datetime'].dt.day.min() 85 | 86 | 87 | def unique_count(index_col, feature, df_data): 88 | if isinstance(index_col, list): 89 | name = "{0}_{1}_nq".format('_'.join(index_col), feature) 90 | else: 91 | name = "{0}_{1}_nq".format(index_col, feature) 92 | print(name) 93 | gp1 = df_data.groupby(index_col)[feature].nunique().reset_index().rename( 94 | columns={feature: name}) 95 | df_data = pd.merge(df_data, gp1, how='left', on=[index_col]) 96 | return df_data.fillna(0) 97 | 98 | 99 | # 设备下的媒体数 model_mediashowid_nq model_city_nq 100 | df_uni = unique_count('model', 'mediashowid', df_uni) 101 | df_uni = unique_count('model', 'city', df_uni) 102 | # 设备 103 | df_uni = unique_count('adidmd5', 'model', df_uni) 104 | df_uni = unique_count('imeimd5', 'model', df_uni) 105 | df_uni = unique_count('macmd5', 'model', df_uni) 106 | df_uni = unique_count('openudidmd5', 'model', df_uni) 107 | df_uni = unique_count('ip', 'model', df_uni) 108 | df_uni = unique_count('reqrealip', 'model', df_uni) 109 | 110 | # 屏幕密度 111 | df_uni = unique_count('adidmd5', 'ppi', df_uni) 112 | df_uni = unique_count('imeimd5', 'ppi', df_uni) 113 | df_uni = unique_count('macmd5', 'ppi', df_uni) 114 | df_uni = unique_count('openudidmd5', 'ppi', df_uni) 115 | df_uni = unique_count('ip', 'ppi', df_uni) 116 | df_uni = unique_count('reqrealip', 'ppi', df_uni) 117 | 118 | # 网络类型 119 | df_uni = unique_count('adidmd5', 'dvctype', df_uni) 120 | df_uni = unique_count('imeimd5', 'dvctype', df_uni) 121 | df_uni = unique_count('macmd5', 'dvctype', df_uni) 122 | df_uni = unique_count('openudidmd5', 'dvctype', df_uni) 123 | df_uni = unique_count('ip', 'dvctype', df_uni) 124 | df_uni = unique_count('reqrealip', 'dvctype', df_uni) 125 | 126 | # 地理位置 127 | df_uni = unique_count('ip', 'city', df_uni) 128 | df_uni = unique_count('reqrealip', 'city', df_uni) 129 | 130 | # 用户下的ip数 131 | df_uni = unique_count('adidmd5', 'ip', df_uni) 132 | df_uni = unique_count('imeimd5', 'ip', df_uni) 133 | df_uni = unique_count('macmd5', 'ip', df_uni) 134 | df_uni = unique_count('openudidmd5', 'ip', df_uni) 135 | 136 | # 统计数据 137 | value_counts_col = [ 138 | # 'adidmd5', 'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5', 139 | 'make', 'pkgname', 'adunitshowid', 'mediashowid', 'ip', 'city', 'model', 'hour', 140 | 'screen_area', 'creative_dpi', 'h', 'w', 141 | 'dvctype', 142 | ] 143 | 144 | 145 | def gen_value_counts(data, col): 146 | """ 147 | # 统计每个种类的个数。 148 | :param data: 149 | :param col: 150 | :return: 151 | """ 152 | print('value counts', col) 153 | df_tmp = pd.DataFrame(data[col].value_counts().reset_index()) 154 | df_tmp.columns = [col, 'tmp'] 155 | r = pd.merge(data, df_tmp, how='left', on=col)['tmp'] 156 | return r.fillna(0) 157 | 158 | 159 | # 统计值 160 | counts_col_name = [] 161 | for col_values in value_counts_col: 162 | new_name = 'vc_' + col_values 163 | df_uni[new_name] = gen_value_counts(df_uni, col_values) 164 | counts_col_name.append(new_name) 165 | 166 | # ip 167 | gp = df_uni[['ip', 'mediashowid', 'adunitshowid']].groupby(by=['ip', 'mediashowid'])[ 168 | ['adunitshowid']].count().reset_index().rename(index=str, columns={'adunitshowid': 'ip_media_count_ad'}) 169 | df_uni = df_uni.merge(gp, on=['ip', 'mediashowid', ], how='left') 170 | del gp 171 | gc.collect() 172 | 173 | gp = df_uni[['ip', 'mediashowid', 'dvctype', 'hour']].groupby(by=['ip', 'mediashowid', 'dvctype'])[ 174 | ['hour']].var().reset_index().rename( 175 | index=str, columns={'hour': 'ip_media_dvctype_var_hour'}) 176 | df_uni = df_uni.merge(gp, on=['ip', 'mediashowid', 'dvctype'], how='left') 177 | del gp 178 | gc.collect() 179 | 180 | gp = df_uni[['ip', 'mediashowid', 'dvctype', 'hour']].groupby(by=['ip', 'mediashowid', 'dvctype'])[ 181 | ['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_media_dvctype_mean_hour'}) 182 | df_uni = df_uni.merge(gp, on=['ip', 'mediashowid', 'dvctype'], how='left') 183 | del gp 184 | 185 | # make 186 | gp = df_uni[['make', 'mediashowid', 'adunitshowid']].groupby(by=['make', 'mediashowid'])[ 187 | ['adunitshowid']].count().reset_index().rename(index=str, columns={'adunitshowid': 'make_media_count_ad'}) 188 | df_uni = df_uni.merge(gp, on=['make', 'mediashowid', ], how='left') 189 | del gp 190 | gc.collect() 191 | 192 | gp = df_uni[['make', 'mediashowid', 'dvctype', 'hour']].groupby(by=['make', 'mediashowid', 'dvctype'])[ 193 | ['hour']].var().reset_index().rename( 194 | index=str, columns={'hour': 'make_media_dvctype_var_hour'}) 195 | df_uni = df_uni.merge(gp, on=['make', 'mediashowid', 'dvctype'], how='left') 196 | del gp 197 | gc.collect() 198 | 199 | gp = df_uni[['make', 'mediashowid', 'dvctype', 'hour']].groupby(by=['make', 'mediashowid', 'dvctype'])[ 200 | ['hour']].mean().reset_index().rename(index=str, columns={'hour': 'make_media_dvctype_mean_hour'}) 201 | df_uni = df_uni.merge(gp, on=['make', 'mediashowid', 'dvctype'], how='left') 202 | del gp 203 | 204 | # model 205 | gp = df_uni[['model', 'mediashowid', 'adunitshowid']].groupby(by=['model', 'mediashowid'])[ 206 | ['adunitshowid']].count().reset_index().rename(index=str, columns={'adunitshowid': 'model_media_count_ad'}) 207 | df_uni = df_uni.merge(gp, on=['model', 'mediashowid', ], how='left') 208 | del gp 209 | gc.collect() 210 | 211 | gp = df_uni[['model', 'mediashowid', 'dvctype', 'hour']].groupby(by=['model', 'mediashowid', 'dvctype'])[ 212 | ['hour']].var().reset_index().rename( 213 | index=str, columns={'hour': 'model_media_dvctype_var_hour'}) 214 | df_uni = df_uni.merge(gp, on=['model', 'mediashowid', 'dvctype'], how='left') 215 | del gp 216 | gc.collect() 217 | 218 | gp = df_uni[['model', 'mediashowid', 'dvctype', 'hour']].groupby(by=['model', 'mediashowid', 'dvctype'])[ 219 | ['hour']].mean().reset_index().rename(index=str, columns={'hour': 'model_media_dvctype_mean_hour'}) 220 | df_uni = df_uni.merge(gp, on=['model', 'mediashowid', 'dvctype'], how='left') 221 | del gp 222 | 223 | # city dvctype 224 | gp = df_uni[['city', 'dvctype']].groupby(by=['city'])[ 225 | ['dvctype']].count().reset_index().rename(index=str, columns={'dvctype': 'city_count_dvctype'}) 226 | df_uni = df_uni.merge(gp, on=['city'], how='left') 227 | del gp 228 | gc.collect() 229 | 230 | # 'dvctype', 'orientation', 'city' 231 | gp = df_uni[['dvctype', 'orientation', 'city']].groupby(by=['dvctype', 'orientation'])[ 232 | ['city']].count().reset_index().rename(index=str, columns={'city': 'dvctype_orientation_count_city'}) 233 | df_uni = df_uni.merge(gp, on=['dvctype', 'orientation'], how='left') 234 | del gp 235 | gc.collect() 236 | 237 | # 'dvctype', 'ppi', 'city' 238 | gp = df_uni[['dvctype', 'ppi', 'city']].groupby(by=['dvctype', 'ppi'])[ 239 | ['city']].count().reset_index().rename(index=str, columns={'city': 'dvctype_ppi_count_city'}) 240 | df_uni = df_uni.merge(gp, on=['dvctype', 'ppi'], how='left') 241 | del gp 242 | gc.collect() 243 | 244 | print("merging success...") 245 | # 将种类编码成数字 246 | print('post process') 247 | cat_cols = [ 248 | 'model', 'make', 'ppi', 'screen_area', 'creative_dpi', 249 | 'pkgname', 'ver', 'osv', 'city', 250 | 'adidmd5', 'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5', 251 | 'adunitshowid', 'mediashowid', 252 | 'apptype', 'dvctype', 'ntt', 'carrier', 'orientation', 253 | 'hour', 'reqrealip', 'ip', 'h', 'w', 'lan', 254 | ] 255 | print(set(df_uni.columns) - (set(cat_cols) | set(counts_col_name))) 256 | for col_values in cat_cols: 257 | # 将种类进行 映射成唯一编码 {"A":1"} .unique() 获得唯一值。 258 | df_uni[col_values] = df_uni[col_values].map( 259 | dict(zip(df_uni[col_values].unique(), range(0, df_uni[col_values].nunique())))) 260 | 261 | # print('model', df_uni['model'].max()) 262 | # 数据集索引。最后一天数据用于预测,不提供“是否作弊”标识,其余日期的数据作为训练数据。 263 | all_train_index = (df_uni['day'] <= 6).values 264 | test_index = (df_uni['day'] == 7).values 265 | train_label = df_uni['label'] 266 | 267 | train_df = df_uni.iloc[all_train_index, :] 268 | y_train = train_label.iloc[all_train_index].values 269 | 270 | test_df = df_uni.iloc[test_index, :] 271 | 272 | 273 | def get_keras_data(dataset, cate_list, num_list): 274 | X = { 275 | 'category_inp': dataset[cate_list].values, 276 | 'continous_inp': dataset[num_list].values, 277 | } 278 | return X 279 | 280 | 281 | category = [ 282 | # 'adidmd5', 'idfamd5', 'imeimd5', 'macmd5', 'openudidmd5', 'ip', 'reqrealip', 283 | # 'idfamd5', 284 | 'adunitshowid', 'apptype', 'carrier', 'city', 'dvctype', 'make', 'model', 'mediashowid', 'ntt', 285 | 'orientation', 'osv', 'pkgname', 'ppi', 'hour', 286 | 'screen_area', 'creative_dpi', 'ver', 'h', 'w', 'lan', 287 | ] 288 | 289 | numerical = [ 290 | 'ip_media_count_ad', 'ip_media_dvctype_var_hour', 'ip_media_dvctype_mean_hour', 291 | 'make_media_count_ad', 'make_media_dvctype_var_hour', 'make_media_dvctype_mean_hour', 292 | 'model_media_count_ad', 'model_media_dvctype_var_hour', 'model_media_dvctype_mean_hour', 293 | 'city_count_dvctype', 'dvctype_orientation_count_city', 'dvctype_ppi_count_city', 294 | 295 | 'model_mediashowid_nq', 296 | 'model_city_nq', 297 | # model 298 | 'adidmd5_model_nq', 299 | 'ip_model_nq', 300 | 'imeimd5_model_nq', 301 | 'macmd5_model_nq', 302 | 'openudidmd5_model_nq', 303 | 'reqrealip_model_nq', 304 | 305 | # ppi 306 | 'adidmd5_ppi_nq', 307 | 'ip_ppi_nq', 308 | 'imeimd5_ppi_nq', 309 | 'macmd5_ppi_nq', 310 | 'openudidmd5_ppi_nq', 311 | 'reqrealip_ppi_nq', 312 | 313 | # dvctype 314 | 'adidmd5_dvctype_nq', 315 | 'ip_dvctype_nq', 316 | 'imeimd5_dvctype_nq', 317 | 'macmd5_dvctype_nq', 318 | 'openudidmd5_dvctype_nq', 319 | 'reqrealip_dvctype_nq', 320 | 321 | 'ip_city_nq', 322 | 'reqrealip_city_nq', 323 | 324 | 'adidmd5_ip_nq', 325 | 'imeimd5_ip_nq', 326 | 'macmd5_ip_nq', 327 | 'openudidmd5_ip_nq', 328 | ] 329 | 330 | 331 | def gru_model(): 332 | emb_n = 64 333 | category_num = { 334 | # 'adidmd5': (780369, emb_n), 335 | # 'idfamd5': (360, emb_n), 336 | # 'imeimd5': (1021836, emb_n), 337 | # 'macmd5': (329184, emb_n), 338 | # 'openudidmd5': (85051, emb_n), 339 | # 'ip': (813719, emb_n), 340 | # 'reqrealip': (9748, emb_n), 341 | 'adunitshowid': (800, emb_n), 342 | 'apptype': (91, emb_n), 343 | 'carrier': (4, emb_n), 344 | 'city': (331, emb_n), 345 | 'dvctype': (3, emb_n), 346 | 'model': (5923, emb_n), # 7957 7958 5922 347 | 'make': (1704, emb_n), 348 | 'mediashowid': (313, emb_n), 349 | 'ntt': (7, emb_n), 350 | 'orientation': (2, emb_n), 351 | 'osv': (185, emb_n), 352 | 'pkgname': (2368, emb_n), 353 | 'ppi': (119, emb_n), 354 | 'ver': (3268, emb_n), 355 | 'screen_area': (1396, emb_n), 356 | 'creative_dpi': (1763, emb_n), 357 | 'hour': (24, emb_n), 358 | 'lan': (33, emb_n), 359 | 'h': (985, emb_n), 360 | 'w': (449, emb_n), 361 | 362 | } 363 | # 类别型变量输入 364 | category_inp = Input(shape=(len(category),), name='category_inp') 365 | cat_embeds = [] 366 | for idx, col in enumerate(category): 367 | x = Lambda(lambda x: x[:, idx, None])(category_inp) 368 | x = Embedding(category_num[col][0], category_num[col][1], input_length=1)(x) 369 | cat_embeds.append(x) 370 | embeds = concatenate(cat_embeds, axis=2) 371 | embeds = GaussianDropout(0.5)(embeds) 372 | # 数值型变量输入 373 | numerical_inp = Input(shape=(len(numerical),), name='continous_inp') 374 | print('numerical', len(numerical) // 8 * 8 + 8) 375 | x2 = Dense(len(numerical) // 8 + 8, activation='relu', kernel_initializer='random_uniform', 376 | bias_initializer='zeros')( 377 | numerical_inp) 378 | x2 = Dropout(0.5)(x2) 379 | x2 = BatchNormalization()(x2) 380 | x2 = Reshape([1, int(x2.shape[1])])(x2) 381 | x = concatenate([embeds, x2], axis=2) 382 | # 主干网络 383 | x = CuDNNGRU(128)(x) 384 | x = BatchNormalization()(x) 385 | x = Dropout(0.50)(x) 386 | x = Dense(64, activation='relu', kernel_initializer='random_uniform')(x) 387 | x = PReLU()(x) 388 | x = BatchNormalization()(x) 389 | x = Dropout(0.50)(x) 390 | x = Dense(32, activation='relu', kernel_initializer='random_uniform')(x) 391 | x = PReLU()(x) 392 | x = BatchNormalization()(x) 393 | x = Dropout(0.50)(x) 394 | out_p = Dense(1, activation='sigmoid')(x) 395 | return Model(inputs=[category_inp, numerical_inp], outputs=out_p) 396 | 397 | 398 | model = gru_model() 399 | # model.summary() 400 | 401 | batch_size = 1024 # 20000 512 402 | epochs = 20 403 | 404 | steps = int(len(train_df) / batch_size) * epochs 405 | exp_decay = lambda init, fin, steps: (init / fin) ** (1 / (steps - 1)) - 1 406 | lr_init, lr_fin = 0.001, 0.0001 407 | lr_decay = exp_decay(lr_init, lr_fin, steps) 408 | optimizer_adam = Adam(lr=0.001, decay=lr_decay) 409 | model.compile(loss='binary_crossentropy', optimizer=optimizer_adam, metrics=['accuracy']) 410 | 411 | train_df = get_keras_data(train_df, category, numerical) 412 | 413 | early_stopping = EarlyStopping(monitor='va', patience=3) 414 | model.fit(train_df, y_train, callbacks=[early_stopping], validation_split=0.2, batch_size=batch_size, epochs=epochs, 415 | shuffle=True, verbose=1) 416 | 417 | test_df = get_keras_data(test_df, category, numerical) 418 | 419 | print("predicting....") 420 | test_y = model.predict(test_df, batch_size=batch_size) 421 | 422 | test_list = test_y.flatten().tolist() 423 | result = [] 424 | for d in test_list: 425 | if d > 0.5: 426 | result.append(1) 427 | else: 428 | result.append(0) 429 | 430 | df_sub = pd.concat([df_test['sid'], pd.Series(result)], axis=1) 431 | df_sub.columns = ['sid', 'label'] 432 | save_path = 'submit-{}.csv'.format(datetime.now().strftime('%m%d_%H%M%S')) 433 | print(save_path) 434 | df_sub.to_csv(save_path, sep=',', index=False) 435 | --------------------------------------------------------------------------------