├── 001.gdbt_version.py ├── 002.dart_version.py ├── CompetitionPlatform.md ├── Factorization Meets the Neighborhood_ a Multifaceted Collaborative Filtering Model.pdf ├── Item-Based Collaborative Filtering Recommendation Algorithms.pdf ├── Recommender-Systems-[Netflix].pdf ├── baselin └── baseline /001.gdbt_version.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from sklearn.model_selection import StratifiedKFold 10 | from sklearn.metrics import roc_auc_score, f1_score 11 | from scipy.stats import entropy 12 | from gensim.models import Word2Vec 13 | import lightgbm as lgb 14 | import time 15 | import gc 16 | import Geohash 17 | # unix 内核的加速操作 18 | from pandarallel import pandarallel 19 | pandarallel.initialize() 20 | pd.set_option('display.max_columns', None) 21 | import os 22 | import warnings 23 | warnings.filterwarnings('ignore') 24 | 25 | 26 | # In[2]: 27 | 28 | 29 | np.random.seed(42) 30 | 31 | 32 | # In[3]: 33 | 34 | 35 | def reduce_mem(df): 36 | start_mem = df.memory_usage().sum() / 1024 ** 2 37 | for col in df.columns: 38 | col_type = df[col].dtypes 39 | if col_type != object: 40 | c_min = df[col].min() 41 | c_max = df[col].max() 42 | if str(col_type)[:3] == 'int': 43 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 44 | df[col] = df[col].astype(np.int8) 45 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 46 | df[col] = df[col].astype(np.int16) 47 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 48 | df[col] = df[col].astype(np.int32) 49 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 50 | df[col] = df[col].astype(np.int64) 51 | else: 52 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 53 | df[col] = df[col].astype(np.float16) 54 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 55 | df[col] = df[col].astype(np.float32) 56 | else: 57 | df[col] = df[col].astype(np.float64) 58 | end_mem = df.memory_usage().sum() / 1024 ** 2 59 | print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) 60 | gc.collect() 61 | return df 62 | 63 | 64 | # In[4]: 65 | 66 | 67 | def get_ctr(data,col): 68 | f1 = data.groupby(col,as_index=False)['target'].agg({ 69 | '{}_ctr_mean'.format('_'.join(col)):'mean', 70 | }) 71 | f2 = data.groupby(col + ['hour'],as_index=False)['target'].agg({ 72 | '{}_hour_ctr_mean'.format('_'.join(col)):'mean', 73 | }) 74 | f = pd.merge(f1,f2,on=col,how='outer',copy=False) 75 | del f1,f2 76 | return f 77 | 78 | 79 | # In[5]: 80 | 81 | 82 | def get_ctr_mean(data,col): 83 | f1 = data.groupby(col,as_index=False)['{}_ctr_mean'.format('_'.join(col))].agg({ 84 | '{}_ctr_mean_mean'.format('_'.join(col)):'mean', 85 | '{}_ctr_mean_max'.format('_'.join(col)):'max', 86 | '{}_ctr_mean_median'.format('_'.join(col)):'median', 87 | '{}_ctr_mean_min'.format('_'.join(col)):'min', 88 | '{}_ctr_mean_var'.format('_'.join(col)):'var', 89 | }) 90 | 91 | f2 = data.groupby(col + ['hour'],as_index=False)['{}_hour_ctr_mean'.format('_'.join(col))].agg({ 92 | '{}_hour_ctr_mean_mean'.format('_'.join(col)):'mean', 93 | '{}_hour_ctr_mean_max'.format('_'.join(col)):'max', 94 | '{}_hour_ctr_mean_median'.format('_'.join(col)):'median', 95 | '{}_hour_ctr_mean_min'.format('_'.join(col)):'min', 96 | '{}_hour_ctr_mean_var'.format('_'.join(col)):'var', 97 | }) 98 | 99 | f = pd.merge(f1,f2,on=col,how='outer',copy=False) 100 | del f1,f2 101 | return f 102 | 103 | 104 | # In[6]: 105 | 106 | 107 | import os 108 | def get_emb(data,f1,f2): 109 | tmp = data.groupby([f1], as_index=False)[f2].agg({'{}_{}_list'.format(f1, f2): list}) 110 | sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist() 111 | del tmp['{}_{}_list'.format(f1, f2)] 112 | for i in range(len(sentences)): 113 | sentences[i] = [str(x) for x in sentences[i]] 114 | if os.path.exists('./w2v_{}_{}.model'.format(f1, f2)): 115 | model = Word2Vec.load('./w2v_{}_{}.model'.format(f1, f2)) 116 | else: 117 | model = Word2Vec(sentences, size=8, window=5, min_count=1, sg=1, hs=0, seed=42) 118 | model.save('./w2v_{}_{}.model'.format(f1, f2)) 119 | emb_matrix = [] 120 | for seq in sentences: 121 | vec = [] 122 | for w in seq: 123 | if w in model: 124 | vec.append(model[w]) 125 | if len(vec) > 0: 126 | emb_matrix.append(np.mean(vec, axis=0)) 127 | else: 128 | emb_matrix.append([0] * 8) 129 | emb_matrix = np.array(emb_matrix) 130 | for i in range(8): 131 | tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i] 132 | del model, emb_matrix, sentences 133 | tmp = reduce_mem(tmp) 134 | return tmp 135 | 136 | 137 | # In[7]: 138 | 139 | 140 | def get_ts_feature(data,gap_list = [1],col=['deviceid']): 141 | for gap in gap_list: 142 | data['ts_{}_{}_diff_next'.format('_'.join(col),gap)] = data.groupby(col)['ts'].shift(-gap) 143 | data['ts_{}_{}_diff_next'.format('_'.join(col),gap)] = data['ts_{}_{}_diff_next'.format('_'.join(col),gap)] - data['ts'] 144 | 145 | data['ts_{}_{}_diff_last'.format('_'.join(col),gap)] = data.groupby(col)['ts'].shift(+gap) 146 | data['ts_{}_{}_diff_last'.format('_'.join(col),gap)] = data['ts'] - data['ts_{}_{}_diff_last'.format('_'.join(col),gap)] 147 | 148 | data['ts_{}_{}_diff_next_count'.format('_'.join(col),gap)] = data.groupby(col)['ts_{}_{}_diff_next'.format('_'.join(col),gap)].transform('count') 149 | data['ts_{}_{}_diff_last_count'.format('_'.join(col),gap)] = data.groupby(col)['ts_{}_{}_diff_last'.format('_'.join(col),gap)].transform('count') 150 | 151 | data = reduce_mem(data) 152 | return data 153 | def get_second_ts(data,gap_list = [1,2,3],col=['deviceid'],con_list=[1],f='next'): 154 | for gap in gap_list: 155 | for con in con_list: 156 | data['ts_s_{}_{}_{}_next_{}'.format(f,'_'.join(col),gap,con)] = data.groupby(col)['ts_{}_{}_diff_{}'.format('_'.join(col),con,f)].shift(-gap) 157 | data['ts_s_{}_{}_{}_next_{}'.format(f,'_'.join(col),gap,con)] = data['ts_s_{}_{}_{}_next_{}'.format(f,'_'.join(col),gap,con)] - data['ts_{}_{}_diff_{}'.format('_'.join(col),con,f)] 158 | 159 | data['ts_s_{}_{}_{}_last_{}'.format(f,'_'.join(col),gap,con)] = data.groupby(col)['ts_{}_{}_diff_{}'.format('_'.join(col),con,f)].shift(+gap) 160 | data['ts_s_{}_{}_{}_last_{}'.format(f,'_'.join(col),gap,con)] = data['ts_{}_{}_diff_{}'.format('_'.join(col),con,f)] - data['ts_s_{}_{}_{}_last_{}'.format(f,'_'.join(col),gap,con)] 161 | 162 | data = reduce_mem(data) 163 | return data 164 | 165 | 166 | # In[8]: 167 | 168 | 169 | user = pd.read_csv('./user.csv') 170 | user['guid'] = user['guid'].fillna('none') 171 | 172 | 173 | # In[9]: 174 | 175 | 176 | user['outertag'] = user['outertag'].fillna('none') 177 | user['outertag'] = user['outertag'].astype(str) 178 | 179 | 180 | # In[10]: 181 | 182 | 183 | user['outertag_list'] = user['outertag'].parallel_apply(lambda x:x.split('|') if x!='none' else ':') 184 | 185 | 186 | # In[11]: 187 | 188 | 189 | def get_key_words(x): 190 | if x == ':': 191 | t = [] 192 | else: 193 | t = [t.split(':')[0].split('_')[0] for t in x] 194 | return ' '.join(t) 195 | 196 | 197 | # In[12]: 198 | 199 | 200 | def get_key_values(x): 201 | try: 202 | if x == ':': 203 | t = [0.0] 204 | else: 205 | t = [t.split(':')[1] for t in x] 206 | 207 | return t 208 | except: 209 | return [0.0] 210 | 211 | 212 | # In[13]: 213 | 214 | 215 | user['outertag_words'] = user['outertag_list'].parallel_apply(get_key_words) 216 | 217 | 218 | # In[14]: 219 | 220 | 221 | user['outertag_values'] = user['outertag_list'].parallel_apply(get_key_values) 222 | 223 | 224 | # In[15]: 225 | 226 | 227 | user['tag'] = user['tag'].fillna('none') 228 | user['tag'] = user['tag'].astype(str) 229 | user['tag_list'] = user['tag'].parallel_apply(lambda x:x.split('|') if x!='none' else ':') 230 | 231 | 232 | # In[16]: 233 | 234 | 235 | user['tag_words'] = user['tag_list'].parallel_apply(get_key_words) 236 | 237 | 238 | # In[17]: 239 | 240 | 241 | user['tag_values'] = user['tag_list'].parallel_apply(get_key_values) 242 | 243 | 244 | # In[18]: 245 | 246 | 247 | user = user.drop(['outertag','tag','outertag_list','tag_list'],axis=1) 248 | 249 | 250 | # In[19]: 251 | 252 | 253 | def f(x): 254 | x = [float(t) for t in x] 255 | return x 256 | 257 | 258 | # In[20]: 259 | 260 | 261 | user['tag_values'] = user['tag_values'].apply(lambda x:f(x)) 262 | user['mean_tag_values'] = user['tag_values'].apply(lambda x:np.mean(x)) 263 | user['max_tag_values'] = user['tag_values'].parallel_apply(lambda x:max(x)) 264 | user['min_tag_values'] = user['tag_values'].parallel_apply(lambda x:min(x)) 265 | 266 | 267 | # In[21]: 268 | 269 | 270 | if os.path.exists('./w2v_{}_{}.model'.format('user', 'tag_words')): 271 | model = Word2Vec.load('./w2v_{}_{}.model'.format('user', 'tag_words')) 272 | else: 273 | model = Word2Vec(user['tag_words'].parallel_apply(lambda x:x.split(' ')), size=8, window=5, min_count=1, sg=1, hs=0, seed=42) 274 | model.save('./w2v_{}_{}.model'.format('user', 'tag_words')) 275 | 276 | 277 | # In[22]: 278 | 279 | 280 | model['约会'] 281 | 282 | 283 | # In[23]: 284 | 285 | 286 | emb_matrix = [] 287 | for seq in user['tag_words'].parallel_apply(lambda x:x.split(' ')): 288 | vec = [] 289 | for w in seq: 290 | if w in model: 291 | vec.append(model[w]) 292 | if len(vec) > 0: 293 | emb_matrix.append(np.mean(vec, axis=0)) 294 | else: 295 | emb_matrix.append([0] * 8) 296 | emb_matrix = np.array(emb_matrix) 297 | for i in range(8): 298 | user['{}_{}_emb_{}'.format('user', 'tag_words', i)] = emb_matrix[:, i] 299 | del model, emb_matrix 300 | 301 | 302 | # In[24]: 303 | 304 | 305 | if os.path.exists('./w2v_{}_{}.model'.format('user', 'outertag_words')): 306 | model = Word2Vec.load('./w2v_{}_{}.model'.format('user', 'outertag_words')) 307 | else: 308 | model = Word2Vec(user['outertag_words'].parallel_apply(lambda x:x.split(' ')), size=8, window=5, min_count=1, sg=1, hs=0, seed=42) 309 | model.save('./w2v_{}_{}.model'.format('user', 'outertag_words')) 310 | emb_matrix = [] 311 | for seq in user['outertag_words'].parallel_apply(lambda x:x.split(' ')): 312 | vec = [] 313 | for w in seq: 314 | if w in model: 315 | vec.append(model[w]) 316 | if len(vec) > 0: 317 | emb_matrix.append(np.mean(vec, axis=0)) 318 | else: 319 | emb_matrix.append([0] * 8) 320 | emb_matrix = np.array(emb_matrix) 321 | for i in range(8): 322 | user['{}_{}_emb_{}'.format('user', 'outertag_words', i)] = emb_matrix[:, i] 323 | del model, emb_matrix 324 | 325 | 326 | # In[25]: 327 | 328 | 329 | user.columns 330 | 331 | 332 | # In[26]: 333 | 334 | 335 | user = user[['deviceid', 'guid','user_tag_words_emb_0', 'user_tag_words_emb_1', 'user_tag_words_emb_2', 336 | 'user_tag_words_emb_3', 'user_tag_words_emb_4', 'user_tag_words_emb_5', 337 | 'user_tag_words_emb_6', 'user_tag_words_emb_7']] 338 | 339 | 340 | # In[27]: 341 | 342 | 343 | user.head() 344 | 345 | 346 | # In[28]: 347 | 348 | 349 | print('read train and test data') 350 | train = pd.read_csv('./train.csv') 351 | test = pd.read_csv('./test.csv') 352 | 353 | 354 | # In[29]: 355 | 356 | 357 | train['is_train'] = 1 358 | test['is_train'] = 0 359 | 360 | 361 | # In[30]: 362 | 363 | 364 | data = pd.concat([train, test], axis=0, ignore_index=False) 365 | data = data.sort_values('ts').reset_index(drop=True) 366 | del train,test 367 | print('finish data concat ing') 368 | 369 | 370 | # In[31]: 371 | 372 | 373 | data['guid'] = data['guid'].fillna('none') 374 | 375 | 376 | # In[32]: 377 | 378 | 379 | data = pd.merge(data,user,on=['deviceid', 'guid'],how='left',copy=False) 380 | del user 381 | 382 | 383 | # In[33]: 384 | 385 | 386 | print('change data format ing ... ...') 387 | data['date'] = pd.to_datetime( 388 | data['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000))) 389 | ) 390 | data['day'] = data['date'].dt.day 391 | data['hour'] = data['date'].dt.hour 392 | data['minute'] = data['date'].dt.minute 393 | del data['date'] 394 | gc.collect() 395 | 396 | 397 | # In[34]: 398 | 399 | 400 | del data['timestamp'] 401 | data = reduce_mem(data) 402 | 403 | 404 | # In[35]: 405 | 406 | 407 | print('geohash g6') 408 | data['g6'] = data[['lat','lng']].parallel_apply(lambda x:Geohash.encode(x[0],x[1],6),axis=1) 409 | print('geohash g7') 410 | data['g7'] = data[['lat','lng']].parallel_apply(lambda x:Geohash.encode(x[0],x[1],7),axis=1) 411 | 412 | 413 | # In[36]: 414 | 415 | 416 | print('交叉转化率特征') 417 | train_7 = data[(data['is_train']==1)&(data['day']==7)][['deviceid','g6','newsid','netmodel','target','hour']] 418 | train_8 = data[(data['is_train']==1)&(data['day']==8)][['deviceid','g6','newsid','netmodel','target','hour']] 419 | train_9 = data[(data['is_train']==1)&(data['day']==9)][['deviceid','g6','newsid','netmodel','target','hour']] 420 | train_10 = data[(data['is_train']==1)&(data['day']==10)][['deviceid','g6','newsid','netmodel','target','hour']] 421 | for col in [['deviceid'],['deviceid','netmodel'],['deviceid','g6'],['newsid']]: 422 | print(col) 423 | ctr_7 = get_ctr(train_7,col) 424 | ctr_8 = get_ctr(train_8,col) 425 | ctr_9 = get_ctr(train_9,col) 426 | ctr_10 = get_ctr(train_10,col) 427 | 428 | ctr_1 = pd.concat([ctr_7,ctr_8,ctr_9,ctr_10],axis=0,ignore_index=True,sort=False) 429 | ctr_2 = pd.concat([ctr_8,ctr_9,ctr_10],axis=0,ignore_index=True,sort=False) 430 | ctr_3 = pd.concat([ctr_7,ctr_9,ctr_10],axis=0,ignore_index=True,sort=False) 431 | ctr_4 = pd.concat([ctr_7,ctr_8,ctr_10],axis=0,ignore_index=True,sort=False) 432 | ctr_5 = pd.concat([ctr_7,ctr_8,ctr_9],axis=0,ignore_index=True,sort=False) 433 | 434 | ctr_1 = get_ctr_mean(ctr_1,col) 435 | ctr_2 = get_ctr_mean(ctr_2,col) 436 | ctr_3 = get_ctr_mean(ctr_3,col) 437 | ctr_4 = get_ctr_mean(ctr_4,col) 438 | ctr_5 = get_ctr_mean(ctr_5,col) 439 | 440 | ctr_1['day'] = 11 441 | ctr_2['day'] = 7 442 | ctr_3['day'] = 8 443 | ctr_4['day'] = 9 444 | ctr_5['day'] = 10 445 | 446 | ctr = pd.concat([ctr_2,ctr_3,ctr_4,ctr_5,ctr_1],axis=0,ignore_index=True,sort=False) 447 | ctr = reduce_mem(ctr) 448 | del ctr_1,ctr_2,ctr_3,ctr_4,ctr_5,ctr_7,ctr_8,ctr_9,ctr_10 449 | data = pd.merge(data,ctr,on=['hour','day']+col,how='left',copy=False) 450 | del train_7,train_8,train_9,train_10 451 | 452 | 453 | # In[37]: 454 | 455 | 456 | data['t'] = data['ts'].parallel_apply(lambda x:x//1000) 457 | 458 | 459 | # In[38]: 460 | 461 | 462 | tcmp = data.groupby(['deviceid','t']).size().reset_index() 463 | tcmp.columns = ['deviceid','t','items'] 464 | tcmp['h_d_items'] = tcmp.groupby(['deviceid'])['items'].cumsum() - tcmp['items'] 465 | tcmp = tcmp.sort_values(['t'],ascending=False) 466 | tcmp['f_d_items'] = tcmp.groupby(['deviceid'])['items'].cumsum() - tcmp['items'] 467 | data = pd.merge(data,tcmp,on=['deviceid','t'],copy=False,how='left') 468 | del tcmp 469 | 470 | 471 | # In[39]: 472 | 473 | 474 | data['netmodel'] = data['netmodel'].map({'o':-1, 'w':1, 'g4':2, 'g2':4, 'g3':3}) 475 | data['netmodel'] = data['netmodel'].astype(int) 476 | 477 | 478 | # In[40]: 479 | 480 | 481 | data['app_version'] = data['app_version'].parallel_apply(lambda x:''.join(x.split('.'))) 482 | data['app_version'] = data['app_version'].astype(int) 483 | 484 | 485 | # In[41]: 486 | 487 | 488 | data['osversion'] = data['osversion'].parallel_apply(lambda x:''.join(x.split('.')[::-1])) 489 | data['osversion'] = data['osversion'].astype(int) 490 | 491 | 492 | # In[42]: 493 | 494 | 495 | for cat_f in ['device_vendor','device_version']: 496 | data[cat_f] = data[cat_f].parallel_apply(lambda x:str(x).lower()) 497 | data[cat_f] = data[cat_f].astype("category") 498 | data[cat_f] = data[cat_f].cat.codes 499 | 500 | 501 | # In[43]: 502 | 503 | 504 | for cat_f in ['deviceid','guid','newsid','g6','g7']: 505 | data[cat_f] = data[cat_f].astype("category") 506 | data[cat_f] = data[cat_f].cat.codes 507 | 508 | 509 | # In[44]: 510 | 511 | 512 | # 类被count编码 513 | for cat in ['app_version','device_vendor','device_version','deviceid','lat','lng','netmodel','newsid','osversion','pos','g6','g7','guid']: 514 | data['{}_count'.format(cat)] = data.groupby(cat)['id'].transform('count') 515 | 516 | 517 | # In[45]: 518 | 519 | 520 | data = reduce_mem(data) 521 | 522 | 523 | # In[46]: 524 | 525 | 526 | data.head() 527 | 528 | 529 | # In[47]: 530 | 531 | 532 | print('make embedding feature ing') 533 | for emb in [['deviceid','newsid']]: 534 | print(emb) 535 | tmp = get_emb(data,emb[0],emb[1]) 536 | data = pd.merge(data,tmp,on=emb[0],how='left',copy=False) 537 | del tmp 538 | 539 | 540 | # In[48]: 541 | 542 | 543 | for col in [['deviceid'], 544 | ['pos','deviceid'], 545 | ['netmodel','deviceid'], 546 | ]: 547 | print('_'.join(col),'make','feature') 548 | data = get_ts_feature(data,gap_list = [1,2,3],col=col) 549 | data = get_second_ts(data,gap_list = [1,2,3],col=col,con_list=[1],f='next') 550 | data = get_second_ts(data,gap_list = [1,2,3],col=col,con_list=[1],f='last') 551 | 552 | 553 | # In[49]: 554 | 555 | 556 | # 增加pos信息的偏移量 pos shift(-1/+1) 557 | for col in [['deviceid'],['netmodel','deviceid']]: 558 | for gap in [1,2,3]: 559 | print(col,gap) 560 | data['pos_{}_{}_diff_next'.format('_'.join(col),gap)] = data.groupby(col)['pos'].shift(-gap) 561 | data['pos_{}_{}_diff_next'.format('_'.join(col),gap)] = data['pos_{}_{}_diff_next'.format('_'.join(col),gap)] - data['pos'] 562 | 563 | data['pos_{}_{}_diff_last'.format('_'.join(col),gap)] = data.groupby(col)['pos'].shift(+gap) 564 | data['pos_{}_{}_diff_last'.format('_'.join(col),gap)] = data['pos'] - data['pos_{}_{}_diff_last'.format('_'.join(col),gap)] 565 | data = reduce_mem(data) 566 | 567 | 568 | # In[50]: 569 | 570 | 571 | # 增加 netmodel 信息的偏移量 pos shift(-1/+1) 572 | for col in [['deviceid'],['pos','deviceid']]: 573 | for gap in [1,2,3]: 574 | print(col,gap) 575 | data['netmodel_{}_{}_diff_next'.format('_'.join(col),gap)] = data.groupby(col)['netmodel'].shift(-gap) 576 | data['netmodel_{}_{}_diff_next'.format('_'.join(col),gap)] = data['netmodel_{}_{}_diff_next'.format('_'.join(col),gap)] - data['netmodel'] 577 | 578 | data['netmodel_{}_{}_diff_last'.format('_'.join(col),gap)] = data.groupby(col)['netmodel'].shift(+gap) 579 | data['netmodel_{}_{}_diff_last'.format('_'.join(col),gap)] = data['netmodel'] - data['netmodel_{}_{}_diff_last'.format('_'.join(col),gap)] 580 | data = reduce_mem(data) 581 | 582 | 583 | # In[51]: 584 | 585 | 586 | data = reduce_mem(data) 587 | 588 | 589 | # In[52]: 590 | 591 | 592 | data.to_pickle('./data.pkl') 593 | 594 | 595 | # In[ ]: 596 | 597 | 598 | 599 | 600 | 601 | # In[4]: 602 | 603 | 604 | data = pd.read_pickle('./data.pkl') 605 | 606 | 607 | # In[5]: 608 | 609 | 610 | train_data = data[data['is_train']==1] 611 | del train_data['is_train'] 612 | del train_data['id'] 613 | X_train = train_data[train_data['day'].isin([7,8,9])] 614 | X_valid = train_data[train_data['day'].isin([10])] 615 | del X_train['day'] 616 | del X_valid['day'] 617 | del train_data 618 | gc.collect() 619 | 620 | 621 | # In[6]: 622 | 623 | 624 | X_train = reduce_mem(X_train) 625 | X_valid = reduce_mem(X_valid) 626 | 627 | 628 | # In[7]: 629 | 630 | 631 | gc.collect() 632 | 633 | 634 | # In[8]: 635 | 636 | 637 | lgb_param = { 638 | 'learning_rate': 0.01, 639 | 'boosting_type': 'gbdt', 640 | 'objective': 'binary', 641 | 'metric': 'auc', 642 | 'max_depth': -1, 643 | 'seed':42, 644 | 'num_leaves':512, 645 | # 'boost_from_average':'false', 646 | 'two_round':'true', 647 | 'num_threads':-1, 648 | } 649 | 650 | feature = [x for x in X_train.columns if x not in ['id', 'is_train','target','day','ts','t','items', 'max_tag_values','min_tag_values','mean_tag_values', 651 | 'level', 652 | 'personidentification', 653 | 'followscore', 654 | 'personalscore', 655 | 'gender']] 656 | target = 'target' 657 | 658 | 659 | # In[9]: 660 | 661 | 662 | lgb_train = lgb.Dataset(X_train[feature].values, X_train[target].values,free_raw_data=True) 663 | del X_train 664 | 665 | 666 | # In[10]: 667 | 668 | 669 | xx_score = X_valid[[target]].copy() 670 | 671 | 672 | # In[11]: 673 | 674 | 675 | lgb_valid = lgb.Dataset(X_valid[feature].values, X_valid[target].values, reference=lgb_train,free_raw_data=True) 676 | 677 | 678 | # In[12]: 679 | 680 | 681 | del data 682 | 683 | 684 | # In[13]: 685 | 686 | 687 | lgb_model = lgb.train(lgb_param, lgb_train, num_boost_round=10000, valid_sets=[lgb_train,lgb_valid], 688 | early_stopping_rounds=50,verbose_eval=250 689 | # ,learning_rates=lambda iter: 0.01 if iter <=7500 else 0.015 690 | ) 691 | 692 | 693 | # Training until validation scores don't improve for 50 rounds 694 | # [25] training's auc: 0.979075 valid_1's auc: 0.976442 695 | # [50] training's auc: 0.982931 valid_1's auc: 0.978983 696 | # [75] training's auc: 0.985128 valid_1's auc: 0.979763 697 | # [100] training's auc: 0.985864 valid_1's auc: 0.978559 698 | # [125] training's auc: 0.985951 valid_1's auc: 0.977391 699 | # Early stopping, best iteration is: 700 | # [79] training's auc: 0.985358 valid_1's auc: 0.979778 701 | # 702 | # 0.8082029608320163 703 | 704 | # In[14]: 705 | 706 | 707 | gc.collect() 708 | del lgb_train,lgb_valid 709 | 710 | 711 | # In[15]: 712 | 713 | 714 | p_test = lgb_model.predict(X_valid[feature].values,num_iteration=lgb_model.best_iteration) 715 | del X_valid 716 | 717 | 718 | # In[16]: 719 | 720 | 721 | xx_score['predict'] = p_test 722 | xx_score = xx_score.sort_values('predict',ascending=False) 723 | xx_score = xx_score.reset_index() 724 | xx_score.loc[xx_score.index<=int(xx_score.shape[0]*0.103),'score'] = 1 725 | xx_score['score'] = xx_score['score'].fillna(0) 726 | 727 | 728 | # In[17]: 729 | 730 | 731 | ux = f1_score(xx_score['target'],xx_score['score']) 732 | print(ux) 733 | 734 | 735 | # In[18]: 736 | 737 | 738 | f_imp = lgb_model.feature_importance() 739 | f_nam = feature 740 | f_imp_df = pd.DataFrame({'f_imp':f_imp,'f_nam':f_nam}) 741 | 742 | 743 | # In[19]: 744 | 745 | 746 | f_imp_df.sort_values(['f_imp']) 747 | 748 | 749 | # In[20]: 750 | 751 | 752 | f_imp_df.to_csv('./f_imp_df.csv',index=False) 753 | 754 | 755 | # In[ ]: 756 | 757 | 758 | 759 | 760 | 761 | # In[21]: 762 | 763 | 764 | best_rounds = lgb_model.best_iteration 765 | 766 | 767 | # In[22]: 768 | 769 | 770 | del lgb_model 771 | 772 | 773 | # In[23]: 774 | 775 | 776 | data = pd.read_pickle('./data.pkl') 777 | 778 | 779 | # In[24]: 780 | 781 | 782 | train_data = data[data['is_train']==1] 783 | 784 | 785 | # In[25]: 786 | 787 | 788 | X_test = data[data['is_train']==0] 789 | 790 | 791 | # In[26]: 792 | 793 | 794 | del data 795 | 796 | 797 | # In[27]: 798 | 799 | 800 | gc.collect() 801 | 802 | 803 | # In[28]: 804 | 805 | 806 | print(best_rounds) 807 | lgb_train_online = lgb.Dataset(train_data[feature].values, train_data[target].values,free_raw_data=True) 808 | del train_data 809 | 810 | 811 | # In[29]: 812 | 813 | 814 | lgb_model_online = lgb.train(lgb_param, lgb_train_online, num_boost_round=best_rounds, valid_sets=[lgb_train_online],verbose_eval=250 815 | # ,learning_rates=lambda iter: 0.01 if iter <=7500 else 0.015 816 | ) 817 | 818 | 819 | # In[30]: 820 | 821 | 822 | X_submit = X_test[['id']].copy() 823 | p_test_online = lgb_model_online.predict(X_test[feature].values) 824 | del X_test 825 | 826 | 827 | # In[31]: 828 | 829 | 830 | X_submit['predict'] = p_test_online 831 | X_submit = X_submit.sort_values('predict',ascending=False) 832 | X_submit = X_submit.reset_index() 833 | 834 | 835 | # In[32]: 836 | 837 | 838 | X_submit['target'] = 0 839 | X_submit.loc[X_submit.index<=int(X_submit.shape[0]*0.10632930998240937) + 1,'target'] = 1 840 | # X_submit.loc[X_submit.index<=int(X_submit.shape[0]*0.103),'target'] = 1 841 | X_submit['target'] = X_submit['target'].fillna(0) 842 | X_submit['target'] = X_submit['target'].astype(int) 843 | X_submit[['id','target']].to_csv('./baseline{}.csv'.format(str(ux).split('.')[1]),index=False) 844 | 845 | 846 | # In[33]: 847 | 848 | 849 | X_submit.to_csv('./X_submit.csv',index=False) 850 | 851 | 852 | # In[ ]: 853 | 854 | 855 | 856 | 857 | -------------------------------------------------------------------------------- /002.dart_version.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from sklearn.model_selection import StratifiedKFold 10 | from sklearn.metrics import roc_auc_score, f1_score 11 | from scipy.stats import entropy 12 | from gensim.models import Word2Vec 13 | import lightgbm as lgb 14 | import time 15 | import gc 16 | import Geohash 17 | # unix 内核的加速操作 18 | from pandarallel import pandarallel 19 | pandarallel.initialize() 20 | pd.set_option('display.max_columns', None) 21 | import os 22 | import warnings 23 | warnings.filterwarnings('ignore') 24 | 25 | 26 | # In[2]: 27 | 28 | 29 | np.random.seed(42) 30 | 31 | 32 | # In[3]: 33 | 34 | 35 | def reduce_mem(df): 36 | start_mem = df.memory_usage().sum() / 1024 ** 2 37 | for col in df.columns: 38 | col_type = df[col].dtypes 39 | if col_type != object: 40 | c_min = df[col].min() 41 | c_max = df[col].max() 42 | if str(col_type)[:3] == 'int': 43 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 44 | df[col] = df[col].astype(np.int8) 45 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 46 | df[col] = df[col].astype(np.int16) 47 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 48 | df[col] = df[col].astype(np.int32) 49 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 50 | df[col] = df[col].astype(np.int64) 51 | else: 52 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 53 | df[col] = df[col].astype(np.float16) 54 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 55 | df[col] = df[col].astype(np.float32) 56 | else: 57 | df[col] = df[col].astype(np.float64) 58 | end_mem = df.memory_usage().sum() / 1024 ** 2 59 | print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) 60 | gc.collect() 61 | return df 62 | 63 | 64 | # In[4]: 65 | 66 | 67 | def get_ctr(data,col): 68 | f1 = data.groupby(col,as_index=False)['target'].agg({ 69 | '{}_ctr_mean'.format('_'.join(col)):'mean', 70 | }) 71 | f2 = data.groupby(col + ['hour'],as_index=False)['target'].agg({ 72 | '{}_hour_ctr_mean'.format('_'.join(col)):'mean', 73 | }) 74 | f = pd.merge(f1,f2,on=col,how='outer',copy=False) 75 | del f1,f2 76 | return f 77 | 78 | 79 | # In[5]: 80 | 81 | 82 | def get_ctr_mean(data,col): 83 | f1 = data.groupby(col,as_index=False)['{}_ctr_mean'.format('_'.join(col))].agg({ 84 | '{}_ctr_mean_mean'.format('_'.join(col)):'mean', 85 | '{}_ctr_mean_max'.format('_'.join(col)):'max', 86 | '{}_ctr_mean_median'.format('_'.join(col)):'median', 87 | '{}_ctr_mean_min'.format('_'.join(col)):'min', 88 | '{}_ctr_mean_var'.format('_'.join(col)):'var', 89 | }) 90 | 91 | f2 = data.groupby(col + ['hour'],as_index=False)['{}_hour_ctr_mean'.format('_'.join(col))].agg({ 92 | '{}_hour_ctr_mean_mean'.format('_'.join(col)):'mean', 93 | '{}_hour_ctr_mean_max'.format('_'.join(col)):'max', 94 | '{}_hour_ctr_mean_median'.format('_'.join(col)):'median', 95 | '{}_hour_ctr_mean_min'.format('_'.join(col)):'min', 96 | '{}_hour_ctr_mean_var'.format('_'.join(col)):'var', 97 | }) 98 | 99 | f = pd.merge(f1,f2,on=col,how='outer',copy=False) 100 | del f1,f2 101 | return f 102 | 103 | 104 | # In[6]: 105 | 106 | 107 | import os 108 | def get_emb(data,f1,f2): 109 | tmp = data.groupby([f1], as_index=False)[f2].agg({'{}_{}_list'.format(f1, f2): list}) 110 | sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist() 111 | del tmp['{}_{}_list'.format(f1, f2)] 112 | for i in range(len(sentences)): 113 | sentences[i] = [str(x) for x in sentences[i]] 114 | if os.path.exists('./w2v_{}_{}.model'.format(f1, f2)): 115 | model = Word2Vec.load('./w2v_{}_{}.model'.format(f1, f2)) 116 | else: 117 | model = Word2Vec(sentences, size=8, window=5, min_count=1, sg=1, hs=0, seed=42) 118 | model.save('./w2v_{}_{}.model'.format(f1, f2)) 119 | emb_matrix = [] 120 | for seq in sentences: 121 | vec = [] 122 | for w in seq: 123 | if w in model: 124 | vec.append(model[w]) 125 | if len(vec) > 0: 126 | emb_matrix.append(np.mean(vec, axis=0)) 127 | else: 128 | emb_matrix.append([0] * 8) 129 | emb_matrix = np.array(emb_matrix) 130 | for i in range(8): 131 | tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i] 132 | del model, emb_matrix, sentences 133 | tmp = reduce_mem(tmp) 134 | return tmp 135 | 136 | 137 | # In[7]: 138 | 139 | 140 | def get_ts_feature(data,gap_list = [1],col=['deviceid']): 141 | for gap in gap_list: 142 | data['ts_{}_{}_diff_next'.format('_'.join(col),gap)] = data.groupby(col)['ts'].shift(-gap) 143 | data['ts_{}_{}_diff_next'.format('_'.join(col),gap)] = data['ts_{}_{}_diff_next'.format('_'.join(col),gap)] - data['ts'] 144 | 145 | data['ts_{}_{}_diff_last'.format('_'.join(col),gap)] = data.groupby(col)['ts'].shift(+gap) 146 | data['ts_{}_{}_diff_last'.format('_'.join(col),gap)] = data['ts'] - data['ts_{}_{}_diff_last'.format('_'.join(col),gap)] 147 | 148 | data['ts_{}_{}_diff_next_count'.format('_'.join(col),gap)] = data.groupby(col)['ts_{}_{}_diff_next'.format('_'.join(col),gap)].transform('count') 149 | data['ts_{}_{}_diff_last_count'.format('_'.join(col),gap)] = data.groupby(col)['ts_{}_{}_diff_last'.format('_'.join(col),gap)].transform('count') 150 | 151 | data = reduce_mem(data) 152 | return data 153 | def get_second_ts(data,gap_list = [1,2,3],col=['deviceid'],con_list=[1],f='next'): 154 | for gap in gap_list: 155 | for con in con_list: 156 | data['ts_s_{}_{}_{}_next_{}'.format(f,'_'.join(col),gap,con)] = data.groupby(col)['ts_{}_{}_diff_{}'.format('_'.join(col),con,f)].shift(-gap) 157 | data['ts_s_{}_{}_{}_next_{}'.format(f,'_'.join(col),gap,con)] = data['ts_s_{}_{}_{}_next_{}'.format(f,'_'.join(col),gap,con)] - data['ts_{}_{}_diff_{}'.format('_'.join(col),con,f)] 158 | 159 | data['ts_s_{}_{}_{}_last_{}'.format(f,'_'.join(col),gap,con)] = data.groupby(col)['ts_{}_{}_diff_{}'.format('_'.join(col),con,f)].shift(+gap) 160 | data['ts_s_{}_{}_{}_last_{}'.format(f,'_'.join(col),gap,con)] = data['ts_{}_{}_diff_{}'.format('_'.join(col),con,f)] - data['ts_s_{}_{}_{}_last_{}'.format(f,'_'.join(col),gap,con)] 161 | 162 | data = reduce_mem(data) 163 | return data 164 | 165 | 166 | # In[23]: 167 | 168 | 169 | user = pd.read_csv('./user.csv') 170 | user['guid'] = user['guid'].fillna('none') 171 | 172 | 173 | # In[24]: 174 | 175 | 176 | user['outertag'] = user['outertag'].fillna('none') 177 | user['outertag'] = user['outertag'].astype(str) 178 | 179 | 180 | # In[25]: 181 | 182 | 183 | user['outertag_list'] = user['outertag'].parallel_apply(lambda x:x.split('|') if x!='none' else ':') 184 | 185 | 186 | # In[26]: 187 | 188 | 189 | def get_key_words(x): 190 | if x == ':': 191 | t = [] 192 | else: 193 | t = [t.split(':')[0].split('_')[0] for t in x] 194 | return ' '.join(t) 195 | 196 | 197 | # In[27]: 198 | 199 | 200 | def get_key_values(x): 201 | try: 202 | if x == ':': 203 | t = [0.0] 204 | else: 205 | t = [t.split(':')[1] for t in x] 206 | 207 | return t 208 | except: 209 | return [0.0] 210 | 211 | 212 | # In[28]: 213 | 214 | 215 | user['outertag_words'] = user['outertag_list'].parallel_apply(get_key_words) 216 | 217 | 218 | # In[29]: 219 | 220 | 221 | user['outertag_values'] = user['outertag_list'].parallel_apply(get_key_values) 222 | 223 | 224 | # In[30]: 225 | 226 | 227 | user['tag'] = user['tag'].fillna('none') 228 | user['tag'] = user['tag'].astype(str) 229 | user['tag_list'] = user['tag'].parallel_apply(lambda x:x.split('|') if x!='none' else ':') 230 | 231 | 232 | # In[31]: 233 | 234 | 235 | user['tag_words'] = user['tag_list'].parallel_apply(get_key_words) 236 | 237 | 238 | # In[32]: 239 | 240 | 241 | user['tag_values'] = user['tag_list'].parallel_apply(get_key_values) 242 | 243 | 244 | # In[33]: 245 | 246 | 247 | user = user.drop(['outertag','tag','outertag_list','tag_list'],axis=1) 248 | 249 | 250 | # In[34]: 251 | 252 | 253 | def f(x): 254 | x = [float(t) for t in x] 255 | return x 256 | 257 | 258 | # In[35]: 259 | 260 | 261 | user['tag_values'] = user['tag_values'].apply(lambda x:f(x)) 262 | user['mean_tag_values'] = user['tag_values'].apply(lambda x:np.mean(x)) 263 | user['max_tag_values'] = user['tag_values'].parallel_apply(lambda x:max(x)) 264 | user['min_tag_values'] = user['tag_values'].parallel_apply(lambda x:min(x)) 265 | 266 | 267 | # In[36]: 268 | 269 | 270 | if os.path.exists('./w2v_{}_{}.model'.format('user', 'tag_words')): 271 | model = Word2Vec.load('./w2v_{}_{}.model'.format('user', 'tag_words')) 272 | else: 273 | model = Word2Vec(user['tag_words'].parallel_apply(lambda x:x.split(' ')), size=8, window=5, min_count=1, sg=1, hs=0, seed=42) 274 | model.save('./w2v_{}_{}.model'.format('user', 'tag_words')) 275 | 276 | 277 | # In[37]: 278 | 279 | 280 | model['约会'] 281 | 282 | 283 | # In[38]: 284 | 285 | 286 | emb_matrix = [] 287 | for seq in user['tag_words'].parallel_apply(lambda x:x.split(' ')): 288 | vec = [] 289 | for w in seq: 290 | if w in model: 291 | vec.append(model[w]) 292 | if len(vec) > 0: 293 | emb_matrix.append(np.mean(vec, axis=0)) 294 | else: 295 | emb_matrix.append([0] * 8) 296 | emb_matrix = np.array(emb_matrix) 297 | for i in range(8): 298 | user['{}_{}_emb_{}'.format('user', 'tag_words', i)] = emb_matrix[:, i] 299 | del model, emb_matrix 300 | 301 | 302 | # In[39]: 303 | 304 | 305 | if os.path.exists('./w2v_{}_{}.model'.format('user', 'outertag_words')): 306 | model = Word2Vec.load('./w2v_{}_{}.model'.format('user', 'outertag_words')) 307 | else: 308 | model = Word2Vec(user['outertag_words'].parallel_apply(lambda x:x.split(' ')), size=8, window=5, min_count=1, sg=1, hs=0, seed=42) 309 | model.save('./w2v_{}_{}.model'.format('user', 'outertag_words')) 310 | emb_matrix = [] 311 | for seq in user['outertag_words'].parallel_apply(lambda x:x.split(' ')): 312 | vec = [] 313 | for w in seq: 314 | if w in model: 315 | vec.append(model[w]) 316 | if len(vec) > 0: 317 | emb_matrix.append(np.mean(vec, axis=0)) 318 | else: 319 | emb_matrix.append([0] * 8) 320 | emb_matrix = np.array(emb_matrix) 321 | for i in range(8): 322 | user['{}_{}_emb_{}'.format('user', 'outertag_words', i)] = emb_matrix[:, i] 323 | del model, emb_matrix 324 | 325 | 326 | # In[40]: 327 | 328 | 329 | user.columns 330 | 331 | 332 | # In[41]: 333 | 334 | 335 | user = user[['deviceid', 'guid','user_tag_words_emb_0', 'user_tag_words_emb_1', 'user_tag_words_emb_2', 336 | 'user_tag_words_emb_3', 'user_tag_words_emb_4', 'user_tag_words_emb_5', 337 | 'user_tag_words_emb_6', 'user_tag_words_emb_7']] 338 | 339 | 340 | # In[42]: 341 | 342 | 343 | user.head() 344 | 345 | 346 | # In[43]: 347 | 348 | 349 | print('read train and test data') 350 | train = pd.read_csv('./train.csv') 351 | test = pd.read_csv('./test.csv') 352 | 353 | 354 | # In[44]: 355 | 356 | 357 | train['is_train'] = 1 358 | test['is_train'] = 0 359 | 360 | 361 | # In[45]: 362 | 363 | 364 | data = pd.concat([train, test], axis=0, ignore_index=False) 365 | data = data.sort_values('ts').reset_index(drop=True) 366 | del train,test 367 | print('finish data concat ing') 368 | 369 | 370 | # In[46]: 371 | 372 | 373 | data['guid'] = data['guid'].fillna('none') 374 | 375 | 376 | # In[47]: 377 | 378 | 379 | data = pd.merge(data,user,on=['deviceid', 'guid'],how='left',copy=False) 380 | del user 381 | 382 | 383 | # In[49]: 384 | 385 | 386 | print('change data format ing ... ...') 387 | data['date'] = pd.to_datetime( 388 | data['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000))) 389 | ) 390 | data['day'] = data['date'].dt.day 391 | data['hour'] = data['date'].dt.hour 392 | data['minute'] = data['date'].dt.minute 393 | del data['date'] 394 | gc.collect() 395 | 396 | 397 | # In[50]: 398 | 399 | 400 | del data['timestamp'] 401 | data = reduce_mem(data) 402 | 403 | 404 | # In[51]: 405 | 406 | 407 | print('geohash g6') 408 | data['g6'] = data[['lat','lng']].parallel_apply(lambda x:Geohash.encode(x[0],x[1],6),axis=1) 409 | print('geohash g7') 410 | data['g7'] = data[['lat','lng']].parallel_apply(lambda x:Geohash.encode(x[0],x[1],7),axis=1) 411 | 412 | 413 | # In[52]: 414 | 415 | 416 | print('交叉转化率特征') 417 | train_7 = data[(data['is_train']==1)&(data['day']==7)][['deviceid','g6','newsid','netmodel','target','hour']] 418 | train_8 = data[(data['is_train']==1)&(data['day']==8)][['deviceid','g6','newsid','netmodel','target','hour']] 419 | train_9 = data[(data['is_train']==1)&(data['day']==9)][['deviceid','g6','newsid','netmodel','target','hour']] 420 | train_10 = data[(data['is_train']==1)&(data['day']==10)][['deviceid','g6','newsid','netmodel','target','hour']] 421 | for col in [['deviceid'],['deviceid','netmodel'],['deviceid','g6'],['newsid']]: 422 | print(col) 423 | ctr_7 = get_ctr(train_7,col) 424 | ctr_8 = get_ctr(train_8,col) 425 | ctr_9 = get_ctr(train_9,col) 426 | ctr_10 = get_ctr(train_10,col) 427 | 428 | ctr_1 = pd.concat([ctr_7,ctr_8,ctr_9,ctr_10],axis=0,ignore_index=True,sort=False) 429 | ctr_2 = pd.concat([ctr_8,ctr_9,ctr_10],axis=0,ignore_index=True,sort=False) 430 | ctr_3 = pd.concat([ctr_7,ctr_9,ctr_10],axis=0,ignore_index=True,sort=False) 431 | ctr_4 = pd.concat([ctr_7,ctr_8,ctr_10],axis=0,ignore_index=True,sort=False) 432 | ctr_5 = pd.concat([ctr_7,ctr_8,ctr_9],axis=0,ignore_index=True,sort=False) 433 | 434 | ctr_1 = get_ctr_mean(ctr_1,col) 435 | ctr_2 = get_ctr_mean(ctr_2,col) 436 | ctr_3 = get_ctr_mean(ctr_3,col) 437 | ctr_4 = get_ctr_mean(ctr_4,col) 438 | ctr_5 = get_ctr_mean(ctr_5,col) 439 | 440 | ctr_1['day'] = 11 441 | ctr_2['day'] = 7 442 | ctr_3['day'] = 8 443 | ctr_4['day'] = 9 444 | ctr_5['day'] = 10 445 | 446 | ctr = pd.concat([ctr_2,ctr_3,ctr_4,ctr_5,ctr_1],axis=0,ignore_index=True,sort=False) 447 | ctr = reduce_mem(ctr) 448 | del ctr_1,ctr_2,ctr_3,ctr_4,ctr_5,ctr_7,ctr_8,ctr_9,ctr_10 449 | data = pd.merge(data,ctr,on=['hour','day']+col,how='left',copy=False) 450 | del train_7,train_8,train_9,train_10 451 | 452 | 453 | # In[53]: 454 | 455 | 456 | data['t'] = data['ts'].parallel_apply(lambda x:x//1000) 457 | 458 | 459 | # In[54]: 460 | 461 | 462 | tcmp = data.groupby(['deviceid','t']).size().reset_index() 463 | tcmp.columns = ['deviceid','t','items'] 464 | tcmp['h_d_items'] = tcmp.groupby(['deviceid'])['items'].cumsum() - tcmp['items'] 465 | tcmp = tcmp.sort_values(['t'],ascending=False) 466 | tcmp['f_d_items'] = tcmp.groupby(['deviceid'])['items'].cumsum() - tcmp['items'] 467 | data = pd.merge(data,tcmp,on=['deviceid','t'],copy=False,how='left') 468 | del tcmp 469 | 470 | 471 | # In[55]: 472 | 473 | 474 | data['netmodel'] = data['netmodel'].map({'o':-1, 'w':1, 'g4':2, 'g2':4, 'g3':3}) 475 | data['netmodel'] = data['netmodel'].astype(int) 476 | 477 | 478 | # In[56]: 479 | 480 | 481 | data['app_version'] = data['app_version'].parallel_apply(lambda x:''.join(x.split('.'))) 482 | data['app_version'] = data['app_version'].astype(int) 483 | 484 | 485 | # In[57]: 486 | 487 | 488 | data['osversion'] = data['osversion'].parallel_apply(lambda x:''.join(x.split('.')[::-1])) 489 | data['osversion'] = data['osversion'].astype(int) 490 | 491 | 492 | # In[58]: 493 | 494 | 495 | for cat_f in ['device_vendor','device_version']: 496 | data[cat_f] = data[cat_f].parallel_apply(lambda x:str(x).lower()) 497 | data[cat_f] = data[cat_f].astype("category") 498 | data[cat_f] = data[cat_f].cat.codes 499 | 500 | 501 | # In[59]: 502 | 503 | 504 | for cat_f in ['deviceid','guid','newsid','g6','g7']: 505 | data[cat_f] = data[cat_f].astype("category") 506 | data[cat_f] = data[cat_f].cat.codes 507 | 508 | 509 | # In[60]: 510 | 511 | 512 | # 类被count编码 513 | for cat in ['app_version','device_vendor','device_version','deviceid','lat','lng','netmodel','newsid','osversion','pos','g6','g7','guid']: 514 | data['{}_count'.format(cat)] = data.groupby(cat)['id'].transform('count') 515 | 516 | 517 | # In[61]: 518 | 519 | 520 | data = reduce_mem(data) 521 | 522 | 523 | # In[62]: 524 | 525 | 526 | data.head() 527 | 528 | 529 | # In[63]: 530 | 531 | 532 | print('make embedding feature ing') 533 | for emb in [['deviceid','newsid']]: 534 | print(emb) 535 | tmp = get_emb(data,emb[0],emb[1]) 536 | data = pd.merge(data,tmp,on=emb[0],how='left',copy=False) 537 | del tmp 538 | 539 | 540 | # In[64]: 541 | 542 | 543 | for col in [['deviceid'], 544 | ['pos','deviceid'], 545 | ['netmodel','deviceid'], 546 | ]: 547 | print('_'.join(col),'make','feature') 548 | data = get_ts_feature(data,gap_list = [1,2,3],col=col) 549 | data = get_second_ts(data,gap_list = [1,2,3],col=col,con_list=[1],f='next') 550 | data = get_second_ts(data,gap_list = [1,2,3],col=col,con_list=[1],f='last') 551 | 552 | 553 | # In[65]: 554 | 555 | 556 | # 增加pos信息的偏移量 pos shift(-1/+1) 557 | for col in [['deviceid'],['netmodel','deviceid']]: 558 | for gap in [1,2,3]: 559 | print(col,gap) 560 | data['pos_{}_{}_diff_next'.format('_'.join(col),gap)] = data.groupby(col)['pos'].shift(-gap) 561 | data['pos_{}_{}_diff_next'.format('_'.join(col),gap)] = data['pos_{}_{}_diff_next'.format('_'.join(col),gap)] - data['pos'] 562 | 563 | data['pos_{}_{}_diff_last'.format('_'.join(col),gap)] = data.groupby(col)['pos'].shift(+gap) 564 | data['pos_{}_{}_diff_last'.format('_'.join(col),gap)] = data['pos'] - data['pos_{}_{}_diff_last'.format('_'.join(col),gap)] 565 | data = reduce_mem(data) 566 | 567 | 568 | # In[66]: 569 | 570 | 571 | # 增加 netmodel 信息的偏移量 pos shift(-1/+1) 572 | for col in [['deviceid'],['pos','deviceid']]: 573 | for gap in [1,2,3]: 574 | print(col,gap) 575 | data['netmodel_{}_{}_diff_next'.format('_'.join(col),gap)] = data.groupby(col)['netmodel'].shift(-gap) 576 | data['netmodel_{}_{}_diff_next'.format('_'.join(col),gap)] = data['netmodel_{}_{}_diff_next'.format('_'.join(col),gap)] - data['netmodel'] 577 | 578 | data['netmodel_{}_{}_diff_last'.format('_'.join(col),gap)] = data.groupby(col)['netmodel'].shift(+gap) 579 | data['netmodel_{}_{}_diff_last'.format('_'.join(col),gap)] = data['netmodel'] - data['netmodel_{}_{}_diff_last'.format('_'.join(col),gap)] 580 | data = reduce_mem(data) 581 | 582 | 583 | # In[67]: 584 | 585 | 586 | data = reduce_mem(data) 587 | 588 | 589 | # In[68]: 590 | 591 | 592 | data.to_pickle('./data.pkl') 593 | 594 | 595 | # In[ ]: 596 | 597 | 598 | 599 | 600 | 601 | # In[4]: 602 | 603 | 604 | data = pd.read_pickle('./data.pkl') 605 | 606 | 607 | # In[5]: 608 | 609 | 610 | train_data = data[data['is_train']==1] 611 | del train_data['is_train'] 612 | del train_data['id'] 613 | X_train = train_data[train_data['day'].isin([7,8,9])] 614 | X_valid = train_data[train_data['day'].isin([10])] 615 | del X_train['day'] 616 | del X_valid['day'] 617 | del train_data 618 | gc.collect() 619 | 620 | 621 | # In[6]: 622 | 623 | 624 | X_train = reduce_mem(X_train) 625 | X_valid = reduce_mem(X_valid) 626 | 627 | 628 | # In[7]: 629 | 630 | 631 | gc.collect() 632 | 633 | 634 | # In[16]: 635 | 636 | 637 | lgb_param = { 638 | 'learning_rate': 0.05, 639 | 'boosting_type': 'dart', 640 | 'objective': 'binary', 641 | 'metric': 'auc', 642 | 'max_depth': -1, 643 | 'seed':42, 644 | 'num_leaves':512, 645 | 'boost_from_average':'false', 646 | # 'two_round':'true', 647 | 'num_threads':-1, 648 | # 'max_bin':512 649 | # 'device': 'gpu', 650 | } 651 | 652 | 653 | # In[9]: 654 | 655 | 656 | feature = [x for x in X_train.columns if x not in ['id', 'is_train','target','day','ts','t','items', 'max_tag_values','min_tag_values','mean_tag_values', 657 | 'level', 658 | 'personidentification', 659 | 'followscore', 660 | 'personalscore', 661 | 'gender']] 662 | target = 'target' 663 | 664 | 665 | # In[10]: 666 | 667 | 668 | lgb_train = lgb.Dataset(X_train[feature].values, X_train[target].values,free_raw_data=True) 669 | del X_train 670 | 671 | 672 | # In[11]: 673 | 674 | 675 | xx_score = X_valid[[target]].copy() 676 | 677 | 678 | # In[12]: 679 | 680 | 681 | lgb_valid = lgb.Dataset(X_valid[feature].values, X_valid[target].values, reference=lgb_train,free_raw_data=True) 682 | 683 | 684 | # In[13]: 685 | 686 | 687 | del data 688 | 689 | 690 | # In[17]: 691 | 692 | 693 | lgb_model = lgb.train(lgb_param, lgb_train, num_boost_round=1500, valid_sets=[lgb_train,lgb_valid], 694 | early_stopping_rounds=50,verbose_eval=25 695 | # ,learning_rates=lambda iter: 0.01 if iter <=7500 else 0.015 696 | ) 697 | 698 | 699 | # Training until validation scores don't improve for 50 rounds 700 | # [25] training's auc: 0.979075 valid_1's auc: 0.976442 701 | # [50] training's auc: 0.982931 valid_1's auc: 0.978983 702 | # [75] training's auc: 0.985128 valid_1's auc: 0.979763 703 | # [100] training's auc: 0.985864 valid_1's auc: 0.978559 704 | # [125] training's auc: 0.985951 valid_1's auc: 0.977391 705 | # Early stopping, best iteration is: 706 | # [79] training's auc: 0.985358 valid_1's auc: 0.979778 707 | # 708 | # 0.8082029608320163 709 | 710 | # In[18]: 711 | 712 | 713 | gc.collect() 714 | del lgb_train,lgb_valid 715 | 716 | 717 | # In[19]: 718 | 719 | 720 | p_test = lgb_model.predict(X_valid[feature].values,num_iteration=lgb_model.best_iteration) 721 | del X_valid 722 | 723 | 724 | # In[20]: 725 | 726 | 727 | xx_score['predict'] = p_test 728 | xx_score = xx_score.sort_values('predict',ascending=False) 729 | xx_score = xx_score.reset_index() 730 | xx_score.loc[xx_score.index<=int(xx_score.shape[0]*0.103),'score'] = 1 731 | xx_score['score'] = xx_score['score'].fillna(0) 732 | 733 | 734 | # In[21]: 735 | 736 | 737 | ux = f1_score(xx_score['target'],xx_score['score']) 738 | print(ux) 739 | 740 | 741 | # In[22]: 742 | 743 | 744 | f_imp = lgb_model.feature_importance() 745 | f_nam = feature 746 | f_imp_df = pd.DataFrame({'f_imp':f_imp,'f_nam':f_nam}) 747 | 748 | 749 | # In[23]: 750 | 751 | 752 | f_imp_df.sort_values(['f_imp']) 753 | 754 | 755 | # In[24]: 756 | 757 | 758 | f_imp_df.to_csv('./f_imp_df.csv',index=False) 759 | 760 | 761 | # In[ ]: 762 | 763 | 764 | 765 | 766 | 767 | # In[25]: 768 | 769 | 770 | best_rounds = lgb_model.best_iteration 771 | 772 | 773 | # In[26]: 774 | 775 | 776 | best_rounds = 1000 777 | 778 | 779 | # In[27]: 780 | 781 | 782 | del lgb_model 783 | 784 | 785 | # In[28]: 786 | 787 | 788 | data = pd.read_pickle('./data.pkl') 789 | 790 | 791 | # In[29]: 792 | 793 | 794 | train_data = data[data['is_train']==1] 795 | 796 | 797 | # In[30]: 798 | 799 | 800 | X_test = data[data['is_train']==0] 801 | 802 | 803 | # In[31]: 804 | 805 | 806 | del data 807 | 808 | 809 | # In[32]: 810 | 811 | 812 | gc.collect() 813 | 814 | 815 | # In[33]: 816 | 817 | 818 | print(best_rounds) 819 | lgb_train_online = lgb.Dataset(train_data[feature].values, train_data[target].values,free_raw_data=True) 820 | del train_data 821 | 822 | 823 | # In[ ]: 824 | 825 | 826 | lgb_model_online = lgb.train(lgb_param, lgb_train_online, num_boost_round=best_rounds+500, valid_sets=[lgb_train_online],verbose_eval=25 827 | # ,learning_rates=lambda iter: 0.01 if iter <=7500 else 0.015 828 | ) 829 | 830 | 831 | # In[ ]: 832 | 833 | 834 | X_submit = X_test[['id']].copy() 835 | p_test_online = lgb_model_online.predict(X_test[feature].values) 836 | del X_test 837 | 838 | 839 | # In[ ]: 840 | 841 | 842 | X_submit['predict'] = p_test_online 843 | X_submit = X_submit.sort_values('predict',ascending=False) 844 | X_submit = X_submit.reset_index() 845 | 846 | 847 | # In[ ]: 848 | 849 | 850 | X_submit['target'] = 0 851 | X_submit.loc[X_submit.index<=int(X_submit.shape[0]*0.10632930998240937) + 1,'target'] = 1 852 | # X_submit.loc[X_submit.index<=int(X_submit.shape[0]*0.103),'target'] = 1 853 | X_submit['target'] = X_submit['target'].fillna(0) 854 | X_submit['target'] = X_submit['target'].astype(int) 855 | X_submit[['id','target']].to_csv('./baseline{}.csv'.format(str(ux).split('.')[1]),index=False) 856 | 857 | 858 | # In[ ]: 859 | 860 | 861 | X_submit.to_csv('./X_submit.csv',index=False) 862 | 863 | 864 | # In[ ]: 865 | 866 | 867 | 868 | 869 | -------------------------------------------------------------------------------- /CompetitionPlatform.md: -------------------------------------------------------------------------------- 1 | ### Big data competition platform for domestic competition 2 | > 收录了国内的一些比赛平台信息 3 | * [天池大数据比赛平台(TianChi big data)](https://tianchi.aliyun.com/) 4 | * [datafountain](https://www.datafountain.cn) 5 | * [biendata](https://www.biendata.com/) 6 | * [科赛(kesci)](https://www.kesci.com/) 7 | * [DC竞赛-大数据竞赛平台(DC competition)](http://www.dcjingsai.com/) 8 | * [JDATA](https://jdata.jd.com/) 9 | * [JDDiscovery](https://jdder.jd.com/) 10 | * [数源(datadreams)](http://www.datadreams.org/#/index) 11 | 12 | ### 13 | -------------------------------------------------------------------------------- /Factorization Meets the Neighborhood_ a Multifaceted Collaborative Filtering Model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZS167275/Eternity-in-an-hour/30addae3a0d5f183d63e2d02b4694de2d14cec39/Factorization Meets the Neighborhood_ a Multifaceted Collaborative Filtering Model.pdf -------------------------------------------------------------------------------- /Item-Based Collaborative Filtering Recommendation Algorithms.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZS167275/Eternity-in-an-hour/30addae3a0d5f183d63e2d02b4694de2d14cec39/Item-Based Collaborative Filtering Recommendation Algorithms.pdf -------------------------------------------------------------------------------- /Recommender-Systems-[Netflix].pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZS167275/Eternity-in-an-hour/30addae3a0d5f183d63e2d02b4694de2d14cec39/Recommender-Systems-[Netflix].pdf -------------------------------------------------------------------------------- /baselin: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import time,datetime 4 | import lightgbm as lgb 5 | from sklearn.metrics import f1_score 6 | 7 | 8 | train = pd.read_csv('./train.csv') 9 | test = pd.read_csv('./test.csv') 10 | 11 | # 对数据进行排序 12 | train = train.sort_values(['deviceid','guid','ts']) 13 | test = test.sort_values(['deviceid','guid','ts']) 14 | 15 | # 查看数据是否存在交集 16 | # train deviceid 104736 17 | # test deviceid 56681 18 | # train&test deviceid 46833 19 | # train guid 104333 20 | # test guid 56861 21 | # train&test guid 46654 22 | 23 | print('train deviceid',len((set(train['deviceid'])))) 24 | print('test deviceid',len((set(test['deviceid'])))) 25 | print('train&test deviceid',len((set(train['deviceid'])&set(test['deviceid'])))) 26 | print('train guid',len((set(train['guid'])))) 27 | print('test guid',len((set(test['guid'])))) 28 | print('train&test guid',len((set(train['guid'])&set(test['guid'])))) 29 | 30 | # 时间格式转化 ts 31 | def time_data2(time_sj): 32 | data_sj = time.localtime(time_sj/1000) 33 | time_str = time.strftime("%Y-%m-%d %H:%M:%S",data_sj) 34 | return time_str 35 | 36 | train['datetime'] = train['ts'].apply(time_data2) 37 | test['datetime'] = test['ts'].apply(time_data2) 38 | 39 | train['datetime'] = pd.to_datetime(train['datetime']) 40 | test['datetime'] = pd.to_datetime(test['datetime']) 41 | 42 | # 时间范围 43 | # 2019-11-07 23:59:59 2019-11-10 23:59:59 44 | # 2019-11-10 23:59:59 2019-11-11 23:59:59 45 | print(train['datetime'].min(),train['datetime'].max()) 46 | print(test['datetime'].min(),test['datetime'].max()) 47 | # 7 0.000000 48 | # 8 0.107774 49 | # 9 0.106327 50 | # 10 0.105583 51 | 52 | # 7 11 53 | # 8 3674871 54 | # 9 3743690 55 | # 10 3958109 56 | # 11 3653592 57 | 58 | train['days'] = train['datetime'].dt.day 59 | test['days'] = test['datetime'].dt.day 60 | 61 | train['flag'] = train['days'] 62 | test['flag'] = 11 63 | 64 | # 8 9 10 11 65 | data = pd.concat([train,test],axis=0,sort=False) 66 | del train,test 67 | 68 | 69 | # 小时信息 70 | data['hour'] = data['datetime'].dt.hour 71 | data['minute'] = data['datetime'].dt.minute 72 | 73 | # 缺失值填充 74 | data['guid'] = data['guid'].fillna('abc') 75 | 76 | # 构造历史特征 分别统计前一天 guid deviceid 的相关信息 77 | # 8 9 10 11 78 | history_9 = data[data['days']==8] 79 | history_10 = data[data['days']==9] 80 | history_11 = data[data['days']==10] 81 | history_12 = data[data['days']==11] 82 | del data 83 | # 61326 84 | # 64766 85 | # 66547 86 | # 41933 87 | # 42546 88 | print(len(set(history_9['deviceid']))) 89 | print(len(set(history_10['deviceid']))) 90 | print(len(set(history_11['deviceid']))) 91 | print(len(set(history_12['deviceid']))) 92 | print(len(set(history_9['deviceid'])&set(history_10['deviceid']))) 93 | print(len(set(history_10['deviceid'])&set(history_11['deviceid']))) 94 | print(len(set(history_11['deviceid'])&set(history_12['deviceid']))) 95 | 96 | # 61277 97 | # 64284 98 | # 66286 99 | # 41796 100 | # 42347 101 | 102 | print(len(set(history_9['guid']))) 103 | print(len(set(history_10['guid']))) 104 | print(len(set(history_11['guid']))) 105 | print(len(set(history_12['guid']))) 106 | print(len(set(history_9['guid'])&set(history_10['guid']))) 107 | print(len(set(history_10['guid'])&set(history_11['guid']))) 108 | print(len(set(history_11['guid'])&set(history_12['guid']))) 109 | 110 | # 640066 111 | # 631547 112 | # 658787 113 | # 345742 114 | # 350542 115 | 116 | print(len(set(history_9['newsid']))) 117 | print(len(set(history_10['newsid']))) 118 | print(len(set(history_11['newsid']))) 119 | print(len(set(history_12['newsid']))) 120 | print(len(set(history_9['newsid'])&set(history_10['newsid']))) 121 | print(len(set(history_10['newsid'])&set(history_11['newsid']))) 122 | print(len(set(history_11['newsid'])&set(history_12['newsid']))) 123 | 124 | # deviceid guid timestamp ts 时间特征 125 | def get_history_visit_time(data1,date2): 126 | data1 = data1.sort_values(['ts','timestamp']) 127 | data1['timestamp_ts'] = data1['timestamp'] - data1['ts'] 128 | data1_tmp = data1[data1['target']==1].copy() 129 | del data1 130 | for col in ['deviceid','guid']: 131 | for ts in ['timestamp_ts']: 132 | f_tmp = data1_tmp.groupby([col],as_index=False)[ts].agg({ 133 | '{}_{}_max'.format(col,ts):'max', 134 | '{}_{}_mean'.format(col,ts):'mean', 135 | '{}_{}_min'.format(col,ts):'min', 136 | '{}_{}_median'.format(col,ts):'median' 137 | }) 138 | date2 = pd.merge(date2,f_tmp,on=[col],how='left',copy=False) 139 | 140 | return date2 141 | 142 | history_10 = get_history_visit_time(history_9,history_10) 143 | history_11 = get_history_visit_time(history_10,history_11) 144 | history_12 = get_history_visit_time(history_11,history_12) 145 | 146 | data = pd.concat([history_10,history_11],axis=0,sort=False,ignore_index=True) 147 | data = pd.concat([data,history_12],axis=0,sort=False,ignore_index=True) 148 | del history_9,history_10,history_11,history_12 149 | 150 | data = data.sort_values('ts') 151 | data['ts_next'] = data.groupby(['deviceid'])['ts'].shift(-1) 152 | data['ts_next_ts'] = data['ts_next'] - data['ts'] 153 | 154 | # 当前一天内的特征 leak 155 | for col in [['deviceid'],['guid'],['newsid']]: 156 | print(col) 157 | data['{}_days_count'.format('_'.join(col))] = data.groupby(['days'] + col)['id'].transform('count') 158 | 159 | 160 | # netmodel 161 | data['netmodel'] = data['netmodel'].map({'o':1, 'w':2, 'g4':4, 'g3':3, 'g2':2}) 162 | 163 | # pos 164 | data['pos'] = data['pos'] 165 | 166 | 167 | print('train and predict') 168 | X_train = data[data['flag'].isin([9])] 169 | X_valid = data[data['flag'].isin([10])] 170 | X_test = data[data['flag'].isin([11])] 171 | 172 | 173 | lgb_param = { 174 | 'learning_rate': 0.1, 175 | 'boosting_type': 'gbdt', 176 | 'objective': 'binary', 177 | 'metric': 'auc', 178 | 'max_depth': -1, 179 | 'seed':42, 180 | 'boost_from_average':'false', 181 | } 182 | 183 | 184 | feature = [ 185 | 'pos','netmodel', 'hour', 'minute', 186 | 'deviceid_timestamp_ts_max', 'deviceid_timestamp_ts_mean', 187 | 'deviceid_timestamp_ts_min', 'deviceid_timestamp_ts_median', 188 | 'guid_timestamp_ts_max', 'guid_timestamp_ts_mean', 189 | 'guid_timestamp_ts_min', 'guid_timestamp_ts_median', 190 | 'deviceid_days_count', 'guid_days_count','newsid_days_count', 191 | 'ts_next_ts' 192 | ] 193 | target = 'target' 194 | 195 | 196 | lgb_train = lgb.Dataset(X_train[feature].values, X_train[target].values) 197 | lgb_valid = lgb.Dataset(X_valid[feature].values, X_valid[target].values, reference=lgb_train) 198 | lgb_model = lgb.train(lgb_param, lgb_train, num_boost_round=10000, valid_sets=[lgb_train,lgb_valid], 199 | early_stopping_rounds=50,verbose_eval=10) 200 | 201 | p_test = lgb_model.predict(X_valid[feature].values,num_iteration=lgb_model.best_iteration) 202 | xx_score = X_valid[[target]].copy() 203 | xx_score['predict'] = p_test 204 | xx_score = xx_score.sort_values('predict',ascending=False) 205 | xx_score = xx_score.reset_index() 206 | xx_score.loc[xx_score.index<=int(xx_score.shape[0]*0.103),'score'] = 1 207 | xx_score['score'] = xx_score['score'].fillna(0) 208 | print(f1_score(xx_score['target'],xx_score['score'])) 209 | 210 | del lgb_train,lgb_valid 211 | del X_train,X_valid 212 | # 没加 newsid 之前的 f1 score 213 | # 0.5129179717875857 214 | # 0.5197833317587095 215 | # 0.6063125458760602 216 | X_train_2 = data[data['flag'].isin([9,10])] 217 | 218 | 219 | lgb_train_2 = lgb.Dataset(X_train_2[feature].values, X_train_2[target].values) 220 | lgb_model_2 = lgb.train(lgb_param, lgb_train_2, num_boost_round=lgb_model.best_iteration, valid_sets=[lgb_train_2],verbose_eval=10) 221 | 222 | p_predict = lgb_model_2.predict(X_test[feature].values) 223 | 224 | submit_score = X_test[['id']].copy() 225 | submit_score['predict'] = p_predict 226 | submit_score = submit_score.sort_values('predict',ascending=False) 227 | submit_score = submit_score.reset_index() 228 | submit_score.loc[submit_score.index<=int(submit_score.shape[0]*0.103),'target'] = 1 229 | submit_score['target'] = submit_score['target'].fillna(0) 230 | 231 | submit_score = submit_score.sort_values('id') 232 | submit_score['target'] = submit_score['target'].astype(int) 233 | 234 | sample = pd.read_csv('./sample.csv') 235 | sample.columns = ['id','non_target'] 236 | submit_score = pd.merge(sample,submit_score,on=['id'],how='left') 237 | 238 | submit_score[['id','target']].to_csv('./baseline.csv',index=False) 239 | -------------------------------------------------------------------------------- /baseline: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import time,datetime 4 | import lightgbm as lgb 5 | from sklearn.metrics import f1_score 6 | 7 | 8 | train = pd.read_csv('./train.csv') 9 | test = pd.read_csv('./test.csv') 10 | 11 | # 对数据进行排序 12 | train = train.sort_values(['deviceid','guid','ts']) 13 | test = test.sort_values(['deviceid','guid','ts']) 14 | 15 | # 查看数据是否存在交集 16 | # train deviceid 104736 17 | # test deviceid 56681 18 | # train&test deviceid 46833 19 | # train guid 104333 20 | # test guid 56861 21 | # train&test guid 46654 22 | 23 | print('train deviceid',len((set(train['deviceid'])))) 24 | print('test deviceid',len((set(test['deviceid'])))) 25 | print('train&test deviceid',len((set(train['deviceid'])&set(test['deviceid'])))) 26 | print('train guid',len((set(train['guid'])))) 27 | print('test guid',len((set(test['guid'])))) 28 | print('train&test guid',len((set(train['guid'])&set(test['guid'])))) 29 | 30 | # 时间格式转化 ts 31 | def time_data2(time_sj): 32 | data_sj = time.localtime(time_sj/1000) 33 | time_str = time.strftime("%Y-%m-%d %H:%M:%S",data_sj) 34 | return time_str 35 | 36 | train['datetime'] = train['ts'].apply(time_data2) 37 | test['datetime'] = test['ts'].apply(time_data2) 38 | 39 | train['datetime'] = pd.to_datetime(train['datetime']) 40 | test['datetime'] = pd.to_datetime(test['datetime']) 41 | 42 | # 时间范围 43 | # 2019-11-07 23:59:59 2019-11-10 23:59:59 44 | # 2019-11-10 23:59:59 2019-11-11 23:59:59 45 | print(train['datetime'].min(),train['datetime'].max()) 46 | print(test['datetime'].min(),test['datetime'].max()) 47 | # 7 0.000000 48 | # 8 0.107774 49 | # 9 0.106327 50 | # 10 0.105583 51 | 52 | # 7 11 53 | # 8 3674871 54 | # 9 3743690 55 | # 10 3958109 56 | # 11 3653592 57 | 58 | train['days'] = train['datetime'].dt.day 59 | test['days'] = test['datetime'].dt.day 60 | 61 | train['flag'] = train['days'] 62 | test['flag'] = 11 63 | 64 | # 8 9 10 11 65 | data = pd.concat([train,test],axis=0,sort=False) 66 | del train,test 67 | 68 | 69 | # 小时信息 70 | data['hour'] = data['datetime'].dt.hour 71 | data['minute'] = data['datetime'].dt.minute 72 | 73 | # 缺失值填充 74 | data['guid'] = data['guid'].fillna('abc') 75 | 76 | # 构造历史特征 分别统计前一天 guid deviceid 的相关信息 77 | # 8 9 10 11 78 | history_9 = data[data['days']==8] 79 | history_10 = data[data['days']==9] 80 | history_11 = data[data['days']==10] 81 | history_12 = data[data['days']==11] 82 | del data 83 | # 61326 84 | # 64766 85 | # 66547 86 | # 41933 87 | # 42546 88 | print(len(set(history_9['deviceid']))) 89 | print(len(set(history_10['deviceid']))) 90 | print(len(set(history_11['deviceid']))) 91 | print(len(set(history_12['deviceid']))) 92 | print(len(set(history_9['deviceid'])&set(history_10['deviceid']))) 93 | print(len(set(history_10['deviceid'])&set(history_11['deviceid']))) 94 | print(len(set(history_11['deviceid'])&set(history_12['deviceid']))) 95 | 96 | # 61277 97 | # 64284 98 | # 66286 99 | # 41796 100 | # 42347 101 | 102 | print(len(set(history_9['guid']))) 103 | print(len(set(history_10['guid']))) 104 | print(len(set(history_11['guid']))) 105 | print(len(set(history_12['guid']))) 106 | print(len(set(history_9['guid'])&set(history_10['guid']))) 107 | print(len(set(history_10['guid'])&set(history_11['guid']))) 108 | print(len(set(history_11['guid'])&set(history_12['guid']))) 109 | 110 | # 640066 111 | # 631547 112 | # 658787 113 | # 345742 114 | # 350542 115 | 116 | print(len(set(history_9['newsid']))) 117 | print(len(set(history_10['newsid']))) 118 | print(len(set(history_11['newsid']))) 119 | print(len(set(history_12['newsid']))) 120 | print(len(set(history_9['newsid'])&set(history_10['newsid']))) 121 | print(len(set(history_10['newsid'])&set(history_11['newsid']))) 122 | print(len(set(history_11['newsid'])&set(history_12['newsid']))) 123 | 124 | # deviceid guid timestamp ts 时间特征 125 | def get_history_visit_time(data1,date2): 126 | data1 = data1.sort_values(['ts','timestamp']) 127 | data1['timestamp_ts'] = data1['timestamp'] - data1['ts'] 128 | data1_tmp = data1[data1['target']==1].copy() 129 | del data1 130 | for col in ['deviceid','guid']: 131 | for ts in ['timestamp_ts']: 132 | f_tmp = data1_tmp.groupby([col],as_index=False)[ts].agg({ 133 | '{}_{}_max'.format(col,ts):'max', 134 | '{}_{}_mean'.format(col,ts):'mean', 135 | '{}_{}_min'.format(col,ts):'min', 136 | '{}_{}_median'.format(col,ts):'median' 137 | }) 138 | date2 = pd.merge(date2,f_tmp,on=[col],how='left',copy=False) 139 | 140 | return date2 141 | 142 | history_10 = get_history_visit_time(history_9,history_10) 143 | history_11 = get_history_visit_time(history_10,history_11) 144 | history_12 = get_history_visit_time(history_11,history_12) 145 | 146 | data = pd.concat([history_10,history_11],axis=0,sort=False,ignore_index=True) 147 | data = pd.concat([data,history_12],axis=0,sort=False,ignore_index=True) 148 | del history_9,history_10,history_11,history_12 149 | 150 | data = data.sort_values('ts') 151 | data['ts_next'] = data.groupby(['deviceid'])['ts'].shift(-1) 152 | data['ts_next_ts'] = data['ts_next'] - data['ts'] 153 | 154 | # 当前一天内的特征 leak 155 | for col in [['deviceid'],['guid'],['newsid']]: 156 | print(col) 157 | data['{}_days_count'.format('_'.join(col))] = data.groupby(['days'] + col)['id'].transform('count') 158 | 159 | 160 | # netmodel 161 | data['netmodel'] = data['netmodel'].map({'o':1, 'w':2, 'g4':4, 'g3':3, 'g2':2}) 162 | 163 | # pos 164 | data['pos'] = data['pos'] 165 | 166 | 167 | print('train and predict') 168 | X_train = data[data['flag'].isin([9])] 169 | X_valid = data[data['flag'].isin([10])] 170 | X_test = data[data['flag'].isin([11])] 171 | 172 | 173 | lgb_param = { 174 | 'learning_rate': 0.1, 175 | 'boosting_type': 'gbdt', 176 | 'objective': 'binary', 177 | 'metric': 'auc', 178 | 'max_depth': -1, 179 | 'seed':42, 180 | 'boost_from_average':'false', 181 | } 182 | 183 | 184 | feature = [ 185 | 'pos','netmodel', 'hour', 'minute', 186 | 'deviceid_timestamp_ts_max', 'deviceid_timestamp_ts_mean', 187 | 'deviceid_timestamp_ts_min', 'deviceid_timestamp_ts_median', 188 | 'guid_timestamp_ts_max', 'guid_timestamp_ts_mean', 189 | 'guid_timestamp_ts_min', 'guid_timestamp_ts_median', 190 | 'deviceid_days_count', 'guid_days_count','newsid_days_count', 191 | 'ts_next_ts' 192 | ] 193 | target = 'target' 194 | 195 | 196 | lgb_train = lgb.Dataset(X_train[feature].values, X_train[target].values) 197 | lgb_valid = lgb.Dataset(X_valid[feature].values, X_valid[target].values, reference=lgb_train) 198 | lgb_model = lgb.train(lgb_param, lgb_train, num_boost_round=10000, valid_sets=[lgb_train,lgb_valid], 199 | early_stopping_rounds=50,verbose_eval=10) 200 | 201 | p_test = lgb_model.predict(X_valid[feature].values,num_iteration=lgb_model.best_iteration) 202 | xx_score = X_valid[[target]].copy() 203 | xx_score['predict'] = p_test 204 | xx_score = xx_score.sort_values('predict',ascending=False) 205 | xx_score = xx_score.reset_index() 206 | xx_score.loc[xx_score.index<=int(xx_score.shape[0]*0.103),'score'] = 1 207 | xx_score['score'] = xx_score['score'].fillna(0) 208 | print(f1_score(xx_score['target'],xx_score['score'])) 209 | 210 | del lgb_train,lgb_valid 211 | del X_train,X_valid 212 | # 没加 newsid 之前的 f1 score 213 | # 0.5129179717875857 214 | # 0.5197833317587095 215 | # 0.6063125458760602 216 | X_train_2 = data[data['flag'].isin([9,10])] 217 | 218 | 219 | lgb_train_2 = lgb.Dataset(X_train_2[feature].values, X_train_2[target].values) 220 | lgb_model_2 = lgb.train(lgb_param, lgb_train_2, num_boost_round=lgb_model.best_iteration, valid_sets=[lgb_train_2],verbose_eval=10) 221 | 222 | p_predict = lgb_model_2.predict(X_test[feature].values) 223 | 224 | submit_score = X_test[['id']].copy() 225 | submit_score['predict'] = p_predict 226 | submit_score = submit_score.sort_values('predict',ascending=False) 227 | submit_score = submit_score.reset_index() 228 | submit_score.loc[submit_score.index<=int(submit_score.shape[0]*0.103),'target'] = 1 229 | submit_score['target'] = submit_score['target'].fillna(0) 230 | 231 | submit_score = submit_score.sort_values('id') 232 | submit_score['target'] = submit_score['target'].astype(int) 233 | 234 | sample = pd.read_csv('./sample.csv') 235 | sample.columns = ['id','non_target'] 236 | submit_score = pd.merge(sample,submit_score,on=['id'],how='left') 237 | 238 | submit_score[['id','target']].to_csv('./baseline.csv',index=False) 239 | --------------------------------------------------------------------------------