├── .gitattributes ├── README.md ├── code ├── chenzhiliang.py ├── fusion.py ├── gao.py └── zhao.py ├── data ├── test │ ├── test_agg.csv │ └── test_log.csv └── train │ ├── train_agg.csv │ ├── train_flg.csv │ └── train_log.csv └── run_all.sh /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [招商银行信用卡中心校园大赛:消费金融场景下的用户购买预测](https://www.datafountain.cn/competitions/287/details) 2 | 3 | 我们三个队友一起完成,B榜最后得分0.86295。 4 | 5 | 运行说明: 6 | 7 | 1.将训练集,测试集,提交样例放到data目录下(我保留了文件,但我将文件内容删除了,需要替换数据文件) 8 | 9 | 2.运行code/chenzhiliang.py 运行环境xgboost0.6 10 | 11 | 3.运行code/gao.py 运行环境xgboost0.71 12 | 13 | 4.运行code/zhao.py 运行环境xgboost0.6 14 | 15 | 5.运行code/fusion.py ,最终结果文件big_fusion_7_14_ratio1.35_18_1.1.txt 16 | -------------------------------------------------------------------------------- /code/chenzhiliang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jul 1 14:18:19 2018 4 | 5 | @author: CCL 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from pandas import DataFrame 11 | import xgboost as xgb 12 | from sklearn.model_selection import ShuffleSplit 13 | from sklearn.metrics import f1_score 14 | import sys 15 | import scipy as sp 16 | from sklearn.metrics import roc_curve 17 | from sklearn import metrics 18 | from sklearn.model_selection import StratifiedKFold 19 | import time 20 | from collections import Counter 21 | from sklearn.preprocessing import PolynomialFeatures 22 | from xgboost import plot_importance 23 | from matplotlib import pyplot 24 | import matplotlib.pyplot as plt 25 | from pylab import * 26 | mpl.rcParams['font.sans-serif'] = ['SimHei'] 27 | 28 | columns_rank_list = ['user_EVT3_3169','user_EVT3_2067','user_EVT3_3744','user_EVT2_1261','user_EVT3_271','V30','V23','user_EVT2_1798','user_EVT3_3928','user_EVT3_3840','user_EVT3_4334','user_EVT3_3785','user_day_16',\ 29 | 'user_EVT3_3211','user_EVT2_43','user_EVT3_3923','user_EVT3_4374','user_EVT2_20','user_EVT3_3159','user_day_10','user_day_5','user_EVT3_1688','user_EVT1_181','user_day_2','user_EVT3_3922','user_EVT3_4282','user_EVT3_4383',\ 30 | 'user_EVT3_291','user_EVT3_3155','user_EVT3_3862','user_day_11','user_EVT3_3137','user_EVT3_3851','user_EVT3_1706','user_EVT2_555','user_EVT3_568','user_EVT3_2035','user_EVT3_3136','user_EVT3_4326','user_EVT2_1797','user_EVT2_224',\ 31 | 'user_EVT3_3857','user_EVT3_2068','user_EVT2_1047','user_EVT3_4317','user_EVT2_1043','user_EVT3_3754','user_EVT3_3145','user_EVT3_3662','user_EVT2_1914','user_EVT2_233','user_EVT3_2071','user_EVT2_1270','user_EVT3_3868','user_EVT1_139',\ 32 | 'user_EVT3_4314','user_EVT3_565','user_EVT2_2141','user_EVT2_2157','user_EVT2_2158','user_EVT3_3931','user_EVT3_2206','user_EVT3_3804','user_EVT2_1591','user_EVT2_221','user_EVT2_227','user_EVT3_3807','user_EVT3_3827','user_EVT3_3856',\ 33 | 'user_EVT3_3873','user_EVT3_3670','user_EVT3_3787','user_EVT1_259','V7','user_EVT3_1525','user_EVT3_3835','user_EVT3_3836','user_EVT3_3765','user_EVT3_2205','user_EVT3_3829','user_EVT3_4338','user_EVT3_4380','user_EVT2_1483','V25',\ 34 | 'user_EVT2_2162','user_EVT2_578','user_EVT3_2637','user_EVT3_1715','user_EVT3_3927','user_EVT3_1199','user_EVT3_273','user_EVT3_3801','user_EVT3_4351','user_EVT3_4353','user_EVT3_3871','user_EVT3_4393','user_EVT3_1703','user_EVT3_3813',\ 35 | 'user_EVT2_1852','user_EVT2_702','user_EVT2_2143','user_EVT1_102','user_EVT3_3182','user_EVT2_1233','user_EVT3_2037','user_EVT3_274','user_EVT2_1846','user_EVT3_2644','user_EVT3_3730','user_EVT3_3929','user_EVT3_3847','user_EVT3_566',\ 36 | 'user_EVT3_4367','user_EVT2_1843','V29','user_EVT3_3149','user_EVT3_4373','user_EVT3_3618','user_EVT3_1724','user_EVT3_3615','user_EVT3_561','user_EVT3_3494','user_EVT3_3869','user_EVT3_1719','user_EVT3_1197','user_EVT3_3865','user_EVT1_359',\ 37 | 'user_EVT2_1234','user_EVT2_2164','user_EVT1_10','user_EVT2_1593','V3','user_EVT1_162','user_EVT2_2140','gap_log_day_min','user_day_7','user_EVT3_2049','user_EVT3_563','user_EVT3_2642','user_EVT2_1828','V9','user_EVT3_4360','user_EVT2_21',\ 38 | 'user_EVT3_3820','user_EVT3_4366','user_EVT2_1044','user_EVT3_4304','user_day_21','user_EVT3_3749','user_EVT3_2077','user_EVT3_2204','user_EVT3_3841','user_EVT2_1040','user_EVT3_4330','user_EVT3_564','user_EVT2_2154','user_EVT3_2045','user_EVT1_460',\ 39 | 'user_EVT3_1722','user_EVT3_17','user_EVT3_2062','user_EVT3_4280','user_EVT3_1682','user_EVT3_268','user_EVT3_4378','user_EVT2_231','user_EVT3_2056','user_day_22','user_EVT2_2136','user_EVT3_3127','user_EVT3_3860','user_EVT3_3170','user_EVT3_2643',\ 40 | 'user_EVT3_3800','user_EVT3_3614','user_EVT3_3158','user_EVT2_1482','user_EVT2_561','user_EVT2_15','user_EVT3_3728','user_EVT2_704','user_EVT3_1681','user_EVT3_912','user_EVT2_1911','user_day_4','user_EVT3_3142','user_day_24','user_EVT3_3133','user_EVT3_3864',\ 41 | 'user_EVT2_2138','user_EVT3_1710','user_EVT3_3623','user_EVT3_4318','user_EVT2_229','user_EVT3_1704','user_EVT3_22','user_EVT3_3129','user_EVT3_4288','user_EVT3_3737','user_EVT3_3805','user_EVT3_3621','V6','user_EVT2_2148','user_EVT2_1910','user_EVT3_4391',\ 42 | 'user_EVT3_3247','user_EVT1_604','V10','user_EVT3_3144','user_EVT3_3739','user_EVT3_3179','user_EVT2_2156','user_EVT3_4392','user_EVT3_4328','user_EVT3_3830','V11','user_EVT3_2004','user_EVT3_3202','user_day_28','user_EVT3_3121','user_EVT2_574','user_EVT2_1916',\ 43 | 'user_EVT3_1677','user_EVT3_3861','user_EVT3_4289','user_EVT3_3736','user_EVT3_3828','user_EVT3_3814','user_EVT3_3842','user_EVT3_1679','user_EVT2_924','user_EVT3_4396','user_EVT3_3852','user_EVT3_1702','user_EVT3_1716','user_EVT3_1727','user_EVT2_1048',\ 44 | 'user_EVT3_3126','user_day_14','user_EVT2_1481','user_EVT2_1848','user_EVT3_4311','user_EVT3_1689','user_EVT1_163','user_day_18','user_EVT2_395','user_EVT3_3138','user_EVT2_575','user_EVT2_1260','user_EVT3_569','user_EVT3_3770','user_EVT3_1693','user_EVT3_3846',\ 45 | 'user_EVT3_3769','user_EVT3_3775','user_EVT3_4359','V28','user_EVT2_16','user_EVT3_3134','user_EVT2_1351','user_EVT3_3788','user_EVT3_1523','user_EVT3_4381','user_EVT2_2137','user_EVT3_3619','user_EVT3_3132','user_EVT3_3767','user_EVT3_4364','user_EVT3_2638',\ 46 | 'user_EVT3_1690','user_EVT3_2050','user_EVT2_2146','user_EVT2_1847','user_EVT3_560','user_EVT3_4299','user_EVT3_4365','user_day_30','user_EVT3_2031','user_EVT3_4344','user_EVT3_1687','user_day_15','user_EVT2_2163','user_EVT3_3786','user_EVT3_4291','user_EVT2_1913',\ 47 | 'user_day_6','user_day_31','user_EVT3_2639','user_EVT1_38','user_EVT3_1692','user_EVT3_3930','user_EVT3_1198','user_EVT2_1269','user_EVT3_3156','user_EVT2_1826','user_EVT3_2061','user_EVT3_18','user_EVT2_1841','user_EVT3_3866','user_EVT2_1042','user_EVT3_3748',\ 48 | 'user_EVT3_2640','user_EVT2_1831','user_EVT2_1854','user_EVT1_520','user_EVT2_1851','user_day_27','user_EVT3_2079','user_EVT3_3789','user_EVT3_4322','user_EVT3_4294','user_EVT2_577','user_EVT3_2072','user_EVT3_3731','user_EVT3_3243','user_EVT2_1235',\ 49 | 'user_EVT3_3755','user_EVT3_3773','user_EVT3_3757','user_EVT3_4387','user_EVT1_0','user_EVT2_576','V1','user_EVT3_3747','user_EVT3_4349','user_EVT2_1049','user_EVT3_3751','user_EVT3_3838','user_EVT3_3141','user_EVT2_1352','user_EVT3_4302','user_EVT3_3832',\ 50 | 'user_EVT2_553','user_EVT3_1200','user_EVT3_4329','user_EVT3_1694','user_EVT3_570','user_EVT3_893','user_EVT3_4295','user_EVT2_2135','user_EVT3_117','user_EVT3_572','user_EVT3_3753','user_EVT2_1842','user_EVT3_267','user_EVT3_3776','user_EVT3_20','user_EVT3_2003',\ 51 | 'user_EVT3_4301','user_day_25','user_EVT3_913','user_EVT3_3771','gap_log_day_max','user_EVT3_3199','user_EVT3_2046','user_EVT3_3128','user_EVT2_1863','user_EVT2_1484','user_EVT3_2066','user_EVT3_3190','user_EVT2_1350','user_EVT3_3837','user_EVT2_1592',\ 52 | 'user_EVT3_269','user_EVT3_4323','user_EVT3_2075','user_TCH_TYP_0','user_EVT2_1479','user_EVT3_3245','user_EVT3_3872','user_EVT3_4309','user_EVT3_3613','user_EVT3_4375','user_EVT2_115','user_EVT3_3810','user_EVT3_272','user_EVT2_1830','user_EVT3_3492',\ 53 | 'user_EVT3_21','user_EVT3_1708','user_EVT2_1905','user_EVT3_4377','V22','user_EVT3_3617','user_EVT3_1711','user_EVT3_4346','user_EVT3_3834','user_EVT3_2073','user_EVT3_905','user_EVT3_3150','user_EVT2_1849','user_EVT3_3493','user_EVT3_3818','user_EVT3_4376',\ 54 | 'user_EVT3_3796','user_EVT3_1201','user_EVT2_1264','user_EVT3_4277','user_EVT3_3756','user_EVT3_3188','user_EVT3_3809','user_EVT3_4312','user_EVT2_314','user_EVT3_3663','user_EVT3_1709','user_EVT3_3867','user_EVT3_3151','user_EVT3_3496','user_EVT3_4320',\ 55 | 'user_EVT2_1268','user_EVT3_1691','user_EVT3_4270','user_EVT3_3784','user_EVT3_3174','user_EVT2_1858','user_EVT3_3664','user_EVT3_3759','user_EVT2_1857','user_EVT1_372','user_EVT3_19','user_EVT3_3772','user_EVT2_1909','user_EVT3_3797','user_EVT2_705','user_EVT2_222',\ 56 | 'user_EVT3_3147','user_EVT2_1796','user_EVT2_1795','user_EVT2_701','user_EVT3_3745','user_EVT3_573','user_EVT2_557','user_EVT3_3802','user_day_26','user_EVT3_1698','user_EVT3_3130','user_EVT3_15','user_EVT3_889','user_EVT3_3248','V4','user_EVT3_2207','user_TCH_TYP_2',\ 57 | 'user_EVT3_4273','user_EVT3_3161','user_EVT3_4332','user_EVT3_3778','user_EVT2_1588','user_EVT3_4292','user_EVT2_1859','user_EVT2_1906','user_EVT3_3672','user_EVT2_1860','user_EVT2_1263','user_EVT3_3870','user_EVT3_1717','user_EVT2_1265','user_day_3',\ 58 | 'user_EVT3_4357','user_EVT3_2058','user_EVT3_3808','user_EVT2_392','user_EVT3_3817','user_EVT2_225','user_EVT2_703','user_day_8','user_EVT3_3859','user_EVT3_4286','user_EVT3_2044','user_EVT1_326','V13','user_EVT3_2047','user_EVT2_1853','user_EVT3_3791',\ 59 | 'user_EVT3_3741','user_EVT3_4271','user_EVT1_518','user_EVT3_1678','user_EVT3_3752','user_EVT3_891','user_EVT3_1718','user_EVT3_1729','user_EVT3_567','user_EVT2_2165','user_EVT3_3926','user_EVT2_1845','user_EVT3_3924','user_EVT3_3854','user_EVT2_2142',\ 60 | 'user_EVT3_3845','user_EVT2_228','user_day_23','user_day_20','user_EVT3_3742','user_EVT2_1838','user_EVT3_571','user_EVT2_393','V14','user_EVT3_2641','user_EVT3_3234','V16','V12','user_EVT2_706','user_EVT3_4327','user_EVT3_897','V8','user_EVT3_1699',\ 61 | 'user_EVT2_1855','user_EVT3_3612','user_EVT3_1684','user_EVT3_275','user_EVT3_3640','user_EVT3_3724','user_EVT3_462','user_EVT2_2145','V20','user_EVT3_2064','V24','user_EVT3_4287','user_EVT2_230','user_EVT3_2078','user_EVT3_3779','user_EVT2_18','user_EVT3_3793',\ 62 | 'user_EVT3_1728','user_EVT3_4278','user_EVT3_1683','user_EVT3_3758','user_EVT3_1697','user_EVT3_2032','user_EVT3_3743','user_EVT3_3135','user_EVT2_2161','user_EVT3_3811','user_EVT3_1707','user_EVT2_1590','user_EVT3_4296','user_EVT3_2057','user_EVT3_3781',\ 63 | 'user_EVT3_3863','user_EVT3_3723','user_EVT3_3153','user_EVT2_1836','user_EVT3_4368','user_EVT3_4285','user_EVT2_1844','user_EVT3_3803','user_EVT3_4372','user_EVT3_4382','user_EVT3_4361','user_EVT3_2041','user_EVT3_4388','user_day_17','user_EVT3_3642',\ 64 | 'user_EVT3_4308','user_EVT2_1262','gap_log_day_mean','V26','user_EVT3_3734','user_EVT3_1723','user_EVT3_4303','user_EVT3_3131','user_EVT3_3782','user_EVT3_3495','user_EVT3_4350','V18','user_EVT2_1866','user_EVT3_4386','user_EVT3_2054','user_EVT3_4370',\ 65 | 'user_EVT3_4398','user_EVT3_1700','user_EVT3_2074','用户平均的访问时间','user_day_12','user_EVT1_438','user_EVT3_2053','user_EVT3_1685','user_EVT3_3853','user_EVT3_1686','user_EVT2_1349','user_EVT3_2051','user_EVT2_1799','user_EVT3_3795','user_EVT2_1041',\ 66 | 'user_EVT3_3646','user_EVT3_4290','user_EVT3_3806','user_EVT3_1705','user_EVT3_4389','user_EVT2_2155','user_EVT3_4352','user_EVT2_1480','user_day_29','user_EVT3_3921','user_EVT3_4293','user_EVT3_4315','user_EVT3_4319','user_EVT2_1837','user_EVT3_1726',\ 67 | 'user_EVT2_1589','user_EVT3_4395','user_EVT3_1701','user_EVT3_559','V19','user_EVT3_4325','user_EVT3_558','user_EVT3_1695','user_EVT3_1713','user_EVT1_257','user_EVT3_3831','user_EVT3_911','user_EVT2_922','user_EVT3_2033','user_EVT3_3622','user_EVT3_270',\ 68 | 'user_EVT3_3821','user_EVT3_3850','user_EVT3_2005','user_EVT3_4307','user_EVT2_1912','user_EVT3_1725','user_EVT3_4363','user_EVT3_3157','user_EVT3_4362','user_EVT3_3497','user_EVT3_3671','user_EVT3_3812','user_EVT3_2063','user_EVT3_3616','user_EVT3_3792',\ 69 | 'user_EVT2_1850','user_EVT3_2080','user_EVT3_3816','user_EVT2_1915','user_EVT3_3819','user_EVT3_3122','user_EVT3_3774','user_EVT3_3172','user_EVT3_2069','user_EVT3_3726','user_EVT2_2149','user_EVT3_3794','user_EVT3_1712','user_EVT3_3849','V21','user_EVT3_4331',\ 70 | 'user_EVT3_3620','user_EVT3_3746','user_EVT2_1839','user_EVT3_2052','user_EVT3_4281','user_EVT2_394','user_EVT3_3777','user_EVT3_3925','user_EVT2_1908','V17','user_EVT3_2043','user_EVT3_3729','user_EVT1_540','user_EVT2_1864','user_EVT3_3231','user_EVT3_277',\ 71 | 'user_EVT2_1865','user_EVT2_2159','user_EVT2_226','user_EVT3_3843','V2','V27','user_EVT3_3139','user_EVT2_1829','user_EVT3_3140','user_EVT3_3826','user_EVT3_3815','user_EVT3_4355','user_day_1','user_EVT2_17','user_EVT3_16','user_EVT3_2048','user_EVT3_4313',\ 72 | 'user_EVT3_1196','user_EVT2_1046','user_EVT2_2133','user_EVT2_22','user_EVT3_3833','user_day_19','user_EVT3_276','user_EVT3_4276','user_EVT3_3790','user_day_13','user_EVT2_1827','user_EVT3_1680','user_EVT3_4384','user_EVT2_1861','user_EVT3_3700','user_EVT3_3768',\ 73 | 'user_EVT3_4297','user_EVT2_223','user_EVT3_3839','user_EVT3_3143','V5','user_EVT3_1696','user_EVT3_4390','user_EVT3_562','user_EVT2_2167','user_EVT2_2160','user_EVT3_4333','user_EVT3_3783','user_EVT2_2134','user_EVT3_3858','user_EVT3_4394','user_EVT3_43',\ 74 | 'user_EVT2_1266','V15','user_EVT3_3920','user_EVT2_2150','user_day_9','user_EVT3_3735','user_EVT3_910','user_EVT3_3171','user_EVT2_1862','user_EVT3_4305','user_EVT3_3498','user_EVT3_914','user_EVT3_3185','user_EVT3_3766','user_EVT3_3798','user_EVT2_1045',\ 75 | 'user_EVT1_396','user_EVT1_508','user_EVT3_3780','user_EVT3_4300','user_EVT2_19','user_EVT2_1907','user_EVT3_4298','user_EVT2_569','user_EVT2_1267','用户最后的访问时间'] 76 | OFF_LINE = False 77 | 78 | def xgb_model(train_set_x,train_set_y,test_set_x): 79 | # 模型参数 80 | params = {'booster': 'gbtree', 81 | 'objective':'binary:logistic', 82 | 'eta': 0.03, 83 | 'max_depth': 5, # 4 3 84 | 'colsample_bytree': 0.7,#0.8 85 | 'subsample': 0.7, 86 | 'min_child_weight': 10, # 2 3 87 | 'silent':1, 88 | 'eval_metric':'auc' 89 | } 90 | 91 | dtrain = xgb.DMatrix(train_set_x, label=train_set_y) 92 | dvali = xgb.DMatrix(test_set_x) 93 | model = xgb.train(params, dtrain, num_boost_round=490) 94 | importance = pd.Series(model.get_fscore()).sort_values(ascending=False) 95 | importance = pd.DataFrame(importance,columns = ['importance']) 96 | importance.reset_index(inplace=True) 97 | importance.columns = ['name','importance'] 98 | print(type(importance)) 99 | # plot_importance(model,max_num_features=20) 100 | # pyplot.show() 101 | tmp = columns_rank_list.copy() 102 | for i in range(1,31): 103 | tmp.remove(str('V'+str(i))) 104 | fsa = pd.DataFrame() 105 | fsa['name'] = tmp 106 | fsa['num'] = 1 107 | importance = pd.merge(importance, fsa, on=['name']) 108 | print(importance) 109 | importance = importance.set_index(['name']) 110 | importance = importance.iloc[:20,:] 111 | importance = importance.sort_values(by=['importance'],ascending=True) 112 | importance.loc[:,'importance'].plot.barh() 113 | plt.show() 114 | predict = model.predict(dvali) 115 | return predict 116 | 117 | from math import log 118 | def continue_day(dataframe): 119 | if len(set(list(dataframe))) <= 1: 120 | return -1 121 | day_list = sorted(list(set(list(dataframe)))) 122 | con = 0 123 | maxx = 0 124 | for i in range(1,len(day_list)): 125 | if day_list[i-1] + 1 == day_list[i]: 126 | con += 1 127 | else: 128 | if maxx300*10000: 208 | columns_rank_list.extend(['用户在哪一天访问量最大']) 209 | original_copy['用户在哪一天访问量最大减去最后访问时间'] = original_copy['用户在哪一天访问量最大'] - original_copy['用户最后的访问时间'] 210 | original_copy['用户在哪一天访问量最大减去平均访问时间'] = original_copy['用户在哪一天访问量最大'] - original_copy['用户平均的访问时间'] 211 | if dataframe_copy.shape[0]>300*10000: 212 | columns_rank_list.extend(['用户在哪一天访问量最大减去最后访问时间','用户在哪一天访问量最大减去平均访问时间']) 213 | 214 | ############################################## 215 | tt = dataframe_copy.groupby(by=['USRID'])['hour'].max().reset_index(name='用户最后的小时访问时间') 216 | original_copy = pd.merge(original_copy,tt,on=['USRID'],how='left') 217 | if dataframe_copy.shape[0]>300*10000: 218 | columns_rank_list.extend(['用户最后的小时访问时间']) 219 | 220 | 221 | 222 | 223 | 224 | return original_copy 225 | 226 | if __name__ == '__main__': 227 | train_agg = pd.read_csv('../data/train/train_agg.csv',sep='\t') 228 | train_flg = pd.read_csv('../data/train/train_flg.csv',sep='\t') 229 | train_log = pd.read_csv('../data/train/train_log.csv',sep='\t',parse_dates = ['OCC_TIM']) 230 | flg_agg_train = pd.merge(train_flg,train_agg,on=['USRID'],how='left') 231 | all_train = log_tabel(flg_agg_train,train_log) 232 | all_train.fillna(-999,inplace=True) 233 | if OFF_LINE == True: 234 | train_x = all_train.drop(['USRID', 'FLAG'], axis=1).values 235 | train_y = all_train['FLAG'].values 236 | auc_list = [] 237 | skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=False) 238 | for train_index, test_index in skf.split(train_x, train_y): 239 | print('Train: %s | test: %s' % (train_index, test_index)) 240 | X_train, X_test = train_x[train_index], train_x[test_index] 241 | y_train, y_test = train_y[train_index], train_y[test_index] 242 | 243 | pred_value = xgb_model(X_train, y_train, X_test) 244 | print(pred_value) 245 | print(y_test) 246 | 247 | pred_value = np.array(pred_value) 248 | pred_value = [ele + 1 for ele in pred_value] 249 | 250 | y_test = np.array(y_test) 251 | y_test = [ele + 1 for ele in y_test] 252 | 253 | fpr, tpr, thresholds = roc_curve(y_test, pred_value, pos_label=2) 254 | 255 | auc = metrics.auc(fpr, tpr) 256 | print('auc value:',auc) 257 | auc_list.append(auc) 258 | 259 | print('validate result:',np.mean(auc_list)) 260 | sys.exit(32) 261 | 262 | test_log = pd.read_csv('../data/test/test_log.csv',sep='\t',parse_dates = ['OCC_TIM']) 263 | test_agg = pd.read_csv('../data/test/test_agg.csv',sep='\t') 264 | test_set = log_tabel(test_agg,test_log) 265 | test_set = test_set.fillna(-999) 266 | ################################################################################## 267 | result_name = test_set[['USRID']] 268 | train_x = all_train.drop(['USRID', 'FLAG'], axis=1) 269 | train_x = train_x.reindex_axis(columns_rank_list,axis=1) 270 | train_y = all_train['FLAG'] 271 | test_x = test_set.drop(['USRID'], axis=1) 272 | 273 | test_x = test_x.reindex_axis(columns_rank_list,axis=1) 274 | 275 | # from sklearn import preprocessing 276 | # min_max_scaler = preprocessing.StandardScaler() 277 | # X_train_minmax = min_max_scaler.fit_transform(train_x) 278 | # X_test_minmax = min_max_scaler.transform(test_x) 279 | X_train_minmax = train_x 280 | X_test_minmax = test_x 281 | pred_result = xgb_model(X_train_minmax,train_y,X_test_minmax) 282 | result_name['RST'] = pred_result 283 | 284 | 285 | result_name.to_csv('chen_result.csv',index=None,sep='\t') -------------------------------------------------------------------------------- /code/fusion.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jun 25 17:26:21 2018 4 | 5 | @author: CCL 6 | """ 7 | 8 | import pandas as pd 9 | 10 | ans1 = pd.read_csv("chen_result.csv",sep='\t',engine='python') 11 | ans1.columns = ['USRID','RST_c'] 12 | 13 | 14 | ans2 = pd.read_csv("gao.csv",sep='\t',engine='python') 15 | ans2.columns = ['USRID','RST_g'] 16 | 17 | ans3 = pd.read_csv("zhao.csv",sep='\t',engine='python') 18 | ans3.columns = ['USRID','RST_z'] 19 | 20 | result = pd.merge(ans1,ans2,on='USRID') 21 | result = pd.merge(result,ans3,on='USRID') 22 | 23 | 24 | result['RST'] = 0.5*(0.35*result['RST_z'] + 0.65*result['RST_g'])+0.5*result['RST_c'] 25 | # 26 | result = result[['USRID','RST']].copy() 27 | ################################ 28 | def is_nan(x): 29 | if str(x) == 'nan': 30 | return 1 31 | else: 32 | return 0 33 | ans = result.copy() 34 | test_log = pd.read_csv('../data/test/test_log.csv',sep='\t') 35 | 36 | 37 | test_log = test_log.drop_duplicates(subset=['USRID']) 38 | 39 | result = pd.merge(ans,test_log,on=['USRID'],how='left') 40 | 41 | result['is_nan'] = result.EVT_LBL.map(lambda x:1 if str(x)=='nan' else 0) 42 | 43 | result.loc[result.is_nan==1,'RST'] = result.loc[result.is_nan==1,'RST']*1.35 44 | result = result[['USRID','RST']].copy() 45 | 46 | ######################### 47 | 48 | test_log = pd.read_csv('../data/test/test_log.csv',sep='\t',parse_dates = ['OCC_TIM']) 49 | test_log['hour'] = test_log.OCC_TIM.map(lambda x:int(x.hour)) 50 | test_log['day'] = test_log.OCC_TIM.map(lambda x:int(x.day)) 51 | 52 | 53 | def func(x): 54 | if 18 in list(x): 55 | return 1 56 | else: 57 | return 0 58 | ee = test_log.groupby(by=['USRID'])['hour'].apply(func).reset_index(name = '用户访问的小时') 59 | 60 | 61 | test_flag = result.copy() 62 | # 63 | test_result = pd.merge(test_flag,ee,on=['USRID'],how='left') 64 | 65 | test_result.loc[test_result['用户访问的小时']==1,'RST'] = test_result.loc[test_result['用户访问的小时']==1,'RST']*1.1 66 | 67 | test_result[['USRID','RST']].to_csv('big_fusion_7_14_ratio1.35_18_1.1.csv',index=None,sep='\t') -------------------------------------------------------------------------------- /code/gao.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jul 17 12:46:48 2018 4 | 5 | @author: FNo0 6 | """ 7 | 8 | import pandas as pd 9 | from sklearn import preprocessing 10 | import xgboost as xgb 11 | 12 | import warnings 13 | warnings.filterwarnings('ignore') 14 | 15 | def load_data(): 16 | # source train 17 | train_agg = pd.read_csv(r'../data/train/train_agg.csv',delimiter = '\t') 18 | train_flg = pd.read_csv(r'../data/train/train_flg.csv',delimiter = '\t') 19 | train_log = pd.read_csv(r'../data/train/train_log.csv',delimiter = '\t') 20 | # source test 21 | test_agg = pd.read_csv(r'../data/test/test_agg.csv',delimiter = '\t') 22 | test_log = pd.read_csv(r'../data/test/test_log.csv',delimiter = '\t') 23 | # 预处理 24 | train_log['EVT_LBL_1'] = train_log['EVT_LBL'].map(lambda x : x.split('-')[0]) 25 | train_log['EVT_LBL_2'] = train_log['EVT_LBL'].map(lambda x : x.split('-')[1]) 26 | train_log['EVT_LBL_3'] = train_log['EVT_LBL'].map(lambda x : x.split('-')[2]) 27 | train_log.drop(['EVT_LBL'],axis = 1,inplace = True) 28 | train_log['OCC_TIM'] = pd.to_datetime(train_log['OCC_TIM']) 29 | train_log['OCC_DAY'] = train_log['OCC_TIM'].map(lambda x : x.day) 30 | train_log['OCC_HOUR'] = train_log['OCC_TIM'].map(lambda x : x.hour) 31 | test_log['EVT_LBL_1'] = test_log['EVT_LBL'].map(lambda x : x.split('-')[0]) 32 | test_log['EVT_LBL_2'] = test_log['EVT_LBL'].map(lambda x : x.split('-')[1]) 33 | test_log['EVT_LBL_3'] = test_log['EVT_LBL'].map(lambda x : x.split('-')[2]) 34 | test_log.drop(['EVT_LBL'],axis = 1,inplace = True) 35 | test_log['OCC_TIM'] = pd.to_datetime(test_log['OCC_TIM']) 36 | test_log['OCC_DAY'] = test_log['OCC_TIM'].map(lambda x : x.day) 37 | test_log['OCC_HOUR'] = train_log['OCC_TIM'].map(lambda x : x.hour) 38 | ## 联合编码疑似离散值的特征列:V2,V4 39 | train_agg['V2V4'] = list(map(lambda x,y : str(x) + '_' + str(y),train_agg['V2'],train_agg['V4'])) 40 | test_agg['V2V4'] = list(map(lambda x,y : str(x) + '_' + str(y),test_agg['V2'],test_agg['V4'])) 41 | le = preprocessing.LabelEncoder() 42 | le.fit(train_agg['V2V4']) 43 | train_agg['V2V4'] = le.transform(train_agg['V2V4']) 44 | test_agg['V2V4'] = le.transform(test_agg['V2V4']) 45 | ## 联合编码疑似离散值的特征列:V2,V5 46 | train_agg['V2V5'] = list(map(lambda x,y : str(x) + '_' + str(y),train_agg['V2'],train_agg['V5'])) 47 | test_agg['V2V5'] = list(map(lambda x,y : str(x) + '_' + str(y),test_agg['V2'],test_agg['V5'])) 48 | le = preprocessing.LabelEncoder() 49 | le.fit(train_agg['V2V5']) 50 | train_agg['V2V5'] = le.transform(train_agg['V2V5']) 51 | test_agg['V2V5'] = le.transform(test_agg['V2V5']) 52 | ## 联合编码疑似离散值的特征列:V4,V5 53 | train_agg['V4V5'] = list(map(lambda x,y : str(x) + '_' + str(y),train_agg['V4'],train_agg['V5'])) 54 | test_agg['V4V5'] = list(map(lambda x,y : str(x) + '_' + str(y),test_agg['V4'],test_agg['V5'])) 55 | le = preprocessing.LabelEncoder() 56 | le.fit(train_agg['V4V5']) 57 | train_agg['V4V5'] = le.transform(train_agg['V4V5']) 58 | test_agg['V4V5'] = le.transform(test_agg['V4V5']) 59 | ## 联合编码疑似离散值的特征列:V2,V4,V5 60 | train_agg['V2V4V5'] = list(map(lambda x,y,z : str(x) + '_' + str(y) + '_' + str(z),train_agg['V2'],train_agg['V4'],train_agg['V5'])) 61 | test_agg['V2V4V5'] = list(map(lambda x,y,z : str(x) + '_' + str(y) + '_' + str(z),test_agg['V2'],test_agg['V4'],test_agg['V5'])) 62 | le = preprocessing.LabelEncoder() 63 | le.fit(train_agg['V2V4V5']) 64 | train_agg['V2V4V5'] = le.transform(train_agg['V2V4V5']) 65 | test_agg['V2V4V5'] = le.transform(test_agg['V2V4V5']) 66 | # 返回 67 | return train_agg,train_flg,train_log,test_agg,test_log 68 | 69 | def get_log_feat(dataset,EVT_LBL_1,EVT_LBL_2,EVT_LBL_3): 70 | data = dataset.copy() 71 | data['cnt'] = 1 72 | # 返回的特征 73 | feature = pd.DataFrame(columns = ['USRID']) 74 | 75 | ## 每天点击 76 | # pivot + unstack 77 | pivot = pd.pivot_table(data,index = ['USRID','OCC_DAY'],values = 'cnt',aggfunc = len) 78 | pivot = pivot.unstack(level = -1) 79 | pivot.fillna(0,downcast = 'infer',inplace = True) 80 | # 特征 81 | feat = pd.DataFrame() 82 | feat['USRID'] = pivot.index 83 | feat.index = pivot.index 84 | ## 统计特征 85 | # 总和 86 | feat['USRID_click_OCC_DAY_sum'] = pivot.sum(1) 87 | # 每一天的特征 88 | dates = list(set(data['OCC_DAY'].tolist())) 89 | for i in dates: 90 | for j in pivot.columns.tolist(): 91 | if i == j[-1]: 92 | feat['USRID_click_OCC_DAY_' + str(i)] = pivot[j] 93 | ## 添加进特征 94 | feature = pd.merge(feature,feat,on = ['USRID'],how = 'outer') 95 | print('USRID_OCC_DAY特征提取完毕!') 96 | 97 | ## 每种TCH_TYP点击 98 | # pivot + unstack 99 | pivot = pd.pivot_table(data,index = ['USRID','TCH_TYP'],values = 'cnt',aggfunc = len) 100 | pivot = pivot.unstack(level = -1) 101 | pivot.fillna(0,downcast = 'infer',inplace = True) 102 | # 特征 103 | feat = pd.DataFrame() 104 | feat['USRID'] = pivot.index 105 | feat.index = pivot.index 106 | # 每种TCH_TYP的特征 107 | tchtyps = list(set(data['TCH_TYP'].tolist())) 108 | for i in tchtyps: 109 | for j in pivot.columns.tolist(): 110 | if i == j[-1]: 111 | feat['USRID_click_TCH_TYP_' + str(i)] = pivot[j] 112 | ## 添加进特征 113 | feature = pd.merge(feature,feat,on = ['USRID'],how = 'outer') 114 | print('USRID_TCH_TYP特征提取完毕!') 115 | 116 | ## 每种EVT_LBL_1 117 | # pivot + unstack 118 | pivot = pd.pivot_table(data,index = ['USRID','EVT_LBL_1'],values = 'cnt',aggfunc = len) 119 | pivot = pivot.unstack(level = -1) 120 | pivot.fillna(0,downcast = 'infer',inplace = True) 121 | # 特征 122 | feat = pd.DataFrame() 123 | feat['USRID'] = pivot.index 124 | feat.index = pivot.index 125 | # 每种EVT_LBL_1的特征 126 | for i in EVT_LBL_1: 127 | for j in pivot.columns.tolist(): 128 | if i == j[-1]: 129 | feat['USRID_click_EVT_LBL_1_' + str(i)] = pivot[j] 130 | ## 添加进特征 131 | feature = pd.merge(feature,feat,on = ['USRID'],how = 'outer') 132 | print('USRID_EVT_LBL_1特征提取完毕!') 133 | 134 | ## 每种EVT_LBL_2 135 | # pivot + unstack 136 | pivot = pd.pivot_table(data,index = ['USRID','EVT_LBL_2'],values = 'cnt',aggfunc = len) 137 | pivot = pivot.unstack(level = -1) 138 | pivot.fillna(0,downcast = 'infer',inplace = True) 139 | # 特征 140 | feat = pd.DataFrame() 141 | feat['USRID'] = pivot.index 142 | feat.index = pivot.index 143 | # 每种EVT_LBL_2的特征 144 | for i in EVT_LBL_2: 145 | for j in pivot.columns.tolist(): 146 | if i == j[-1]: 147 | feat['USRID_click_EVT_LBL_2_' + str(i)] = pivot[j] 148 | ## 添加进特征 149 | feature = pd.merge(feature,feat,on = ['USRID'],how = 'outer') 150 | print('USRID_EVT_LBL_2特征提取完毕!') 151 | 152 | ## 每种EVT_LBL_3 153 | # pivot + unstack 154 | pivot = pd.pivot_table(data,index = ['USRID','EVT_LBL_3'],values = 'cnt',aggfunc = len) 155 | pivot = pivot.unstack(level = -1) 156 | pivot.fillna(0,downcast = 'infer',inplace = True) 157 | # 特征 158 | feat = pd.DataFrame() 159 | feat['USRID'] = pivot.index 160 | feat.index = pivot.index 161 | # 每种EVT_LBL_3的特征 162 | for i in EVT_LBL_3: 163 | for j in pivot.columns.tolist(): 164 | if i == j[-1]: 165 | feat['USRID_click_EVT_LBL_3_' + str(i)] = pivot[j] 166 | ## 添加进特征 167 | feature = pd.merge(feature,feat,on = ['USRID'],how = 'outer') 168 | print('USRID_EVT_LBL_3特征提取完毕!') 169 | 170 | ## 返回 171 | return feature 172 | 173 | def create_data(agg,log,EVT_LBL_1,EVT_LBL_2,EVT_LBL_3): 174 | data = agg.copy() 175 | # USRID特征 176 | feat = get_log_feat(log,EVT_LBL_1,EVT_LBL_2,EVT_LBL_3) 177 | data = pd.merge(data,feat,on = ['USRID'],how = 'left') 178 | data.fillna(0,downcast = 'infer',inplace = True) 179 | # 返回 180 | return data 181 | 182 | def get_dataset(): 183 | # 原始数据 184 | train_agg,train_flg,train_log,test_agg,test_log = load_data() 185 | ## 训练集和测试集都有的EVT_LBL 186 | EVT_LBL_1 = list(set(train_log['EVT_LBL_1']) & set(test_log['EVT_LBL_1'])) 187 | EVT_LBL_2 = list(set(train_log['EVT_LBL_2']) & set(test_log['EVT_LBL_2'])) 188 | EVT_LBL_3 = list(set(train_log['EVT_LBL_3']) & set(test_log['EVT_LBL_3'])) 189 | EVT_LBL_1.sort() 190 | EVT_LBL_2.sort() 191 | EVT_LBL_3.sort() 192 | EVT_LBL_1.reverse() 193 | EVT_LBL_2.reverse() 194 | EVT_LBL_3.reverse() 195 | ## 构造训练集、测试集 196 | # 训练集 197 | print('构造训练集:') 198 | train = create_data(train_agg,train_log,EVT_LBL_1,EVT_LBL_2,EVT_LBL_3) 199 | train = pd.merge(train,train_flg,on = 'USRID',how = 'right') 200 | print('训练集构造完成!') 201 | print() 202 | # 测试集 203 | print('构造测试集:') 204 | test = create_data(test_agg,test_log,EVT_LBL_1,EVT_LBL_2,EVT_LBL_3) 205 | print('测试集构造完成!') 206 | # 返回 207 | return train,test 208 | 209 | def model_xgb(tr,te): 210 | train = tr.copy() 211 | test = te.copy() 212 | 213 | train_y = train['FLAG'].values 214 | train_x = train.drop(['USRID','FLAG'],axis=1).values 215 | test_x = test.drop(['USRID'],axis=1).values 216 | 217 | dtrain = xgb.DMatrix(train_x, label=train_y) 218 | dtest = xgb.DMatrix(test_x) 219 | 220 | # 模型参数 221 | params = {'booster': 'gbtree', 222 | 'objective':'binary:logistic', 223 | 'eval_metric' : 'error', 224 | 'eta': 0.03, 225 | 'max_depth': 6, # 4 3 226 | 'colsample_bytree': 0.8,#0.8 227 | 'subsample': 0.8, 228 | 'scale_pos_weight': 1, 229 | 'min_child_weight': 18 # 2 3 230 | } 231 | # 训练 232 | print('开始训练!') 233 | bst = xgb.train(params, dtrain, num_boost_round=300) 234 | # 预测 235 | print('开始预测!') 236 | predict = bst.predict(dtest) 237 | test_xy = test[['USRID']] 238 | test_xy['RST'] = predict 239 | test_xy.sort_values(['RST'],ascending = False,inplace = True) 240 | return test_xy 241 | 242 | if __name__ == '__main__': 243 | tr,te = get_dataset() 244 | result = model_xgb(tr,te) 245 | result.to_csv('gao.csv',index=None,sep='\t') 246 | 247 | -------------------------------------------------------------------------------- /code/zhao.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn import preprocessing 4 | import xgboost as xgb 5 | import time 6 | import warnings 7 | warnings.filterwarnings('ignore') 8 | 9 | # 读取个人信息 10 | train_agg = pd.read_csv('../data/train/train_agg.csv',sep='\t') 11 | test_agg = pd.read_csv('../data/test/test_agg.csv',sep='\t') 12 | 13 | # 日志信息 14 | train_log = pd.read_csv('../data/train/train_log.csv',sep='\t') 15 | test_log = pd.read_csv('../data/test/test_log.csv',sep='\t') 16 | # 用户唯一标识 17 | train_flg = pd.read_csv('../data/train/train_flg.csv',sep='\t') 18 | test_flg = pd.read_csv('../data/train/submit_sample.csv',sep='\t') 19 | del test_flg['RST'] 20 | 21 | # 训练集合测试集切分EVT_LBL为三列 22 | train_log['EVT_LBL_1'] = train_log['EVT_LBL'].map(lambda x:(str(x).split('-')[0])) 23 | train_log['EVT_LBL_2'] = train_log['EVT_LBL'].map(lambda x: str(x).split('-')[1]) 24 | train_log['EVT_LBL_3'] = train_log['EVT_LBL'].map(lambda x: str(x).split('-') 25 | [2]) 26 | test_log['EVT_LBL_1'] = test_log['EVT_LBL'].map(lambda x: str(x).split('-')[0]) 27 | test_log['EVT_LBL_2'] = test_log['EVT_LBL'].map(lambda x: str(x).split('-')[1]) 28 | test_log['EVT_LBL_3'] = test_log['EVT_LBL'].map(lambda x: str(x).split('-')[2]) 29 | # 训练集合测试集切分EVT_LBL1 EVT_LBL2 EVT_LBL交集 30 | EVT_LBL_1=list(set(train_log['EVT_LBL_1']) & set(test_log['EVT_LBL_1'])) 31 | EVT_LBL_2=list(set(train_log['EVT_LBL_2']) & set(test_log['EVT_LBL_2'])) 32 | EVT_LBL_3=list(set(train_log['EVT_LBL_3']) & set(test_log['EVT_LBL_3'])) 33 | 34 | def User_log_Fun(feature_data): 35 | 36 | feature_data['hour'] = [int(str(i)[11:13].replace('-', '')) for i in feature_data['OCC_TIM']] 37 | feature_data['day'] = [int(str(i)[8:11].replace('-', '')) for i in feature_data['OCC_TIM']] 38 | 39 | # 统计次数 40 | def Count_feature(para_datas, keys): 41 | prefixs = 'l_' 42 | for key in keys: 43 | prefixs = key + '_' 44 | data = para_datas[keys] 45 | data['temp'] = 1 46 | data = data.groupby(keys).agg('sum').reset_index() 47 | data.rename(columns={'temp': prefixs + 'cnt'}, inplace=True) 48 | para_datas = pd.merge(para_datas, data, on=keys, how='left') 49 | para_datas=para_datas.drop_duplicates(keys) 50 | return para_datas 51 | 52 | User_log_user_id = list(set(feature_data['USRID'])) 53 | User_log_user_id = pd.DataFrame(User_log_user_id, columns=['USRID']) 54 | User_log_user_id1 = pd.DataFrame(User_log_user_id, columns=['USRID']) 55 | 56 | # 统计USRID、EVT_LBL_1出现次数###################################################################################### 57 | keys=['USRID','EVT_LBL_1'] 58 | data = feature_data[keys].copy() 59 | data=Count_feature(data,keys) 60 | data= data[data['EVT_LBL_1'].map(lambda x:x in EVT_LBL_1)] 61 | data['EVT_LBL_1']=data['EVT_LBL_1'].map(lambda x:'EVT_LBL_1_'+str(x)) 62 | data.set_index(keys,inplace=True) 63 | data=data.unstack(level=-1) 64 | data.reset_index(inplace=True) 65 | User_log_user_id=pd.merge(User_log_user_id, data, on=['USRID'], how='left') 66 | 67 | # 统计USRID、EVT_LBL_2出现次数 68 | keys = ['USRID', 'EVT_LBL_2'] 69 | data = feature_data[keys].copy() 70 | data = Count_feature(data, keys) 71 | data = data[data['EVT_LBL_2'].map(lambda x: x in EVT_LBL_2)] 72 | data['EVT_LBL_2'] = data['EVT_LBL_2'].map(lambda x: 'EVT_LBL_2_' + str(x)) 73 | data.set_index(keys, inplace=True) 74 | data = data.unstack(level=-1) 75 | data.reset_index(inplace=True) 76 | User_log_user_id = pd.merge(User_log_user_id, data, on=['USRID'], how='left') 77 | 78 | # 统计USRID、EVT_LBL_3出现次数 79 | keys = ['USRID', 'EVT_LBL_3'] 80 | data = feature_data[keys].copy() 81 | data = Count_feature(data, keys) 82 | data = data[data['EVT_LBL_3'].map(lambda x: x in EVT_LBL_3)] 83 | data['EVT_LBL_3'] = data['EVT_LBL_3'].map(lambda x: 'EVT_LBL_3_' + str(x)) 84 | data.set_index(keys, inplace=True) 85 | data = data.unstack(level=-1) 86 | data.reset_index(inplace=True) 87 | User_log_user_id = pd.merge(User_log_user_id, data, on=['USRID'], how='left') 88 | ######################################### 用户每天出现次数########################################################## 89 | for i in range(1, 32): 90 | data = feature_data[['USRID', 'day']].copy() 91 | data = data[data['day'] == i] 92 | data['user_log_USRID_per_day_last' + str(i) + '_cnt'] = 1 93 | del data['day'] 94 | data = data.groupby(['USRID']).agg('sum').reset_index() 95 | User_log_user_id = pd.merge(User_log_user_id, data, on=['USRID'], how='left') 96 | 97 | # 用户出现方差、总和、平均数、最大值、最小值 98 | needs = [] 99 | for col in User_log_user_id.columns.tolist(): 100 | if 'user_log_USRID_per_day_last' in col: 101 | needs.append(col) 102 | User_log_user_id.fillna(0, inplace=True) 103 | User_log_user_id['user_log_USRID_per_day_cnt_var'] = User_log_user_id[needs].var(1) 104 | User_log_user_id['user_log_USRID_per_day_cnt_sum'] = User_log_user_id[needs].sum(1) 105 | User_log_user_id['user_log_USRID_per_day_cnt_avg'] = User_log_user_id[needs].mean(1) 106 | User_log_user_id['user_log_USRID_per_day_cnt_max'] = User_log_user_id[needs].max(1) 107 | User_log_user_id['user_log_USRID_per_day_cnt_min'] = User_log_user_id[needs].min(1) 108 | 109 | # ##差分计算每天出现次数###################################### 110 | a = np.diff(User_log_user_id[needs]) 111 | b = pd.DataFrame(a) 112 | b.columns = ['user_log_USRID_per_day_cnt_diff_1', 113 | 'user_log_USRID_per_day_cnt_diff_2', 'user_log_USRID_per_day_cnt_diff_3', 114 | 'user_log_USRID_per_day_cnt_diff_4', 'user_log_USRID_per_day_cnt_diff_5', 115 | 'user_log_USRID_per_day_cnt_diff_6', 'user_log_USRID_per_day_cnt_diff_7', 116 | 'user_log_USRID_per_day_cnt_diff_8', 'user_log_USRID_per_day_cnt_diff_9', 117 | 'user_log_USRID_per_day_cnt_diff_10', 'user_log_USRID_per_day_cnt_diff_11', 118 | 'user_log_USRID_per_day_cnt_diff_12', 'user_log_USRID_per_day_cnt_diff_13', 119 | 'user_log_USRID_per_day_cnt_diff_14', 'user_log_USRID_per_day_cnt_diff_15', 120 | 'user_log_USRID_per_day_cnt_diff_16','user_log_USRID_per_day_cnt_diff_17', 121 | 'user_log_USRID_per_day_cnt_diff_18', 'user_log_USRID_per_day_cnt_diff_19', 122 | 'user_log_USRID_per_day_cnt_diff_20', 'user_log_USRID_per_day_cnt_diff_21', 123 | 'user_log_USRID_per_day_cnt_diff_22', 'user_log_USRID_per_day_cnt_diff_23', 124 | 'user_log_USRID_per_day_cnt_diff_24', 'user_log_USRID_per_day_cnt_diff_25', 125 | 'user_log_USRID_per_day_cnt_diff_26', 'user_log_USRID_per_day_cnt_diff_27', 126 | 'user_log_USRID_per_day_cnt_diff_28', 'user_log_USRID_per_day_cnt_diff_29', 127 | 'user_log_USRID_per_day_cnt_diff_30'] 128 | # 用户出现方差、总和、平均数、最大值、最小值 129 | needs = [] 130 | for col in b.columns.tolist(): 131 | if 'user_log_USRID_per_day_cnt_diff_' in col: 132 | needs.append(col) 133 | User_log_user_id['user_log_USRID_per_day_cnt_diff_var_cnt'] = b[needs].var(1) 134 | User_log_user_id['user_log_USRID_per_day_cnt_diff_sum_cnt'] = b[needs].sum(1) 135 | User_log_user_id['user_log_USRID_per_day_cnt_diff_avg_cnt'] = b[needs].mean(1) 136 | User_log_user_id['user_log_USRID_per_day_cnt_diff_max_cnt'] = b[needs].max(1) 137 | User_log_user_id['user_log_USRID_per_day_cnt_diff_min_cnt'] = b[needs].min(1) 138 | 139 | ## 时间间隔 140 | # 最近/远一次启动距离最近考察日的时间间隔 141 | data = feature_data[['USRID', 'day']].copy() 142 | data = data.groupby(['USRID'])['day'].agg({'user_log_USRID_min_day': np.min}).reset_index() 143 | User_log_user_id = pd.merge(User_log_user_id, data, on=['USRID'], how='left') 144 | User_log_user_id['furest_day_to_label']=User_log_user_id['user_log_USRID_min_day'].map(lambda x: 32 - x) 145 | 146 | 147 | data = feature_data[['USRID', 'day']].copy() 148 | data = data.groupby(['USRID'])['day'].agg({'user_log_USRID_max_day': np.max}).reset_index() 149 | User_log_user_id = pd.merge(User_log_user_id, data, on=['USRID'], how='left') 150 | User_log_user_id['near_day_to_label']=User_log_user_id['user_log_USRID_max_day'].map(lambda x: 32 - x) 151 | 152 | # # USRID TCH_TYP每个类型出现的次数################################################################################# 153 | for i in range(3): 154 | data = feature_data[['USRID', 'TCH_TYP']] 155 | data = data[data['TCH_TYP'] == i] 156 | data = data.groupby(['USRID']).agg('count').reset_index() 157 | data.rename(columns={'TCH_TYP': 'user_log_USRID_TCH_TYP_' + str(i) + '_cnt'}, inplace=True) 158 | User_log_user_id = pd.merge(User_log_user_id, data, on='USRID', how='left') 159 | 160 | # # USRID TCH_TYP每个类型每天出现的次数############################################################################# 161 | for ii in range(1,32): 162 | feature_datas=feature_data[['USRID', 'TCH_TYP', 'day']] 163 | feature_datas=feature_datas[feature_datas['day']==ii] 164 | del feature_datas['day'] 165 | for i in range(3): 166 | data = feature_datas[['USRID', 'TCH_TYP']] 167 | data = data[data['TCH_TYP'] == i] 168 | data = data.groupby(['USRID']).agg('count').reset_index() 169 | data.rename(columns={'TCH_TYP': 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_' + str(ii)+ ''}, inplace=True) 170 | User_log_user_id1 = pd.merge(User_log_user_id1, data, on='USRID', how='left') 171 | 172 | #TCH_TYP出现方差、总和、平均数、最大值、最小值 173 | User_log_user_id1.fillna(0, inplace=True) 174 | for i in range(3): 175 | needs = [] 176 | for col in User_log_user_id1.columns.tolist(): 177 | if 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_' in col: 178 | needs.append(col) 179 | User_log_user_id.fillna(0, inplace=True) 180 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_'+str(i)+'_cnt_var'] = User_log_user_id1[needs].var(1) 181 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_'+str(i)+'_cnt_sum'] = User_log_user_id1[needs].sum(1) 182 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_'+str(i)+'_cnt_mean'] = User_log_user_id1[needs].mean(1) 183 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_'+str(i)+'_cnt_max'] = User_log_user_id1[needs].max(1) 184 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_'+str(i)+'_cnt_min'] = User_log_user_id1[needs].min(1) 185 | 186 | # ##差分计算每天出现次数###################################### 187 | for i in range(3): 188 | needs = [] 189 | for col in User_log_user_id1.columns.tolist(): 190 | if 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_' in col: 191 | needs.append(col) 192 | a = np.diff(User_log_user_id1[needs]) 193 | b = pd.DataFrame(a) 194 | b.columns = ['user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_1', 195 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_2', 196 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_3', 197 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_4', 198 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_5', 199 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_6', 200 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_7', 201 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_8', 202 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_9', 203 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_10', 204 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_11', 205 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_12', 206 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_13', 207 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_14', 208 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_15', 209 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_16', 210 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_17', 211 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_18', 212 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_19', 213 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_20', 214 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_21', 215 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_22', 216 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_23', 217 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_24', 218 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_25', 219 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_26', 220 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_27', 221 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_28', 222 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_29', 223 | 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_30'] 224 | # 用户出现方差、总和、平均数、最大值、最小值 225 | needs = [] 226 | for col in b.columns.tolist(): 227 | if 'user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_' in col: 228 | needs.append(col) 229 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_var_cnt'] = b[needs].var(1) 230 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_sum_cnt'] = b[needs].sum(1) 231 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_avg_cnt'] = b[needs].mean(1) 232 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_max_cnt'] = b[needs].max(1) 233 | User_log_user_id['user_log_USRID_TCH_TYP_per_day_' + str(i) + '_cnt_diff_min_cnt'] = b[needs].min(1) 234 | 235 | ## 时间间隔 236 | # 最近/远一次启动距离最近考察日的时间间隔 237 | for i in range(3): 238 | data = feature_data[['USRID', 'TCH_TYP','day']].copy() 239 | data=data[data['TCH_TYP']==i] 240 | del data['TCH_TYP'] 241 | data = data.groupby(['USRID'])['day'].agg({'user_log_USRID_TCH_TYP_'+str(i)+'_min_day': np.min}).reset_index() 242 | User_log_user_id = pd.merge(User_log_user_id, data, on=['USRID'], how='left') 243 | User_log_user_id['furest_TCH_TYP_day_to_label'] = User_log_user_id['user_log_USRID_TCH_TYP_'+str(i)+'_min_day'].map(lambda x: 32 - x) 244 | 245 | for i in range(3): 246 | data = feature_data[['USRID', 'TCH_TYP','day']].copy() 247 | data=data[data['TCH_TYP']==i] 248 | del data['TCH_TYP'] 249 | data = data.groupby(['USRID'])['day'].agg({'user_log_USRID_TCH_TYP_'+str(i)+'_max_day': np.max}).reset_index() 250 | User_log_user_id = pd.merge(User_log_user_id, data, on=['USRID'], how='left') 251 | User_log_user_id['near_TCH_TYP_day_to_label'] = User_log_user_id['user_log_USRID_TCH_TYP_'+str(i)+'_max_day'].map(lambda x: 32 - x) 252 | return User_log_user_id 253 | def User_agg_Fun(feature_data): 254 | User_agg_user_id = list(set(feature_data['USRID'])) 255 | User_agg_user_id = pd.DataFrame(User_agg_user_id, columns=['USRID']) 256 | 257 | # data = feature_data[['V2', 'V4', 'V5']].copy() 258 | # data['V2_V4'] = list(map(lambda x, y: str(x) + '_' + str(y), data['V2'], data['V4'])) 259 | # data['V2_V4_V5'] = list(map(lambda x, y: str(x) + '_' + str(y), data['V2_V4'], data['V5'])) 260 | # le = preprocessing.LabelEncoder() 261 | # le.fit(data['V2_V4_V5']) 262 | # data['V2_V4_V5_LabelEncoder'] = le.transform(data['V2_V4_V5']) 263 | # User_agg_user_id['V2_V4_V5_LabelEncoder'] = data['V2_V4_V5_LabelEncoder'] 264 | return User_agg_user_id 265 | 266 | def online_model(test,train): 267 | result = test[['USRID']] 268 | test.fillna(0, inplace=True) 269 | train.fillna(0, inplace=True) 270 | train_y = train.FLAG 271 | train_X = train.drop(['FLAG'], axis=1) 272 | 273 | # print(test) 274 | 275 | xgb_train = xgb.DMatrix(train_X, label=train_y) 276 | xgb_test = xgb.DMatrix(test) 277 | 278 | params = { 279 | 'booster': 'gbtree', 280 | 'objective': 'binary:logistic', 281 | 'eval_metric': 'auc', 282 | 'eta': 0.03, 283 | 'max_depth': 6, 284 | 'colsample_bytree': 0.8, 285 | 'subsample': 0.8, 286 | 'scale_pos_weight': 1, 287 | 'min_child_weight': 18, 288 | } 289 | num_rounds = 500# 迭代次数 290 | watchlist = [(xgb_train, 'train'), (xgb_train, 'verification')] 291 | 292 | # training model 293 | model = xgb.train(params, xgb_train, num_rounds) 294 | 295 | # 测试集 296 | preds = model.predict(xgb_test) 297 | test_pre_y = pd.DataFrame(preds) 298 | result['RST'] = test_pre_y 299 | 300 | result.to_csv('zhao.csv',index=False,sep='\t') 301 | 302 | 303 | return 304 | if __name__ == '__main__': 305 | start_time = time.time() 306 | print('begin', start_time) 307 | 308 | # 训练集构造特征 309 | 310 | train_agg_feature_data = User_agg_Fun(train_agg) 311 | train_log_feature_data=User_log_Fun(train_log) 312 | train_data_label = pd.merge(train_flg, train_agg, on=['USRID'], how='left') 313 | 314 | 315 | # 测试集构造特征 316 | test_agg_feature_data = User_agg_Fun(test_agg) 317 | test_log_feature_data=User_log_Fun(test_log) 318 | test_data_label = pd.merge(test_flg, test_agg, on=['USRID'], how='left') 319 | 320 | 321 | # 合并训练集 322 | train_data_label=pd.merge(train_data_label,train_agg_feature_data,on=['USRID'], how='left') 323 | train_data_label = pd.merge(train_data_label, train_log_feature_data, on=['USRID'], how='left') 324 | # print(train_data_label) 325 | # 合并测试集 326 | test_data_label = pd.merge(test_data_label, test_agg_feature_data, on=['USRID'], how='left') 327 | test_data_label = pd.merge(test_data_label, test_log_feature_data, on=['USRID'], how='left') 328 | # print(test_data_label) 329 | 330 | # 线上预测 331 | online_model(test_data_label,train_data_label) 332 | 333 | print('end tiem', time.time() - start_time) 334 | 335 | -------------------------------------------------------------------------------- /data/test/test_agg.csv: -------------------------------------------------------------------------------- 1 | V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 USRID 2 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 -1.03407 -0.29641 -0.18761 -0.50786 -0.60103 -0.36994 -0.53433 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.57729 -0.45963 0.57851 -0.22881 0.15815 -0.16201 0.4762 0.06979 -0.77398 -0.25708 -0.05001 -0.49396 -0.32438 92595 3 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 -1.03407 0.0398 -0.13652 -0.48351 -0.55402 -0.22361 -0.41609 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.50275 -0.28813 0.57851 -0.22881 0.15815 -0.16201 0.4762 -0.39637 -0.77398 -0.25708 -0.42709 -0.5471 -0.32438 4572 4 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 -0.09311 -0.29641 -0.18761 -0.3374 -0.46 -0.17571 -0.23874 -0.3339 -0.13843 -0.22708 -0.188 -0.99764 -0.49412 -0.42279 0.57851 -0.22881 0.15815 -0.16201 0.4762 -0.55917 -0.77398 -0.25708 -0.40814 -0.53054 -0.32438 64470 5 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 0.04132 -0.29641 -0.18761 -0.50786 -0.60103 -0.36994 -0.53433 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.57729 -0.45963 0.57851 -0.22881 0.15815 -0.16201 0.4762 -0.55917 -0.77398 -0.25708 -0.5791 -0.54868 -0.32438 68958 6 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 0.04132 -0.29641 -0.18761 -0.48351 -0.53051 -0.36262 -0.35697 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.37212 0.01237 0.57851 -0.18041 0.15815 -0.16201 0.4762 -0.06183 -0.77398 -0.25708 -0.16073 -0.54884 0.07949 8839 7 | -------------------------------------------------------------------------------- /data/test/test_log.csv: -------------------------------------------------------------------------------- 1 | USRID EVT_LBL OCC_TIM TCH_TYP 2 | 1000 162-574-910 2018-03-23 11:48:51 0 3 | 1000 257-922-1523 2018-03-14 13:31:01 0 4 | 1000 257-922-1523 2018-03-23 10:14:31 0 5 | 1000 257-922-1523 2018-03-23 11:46:23 0 6 | 1000 259-924-1525 2018-03-23 11:46:25 0 7 | -------------------------------------------------------------------------------- /data/train/train_agg.csv: -------------------------------------------------------------------------------- 1 | V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 USRID 2 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 -1.70618 -0.29641 -0.18761 -0.48351 -0.53051 -0.18703 -0.41609 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.42763 -0.30845 0.57851 -0.22881 0.15815 -0.16201 0.4762 -0.55917 -0.77398 -0.25708 -0.27221 -0.51336 -0.32438 14233 3 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 -1.70618 0.37601 -0.1003 -0.16694 -0.31897 0.13488 -0.41609 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.49415 -0.43058 0.57851 -0.22881 0.15815 -0.16201 0.4762 -0.50058 -0.77398 -0.25708 -0.37907 -0.52723 -0.32438 790 4 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 -1.57176 0.0398 -0.17089 -0.3861 -0.46 -0.36669 -0.17962 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.57339 -0.45064 0.57851 -0.22881 0.15815 -0.16201 0.4762 -0.55127 -0.77398 -0.25708 -0.57113 -0.54758 -0.32438 82308 5 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 -1.30291 -0.29641 -0.18761 -0.45916 -0.53051 -0.11165 -0.35697 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.45451 -0.17717 0.57851 -0.22881 0.15815 -0.16201 0.4762 -0.26154 -0.77398 0.2344 -0.32873 -0.51951 -0.32438 90325 6 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 -1.16849 -0.29641 -0.18761 -0.50786 -0.60103 -0.36994 -0.53433 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.57729 -0.45963 0.57851 -0.22881 0.15815 -0.16201 0.4762 -0.55917 -0.77398 -0.25708 -0.5791 -0.54884 -0.32438 71893 7 | -1.92554 -0.90689 -1.26634 0.2892 -0.68454 -1.16849 -0.29641 -0.18761 -0.50786 -0.60103 -0.36994 -0.53433 -0.43194 -0.13843 -0.27218 -0.188 -0.99764 -0.57729 -0.45963 0.57851 -0.22881 0.15815 -0.16201 0.4762 -0.55917 -0.77398 -0.25708 -0.5791 -0.54884 -0.32438 83816 8 | -------------------------------------------------------------------------------- /data/train/train_flg.csv: -------------------------------------------------------------------------------- 1 | USRID FLAG 2 | 0 0 3 | 35 0 4 | 42 0 5 | 76 0 6 | 77 0 7 | -------------------------------------------------------------------------------- /data/train/train_log.csv: -------------------------------------------------------------------------------- 1 | USRID EVT_LBL OCC_TIM TCH_TYP 2 | 10002 163-577-913 2018-03-22 16:31:44 0 3 | 10002 163-578-914 2018-03-22 16:31:18 0 4 | 10002 259-924-1525 2018-03-22 16:31:15 0 5 | 10002 326-1040-1677 2018-03-06 12:08:51 0 6 | -------------------------------------------------------------------------------- /run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd code 4 | 5 | python chenzhiliang.py 6 | 7 | python gao.py 8 | 9 | python zhao.py 10 | 11 | python fusion.py 12 | 13 | cd .. --------------------------------------------------------------------------------