├── A榜.png ├── B榜.png ├── README.md ├── features_list.xlsx ├── genfeature.py ├── lightGBM_JDD.py ├── lr_xgb_ensamble.py ├── stacking.py ├── useless.py └── 特征设计.numbers /A榜.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klyan/JDD_Loan_Forecasting/a0e03a767173d7a8d4233aa41ea0b94f8d4418ac/A榜.png -------------------------------------------------------------------------------- /B榜.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klyan/JDD_Loan_Forecasting/a0e03a767173d7a8d4233aa41ea0b94f8d4418ac/B榜.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JDD_Loan_Forecasting 2 | 京东借贷需求预测比赛 3 | A榜13名 4 | B榜17名 5 | 主要使用lightgbm的单模型,此题ensemble、stacking效果一般 6 | 比赛经历:http://m.blog.csdn.net/weixin_40275371/article/details/78791557 7 | -------------------------------------------------------------------------------- /features_list.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klyan/JDD_Loan_Forecasting/a0e03a767173d7a8d4233aa41ea0b94f8d4418ac/features_list.xlsx -------------------------------------------------------------------------------- /genfeature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import json 6 | import pandas as pd 7 | import numpy as np 8 | import os 9 | import datetime 10 | import time 11 | import math 12 | import random 13 | import copy 14 | from datetime import datetime, timedelta,date 15 | import sklearn.preprocessing, sklearn.decomposition, sklearn.linear_model, sklearn.pipeline, sklearn.metrics 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | import xgboost as xgb 18 | import lightgbm as lgb 19 | import time 20 | import random 21 | from sklearn.metrics import mean_squared_error 22 | from math import sqrt 23 | import matplotlib.pyplot as plt 24 | 25 | 26 | def parseDate(df, col): 27 | df[col] = pd.to_datetime(df[col]) 28 | if col == "click_time" or col == "loan_time": 29 | df["date"] = df[col].apply(lambda x: x.date()) 30 | return df 31 | 32 | 33 | def get_windows_mask(df, time_col, window_size): 34 | valid_end_date = "2016-11-01" 35 | test_end_date = "2016-12-01" 36 | valid_start = pd.Timestamp(valid_end_date) - timedelta(days=window_size) 37 | valid_mask = (df[time_col] >= valid_start) & (df[time_col] < pd.Timestamp(valid_end_date)) 38 | test_start = pd.Timestamp(test_end_date) - timedelta(days=window_size) 39 | test_mask = (df[time_col] >= test_start) & (df[time_col] < pd.Timestamp(test_end_date)) 40 | return valid_mask, test_mask 41 | 42 | def click_percent(df, duser, tuser, window_size): 43 | valid_mask, test_mask = get_windows_mask(df, "click_time", window_size) 44 | for idx, mask in enumerate([valid_mask, test_mask]): 45 | tmp = df[mask] 46 | uid_clicks = tmp.groupby(["uid","pid"]).click_time.count().reset_index() 47 | uid_clicks.columns = ["uid","pid", "clicks"] 48 | pid_avg_clicks = uid_clicks.groupby(["pid"]).clicks.mean().reset_index() 49 | pid_avg_clicks.columns = ["pid","avg_clicks"] 50 | uid_clicks = uid_clicks.merge(pid_avg_clicks,how="left",on="pid") 51 | uid_clicks["clicks_percent"] = uid_clicks["clicks"]/uid_clicks["avg_clicks"] 52 | uid_clicks["pid"] = uid_clicks['pid'].astype(str) + "_pidcliks_" + str(window_size) + "days" 53 | if idx == 0: 54 | uid_clicks = uid_clicks.pivot(index='uid', columns='pid', values='clicks_percent').reset_index().fillna(0) 55 | duser = duser.merge(uid_clicks, how="left", on="uid").fillna(0) 56 | elif idx == 1: 57 | uid_clicks = uid_clicks.pivot(index='uid', columns='pid', values='clicks_percent').reset_index().fillna(0) 58 | tuser = tuser.merge(uid_clicks, how="left", on="uid").fillna(0) 59 | return duser, tuser 60 | 61 | 62 | def click_days_pids(df, duser, tuser, window_size): 63 | valid_mask, test_mask = get_windows_mask(df, "click_time", window_size) 64 | for idx, mask in enumerate([valid_mask, test_mask]): 65 | tmp = df[mask] 66 | uid_clicks = tmp.groupby(["uid"])["date","pid"].nunique().reset_index() 67 | uid_clicks.columns = ["uid","clickdays_" + str(window_size) + "_days", "pids_" + str(window_size)+"days"] 68 | if idx == 0: 69 | duser = duser.merge(uid_clicks, how="left", on="uid").fillna(0) 70 | elif idx == 1: 71 | tuser = tuser.merge(uid_clicks, how="left", on="uid").fillna(0) 72 | return duser, tuser 73 | 74 | 75 | def getNearestClick(df, duser, tuser, window_size): 76 | offset = 1 77 | df['last_click_time'] = df.groupby(['uid'])[['click_time']].shift(offset) 78 | df["click_interval"] = (df["click_time"] - df['last_click_time']).apply(lambda x: x.total_seconds()).fillna(0) 79 | df['last_click_pid'] = df.groupby(['uid'])['pid'].shift(offset).fillna(0) 80 | uid_mean_click_interval = df.groupby(["uid"])["click_interval"].mean().reset_index() 81 | valid_mask, test_mask = get_windows_mask(df, "click_time", window_size) 82 | for idx, mask in enumerate([valid_mask, test_mask]): 83 | tmp = df[mask] 84 | uid_nearest_click = tmp.groupby("uid")["click_time"].max().reset_index() 85 | uid_click_interval = uid_mean_click_interval.merge(uid_nearest_click,how="left", on="uid") 86 | if idx == 0: 87 | uid_click_interval["click_nearest_interval"] = (pd.Timestamp(valid_end_date) - uid_click_interval["click_time"]).apply(lambda x:x.total_seconds()) 88 | elif idx == 1: 89 | uid_click_interval["click_nearest_interval"] = (pd.Timestamp(test_end_date) - uid_click_interval["click_time"]).apply(lambda x:x.total_seconds()) 90 | uid_click_interval["next_click"] = uid_click_interval["click_interval"] + uid_click_interval["click_nearest_interval"] 91 | uid_click_interval.drop("click_time",axis=1,inplace=True) 92 | uid_click_interval.columns = ["uid","mean_click_interval", "click_nearest_interval", "nextclicktime"] 93 | if idx == 0: 94 | duser = duser.merge(uid_click_interval,how="left",on="uid") 95 | elif idx == 1: 96 | tuser = tuser.merge(uid_click_interval,how="left",on="uid") 97 | return duser, tuser 98 | 99 | 100 | def uid_order_status(df, duser, tuser, window_size): 101 | valid_mask, test_mask = get_windows_mask(df, "buy_time", window_size) 102 | for idx, mask in enumerate([valid_mask, test_mask]): 103 | tmp = df[mask] 104 | cate_total_sale_amt = tmp.groupby(["uid"])["order_amt"].sum().reset_index() 105 | cate_total_sale_cnt = tmp.groupby(["uid"])["buy_time"].count().reset_index() 106 | uid_order = cate_total_sale_amt.merge(cate_total_sale_cnt, how="left", on =["uid"]) 107 | uid_order["avg_order_amt"] = uid_order["order_amt"]/ uid_order["buy_time"] 108 | uid_order["mean_order_amt_percent"] = uid_order["order_amt"] / np.mean(uid_order["order_amt"]) 109 | uid_order["mean_buy_cnt_percent"] = uid_order["buy_time"] / np.mean(uid_order["buy_time"]) 110 | uid_order["mean_order_amt_percent_mean_buy_cnt_percent"] = uid_order["mean_order_amt_percent"] * uid_order["mean_buy_cnt_percent"] 111 | uid_order.columns = ['uid', 'order_amt'+str(window_size), 'buy_cnt'+str(window_size), "avg_order_amt" + str(window_size) , 'mean_order_amt_percent'+str(window_size), 'mean_buy_cnt_percent'+str(window_size), 'mean_order_amt_percent_mean_buy_cnt_percent'+str(window_size)] 112 | if idx == 0: 113 | duser = duser.merge(uid_order, how="left", on = 'uid') 114 | elif idx == 1: 115 | tuser = tuser.merge(uid_order, how="left", on = 'uid') 116 | return duser, tuser 117 | 118 | 119 | 120 | 121 | def getNearestOrder(df, duser, tuser, window_size): 122 | offset = 1 123 | df['last_buy_time'] = df.groupby(['uid'])[['buy_time']].shift(offset) 124 | df["buy_interval"] = (df["buy_time"] - df['last_buy_time']).apply(lambda x: x.days).fillna(0) 125 | uid_buy_interval = df.groupby(["uid"])["buy_interval"].mean().reset_index() 126 | #df.drop("buy_interval", inplace=True, axis=1) 127 | valid_mask, test_mask = get_windows_mask(df, "buy_time", window_size) 128 | for idx, mask in enumerate([valid_mask, test_mask]): 129 | tmp = df[mask] 130 | tmp = tmp.groupby(['uid','buy_time'])["order_amt"].sum().reset_index() #每个人一天内消费了多少金额 131 | maxtime_idx = tmp.groupby(['uid'])['buy_time'].transform(max) == tmp['buy_time'] #用户最近一天消费的情况 132 | uid_nearest_buy = tmp[maxtime_idx] 133 | uid_nearest_buy = uid_buy_interval.merge(uid_nearest_buy, on="uid", how="left") 134 | if idx == 0: 135 | uid_nearest_buy["buy_nearest_interval"] = (pd.Timestamp(valid_end_date) - uid_nearest_buy["buy_time"]).apply(lambda x:x.days) 136 | elif idx == 1: 137 | uid_nearest_buy["buy_nearest_interval"] = (pd.Timestamp(test_end_date) - uid_nearest_buy["buy_time"]).apply(lambda x:x.days) 138 | uid_nearest_buy["next_buytime"] = uid_nearest_buy["buy_interval"] + uid_nearest_buy["buy_nearest_interval"] 139 | uid_nearest_buy["buy_nearest_price_interval"] = uid_nearest_buy["order_amt"] / (uid_nearest_buy["buy_nearest_interval"]+1) 140 | #uid_nearest_buy.drop(["buy_interval"],axis=1,inplace=True) 141 | uid_nearest_buy.columns = ["uid", "mean_buy_interval", "nearest_buytime", "buy_nearest_price", "buy_nearest_interval", "next_buytime", "buy_nearest_price_interval"] 142 | if idx == 0: 143 | duser = duser.merge(uid_nearest_buy, how="left", on="uid") 144 | duser["nearest_buytime_to_active_date"] = (duser["nearest_buytime"] - duser["active_date"]).apply(lambda x: x.days)#最近一次购买距离用户激活的时间 145 | duser.drop("nearest_buytime", inplace=True, axis=1) 146 | elif idx == 1: 147 | tuser = tuser.merge(uid_nearest_buy, how="left", on="uid") 148 | tuser["nearest_buytime_to_active_date"] = (tuser["nearest_buytime"] - tuser["active_date"]).apply(lambda x: x.days)#最近一次购买距离用户激活的时间 149 | tuser.drop("nearest_buytime",inplace=True,axis=1) 150 | return duser, tuser 151 | 152 | def getMaxPriceOrder(df, duser, tuser, window_size): 153 | valid_mask, test_mask = get_windows_mask(df, "buy_time", window_size) 154 | for idx, mask in enumerate([valid_mask, test_mask]): 155 | tmp = df[mask] 156 | tmp = tmp.groupby(['uid','buy_time'])["order_amt"].sum().reset_index() #每个人一天内消费了多少金额 157 | tmp["max_order_amt"] = tmp.groupby(['uid'])['order_amt'].transform(max) #用户消费最大的情况 158 | uid_max_buy = tmp[(tmp["max_order_amt"] == tmp["order_amt"])] 159 | max_amt_idx = uid_max_buy.groupby("uid")['buy_time'].transform(max) == uid_max_buy['buy_time'] 160 | uid_max_buy = uid_max_buy[max_amt_idx] 161 | uid_max_buy.drop("max_order_amt",inplace=True,axis=1) 162 | if idx == 0: 163 | uid_max_buy["maxbuy_interval"] = (pd.Timestamp(valid_end_date) - uid_max_buy["buy_time"]).apply(lambda x:x.days) 164 | elif idx == 1: 165 | uid_max_buy["maxbuy_interval"] = (pd.Timestamp(test_end_date) - uid_max_buy["buy_time"]).apply(lambda x:x.days) 166 | uid_max_buy["maxbuy_price_interval"] = uid_max_buy["order_amt"] / (uid_max_buy["maxbuy_interval"]+1) 167 | uid_max_buy.drop(["buy_time"],axis=1,inplace=True) 168 | uid_max_buy.columns = ["uid","maxbuy_price", "maxbuy_interval", "maxbuy_price_interval"] 169 | if idx == 0: 170 | duser = duser.merge(uid_max_buy,how="left",on="uid") 171 | elif idx == 1: 172 | tuser = tuser.merge(uid_max_buy,how="left",on="uid") 173 | return duser, tuser 174 | 175 | 176 | 177 | def gen_fixedtw_features_for_loan(df, duser1, duser2, window_size): 178 | valid_mask, test_mask = get_windows_mask(df, "loan_time", window_size) 179 | for idx, mask in enumerate([valid_mask, test_mask]): 180 | tmp = df[mask] 181 | #贷款金额 loan_amount 182 | stat_loanAmt = tmp.groupby(["uid"])['loan_amount'].agg(['sum','count','mean','max','min']).reset_index() 183 | stat_loanAmt.columns=['uid']+ [i+ '_loanAmt_'+str(window_size) for i in list(stat_loanAmt.columns)[1:]] 184 | #贷款期数 plannum #平均值去掉? 185 | stat_loanPlanNum=tmp.groupby(["uid"])['plannum'].agg(['mean','max','min']).reset_index() 186 | stat_loanPlanNum.columns=['uid']+ [i+ '_loanPlanNum_'+str(window_size) for i in list(stat_loanPlanNum.columns)[1:]] 187 | #每期贷款额 amt_per_plan 188 | stat_amtPerPlan=tmp.groupby(["uid"])['amt_per_plan'].agg(['sum','mean','max','min']).reset_index() 189 | stat_amtPerPlan.columns = ['uid']+ [i+ '_amtPerPlan_'+str(window_size) for i in list(stat_amtPerPlan.columns)[1:]] 190 | #频率最高的贷款期数和对应的贷款次数 191 | freq_plannum=tmp.groupby('uid').plannum.value_counts().rename('freq_plannum').reset_index() 192 | idx_mostxfreq=list(freq_plannum.groupby('uid').freq_plannum.idxmax()) 193 | most_freq=freq_plannum.loc[idx_mostxfreq] 194 | most_freq.columns=['uid','most_plannum_'+str(window_size),'freq_most_plannum_'+str(window_size)] 195 | ##每期的贷款期数pivot 196 | perPlanAmtCnt= tmp.groupby(["uid","plannum"])["loan_amount"].agg(['count','sum']).reset_index() 197 | perPlanAmtCnt["plannum"] = perPlanAmtCnt['plannum'].astype(str) + "_plannum_" + str(window_size) + "days" 198 | perPlanAmtCnt = perPlanAmtCnt.pivot(index='uid', columns='plannum').reset_index().fillna(0) 199 | new_list = ["uid"] 200 | for words in perPlanAmtCnt.columns.get_values(): 201 | if "uid" in words : 202 | continue 203 | new_list.append('_'.join(words)) 204 | perPlanAmtCnt.columns = new_list 205 | #贷款周期 loan_interval 206 | stat_loanInterval=tmp.groupby(["uid"])['loan_interval'].agg(['mean','median','max','min']).reset_index() 207 | stat_loanInterval.columns=['uid']+ [i+ '_loanInterval_'+str(window_size) for i in list(stat_loanInterval.columns)[1:]] 208 | loan_stat = stat_loanAmt.merge(stat_loanPlanNum, how="left", on="uid").merge(stat_amtPerPlan, how="left", on="uid").merge(stat_loanInterval, how="left", on="uid").merge(most_freq, how="left", on="uid").merge(perPlanAmtCnt, how="left", on="uid") 209 | if idx==0: 210 | duser1=duser1.merge(loan_stat, how="left", on="uid") 211 | duser1[new_list] = duser1[new_list].fillna(0.0) 212 | stat_daysLoan=(pd.Timestamp(valid_end_date)-tmp.loan_time).apply(lambda x:x.days+x.seconds/86400.0).groupby(tmp.uid).agg(['mean','max','min']).reset_index() #各次贷款离现在的时间相关的统计 213 | stat_daysLoan.columns = ['uid']+ [i+ '_nearestLoanInterval_'+str(window_size) for i in list(stat_daysLoan.columns)[1:]] 214 | duser1=duser1.merge(stat_daysLoan, how="left", on="uid") 215 | elif idx==1: 216 | duser2=duser2.merge(loan_stat, how="left", on="uid") 217 | duser2[new_list] = duser2[new_list].fillna(0.0) 218 | stat_daysLoan=(pd.Timestamp(test_end_date)-tmp.loan_time).apply(lambda x:x.days+x.seconds/86400.0).groupby(tmp.uid).agg(['mean','max','min']).reset_index() 219 | stat_daysLoan.columns = ['uid']+ [i+ '_nearestLoanInterval_'+str(window_size) for i in list(stat_daysLoan.columns)[1:]] 220 | duser2=duser2.merge(stat_daysLoan, how="left", on="uid") 221 | return duser1, duser2 222 | 223 | 224 | def getNearestLoan(df, duser1, duser2): 225 | valid_mask = df.month.isin([8, 9, 10]) 226 | test_mask = df.month.isin([8, 9, 10, 11]) 227 | for idx, mask in enumerate([valid_mask, test_mask]): 228 | tmp = df[mask] 229 | maxtime_idx = tmp.groupby(['uid'])['loan_time'].transform(max) == tmp['loan_time'] #用户最近一次贷款的情况 230 | uid_nearest_loan = tmp[maxtime_idx].reset_index(drop=True) 231 | if idx==0: 232 | uid_nearest_loan['nearest_loantime']=(pd.Timestamp(valid_end_date)-uid_nearest_loan.loan_time).apply(lambda x:x.days+x.seconds/86400.0) 233 | uid_nearest_loan['nearest_loan_amt_time'] = uid_nearest_loan['loan_amount'] / (1 + uid_nearest_loan['nearest_loantime']) 234 | uid_nearest_loan = uid_nearest_loan[["uid", "plannum", "amt_per_plan", "loan_amount","nearest_loan_amt_time", "nearest_loantime"]] 235 | uid_nearest_loan.columns = ["uid","nearest_plannum", "nearest_amt_per_plan", "nearest_loan_amount","nearest_loan_amt_time","nearest_loantime"] 236 | duser1 = duser1.merge(uid_nearest_loan, how="left", on="uid") 237 | elif idx==1: 238 | uid_nearest_loan['nearest_loantime']=(pd.Timestamp(test_end_date)-uid_nearest_loan.loan_time).apply(lambda x:x.days+x.seconds/86400.0) 239 | uid_nearest_loan['nearest_loan_amt_time'] = uid_nearest_loan['loan_amount'] / (1 + uid_nearest_loan['nearest_loantime']) 240 | uid_nearest_loan = uid_nearest_loan[["uid", "plannum", "amt_per_plan", "loan_amount","nearest_loan_amt_time","nearest_loantime"]] 241 | uid_nearest_loan.columns = ["uid","nearest_plannum", "nearest_amt_per_plan", "nearest_loan_amount","nearest_loan_amt_time","nearest_loantime"] 242 | duser2 = duser2.merge(uid_nearest_loan, how="left", on="uid") 243 | return duser1, duser2 244 | 245 | 246 | def current2PayAmt(df, duser1, duser2, window_size): 247 | valid_mask, test_mask = get_windows_mask(df, "loan_time", window_size) 248 | for idx, mask in enumerate([valid_mask, test_mask]): 249 | if idx == 0: 250 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(valid_end_date) 251 | tmp = df[mask & pay_date_mask] 252 | current_pay_amt = tmp.groupby("uid")["amt_per_plan"].agg(["sum"]).reset_index() 253 | current_pay_amt.columns = ["uid", "current_topay_amt"] 254 | duser1 = duser1.merge(current_pay_amt, on="uid", how="left") 255 | duser1["current_topay_amt"] = duser1["current_topay_amt"].fillna(0.0) 256 | elif idx == 1: 257 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(test_end_date) 258 | tmp = df[mask & pay_date_mask] 259 | current_pay_amt = tmp.groupby("uid")["amt_per_plan"].agg(["sum"]).reset_index() 260 | current_pay_amt.columns = ["uid", "current_topay_amt"] 261 | duser2 = duser2.merge(current_pay_amt, on="uid", how="left") 262 | duser2["current_topay_amt"] = duser2["current_topay_amt"].fillna(0.0) 263 | return duser1, duser2 264 | 265 | 266 | def currentDebtAmt(df, duser1, duser2, window_size): 267 | valid_mask, test_mask = get_windows_mask(df, "loan_time", window_size) 268 | for idx, mask in enumerate([valid_mask, test_mask]): 269 | if idx == 0: 270 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(valid_end_date) 271 | tmp = df[mask & pay_date_mask].reset_index(drop=True) 272 | tmp["payed_num"] = (pd.Timestamp(valid_end_date) - tmp["loan_time"]).apply(lambda x: x.days/30.0) 273 | tmp["debtAmt"] = tmp["loan_amount"] - tmp["amt_per_plan"] * tmp["payed_num"] 274 | current_debtAmt = tmp.groupby("uid")["debtAmt"].sum().rename("current_debtAmt_" + str(window_size)).reset_index() 275 | duser1 = duser1.merge(current_debtAmt, on="uid", how="left") 276 | duser1["current_debtAmt_" + str(window_size)] = duser1["current_debtAmt_" + str(window_size)].fillna(0.0) 277 | duser1["remainingAmt_" + str(window_size)] = duser1["limit"]- duser1["current_debtAmt_" + str(window_size)] 278 | elif idx == 1: 279 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(valid_end_date) 280 | tmp = df[mask & pay_date_mask].reset_index(drop=True) 281 | tmp["payed_num"] = (pd.Timestamp(test_end_date) - tmp["loan_time"]).apply(lambda x: x.days/30.0) 282 | tmp["debtAmt"] = tmp["loan_amount"] - tmp["amt_per_plan"] * tmp["payed_num"] 283 | current_debtAmt = tmp.groupby("uid")["debtAmt"].sum().rename("current_debtAmt_" + str(window_size)).reset_index() 284 | duser2 = duser2.merge(current_debtAmt, on="uid", how="left") 285 | duser2["current_debtAmt_" + str(window_size)] = duser2["current_debtAmt_" + str(window_size)].fillna(0.0) 286 | duser2["remainingAmt_" + str(window_size)] = duser2["limit"]- duser2["current_debtAmt_" + str(window_size)] 287 | return duser1, duser2 288 | 289 | ##每人购买力,预测贷款金额 290 | def avgLoanAmt4orderAmt(df_loan, df_order, duser1, duser2): 291 | df_order_tmp = df_order[df_order["month"] < 11] 292 | df_loan_tmp = df_loan[df_loan["month"] < 11] 293 | month_orderAmt = df_order_tmp.groupby(["uid","month"])["order_amt"].sum().rename("uid_month_orderAmt").reset_index().groupby("uid")["uid_month_orderAmt"].mean().rename("uid_avg_month_orderAmt").reset_index() 294 | month_loanAmt = df_loan_tmp.groupby(["uid","month"])["loan_amount"].sum().rename("uid_month_loanAmt").reset_index().groupby("uid")["uid_month_loanAmt"].mean().rename("uid_avg_month_loanAmt").reset_index() 295 | month_order_loan = month_orderAmt.merge(month_loanAmt, on = "uid", how="left").fillna(0.0) 296 | month_order_loan["loanAmt_ratio"] = month_order_loan["uid_avg_month_loanAmt"]/ month_order_loan["uid_avg_month_orderAmt"] 297 | duser1 = duser1.merge(month_order_loan, on = 'uid', how="left") 298 | duser2 = duser2.merge(month_order_loan, on = 'uid', how="left") 299 | duser1["pred_loanAmt"] = duser1["loanAmt_ratio"] * duser1['order_amt30'] 300 | duser2["pred_loanAmt"] = duser2["loanAmt_ratio"] * duser2['order_amt30'] 301 | duser1.drop(["uid_avg_month_loanAmt","uid_avg_month_orderAmt"], axis=1, inplace=True) 302 | duser2.drop(["uid_avg_month_loanAmt","uid_avg_month_orderAmt"], axis=1, inplace=True) 303 | return duser1, duser2 304 | 305 | 306 | def getLoanAmtRemainingLimt(df_loan, duser1, duser2, before_month): 307 | valid_start = pd.Timestamp(valid_end_date) - timedelta(days=before_month*31) 308 | valid_end = pd.Timestamp(valid_end_date) - timedelta(days=(before_month-1)*31) 309 | test_start = pd.Timestamp(test_end_date) - timedelta(days=before_month*31) 310 | test_end = pd.Timestamp(test_end_date) - timedelta(days=(before_month-1)*31) 311 | valid_mask = (df_loan["loan_time"] >= valid_start) & (df_loan["loan_time"] < valid_end) 312 | test_mask = (df_loan["loan_time"] >= test_start) & (df_loan["loan_time"] < test_end) 313 | for idx, mask in enumerate([valid_mask, test_mask]): 314 | uid_month_loanamt = df_loan[mask].groupby("uid")["loan_amount"].sum().rename("month_sum_loanamt"+str(before_month)).reset_index() #分子 315 | if idx == 0: 316 | debt_mask = (df_loan["loan_time"] < valid_start) & (df_loan["pay_end_date"] > valid_start) 317 | tmp = df_loan[debt_mask].reset_index(drop=True) 318 | tmp["topay_num"] = (tmp["pay_end_date"] - valid_start).apply(lambda x: x.days/30.0) 319 | tmp["debtAmt"] = tmp["amt_per_plan"] * tmp["topay_num"] 320 | current_debtAmt = tmp.groupby("uid")["debtAmt"].sum().rename("debtAmt_monthBefore"+str(before_month)).reset_index() 321 | current_debtAmt = current_debtAmt.merge(uid_month_loanamt,on="uid",how="left").fillna(0.0) 322 | duser1 = duser1.merge(current_debtAmt, on="uid", how="left") #月初的负债额,当月借贷额度 323 | duser1["debtAmt_monthBefore"+str(before_month)] = duser1["debtAmt_monthBefore"+str(before_month)].fillna(0.0) 324 | duser1["remainingAmt_monthBefore" + str(before_month)] = duser1["limit"]- duser1["debtAmt_monthBefore" + str(before_month)] 325 | duser1["loansum_remainingAmt_ratio_monthBefore" + str(before_month)] = duser1["month_sum_loanamt"+str(before_month)]/(1+duser1["remainingAmt_monthBefore" + str(before_month)]) 326 | elif idx == 1: 327 | tmp = df_loan[(df_loan["loan_time"] < test_start) & (df_loan["pay_end_date"] > test_start)].reset_index(drop=True) 328 | tmp["topay_num"] = (tmp["pay_end_date"] - test_start).apply(lambda x: x.days/30.0) 329 | tmp["debtAmt"] = tmp["amt_per_plan"] * tmp["topay_num"] 330 | current_debtAmt = tmp.groupby("uid")["debtAmt"].sum().rename("debtAmt_monthBefore"+str(before_month)).reset_index() 331 | current_debtAmt = current_debtAmt.merge(uid_month_loanamt, on="uid",how="left").fillna(0.0) 332 | duser2 = duser2.merge(current_debtAmt, on="uid", how="left") #月初的负债额,当月借贷额度 333 | duser2["debtAmt_monthBefore"+str(before_month)] = duser2["debtAmt_monthBefore"+str(before_month)].fillna(0.0) 334 | duser2["remainingAmt_monthBefore" + str(before_month)] = duser2["limit"]- duser2["debtAmt_monthBefore" + str(before_month)] 335 | duser2["loansum_remainingAmt_ratio_monthBefore" + str(before_month)] = duser2["month_sum_loanamt"+str(before_month)]/(1+duser2["remainingAmt_monthBefore" + str(before_month)]) 336 | return duser1, duser2 337 | 338 | 339 | def getAvailableLoanAmtLimt(df_loan, duser1, duser2, before_month): 340 | if before_month == 1: 341 | valid_start = pd.Timestamp("2016-10-01") 342 | valid_end = pd.Timestamp("2016-11-01") 343 | test_start = pd.Timestamp("2016-11-01") 344 | test_end = pd.Timestamp("2016-12-01") 345 | elif before_month == 2: 346 | valid_start = pd.Timestamp("2016-09-01") 347 | valid_end = pd.Timestamp("2016-10-01") 348 | test_start = pd.Timestamp("2016-10-01") 349 | test_end = pd.Timestamp("2016-11-01") 350 | valid_mask = (df_loan["loan_time"] >= valid_start) & (df_loan["loan_time"] < valid_end) 351 | test_mask = (df_loan["loan_time"] >= test_start) & (df_loan["loan_time"] < test_end) 352 | for idx, mask in enumerate([valid_mask, test_mask]): 353 | if idx == 0: 354 | debt_mask = (df_loan["loan_time"] < valid_end) & (df_loan["pay_end_date"] > valid_end) #仍然再还的贷款 355 | tmp = df_loan[debt_mask].reset_index(drop=True) 356 | tmp["debtRatio"] = (tmp["pay_end_date"] - valid_end).apply(lambda x: 1 if x.days/30.0 >= 1 else x.days/30.0) 357 | tmp["unAvailableAmt"] = tmp["loan_amount"] * tmp["debtRatio"] 358 | unAvailableAmt = tmp.groupby("uid")["unAvailableAmt"].sum().rename("unAvailableAmt_monthBefore"+str(before_month)).reset_index() 359 | duser1 = duser1.merge(unAvailableAmt, on="uid", how="left") #当月不可用额度 360 | duser1["unAvailableAmt_monthBefore"+str(before_month)] = duser1["unAvailableAmt_monthBefore"+str(before_month)].fillna(0.0) 361 | duser1["availableAmt_monthBefore" + str(before_month)] = duser1["limit"]- duser1["unAvailableAmt_monthBefore"+str(before_month)] 362 | elif idx == 1: 363 | tmp = df_loan[(df_loan["loan_time"] < test_end) & (df_loan["pay_end_date"] > test_end)].reset_index(drop=True) 364 | tmp["debtRatio"] = (tmp["pay_end_date"] - test_end).apply(lambda x: 1 if x.days/30.0 >= 1 else x.days/30.0) 365 | tmp["unAvailableAmt"] = tmp["loan_amount"] * tmp["debtRatio"] 366 | unAvailableAmt = tmp.groupby("uid")["unAvailableAmt"].sum().rename("unAvailableAmt_monthBefore"+str(before_month)).reset_index() 367 | duser2 = duser2.merge(unAvailableAmt, on="uid", how="left") #当月不可用额度 368 | duser2["unAvailableAmt_monthBefore"+str(before_month)] = duser2["unAvailableAmt_monthBefore"+str(before_month)].fillna(0.0) 369 | duser2["availableAmt_monthBefore" + str(before_month)] = duser2["limit"]- duser2["unAvailableAmt_monthBefore"+str(before_month)] 370 | return duser1, duser2 371 | 372 | def lastMonthPayedAmt(df, duser1, duser2): 373 | valid_start = pd.Timestamp(valid_end_date) - timedelta(days=30) 374 | valid_mask = (df["pay_end_date"] >= valid_start) & (df["loan_time"] < valid_start) 375 | test_start = pd.Timestamp(test_end_date) - timedelta(days=30) 376 | test_mask = (df["pay_end_date"] >= test_start) & (df["loan_time"] < test_start) 377 | for idx, mask in enumerate([valid_mask, test_mask]): 378 | tmp = df[mask].reset_index(drop=True) 379 | if idx==0: 380 | tmp["last_month_topay_num"] = tmp["pay_end_date"].apply(lambda x: 1 if (x- valid_start).days/30.0 >=1 else (x- valid_start).days/30.0) 381 | tmp["last_month_payed_amt"] = tmp["amt_per_plan"] * tmp["last_month_topay_num"] 382 | lastMonthPayedAmt= tmp.groupby("uid")["last_month_payed_amt"].sum().rename("lastMonthPayedAmt").reset_index() 383 | duser1 = duser1.merge(lastMonthPayedAmt, on="uid", how="left") 384 | duser1["lastMonthPayedAmt"] = duser1["lastMonthPayedAmt"].fillna(0.0) 385 | elif idx==1: 386 | tmp["last_month_topay_num"] = tmp["pay_end_date"].apply(lambda x: 1 if (x- test_start).days/30.0 >=1 else (x- test_start).days/30.0) 387 | tmp["last_month_payed_amt"] = tmp["amt_per_plan"] * tmp["last_month_topay_num"] 388 | lastMonthPayedAmt= tmp.groupby("uid")["last_month_payed_amt"].sum().rename("lastMonthPayedAmt").reset_index() 389 | duser2 = duser2.merge(lastMonthPayedAmt, on="uid", how="left") 390 | duser2["lastMonthPayedAmt"] = duser2["lastMonthPayedAmt"].fillna(0.0) 391 | return duser1, duser2 392 | 393 | def getPast3MonthLoanFeatures(df, duser1, duser2): 394 | valid_mask = df.month.isin([8, 9, 10]) 395 | test_mask = df.month.isin([9, 10, 11]) 396 | window_size = 92 397 | for idx, mask in enumerate([valid_mask, test_mask]): 398 | tmp = df[mask] 399 | #平均每月贷款金额 400 | month_loanAmt = tmp.groupby(["uid","month"])['loan_amount'].sum().rename("monthLoanAmt").reset_index().groupby(["uid"])["monthLoanAmt"].mean().rename("monthAvgLoanAmt").reset_index() 401 | #month_loanAmt = tmp.groupby(["uid","month"])['loan_amount'].sum().rename("monthLoanAmt").reset_index().groupby(["uid"])["monthLoanAmt"].agg(["count","mean","max","median","min","std"]).reset_index() 402 | #month_loanAmt.columns = ['uid']+ [i+ '_monthLoanAmt_'+str(window_size) for i in list(month_loanAmt.columns)[1:]] 403 | #贷款金额 loan_amount 404 | stat_loanAmt = tmp.groupby(["uid"])['loan_amount'].agg(['sum','count','mean','max','min']).reset_index() 405 | stat_loanAmt.columns=['uid']+ [i+ '_loanAmt_'+str(window_size) for i in list(stat_loanAmt.columns)[1:]] 406 | #贷款期数 plannum 407 | stat_loanPlanNum=tmp.groupby(["uid"])['plannum'].agg(['sum','mean','max','min']).reset_index() 408 | stat_loanPlanNum.columns=['uid']+ [i+ '_loanPlanNum_'+str(window_size) for i in list(stat_loanPlanNum.columns)[1:]] 409 | #每期贷款额 amt_per_plan 410 | stat_amtPerPlan=tmp.groupby(["uid"])['amt_per_plan'].agg(['sum','mean','max','min']).reset_index() 411 | stat_amtPerPlan.columns = ['uid']+ [i+ '_amtPerPlan_'+str(window_size) for i in list(stat_amtPerPlan.columns)[1:]] 412 | #贷款周期 loan_interval 413 | stat_loanInterval=tmp.groupby(["uid"])['loan_interval'].agg(['mean','median','max','min']).reset_index() 414 | stat_loanInterval.columns=['uid']+ [i+ '_loanInterval_'+str(window_size) for i in list(stat_loanInterval.columns)[1:]] 415 | loan3Month = month_loanAmt.merge(stat_loanAmt, on="uid", how="left").merge(stat_loanPlanNum, on="uid", how="left").merge(stat_amtPerPlan, on="uid", how="left").merge(stat_loanInterval, on="uid", how="left").fillna(0.0) 416 | if idx == 0: 417 | duser1 = duser1.merge(loan3Month, how="left", on="uid") 418 | elif idx == 1: 419 | duser2 = duser2.merge(loan3Month, how="left", on="uid") 420 | return duser1, duser2 421 | 422 | 423 | 424 | def loanTimeBetweenActivetime(df,duser1, duser2): 425 | valid_mask = df.month.isin([8, 9, 10]) 426 | test_mask = df.month.isin([8, 9, 10, 11]) 427 | for idx, mask in enumerate([valid_mask, test_mask]): 428 | tmp = df[mask] 429 | uid_nearest_loan = tmp[tmp.groupby(['uid'])['loan_time'].transform(max) == tmp['loan_time']][["uid","loan_time","loan_amount"]] #用户最近一次借贷的情况 430 | uid_first_loan = tmp[tmp.groupby(['uid'])['loan_time'].transform(min) == tmp['loan_time']][["uid","loan_time","loan_amount"]] #用户第一天借贷的情况 431 | uid_nearest_loan.columns = ["uid", "nearest_loan_time", "nearest_loan_amt"] 432 | uid_first_loan.columns = ["uid", "first_loan_time", "first_loan_amt"] 433 | uid_loan = uid_nearest_loan.merge(uid_first_loan, on="uid", how="left") 434 | if idx == 0: 435 | duser1 = duser1.merge(uid_loan, on="uid", how="left") 436 | duser1["first_loantime_active_days"] = (duser1["active_date"] - duser1["first_loan_time"]).apply(lambda x: x.days) #第一次借贷距离用户激活的时间 437 | duser1["first_loan_amount_limit"] = duser1["first_loan_amt"]/duser1["limit"] 438 | duser1["nearest_loantime_active_days"] = (duser1["active_date"] - duser1["nearest_loan_time"]).apply(lambda x: x.days) #最近一次借贷距离用户激活的时间 439 | duser1["nearest_loan_amount_limit"] = duser1["nearest_loan_amt"]/duser1["limit"] 440 | duser1.drop(["nearest_loan_time", "first_loan_time"], axis=1, inplace=True) 441 | elif idx == 1: 442 | duser2 = duser2.merge(uid_loan, on="uid", how="left") 443 | duser2["first_loantime_active_days"] = (duser2["active_date"] - duser2["first_loan_time"]).apply(lambda x: x.days) #第一次借贷距离用户激活的时间 444 | duser2["first_loan_amount_limit"] = duser2["first_loan_amt"]/duser2["limit"] 445 | duser2["nearest_loantime_active_days"] = (duser2["active_date"] - duser2["nearest_loan_time"]).apply(lambda x: x.days) #最近一次借贷距离用户激活的时间 446 | duser2["nearest_loan_amount_limit"] = duser2["nearest_loan_amt"]/duser2["limit"] 447 | duser2.drop(["nearest_loan_time", "first_loan_time"], axis=1, inplace=True) 448 | return duser1, duser2 449 | 450 | 451 | def getOrderClickRatio(df_click, df_order, duser1, duser2): 452 | click_valid_mask = df_click.month.isin([8,9,10]) 453 | click_test_mask = df_click.month.isin([9,10,11]) 454 | order_valid_mask = df_order.month.isin([8,9,10]) 455 | order_test_mask = df_order.month.isin([9,10,11]) 456 | uid_valid_clicks = df_click[click_valid_mask].groupby("uid")["click_time"].count().rename("total_clicks_3month").reset_index() 457 | uid_test_clicks = df_click[click_test_mask].groupby("uid")["click_time"].count().rename("total_clicks_3month").reset_index() 458 | uid_valid_orders = df_order[order_valid_mask].groupby("uid")["buy_time"].count().rename("total_order_3month").reset_index() 459 | uid_test_orders = df_order[order_test_mask].groupby("uid")["buy_time"].count().rename("total_order_3month").reset_index() 460 | uid_valid_click_order = uid_valid_clicks.merge(uid_valid_orders, on="uid", how="left") 461 | uid_test_click_order = uid_test_clicks.merge(uid_test_orders, on="uid", how="left") 462 | uid_valid_click_order["click_order_ratio"] = uid_valid_click_order["total_clicks_3month"]/ (uid_valid_click_order["total_order_3month"] + 1) 463 | uid_test_click_order["click_order_ratio"] = uid_test_click_order["total_clicks_3month"]/ (uid_test_click_order["total_order_3month"] + 1) 464 | duser1 = duser1.merge(uid_valid_click_order, on="uid", how="left") 465 | duser2 = duser2.merge(uid_valid_click_order, on="uid", how="left") 466 | return duser1, duser2 467 | 468 | 469 | def getNearest2LoanInterval(df_loan, duser1, duser2): 470 | valid_mask = df_loan.month.isin([8, 9, 10]) 471 | test_mask = df_loan.month.isin([8, 9, 10, 11]) 472 | for idx, mask in enumerate([valid_mask, test_mask]): 473 | tmp = df_loan[mask] 474 | nearestLastLoanInterval = tmp[tmp.groupby(['uid'])['loan_time'].transform(max) == tmp['loan_time']][["uid","loan_interval"]] #用户最近一次借贷的前一次借贷间隔 475 | nearestLastLoanInterval.columns = ["uid","nearestLastLoanInterval"] 476 | if idx == 0: 477 | duser1 = duser1.merge(nearestLastLoanInterval, on ="uid", how="left") 478 | elif idx == 1: 479 | duser2 = duser2.merge(nearestLastLoanInterval, on ="uid", how="left") 480 | return duser1, duser2 481 | 482 | ##用户折扣率,提升很小 483 | def userDiscountRatio(df_order, duser1, duser2): 484 | valid_mask = df_order.month.isin([8,9,10]) 485 | test_mask = df_order.month.isin([9,10,11]) 486 | for idx, mask in enumerate([valid_mask, test_mask]): 487 | tmp = df_order[mask].reset_index(drop=True) 488 | orderAmtDiscount = tmp.groupby("uid")["order_amt","discount"].sum().reset_index() 489 | orderAmtDiscount.columns = ["uid", "total_order_amt", "total_discount_amt"] 490 | orderAmtDiscount["discount_ratio"] = 1 - orderAmtDiscount["total_discount_amt"] /orderAmtDiscount["total_order_amt"] 491 | if idx == 0: 492 | duser1 = duser1.merge(orderAmtDiscount, on="uid", how="left") 493 | duser1["discount_ratio"] = duser1["discount_ratio"].fillna(1.0) 494 | elif idx ==1: 495 | duser2 = duser2.merge(orderAmtDiscount, on="uid", how="left") 496 | duser2["discount_ratio"] = duser2["discount_ratio"].fillna(1.0) 497 | return duser1, duser2 498 | 499 | 500 | def getMonthLoanShiftDiff(df_loan, duser1, duser2): 501 | valid_mask = df_loan.month.isin([8, 9, 10]) 502 | test_mask = df_loan.month.isin([8, 9, 10, 11]) 503 | for idx, mask in enumerate([valid_mask, test_mask]): 504 | uidMonthLoan = df_loan[mask].groupby(["uid","month"])["loan_amount"].sum().rename("month_loan_amt").reset_index() 505 | uidMonthLoan = uidMonthLoan.pivot(index='uid', columns='month', values='month_loan_amt').fillna(0) 506 | uidMonthLoan = uidMonthLoan.stack().reset_index() 507 | uidMonthLoan.columns =["uid","month","month_loan_amt"] 508 | uidMonthLoan = uidMonthLoan.groupby(["uid"]).apply(lambda x: x.sort_values(["month"], ascending=True)).reset_index(drop=True) 509 | uidMonthLoan["monthLoanAmtDiff"] = uidMonthLoan.groupby("uid")["month_loan_amt"].apply(lambda x: x - x.shift(1)) 510 | uidMonthLoan["monthLoanAmtDiff2"] = uidMonthLoan.groupby("uid")["monthLoanAmtDiff"].apply(lambda x: x - x.shift(1)) 511 | if idx == 0: 512 | duser1 = duser1.merge(uidMonthLoan[uidMonthLoan.month == 10][["uid", "monthLoanAmtDiff","monthLoanAmtDiff2"]], on ="uid", how="left") 513 | elif idx == 1: 514 | duser2 = duser2.merge(uidMonthLoan[uidMonthLoan.month == 11][["uid", "monthLoanAmtDiff","monthLoanAmtDiff2"]], on ="uid", how="left") 515 | return duser1, duser2 516 | 517 | 518 | def getLoanShiftDiff(df_loan, duser1, duser2): 519 | valid_mask = df_loan.month.isin([8, 9, 10]) 520 | test_mask = df_loan.month.isin([8, 9, 10, 11]) 521 | for idx, mask in enumerate([valid_mask, test_mask]): 522 | tmp = df_loan[mask].reset_index(drop=True) 523 | tmp["loanAmtDiff"] = tmp.groupby("uid")["loan_amount"].apply(lambda x: x - x.shift(1)) 524 | tmp["loanAmtDiff2"] = tmp.groupby("uid")["loanAmtDiff"].apply(lambda x: x - x.shift(1)) 525 | maxtime_idx = tmp.groupby(['uid'])['loan_time'].transform(max) == tmp['loan_time'] #用户最近一天贷款的情况 526 | tmp = tmp[maxtime_idx] 527 | if idx == 0: 528 | duser1 = duser1.merge(tmp[["uid", "loanAmtDiff","loanAmtDiff2"]], on ="uid", how="left") 529 | elif idx == 1: 530 | duser2 = duser2.merge(tmp[["uid", "loanAmtDiff","loanAmtDiff2"]], on ="uid", how="left") 531 | return duser1, duser2 532 | 533 | def getMonthOrderShiftDiff(df_order, duser1, duser2): 534 | valid_mask = df_order.month.isin([8, 9, 10]) 535 | test_mask = df_order.month.isin([8, 9, 10, 11]) 536 | for idx, mask in enumerate([valid_mask, test_mask]): 537 | uidMonthLoan = df_order[mask].groupby(["uid","month"])["order_amt"].sum().rename("month_order_amt").reset_index() 538 | uidMonthLoan = uidMonthLoan.pivot(index='uid', columns='month', values='month_order_amt').fillna(0) 539 | uidMonthLoan = uidMonthLoan.stack().reset_index() 540 | uidMonthLoan.columns =["uid","month","month_order_amt"] 541 | uidMonthLoan = uidMonthLoan.groupby(["uid"]).apply(lambda x: x.sort_values(["month"], ascending=True)).reset_index(drop=True) 542 | uidMonthLoan["monthOrderAmtDiff"] = uidMonthLoan.groupby("uid")["month_order_amt"].apply(lambda x: x - x.shift(1)) 543 | uidMonthLoan["monthOrderAmtDiff2"] = uidMonthLoan.groupby("uid")["monthOrderAmtDiff"].apply(lambda x: x - x.shift(1)) 544 | if idx == 0: 545 | duser1 = duser1.merge(uidMonthLoan[uidMonthLoan.month == 10][["uid", "monthOrderAmtDiff","monthOrderAmtDiff2"]], on ="uid", how="left") 546 | elif idx == 1: 547 | duser2 = duser2.merge(uidMonthLoan[uidMonthLoan.month == 11][["uid", "monthOrderAmtDiff","monthOrderAmtDiff2"]], on ="uid", how="left") 548 | return duser1, duser2 549 | 550 | def getLoanCntShiftDiff(df_loan, duser1, duser2): 551 | valid_mask = df_loan.month.isin([8, 9, 10]) 552 | test_mask = df_loan.month.isin([8, 9, 10, 11]) 553 | for idx, mask in enumerate([valid_mask, test_mask]): 554 | tmp = df_loan[mask].reset_index(drop=True) 555 | uidMonthLoan = tmp.groupby(["uid","month"])["loan_time"].count().rename("month_loan_cnt").reset_index() 556 | uidMonthLoan = uidMonthLoan.pivot(index='uid', columns='month', values='month_loan_cnt').fillna(0) 557 | uidMonthLoan = uidMonthLoan.stack().reset_index() 558 | uidMonthLoan.columns =["uid","month","month_loan_cnt"] 559 | uidMonthLoan = uidMonthLoan.groupby(["uid"]).apply(lambda x: x.sort_values(["month"], ascending=True)).reset_index(drop=True) 560 | uidMonthLoan["loanCntDiff"] = uidMonthLoan.groupby(["uid"])["month_loan_cnt"].apply(lambda x: x - x.shift(1)) 561 | uidMonthLoan["loanCntDiff2"] = uidMonthLoan.groupby(["uid"])["loanCntDiff"].apply(lambda x: x - x.shift(1)) 562 | if idx == 0: 563 | duser1 = duser1.merge(uidMonthLoan.loc[uidMonthLoan.month==10, ["uid", "loanCntDiff","loanCntDiff2"]], on ="uid", how="left") 564 | elif idx == 1: 565 | duser2 = duser2.merge(uidMonthLoan.loc[uidMonthLoan.month==11, ["uid", "loanCntDiff","loanCntDiff2"]], on ="uid", how="left") 566 | return duser1, duser2 567 | 568 | #math.ceil(34600/1000.0) * 1000 569 | def updateLimit(duser1, duser2): 570 | loan_validmask = t_loan.month < 11 571 | loan_testmask = t_loan.month < 12 572 | for idx, mask in enumerate([loan_validmask, loan_testmask]): 573 | uid_day_loan = t_loan[mask].groupby(["uid","date"])["loan_amount"].sum().rename("new_limit").reset_index() 574 | uid_newlimit = uid_day_loan.groupby("uid")["new_limit"].max().rename("new_limit").reset_index() 575 | uid_newlimit = uid_newlimit.merge(t_user[["uid","limit"]], on="uid", how="left") 576 | updateLimit = uid_newlimit[uid_newlimit["limit"] < uid_newlimit["new_limit"]][["uid","new_limit"]] 577 | if idx ==0: 578 | duser1 = duser1.merge(updateLimit, on="uid", how="left") 579 | duser1["limit"] = duser1.apply(lambda x: x["new_limit"] if x["limit"] < x["new_limit"] else x["limit"], axis=1) 580 | duser1.drop("new_limit",axis=1,inplace=True) 581 | elif idx == 1: 582 | duser2 = duser2.merge(updateLimit, on="uid", how="left") 583 | duser2["limit"] = duser2.apply(lambda x: x["new_limit"] if x["limit"] < x["new_limit"] else x["limit"], axis=1) 584 | duser2.drop("new_limit",axis=1,inplace=True) 585 | return duser1, duser2 586 | 587 | 588 | 589 | def updateLimit2(duser1, duser2): 590 | loan_validmask = (t_loan.month < 11) 591 | loan_testmask = (t_loan.month < 12) 592 | for idx, mask in enumerate([loan_validmask, loan_testmask]): 593 | uid_newlimit = t_loan[mask & (t_loan.plannum>6)].groupby(["uid","month"])["loan_amount"].sum().rename("loan12amt").reset_index().groupby("uid")["loan12amt"].max().rename("new_limit").reset_index() 594 | uid_day_loan = t_loan[mask].groupby(["uid","date"])["loan_amount"].sum().rename("new_limit").reset_index().groupby("uid")["new_limit"].max().rename("new_limit").reset_index() 595 | uid_newlimit_all = pd.concat([uid_newlimit,uid_day_loan]).groupby("uid")["new_limit"].max().rename("new_limit").reset_index() 596 | uid_newlimit = uid_newlimit_all.merge(t_user[["uid","limit"]], on="uid", how="left") 597 | updateLimit = uid_newlimit[uid_newlimit["limit"] < uid_newlimit["new_limit"]][["uid","new_limit"]] 598 | if idx ==0: 599 | duser1 = duser1.merge(updateLimit, on="uid", how="left") 600 | duser1["limit"] = duser1.apply(lambda x: x["new_limit"] if x["limit"] < x["new_limit"] else x["limit"], axis=1) 601 | duser1.drop("new_limit",axis=1,inplace=True) 602 | elif idx == 1: 603 | duser2 = duser2.merge(updateLimit, on="uid", how="left") 604 | duser2["limit"] = duser2.apply(lambda x: x["new_limit"] if x["limit"] < x["new_limit"] else x["limit"], axis=1) 605 | duser2.drop("new_limit",axis=1,inplace=True) 606 | return duser1, duser2 607 | 608 | 609 | 610 | 611 | def getSexAgeLimt(dt_user, duser1): 612 | ori_limit = dt_user[["uid","limit"]] 613 | ori_limit.columns = ["uid", "ori_limit"] 614 | duser1 = duser1.merge(ori_limit, on="uid", how="left") 615 | duser1["limit_increase"] = duser1["limit"]/duser1["ori_limit"] 616 | duser1["sex_age_limit"] = duser1["sex"].astype(str) + duser1["age"].astype(str) + duser1["ori_limit"].astype(str) 617 | duser1["sex_age_limit"] = duser1["sex_age_limit"].astype('category') 618 | duser1['sex_age_limit'].cat.categories= np.arange(1,duser1["sex_age_limit"].nunique()+1) 619 | duser1["sex_age_limit"] = duser1["sex_age_limit"].astype(int) 620 | return duser1 621 | 622 | def getAmtBeforeRatio(df, column ,duser1, duser2): 623 | valid_mask = df.month.isin([10,9,8]) 624 | test_mask = df.month.isin([11,10,9,8]) 625 | for idx, mask in enumerate([valid_mask, test_mask]): 626 | uid_months = df[mask].groupby(["uid","month"])[column].agg(["count","sum"]).reset_index() 627 | uid_months.rename({'count': column + '_cnt', 'sum': column + '_sum' }, axis='columns',inplace=True) 628 | if idx == 0: 629 | uid_valid = uid_months[uid_months.month==10].reset_index(drop=True) 630 | mean_before_month = uid_months[uid_months.month < 10].groupby("uid")[column + '_sum'].mean().rename("mean_before_month").reset_index() 631 | uid_valid = uid_valid.merge(mean_before_month, how="left", on="uid") 632 | uid_valid[column + '_sum_before_ratio'] = uid_valid[column + '_sum']/ (uid_valid['mean_before_month'] + 1) 633 | elif idx == 1: 634 | uid_valid = uid_months[uid_months.month==11].reset_index(drop=True) 635 | mean_before_month = uid_months[uid_months.month < 11].groupby("uid")[column + '_sum'].mean().rename("mean_before_month").reset_index() 636 | uid_valid = uid_valid.merge(mean_before_month, how="left", on="uid") 637 | uid_valid[column + '_sum_before_ratio'] = uid_valid[column + '_sum']/ (uid_valid['mean_before_month'] + 1) 638 | if idx == 0: 639 | duser1 = duser1.merge(uid_valid[["uid",column + '_sum_before_ratio']], how="left", on="uid") 640 | elif idx == 1: 641 | duser2 = duser2.merge(uid_valid[["uid",column + '_sum_before_ratio']], how="left", on="uid") 642 | return duser1, duser2 643 | 644 | def getActionDays(df, column ,duser1, duser2, window_size, pref): 645 | valid_mask, test_mask = get_windows_mask(df, column, window_size) 646 | for idx, mask in enumerate([valid_mask, test_mask]): 647 | uid_months = df[mask].groupby(["uid"])[column].nunique().rename(pref + "_actionDays").reset_index() 648 | if idx == 0: 649 | duser1 = duser1.merge(uid_months, how="left", on="uid") 650 | elif idx == 1: 651 | duser2 = duser2.merge(uid_months, how="left", on="uid") 652 | return duser1, duser2 653 | 654 | 655 | def currentMinDebtAmt(df, duser1, duser2, window_size): 656 | valid_mask, test_mask = get_windows_mask(df, "loan_time", window_size) 657 | for idx, mask in enumerate([valid_mask, test_mask]): 658 | if idx == 0: 659 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(valid_end_date) 660 | tmp = df[mask & pay_date_mask].reset_index(drop=True) 661 | tmp["payed_num"] = (pd.Timestamp(valid_end_date) - tmp["loan_time"]).apply(lambda x: math.ceil(x.days/30.0)) 662 | tmp["debtAmt"] = tmp["loan_amount"] - tmp["amt_per_plan"] * tmp["payed_num"] 663 | current_debtAmt = tmp.groupby("uid")["debtAmt"].sum().rename("current_MindebtAmt_" + str(window_size)).reset_index() 664 | duser1 = duser1.merge(current_debtAmt, on="uid", how="left") 665 | duser1["current_MindebtAmt_" + str(window_size)] = duser1["current_MindebtAmt_" + str(window_size)].fillna(0.0) 666 | elif idx == 1: 667 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(valid_end_date) 668 | tmp = df[mask & pay_date_mask].reset_index(drop=True) 669 | tmp["payed_num"] = (pd.Timestamp(test_end_date) - tmp["loan_time"]).apply(lambda x: math.ceil(x.days/30.0)) 670 | tmp["debtAmt"] = tmp["loan_amount"] - tmp["amt_per_plan"] * tmp["payed_num"] 671 | current_debtAmt = tmp.groupby("uid")["debtAmt"].sum().rename("current_MindebtAmt_" + str(window_size)).reset_index() 672 | duser2 = duser2.merge(current_debtAmt, on="uid", how="left") 673 | duser2["current_MindebtAmt_" + str(window_size)] = duser2["current_MindebtAmt_" + str(window_size)].fillna(0.0) 674 | return duser1, duser2 675 | 676 | 677 | def currentMaxDebtAmt(df, duser1, duser2, window_size): 678 | valid_mask, test_mask = get_windows_mask(df, "loan_time", window_size) 679 | for idx, mask in enumerate([valid_mask, test_mask]): 680 | if idx == 0: 681 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(valid_end_date) 682 | tmp = df[mask & pay_date_mask].reset_index(drop=True) 683 | tmp["payed_num"] = (pd.Timestamp(valid_end_date) - tmp["loan_time"]).apply(lambda x: math.floor(x.days/30.0)) 684 | tmp["debtAmt"] = tmp["loan_amount"] - tmp["amt_per_plan"] * tmp["payed_num"] 685 | current_debtAmt = tmp.groupby("uid")["debtAmt"].sum().rename("current_MaxdebtAmt_" + str(window_size)).reset_index() 686 | duser1 = duser1.merge(current_debtAmt, on="uid", how="left") 687 | duser1["current_MaxdebtAmt_" + str(window_size)] = duser1["current_MaxdebtAmt_" + str(window_size)].fillna(0.0) 688 | elif idx == 1: 689 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(valid_end_date) 690 | tmp = df[mask & pay_date_mask].reset_index(drop=True) 691 | tmp["payed_num"] = (pd.Timestamp(test_end_date) - tmp["loan_time"]).apply(lambda x: math.floor(x.days/30.0)) 692 | tmp["debtAmt"] = tmp["loan_amount"] - tmp["amt_per_plan"] * tmp["payed_num"] 693 | current_debtAmt = tmp.groupby("uid")["debtAmt"].sum().rename("current_MaxdebtAmt_" + str(window_size)).reset_index() 694 | duser2 = duser2.merge(current_debtAmt, on="uid", how="left") 695 | duser2["current_MaxdebtAmt_" + str(window_size)] = duser2["current_MaxdebtAmt_" + str(window_size)].fillna(0.0) 696 | return duser1, duser2 697 | 698 | 699 | ##下个月需要还多少钱 700 | def net2PayAmt(df, duser1, duser2, window_size): 701 | valid_mask, test_mask = get_windows_mask(df, "loan_time", window_size) 702 | for idx, mask in enumerate([valid_mask, test_mask]): 703 | if idx == 0: 704 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(valid_end_date) + timedelta(days=30) 705 | tmp = df[mask & pay_date_mask] 706 | net2PayAmt = tmp.groupby("uid")["amt_per_plan"].agg(["sum"]).reset_index() 707 | net2PayAmt.columns = ["uid", "net2PayAmt"] 708 | duser1 = duser1.merge(net2PayAmt, on="uid", how="left") 709 | duser1["net2PayAmt"] = duser1["net2PayAmt"].fillna(0.0) 710 | elif idx == 1: 711 | pay_date_mask = df["pay_end_date"] > pd.Timestamp(test_end_date) + timedelta(days=30) 712 | tmp = df[mask & pay_date_mask] 713 | net2PayAmt = tmp.groupby("uid")["amt_per_plan"].agg(["sum"]).reset_index() 714 | net2PayAmt.columns = ["uid", "net2PayAmt"] 715 | duser2 = duser2.merge(net2PayAmt, on="uid", how="left") 716 | duser2["net2PayAmt"] = duser2["net2PayAmt"].fillna(0.0) 717 | return duser1, duser2 718 | 719 | def orderAmtStaus(df, duser, tuser, window_size): 720 | valid_mask, test_mask = get_windows_mask(df, "buy_time", window_size) 721 | for idx, mask in enumerate([valid_mask, test_mask]): 722 | tmp = df[mask].reset_index(drop=True) 723 | if idx == 0: 724 | tmp["buy_time_interval"] = tmp["buy_time"]- pd.Timestamp(valid_end_date) 725 | tmp["buy_time_interval"] = tmp["buy_time_interval"].apply(lambda x: x.days) 726 | uid_order_his= tmp.groupby(["uid"])["buy_time_interval"].mean().rename("buy_time_interval_mean").reset_index() 727 | duser = duser.merge(uid_order_his, how="left", on = 'uid') 728 | elif idx == 1: 729 | tmp["buy_time_interval"] = tmp["buy_time"]- pd.Timestamp(test_end_date) 730 | tmp["buy_time_interval"] = tmp["buy_time_interval"].apply(lambda x: x.days) 731 | uid_order_his= tmp.groupby(["uid"])["buy_time_interval"].mean().rename("buy_time_interval_mean").reset_index() 732 | tuser = tuser.merge(uid_order_his, how="left", on = 'uid') 733 | return duser, tuser 734 | 735 | def gen_fixed_tw_features_for_click_PidParamUnFold(df,duser1,duser2,col,window_size): 736 | valid_mask, test_mask = get_windows_mask(df, col, window_size) 737 | for idx, mask in enumerate([valid_mask, test_mask]): 738 | tmp = df[mask].reset_index(drop=True) 739 | tmp['pidParam']=pd.Series([str(i)+"_"+str(j) for i,j in zip(list(tmp.pid),list(tmp.param))],index=tmp.index) 740 | uid_pidParam_clicks = tmp.groupby(["uid","pidParam"]).click_time.count().reset_index() 741 | uid_pidParam_clicks.columns = ["uid","pidParam", "pidParam_clicks"] 742 | uid_pidParam_clicks["pidParam"] = uid_pidParam_clicks['pidParam'].astype(str) + "_pidParamcliks_" + str(window_size) + "d" 743 | uid_pidParam_clicks = uid_pidParam_clicks.pivot(index='uid', columns='pidParam', values='pidParam_clicks').reset_index().fillna(0) 744 | if idx == 0: 745 | duser1 = duser1.merge(uid_pidParam_clicks, how="left", on="uid") 746 | elif idx == 1: 747 | duser2 = duser2.merge(uid_pidParam_clicks, how="left", on="uid") 748 | uid_pidParam_clicks = tmp.groupby(["uid","pidParam"]).click_time.count().reset_index() 749 | uid_pidParam_clicks.columns = ["uid","pidParam", "pidParam_clicks"] 750 | uid_clicks = tmp.groupby(["uid"]).click_time.count().rename('clicks').reset_index() 751 | uid_pidParam_clicks = uid_pidParam_clicks.merge(uid_clicks,how="left",on="uid") 752 | uid_pidParam_clicks["clicks_pidParam_ratio"] = uid_pidParam_clicks["pidParam_clicks"]/uid_pidParam_clicks["clicks"] 753 | uid_pidParam_clicks["pidParam"] = uid_pidParam_clicks['pidParam'].astype(str) + "_pidParamcliks_ratio_" + str(window_size) + "d" 754 | uid_pidParam_clicks = uid_pidParam_clicks.pivot(index='uid', columns='pidParam', values='clicks_pidParam_ratio').reset_index().fillna(0) 755 | if idx == 0: 756 | duser1 = duser1.merge(uid_pidParam_clicks, how="left", on="uid") 757 | elif idx == 1: 758 | duser2 = duser2.merge(uid_pidParam_clicks, how="left", on="uid") 759 | return duser1, duser2 760 | 761 | 762 | def getLoanFeaturesWinds(df, duser1, duser2, window_size): 763 | valid_mask, test_mask = get_windows_mask(df, "loan_time", window_size) 764 | for idx, mask in enumerate([valid_mask, test_mask]): 765 | tmp = df[mask] 766 | #平均每月贷款金额 767 | month_loanAmt = tmp.groupby(["uid"])['loan_amount'].sum().rename("monthLoanAmt").reset_index() 768 | #贷款周期 loan_interval 769 | stat_loanInterval=tmp.groupby(["uid"])['loan_interval'].agg(['mean','median','max','min']).reset_index() 770 | stat_loanInterval.columns=['uid']+ [i+ '_loanInterval_'+str(window_size) for i in list(stat_loanInterval.columns)[1:]] 771 | loan3Month = month_loanAmt.merge(stat_loanInterval, on="uid", how="left") 772 | if idx == 0: 773 | duser1 = duser1.merge(loan3Month, how="left", on="uid") 774 | elif idx == 1: 775 | duser2 = duser2.merge(loan3Month, how="left", on="uid") 776 | return duser1, duser2 777 | 778 | 779 | def gen_fixed_tw_features_for_loan_click_PidParamUnfold(df1,df2,duser1,duser2,col1,col2,window_size): 780 | valid_mask_loan, test_mask_loan = get_windows_mask(df1, col1, window_size) 781 | valid_mask_click, test_mask_click = get_windows_mask(df2, col2, window_size) 782 | for idx, mask in enumerate([valid_mask_loan, test_mask_loan]): 783 | tmp_loan = df1[mask].reset_index(drop=True) 784 | if idx==0: 785 | mask_click=valid_mask_click 786 | elif idx==1: 787 | mask_click=test_mask_click 788 | tmp_click= df2[mask_click].reset_index(drop=True) 789 | tmp_click['pidParam']=pd.Series([str(i)+"_"+str(j) for i,j in zip(list(tmp_click.pid),list(tmp_click.param))],index=tmp_click.index) 790 | uid_cnt_pidparam_by_day=tmp_click.groupby(['uid','date','pidParam']).click_time.count().rename('cnt_PidParam_in_loan').reset_index() 791 | uid_cnt_loan_by_day=tmp_loan.groupby(['uid','date']).loan_time.count().rename('cnt_loan').reset_index() 792 | del uid_cnt_loan_by_day['cnt_loan'] 793 | uid_loan_click=uid_cnt_loan_by_day.merge(uid_cnt_pidparam_by_day,how='left',on=['uid','date']) 794 | uid_loan_click_tw=uid_loan_click.groupby(['uid','pidParam']).cnt_PidParam_in_loan.sum().rename('cnt_PidParam').reset_index() 795 | uid_loan_click_total=uid_loan_click.groupby('uid').cnt_PidParam_in_loan.sum().rename('cnt_total').reset_index() 796 | uid_loan_click_tw=uid_loan_click_tw.merge(uid_loan_click_total,how='left',on=['uid']) 797 | uid_loan_click_tw['ratio_pidParam']=uid_loan_click_tw.cnt_PidParam/uid_loan_click_tw.cnt_total 798 | uid_loan_click_tw['cnt_pidParam_in_loan']=uid_loan_click_tw['pidParam'].astype(str)+"_pidParamcliks_in_loan_" + str(window_size) + "d" 799 | uid_loan_click_tw['ratio_pidParam_in_loan']=uid_loan_click_tw['pidParam'].astype(str)+"_pidParamRatio_in_loan_" + str(window_size) + "d" 800 | uid_loan_click_cnt=uid_loan_click_tw.pivot(index='uid', columns='cnt_pidParam_in_loan', values='cnt_PidParam').reset_index().fillna(0) 801 | uid_loan_click_ratio=uid_loan_click_tw.pivot(index='uid', columns='ratio_pidParam_in_loan', values='ratio_pidParam').reset_index().fillna(0) 802 | if idx == 0: 803 | duser1 = duser1.merge(uid_loan_click_cnt, how="left", on="uid") 804 | duser1 = duser1.merge(uid_loan_click_ratio, how="left", on="uid") 805 | elif idx == 1: 806 | duser2 = duser2.merge(uid_loan_click_cnt, how="left", on="uid") 807 | duser2 = duser2.merge(uid_loan_click_ratio, how="left", on="uid") 808 | return duser1, duser2 809 | 810 | def gen_fixed_tw_features_for_notloan_click_PidParamUnfold(df1,df2,duser1,duser2,col1,col2,window_size): 811 | valid_mask_loan, test_mask_loan = get_windows_mask(df1, col1, window_size) 812 | valid_mask_click, test_mask_click = get_windows_mask(df2, col2, window_size) 813 | for idx, mask in enumerate([valid_mask_loan, test_mask_loan]): 814 | tmp_loan = df1[mask].reset_index(drop=True) 815 | if idx==0: 816 | mask_click=valid_mask_click 817 | elif idx==1: 818 | mask_click=test_mask_click 819 | tmp_click= df2[mask_click].reset_index(drop=True) 820 | tmp_click['pidParam']=pd.Series([str(i)+"_"+str(j) for i,j in zip(list(tmp_click.pid),list(tmp_click.param))],index=tmp_click.index) 821 | uid_cnt_pidparam_by_day=tmp_click.groupby(['uid','date','pidParam']).click_time.count().rename('cnt_PidParam_notin_loan').reset_index() 822 | uid_cnt_loan_by_day=tmp_loan.groupby(['uid','date']).loan_time.count().rename('cnt_loan').reset_index() 823 | uid_loan_click= uid_cnt_pidparam_by_day.merge(uid_cnt_loan_by_day,how='left',on=['uid','date']) 824 | uid_loan_click=uid_loan_click[uid_loan_click.cnt_loan.isnull()] 825 | uid_loan_click_tw=uid_loan_click.groupby(['uid','pidParam']).cnt_PidParam_notin_loan.sum().rename('cnt_PidParam').reset_index() 826 | uid_loan_click_total=uid_loan_click.groupby('uid').cnt_PidParam_notin_loan.sum().rename('cnt_total').reset_index() 827 | uid_loan_click_tw=uid_loan_click_tw.merge(uid_loan_click_total,how='left',on=['uid']) 828 | uid_loan_click_tw['ratio_pidParam']=uid_loan_click_tw.cnt_PidParam/uid_loan_click_tw.cnt_total 829 | uid_loan_click_tw['cnt_pidParam_notin_loan']=uid_loan_click_tw['pidParam'].astype(str)+"_pidParamcliks_notin_loan_" + str(window_size) + "d" 830 | uid_loan_click_tw['ratio_pidParam_notin_loan']=uid_loan_click_tw['pidParam'].astype(str)+"_pidParamRatio_notin_loan_" + str(window_size) + "d" 831 | uid_loan_click_cnt=uid_loan_click_tw.pivot(index='uid', columns='cnt_pidParam_notin_loan', values='cnt_PidParam').reset_index().fillna(0) 832 | uid_loan_click_ratio=uid_loan_click_tw.pivot(index='uid', columns='ratio_pidParam_notin_loan', values='ratio_pidParam').reset_index().fillna(0) 833 | if idx == 0: 834 | duser1 = duser1.merge(uid_loan_click_cnt, how="left", on="uid") 835 | duser1 = duser1.merge(uid_loan_click_ratio, how="left", on="uid") 836 | elif idx == 1: 837 | duser2 = duser2.merge(uid_loan_click_cnt, how="left", on="uid") 838 | duser2 = duser2.merge(uid_loan_click_ratio, how="left", on="uid") 839 | return duser1, duser2 840 | 841 | def getCatePivotAmtCnt(df, duser1, duser2): 842 | valid_mask = df.month.isin([8, 9,10]) 843 | test_mask = df.month.isin([9, 10,11]) 844 | for idx, mask in enumerate([valid_mask, test_mask]): 845 | tmp = df[mask] 846 | uid_months = tmp.groupby(["uid","cate_id"])["order_amt"].agg(["count","sum"]).reset_index() 847 | uid_months.rename({'count': 'cate_id_cnt', 'sum': 'cate_id_sum' }, axis='columns',inplace=True) 848 | if idx == 0: 849 | uid_months["cate_id"] = "cate_id_" + uid_months['cate_id'].astype(str) 850 | elif idx == 1: 851 | uid_months["cate_id"] = "cate_id_" + uid_months['cate_id'].astype(str) 852 | uid_months = uid_months.pivot(index='uid', columns='cate_id').reset_index().fillna(0) 853 | new_list = ["uid"] 854 | for words in uid_months.columns.get_values(): 855 | if "uid" in words: 856 | continue 857 | new_list.append('_'.join(words)) 858 | uid_months.columns = new_list 859 | if idx == 0: 860 | duser1 = duser1.merge(uid_months, how="left", on="uid") 861 | elif idx == 1: 862 | duser2 = duser2.merge(uid_months, how="left", on="uid") 863 | return duser1, duser2 864 | 865 | def uidOrderAmtCntWinds(df, duser, tuser, window_size): 866 | valid_mask, test_mask = get_windows_mask(df, "buy_time", window_size) 867 | for idx, mask in enumerate([valid_mask, test_mask]): 868 | tmp = df[mask] 869 | order_status = tmp.groupby(["uid"])["order_amt"].agg(["count","sum"]).reset_index() 870 | order_status.columns = ['uid', 'order_cnt_'+str(window_size), "order_amt_" + str(window_size) ] 871 | if idx == 0: 872 | duser = duser.merge(order_status, how="left", on = 'uid') 873 | elif idx == 1: 874 | tuser = tuser.merge(order_status, how="left", on = 'uid') 875 | return duser, tuser 876 | 877 | def loanClickBehiviorSeries(duser1,duser2,window_size): 878 | valid_mask_loan, test_mask_loan = get_windows_mask(t_loan, "loan_time", window_size) 879 | valid_mask_click, test_mask_click = get_windows_mask(t_click, "click_time", window_size) 880 | for idx, mask in enumerate([valid_mask_loan, test_mask_loan]): 881 | tmp_loan = t_loan.loc[mask,['uid','date']].rename(columns={'date':'behavior_time'}) 882 | tmp_loan['behavior'] = 0 883 | if idx == 0: 884 | mask_click = valid_mask_click 885 | elif idx == 1: 886 | mask_click = test_mask_click 887 | tmp_click=t_click.loc[mask_click,['uid','date']].rename(columns={'date':'behavior_time'}) 888 | tmp_click['behavior']=1 889 | tmp=pd.concat([tmp_loan.drop_duplicates(),tmp_click.drop_duplicates()]).groupby(["uid"]).apply(lambda x: x.sort_values(["behavior_time"], ascending=True)).reset_index(drop=True) 890 | tmp1 = tmp.groupby("uid")["behavior"].apply(lambda x:list(x)).rename("clickLoanBehavior").reset_index() 891 | tmp1["clickLoanBehaviorCode"] = tmp1["clickLoanBehavior"].apply(lambda x: ''.join(map(str, x))) 892 | #tmp1["clickLoanBehaviorCode1"] = tmp1["clickLoanBehaviorCode"].apply(lambda x:len(x)) 893 | tmp1["clickLoanBehaviorCode"] = tmp1["clickLoanBehaviorCode"].astype('category') 894 | tmp1['clickLoanBehaviorCode'].cat.categories=range(tmp1["clickLoanBehaviorCode"].nunique()) #4563 895 | tmp1["clickLoanBehaviorCode"] = tmp1["clickLoanBehaviorCode"].astype(int) 896 | if idx == 0: 897 | duser1 = duser1.merge(tmp1[["uid","clickLoanBehaviorCode"]], on="uid", how="left") 898 | elif idx == 1: 899 | duser2 = duser2.merge(tmp1[["uid","clickLoanBehaviorCode"]], on="uid", how="left") 900 | return duser1, duser2 901 | 902 | def loanClickBehiviorSeries1(duser1,duser2,window_size): 903 | valid_mask_loan, test_mask_loan = get_windows_mask(t_loan, "loan_time", window_size) 904 | valid_mask_click, test_mask_click = get_windows_mask(t_click, "click_time", window_size) 905 | for idx, mask in enumerate([valid_mask_loan, test_mask_loan]): 906 | tmp_loan = t_loan.loc[mask,['uid','date']].rename(columns={'date':'behavior_time'}) 907 | tmp_loan['behavior'] = 0 908 | if idx == 0: 909 | mask_click = valid_mask_click 910 | elif idx == 1: 911 | mask_click = test_mask_click 912 | tmp_click=t_click.loc[mask_click,['uid','date']].rename(columns={'date':'behavior_time'}) 913 | tmp_click['behavior']=1 914 | tmp=pd.concat([tmp_loan.drop_duplicates(),tmp_click.drop_duplicates()]).groupby(["uid"]).apply(lambda x: x.sort_values(["behavior_time"], ascending=True)).reset_index(drop=True) 915 | tmp["pre_behavior"] = tmp.groupby("uid")["behavior"].apply(lambda x: x.shift(1)) 916 | tmp = tmp[tmp["behavior"] != tmp["pre_behavior"]] 917 | tmp1 = tmp.groupby("uid")["behavior"].apply(lambda x:list(x)).rename("clickLoanBehavior").reset_index() 918 | tmp1["clickLoanBehaviorCode"] = tmp1["clickLoanBehavior"].apply(lambda x: ''.join(map(str, x))) 919 | #tmp1["clickLoanBehaviorCode1"] = tmp1["clickLoanBehaviorCode"].apply(lambda x:len(x)) 920 | tmp1["clickLoanBehaviorCode"] = tmp1["clickLoanBehaviorCode"].astype('category') 921 | tmp1['clickLoanBehaviorCode'].cat.categories=range(tmp1["clickLoanBehaviorCode"].nunique()) #4563 922 | tmp1["clickLoanBehaviorCode"] = tmp1["clickLoanBehaviorCode"].astype(int) 923 | if idx == 0: 924 | duser1 = duser1.merge(tmp1[["uid","clickLoanBehaviorCode"]], on="uid", how="left") 925 | elif idx == 1: 926 | duser2 = duser2.merge(tmp1[["uid","clickLoanBehaviorCode"]], on="uid", how="left") 927 | return duser1, duser2 928 | 929 | #将购买表中cateId展开(cnt amt ratio_cnt ratio_amt) 930 | def gen_fixed_tw_features_for_order_cateIdUnFold(df,duser1,duser2, col,window_size): 931 | valid_mask, test_mask = get_windows_mask(df, col, window_size) 932 | for idx, mask in enumerate([valid_mask, test_mask]): 933 | tmp = df[mask] 934 | uid_orders_cnts = tmp.groupby(["uid","cate_id"]).buy_time.count().rename('cnt_cateId').reset_index() 935 | uid_orders_amt = tmp.groupby(["uid","cate_id"]).order_amt.sum().rename('sum_cateId').reset_index() 936 | uid_orders_cnts_total=tmp.groupby(["uid"]).buy_time.count().rename('cnt_cateId_total').reset_index() 937 | uid_orders_amt_total=tmp.groupby(["uid"]).order_amt.sum().rename('sum_cateId_total').reset_index() 938 | uid_orders_cnts=uid_orders_cnts.merge(uid_orders_cnts_total,how='left',on='uid') 939 | uid_orders_amt=uid_orders_amt.merge(uid_orders_amt_total,how='left',on='uid') 940 | uid_orders_cnts['ratio_cnt_cateId']=uid_orders_cnts.cnt_cateId/uid_orders_cnts.cnt_cateId_total 941 | uid_orders_amt['ratio_amt_cateId']=uid_orders_amt.sum_cateId/uid_orders_amt.sum_cateId_total 942 | uid_orders_cnts['cate_id_cnt_fold']=uid_orders_cnts['cate_id'].astype(str)+"_cnt_" + str(window_size) + "d" 943 | uid_orders_cnts['cate_id_cnt_ratio_fold']=uid_orders_cnts['cate_id'].astype(str)+"_cnt_ratio_" + str(window_size) + "d" 944 | uid_orders_amt['cate_id_amt_fold']=uid_orders_amt['cate_id'].astype(str)+"_amt_" + str(window_size) + "d" 945 | uid_orders_amt['cate_id_amt_ratio_fold']=uid_orders_amt['cate_id'].astype(str)+"_amt_ratio_" + str(window_size) + "d" 946 | uid_orders_cnts_cnt=uid_orders_cnts.pivot(index='uid', columns='cate_id_cnt_fold', values='cnt_cateId').reset_index().fillna(0) 947 | uid_orders_cnts_ratio=uid_orders_cnts.pivot(index='uid', columns='cate_id_cnt_ratio_fold', values='ratio_cnt_cateId').reset_index().fillna(0) 948 | uid_orders_amt_amt=uid_orders_amt.pivot(index='uid', columns='cate_id_amt_fold', values='sum_cateId').reset_index().fillna(0) 949 | uid_orders_amt_ratio=uid_orders_amt.pivot(index='uid', columns='cate_id_amt_ratio_fold', values='ratio_amt_cateId').reset_index().fillna(0) 950 | d_feature=pd.DataFrame({'uid':t_user.uid},index=t_user.index) 951 | d_feature=d_feature.merge(uid_orders_cnts_cnt,how="left", on="uid").fillna(0) 952 | d_feature=d_feature.merge(uid_orders_amt_amt,how="left", on="uid").fillna(0) 953 | d_feature=d_feature.merge(uid_orders_cnts_ratio,how="left", on="uid") 954 | d_feature=d_feature.merge(uid_orders_amt_ratio,how="left", on="uid") 955 | if idx == 0: 956 | duser1 = duser1.merge(d_feature, how="left", on="uid") 957 | elif idx == 1: 958 | duser2 = duser2.merge(d_feature, how="left", on="uid") 959 | return duser1,duser2 960 | 961 | def getTotalPidStaytime(df_click, duser1, duser2): 962 | valid_mask = df_click.month.isin([8,9,10]) 963 | test_mask = df_click.month.isin([11,10,9]) 964 | for idx, mask in enumerate([valid_mask, test_mask]): 965 | tmp = df_click[mask].groupby(["uid"])["click_interval"].sum().rename("Staytime").reset_index() 966 | if idx == 0: 967 | duser1 = duser1.merge(tmp, on ="uid", how="left") 968 | elif idx == 1: 969 | duser2 = duser2.merge(tmp, on ="uid", how="left") 970 | return duser1, duser2 971 | 972 | ####非购买日期的点击参数展开 973 | def gen_fixed_tw_features_for_notOrder_click_PidParamUnfold(df1,df2,duser1,duser2,col1,col2,window_size): 974 | valid_mask_order, test_mask_order = get_windows_mask(df1, col1, window_size) 975 | valid_mask_click, test_mask_click = get_windows_mask(df2, col2, window_size) 976 | for idx, mask in enumerate([valid_mask_order, test_mask_order]): 977 | tmp_order = df1[mask].reset_index(drop=True) 978 | if idx==0: 979 | mask_click=valid_mask_click 980 | elif idx==1: 981 | mask_click=test_mask_click 982 | tmp_click= df2[mask_click].reset_index(drop=True) 983 | tmp_click['pidParam']=pd.Series([str(i)+"_"+str(j) for i,j in zip(list(tmp_click.pid),list(tmp_click.param))],index=tmp_click.index) 984 | uid_cnt_pidparam_by_day=tmp_click.groupby(['uid','date','pidParam']).click_time.count().rename('cnt_PidParam_notin_order').reset_index() 985 | uid_cnt_order_by_day=tmp_order.groupby(['uid','buy_time']).buy_time.count().rename('cnt_order').reset_index() 986 | uid_cnt_order_by_day.columns = ["uid","date","cnt_order"] 987 | uid_order_click= uid_cnt_pidparam_by_day.merge(uid_cnt_order_by_day,how='left',on=['uid','date']) 988 | uid_order_click=uid_order_click[uid_order_click.cnt_order.isnull()] 989 | uid_order_click_tw=uid_order_click.groupby(['uid','pidParam']).cnt_PidParam_notin_order.sum().rename('cnt_PidParam').reset_index() 990 | uid_order_click_total=uid_order_click.groupby('uid').cnt_PidParam_notin_order.sum().rename('cnt_total').reset_index() 991 | uid_order_click_tw=uid_order_click_tw.merge(uid_order_click_total,how='left',on=['uid']) 992 | uid_order_click_tw['ratio_pidParam']=uid_order_click_tw.cnt_PidParam/uid_order_click_tw.cnt_total 993 | uid_order_click_tw['cnt_pidParam_notin_order']=uid_order_click_tw['pidParam'].astype(str)+"_pidParamcliks_notin_order_" + str(window_size) + "d" 994 | uid_order_click_tw['ratio_pidParam_notin_order']=uid_order_click_tw['pidParam'].astype(str)+"_pidParamRatio_notin_order_" + str(window_size) + "d" 995 | uid_order_click_cnt=uid_order_click_tw.pivot(index='uid', columns='cnt_pidParam_notin_order', values='cnt_PidParam').reset_index().fillna(0) 996 | uid_order_click_ratio=uid_order_click_tw.pivot(index='uid', columns='ratio_pidParam_notin_order', values='ratio_pidParam').reset_index().fillna(0) 997 | if idx == 0: 998 | duser1 = duser1.merge(uid_order_click_cnt, how="left", on="uid") 999 | duser1 = duser1.merge(uid_order_click_ratio, how="left", on="uid") 1000 | elif idx == 1: 1001 | duser2 = duser2.merge(uid_order_click_cnt, how="left", on="uid") 1002 | duser2 = duser2.merge(uid_order_click_ratio, how="left", on="uid") 1003 | return duser1, duser2 1004 | 1005 | def getLoanAmtWithinOrder(duser1, duser2, window_size): 1006 | valid_mask_loan, test_mask_loan = get_windows_mask(t_loan, "loan_time", window_size) 1007 | valid_mask_buy, test_mask_buy = get_windows_mask(t_order, "buy_time", window_size) 1008 | for idx, mask in enumerate([valid_mask_loan, test_mask_loan]): 1009 | tmp_loan = t_loan[mask] 1010 | loanAmtWinds = tmp_loan.groupby(["uid","date"])["loan_amount"].sum().rename('loanAmtIn').reset_index() 1011 | if idx==0: 1012 | tmp_order = t_order[valid_mask_buy] 1013 | elif idx==1: 1014 | tmp_order = t_order[test_mask_buy] 1015 | orderAmtWinds = tmp_order.groupby(["uid","buy_time"])["order_amt"].sum().rename("orderAmtIn").reset_index() 1016 | orderAmtWinds.columns = ["uid", "date", "orderAmtIn"] 1017 | orderLoanAmtWinds = orderAmtWinds.merge(loanAmtWinds, how="inner", on=["uid","date"]) 1018 | loanAmtWinds= orderLoanAmtWinds.groupby("uid")['loanAmtIn'].sum().rename('loanAmtIn' + str(window_size)).reset_index() 1019 | orderAmtWinds= orderLoanAmtWinds.groupby("uid")['orderAmtIn'].sum().rename('orderAmtIn' + str(window_size)).reset_index() 1020 | orderLoanAmtWinds = loanAmtWinds.merge(orderAmtWinds, how="left", on="uid") 1021 | orderLoanAmtWinds['loanOrderAmtRatioIn' + str(window_size)] = orderLoanAmtWinds['loanAmtIn' + str(window_size)] /orderLoanAmtWinds["orderAmtIn" + str(window_size)] 1022 | if idx == 0: 1023 | duser1 = duser1.merge(orderLoanAmtWinds, how="left", on="uid") 1024 | elif idx == 1: 1025 | duser2 = duser2.merge(orderLoanAmtWinds, how="left", on="uid") 1026 | return duser1, duser2 1027 | 1028 | def orderCntDays(duser1, duser2, window_size): 1029 | valid_mask, test_mask = get_windows_mask(t_order, "buy_time", window_size) 1030 | for idx, mask in enumerate([valid_mask, test_mask]): 1031 | tmp = t_order[mask].groupby("uid")["buy_time"].agg(["count","nunique"]).reset_index() 1032 | tmp.columns = ["uid", "buyCnt" + str(window_size), "buyDays"+ str(window_size)] 1033 | if idx == 0: 1034 | duser1 = duser1.merge(tmp, on="uid", how="left") 1035 | elif idx == 1: 1036 | duser2 = duser2.merge(tmp, on="uid", how="left") 1037 | return duser1, duser2 1038 | 1039 | 1040 | def gen_fixed_tw_features_for_click_1m(df,col,duser1,duser2): 1041 | valid_mask, test_mask = get_windows_mask(df, col, 90) 1042 | for idx, mask in enumerate([valid_mask, test_mask]): 1043 | tmp=df[mask].reset_index(drop=True) 1044 | if idx==0: 1045 | tmp['cut_1m']=pd.cut(tmp.click_time,valid_cut_point,labels=labels,right=False,include_lowest=True) 1046 | tmp=tmp[tmp.cut_1m.notnull()].reset_index(drop=True) 1047 | elif idx==1: 1048 | tmp['cut_1m']=pd.cut(tmp.click_time,test_cut_point,labels=labels,right=False,include_lowest=True) 1049 | tmp=tmp[tmp.cut_1m.notnull()].reset_index(drop=True) 1050 | stat_click_1m=tmp.groupby(['uid','cut_1m']).click_time.agg(['count']).reset_index() 1051 | stat_click_1m.columns=['uid','cut_1m','click_count'] 1052 | cnt_total=stat_click_1m.groupby('uid').click_count.sum().rename('cnt_total').reset_index() 1053 | stat_click_1m=stat_click_1m.merge(cnt_total,how='left',on='uid') 1054 | stat_click_1m['ratio_cnt']=stat_click_1m.click_count/stat_click_1m.cnt_total 1055 | stat_click_1m['count_1m']=stat_click_1m['cut_1m'].astype(str)+'_click_count' 1056 | stat_click_1m['ratio_cnt_1m']=stat_click_1m['cut_1m'].astype(str)+'_click_ratio_cnt' 1057 | 1058 | stat_click_1m_cnt=stat_click_1m.pivot(index='uid', columns='count_1m', values='click_count').reset_index().fillna(0) 1059 | stat_click_1m_ratio_cnt=stat_click_1m.pivot(index='uid', columns='ratio_cnt_1m', values='ratio_cnt').reset_index().fillna(0) 1060 | 1061 | d_feature=pd.DataFrame({'uid':t_user.uid},index=t_user.index) 1062 | d_feature=d_feature.merge(stat_click_1m_cnt,how="left", on="uid").fillna(0) 1063 | d_feature=d_feature.merge(stat_click_1m_ratio_cnt,how="left", on="uid") 1064 | if idx == 0: 1065 | duser1 = duser1.merge(d_feature, how="left", on="uid") 1066 | elif idx == 1: 1067 | duser2 = duser2.merge(d_feature, how="left", on="uid") 1068 | return duser1,duser2 1069 | 1070 | def getLoanCntWithinOrder(duser1, duser2, window_size): 1071 | valid_mask_loan, test_mask_loan = get_windows_mask(t_loan, "loan_time", window_size) 1072 | valid_mask_buy, test_mask_buy = get_windows_mask(t_order, "buy_time", window_size) 1073 | for idx, mask in enumerate([valid_mask_loan, test_mask_loan]): 1074 | tmp_loan = t_loan[mask] 1075 | loanAmtWinds = tmp_loan.groupby(["uid","date"])["loan_amount"].sum().rename('loanAmtIn').reset_index() 1076 | if idx==0: 1077 | tmp_order = t_order[valid_mask_buy] 1078 | elif idx==1: 1079 | tmp_order = t_order[test_mask_buy] 1080 | orderAmtWinds = tmp_order.groupby(["uid","buy_time"])["order_amt"].count().rename("orderCnt").reset_index() 1081 | orderAmtWinds.columns = ["uid", "date", "orderCnt"] 1082 | orderLoanAmtWinds = orderAmtWinds.merge(loanAmtWinds, how="left", on=["uid","date"]) 1083 | orderCntWinds= orderLoanAmtWinds[~orderLoanAmtWinds.loanAmtIn.isnull()].groupby("uid")['orderCnt'].sum().rename('loanOrderCnt' + str(window_size)).reset_index() 1084 | orderTolCnt = orderAmtWinds.groupby("uid")['orderCnt'].sum().rename('orderTolCnt' + str(window_size)).reset_index() 1085 | orderLoanAmtWinds = orderTolCnt.merge(orderCntWinds, how="left", on="uid") 1086 | orderLoanAmtWinds['loanOrderCntRatioIn' + str(window_size)] = orderLoanAmtWinds['loanOrderCnt' + str(window_size)] /orderLoanAmtWinds['orderTolCnt' + str(window_size)] 1087 | if idx == 0: 1088 | duser1 = duser1.merge(orderLoanAmtWinds[["uid",'loanOrderCntRatioIn' + str(window_size)]], how="left", on="uid") 1089 | elif idx == 1: 1090 | duser2 = duser2.merge(orderLoanAmtWinds[["uid",'loanOrderCntRatioIn' + str(window_size)]], how="left", on="uid") 1091 | return duser1, duser2 1092 | 1093 | #if __name__ == '__main__': 1094 | work_path = "/Users/zhangkai/code/JDD_data" 1095 | #work_path = "/data/kai.zhang/JDD" 1096 | pd.set_option('display.max_columns', None) 1097 | os.getcwd() 1098 | os.chdir(work_path) 1099 | t_click = pd.read_csv("t_click.csv") 1100 | t_user = pd.read_csv("t_user.csv") 1101 | t_loan = pd.read_csv("t_loan.csv") 1102 | t_order = pd.read_csv("t_order.csv") 1103 | t_loan_sum = pd.read_csv("t_loan_sum.csv") 1104 | 1105 | 1106 | t_click = parseDate(t_click, "click_time") 1107 | t_loan = parseDate(t_loan,"loan_time") 1108 | t_order = parseDate(t_order,"buy_time") 1109 | t_user = parseDate(t_user,"active_date") 1110 | 1111 | t_loan["date"] = pd.to_datetime(t_loan["date"]) 1112 | t_click["date"] = pd.to_datetime(t_click["date"]) 1113 | 1114 | 1115 | t_click = t_click.groupby(["uid"]).apply(lambda x: x.sort_values(["click_time"], ascending=True)).reset_index(drop=True) 1116 | t_loan = t_loan.groupby(["uid"]).apply(lambda x: x.sort_values(["loan_time"], ascending=True)).reset_index(drop=True) 1117 | t_order = t_order.groupby(["uid"]).apply(lambda x: x.sort_values(["buy_time"], ascending=True)).reset_index(drop=True) 1118 | t_order["month"] = t_order["buy_time"].apply(lambda x: x.month) 1119 | t_loan["month"] = t_loan["loan_time"].apply(lambda x: x.month) 1120 | t_click["month"] = t_click["click_time"].apply(lambda x: x.month) 1121 | 1122 | valid_end_date = "2016-11-01" 1123 | test_end_date = "2016-12-01" 1124 | 1125 | 1126 | ##price解敏, t_loan和t_loan_sum是同一个脱敏函数 1127 | t_loan["loan_amount"] = t_loan.loan_amount.apply(lambda x: round(5**x -1)) 1128 | t_order["price"] = t_order["price"].apply(lambda x: round(5**x -1)) 1129 | t_order["discount"] = t_order["discount"].apply(lambda x: round(5**x -1)) 1130 | t_user["limit"] = t_user["limit"].apply(lambda x: round(5**x -1)) 1131 | 1132 | ##点击 1133 | #每个页面点击次数占比人均点击次数 1134 | tr_user, ts_user = click_percent(t_click, t_user, t_user, 30) 1135 | 1136 | #用户点击的不同天数、页面id个数 1137 | tr_user, ts_user = click_days_pids(t_click, tr_user, ts_user, 30) 1138 | 1139 | #页面点击平均点击间隔,最近一次点击距离现在的时间,下次点击的时间 1140 | tr_user, ts_user = getNearestClick(t_click, tr_user, ts_user, 30) 1141 | 1142 | 1143 | ###购买 1144 | cate_mean_price = t_order[t_order.price != 0].groupby("cate_id")["price"].mean().rename("cate_mean_price").reset_index() 1145 | t_order = t_order.merge(cate_mean_price, on="cate_id", how="left") 1146 | t_order.loc[t_order.price<=0,"price"] = t_order.loc[t_order.price<=0,"cate_mean_price"] 1147 | 1148 | 1149 | #tmp = t_order[(t_order.qty > 10000)] 1150 | #t_order = t_order[t_order.price > 0].reset_index(drop=True) 1151 | t_order["order_amt"] = t_order["price"] * t_order["qty"] - t_order["discount"] 1152 | #t_order.loc[t_order["order_amt"]<0, "order_amt"] = 0 1153 | t_order.loc[t_order["order_amt"]<0, "order_amt"] = t_order.loc[t_order["order_amt"]<0, "order_amt"] + t_order.loc[t_order["order_amt"]<0, "discount"] 1154 | 1155 | #t_order = t_order[t_order["order_amt"]>=0].reset_index(drop=True) 1156 | 1157 | t_order["order_amt"] = t_order["order_amt"] + 1 1158 | 1159 | 1160 | 1161 | #uid的购买次数(金额)占比人均购买次数(金额),uid平均每次购买金额, 1162 | tr_user, ts_user = uid_order_status(t_order, tr_user, ts_user, 30) 1163 | 1164 | 1165 | #购买平均间隔;最近一次购买金额,最近一次购买距离现在的时间,金额/时间; #最近一次购买距离用户激活的时间 1166 | tr_user, ts_user = getNearestOrder(t_order, tr_user, ts_user, 30) 1167 | 1168 | #最大购买的金额,时间,金额/时间 1169 | tr_user, ts_user = getMaxPriceOrder(t_order, tr_user, ts_user, 30) 1170 | 1171 | 1172 | 1173 | ###借贷 1174 | t_loan['amt_per_plan']= t_loan['loan_amount']/t_loan['plannum'] 1175 | t_loan['loan_interval'] = t_loan.groupby('uid')['loan_time'].diff().apply(lambda x:x.days+x.seconds/86400.0) 1176 | t_loan["pay_end_date"] = t_loan.apply(lambda x: x["loan_time"] + timedelta(days=x["plannum"] * 30), axis=1) 1177 | 1178 | ##更新limit,10月单笔单款最大额度,即是11月份limit,11月份的贷款的最大单笔额度,12月份的limit, 1179 | tr_user, ts_user = updateLimit(tr_user, ts_user) 1180 | 1181 | 1182 | #贷款金额, 贷款期数, 每期贷款额, 贷款周期, 各次贷款离现在的时间相关的统计 1183 | tr_user, ts_user = gen_fixedtw_features_for_loan(t_loan, tr_user, ts_user, 30) 1184 | 1185 | ##激活时间距离现在的时间 1186 | tr_user["active_days"] = (pd.Timestamp(valid_end_date) - tr_user["active_date"]).apply(lambda x: x.days) 1187 | ts_user["active_days"] = (pd.Timestamp(test_end_date) - ts_user["active_date"]).apply(lambda x: x.days) 1188 | 1189 | #最近一次借贷的金额, 最近一次借贷的金额/最近一次借贷距离现在的时间,最近一次贷款的分期数, 最近一次贷款的每期金额 1190 | tr_user, ts_user = getNearestLoan(t_loan, tr_user, ts_user) 1191 | 1192 | #未来一个月要还多少钱 1193 | tr_user, ts_user = current2PayAmt(t_loan, tr_user, ts_user, 180) 1194 | 1195 | 1196 | ##当前月尚未还贷情况,剩余贷款额度=limit - debt 1197 | tr_user, ts_user = currentDebtAmt(t_loan, tr_user, ts_user, 180) 1198 | tr_user, ts_user = currentDebtAmt(t_loan, tr_user, ts_user, 60) 1199 | #tr_user, ts_user = currentDebtAmt(t_loan, tr_user, ts_user, 30) 1200 | 1201 | ##join 11月总的借贷金额 1202 | t_loan_sum.drop("month", axis=1,inplace=True) 1203 | tr_user = tr_user.merge(t_loan_sum, on ="uid", how="left") 1204 | tr_user["loan_sum"] = tr_user["loan_sum"].fillna(0.0) 1205 | 1206 | 1207 | ##每个月贷款金额/当月剩余额度 1208 | #当前剩余额度 = 初试额度 - 当月之前一共用了多少额度 + 已还额度 1209 | tr_user, ts_user = getLoanAmtRemainingLimt(t_loan, tr_user, ts_user, 1) 1210 | tr_user, ts_user = getLoanAmtRemainingLimt(t_loan, tr_user, ts_user, 2) #当月剩余额度计算会有偏差 1211 | 1212 | ##上个月还了多少钱 1213 | tr_user, ts_user = lastMonthPayedAmt(t_loan, tr_user, ts_user) 1214 | 1215 | ##前三个月用户贷款的总金额、总次数,平均每次贷款金额,平均每月贷款金额,提升0.0011 1216 | tr_user, ts_user = getPast3MonthLoanFeatures(t_loan, tr_user, ts_user) 1217 | 1218 | ##最近一次,第一次贷款距离用户激活的时间, 最近一次,第一次贷款金额占比limit 1219 | tr_user, ts_user = loanTimeBetweenActivetime(t_loan, tr_user, ts_user) 1220 | 1221 | ##当前月还款金额/最近一次贷款距离现在的时间 1222 | tr_user["current_topay_amt_nearest_loantime"] = tr_user["current_topay_amt"]/(1+tr_user["nearest_loantime"]) 1223 | ts_user["current_topay_amt_nearest_loantime"] = ts_user["current_topay_amt"]/(1+ts_user["nearest_loantime"]) 1224 | 1225 | ##购买点击比 1226 | tr_user, ts_user = getOrderClickRatio(t_click, t_order, tr_user, ts_user) 1227 | 1228 | #用户最近两次的贷款前后的间隔 1229 | tr_user, ts_user = getNearest2LoanInterval(t_loan, tr_user, ts_user) 1230 | 1231 | ##用户购买的折扣率,重复计算了用户近三个月的购买金额 1232 | tr_user, ts_user = userDiscountRatio(t_order, tr_user, ts_user) 1233 | 1234 | ##离线效果微弱 1235 | #tr_user, ts_user = getAvailableLoanAmtLimt(t_loan, tr_user, ts_user, 1) 1236 | #tr_user, ts_user = getAvailableLoanAmtLimt(t_loan, tr_user, ts_user, 2) 1237 | 1238 | 1239 | ##每人购买力,预测贷款金额 1240 | tr_user, ts_user = avgLoanAmt4orderAmt(t_loan, t_order, tr_user, ts_user) 1241 | 1242 | ##一阶二阶差分 1243 | tr_user, ts_user = getLoanShiftDiff(t_loan, tr_user, ts_user) 1244 | #tr_user, ts_user = getMonthLoanShiftDiff(t_loan, tr_user, ts_user) 1245 | 1246 | tr_user, ts_user = getMonthOrderShiftDiff(t_order, tr_user, ts_user) 1247 | 1248 | ##loan,次数一阶二阶差分 1249 | tr_user, ts_user = getLoanCntShiftDiff(t_loan, tr_user, ts_user) 1250 | 1251 | 1252 | #tr_user["count_loanAmt_90_active_days"] = tr_user["count_loanAmt_90"]/tr_user["active_days"] 1253 | #ts_user["count_loanAmt_90_active_days"] = ts_user["count_loanAmt_90"]/ts_user["active_days"] 1254 | 1255 | 1256 | ##性别-年龄-limit交叉 1257 | tr_user = getSexAgeLimt(t_user, tr_user) 1258 | ts_user = getSexAgeLimt(t_user, ts_user) 1259 | 1260 | #tr_user, ts_user = currentMinDebtAmt(t_loan, tr_user, ts_user, 180) 1261 | #tr_user, ts_user = currentMaxDebtAmt(t_loan, tr_user, ts_user, 180) 1262 | 1263 | tr_user, ts_user = getCatePivotAmtCnt(t_order, tr_user, ts_user) 1264 | tr_user, ts_user = uidOrderAmtCntWinds(t_order, tr_user, ts_user, 60) 1265 | tr_user, ts_user = uidOrderAmtCntWinds(t_order, tr_user, ts_user, 90) 1266 | tr_user, ts_user = getTotalPidStaytime(t_click, tr_user, ts_user) 1267 | 1268 | ##点击的页面展开 1269 | #tr_user, ts_user = gen_fixed_tw_features_for_click_PidParamUnFold(t_click, tr_user, ts_user, "click_time", 7) 1270 | #tr_user, ts_user = gen_fixed_tw_features_for_click_PidParamUnFold(t_click, tr_user, ts_user, "click_time", 14) 1271 | tr_user, ts_user = gen_fixed_tw_features_for_click_PidParamUnFold(t_click, tr_user, ts_user, "click_time", 30) 1272 | tr_user, ts_user = gen_fixed_tw_features_for_click_PidParamUnFold(t_click, tr_user, ts_user, "click_time", 60) 1273 | tr_user, ts_user = gen_fixed_tw_features_for_click_PidParamUnFold(t_click, tr_user, ts_user, "click_time", 90) 1274 | 1275 | #tr_user,ts_user=gen_fixed_tw_features_for_loan_click_PidParamUnfold(t_loan,t_click,tr_user,ts_user,'loan_time',"click_time",30) 1276 | #tr_user,ts_user=gen_fixed_tw_features_for_loan_click_PidParamUnfold(t_loan,t_click,tr_user,ts_user,'loan_time',"click_time",60) 1277 | #tr_user,ts_user=gen_fixed_tw_features_for_loan_click_PidParamUnfold(t_loan,t_click,tr_user,ts_user,'loan_time',"click_time",90) 1278 | #tr_user,ts_user=gen_fixed_tw_features_for_loan_click_PidParamUnfold(t_loan,t_click,tr_user,ts_user,'loan_time',"click_time",15) 1279 | 1280 | ##没有贷款的日子里的点击情况 1281 | tr_user,ts_user=gen_fixed_tw_features_for_notloan_click_PidParamUnfold(t_loan,t_click,tr_user,ts_user,'loan_time',"click_time",30) 1282 | tr_user,ts_user=gen_fixed_tw_features_for_notloan_click_PidParamUnfold(t_loan,t_click,tr_user,ts_user,'loan_time',"click_time",60) 1283 | tr_user,ts_user=gen_fixed_tw_features_for_notloan_click_PidParamUnfold(t_loan,t_click,tr_user,ts_user,'loan_time',"click_time",90) 1284 | tr_user,ts_user=gen_fixed_tw_features_for_notloan_click_PidParamUnfold(t_loan,t_click,tr_user,ts_user,'loan_time',"click_time",15) 1285 | 1286 | 1287 | tr_user, ts_user = gen_fixed_tw_features_for_order_cateIdUnFold(t_order, tr_user, ts_user, "buy_time", 90) 1288 | tr_user, ts_user = gen_fixed_tw_features_for_order_cateIdUnFold(t_order, tr_user, ts_user, "buy_time", 60) 1289 | #tr_user, ts_user = gen_fixed_tw_features_for_order_cateIdUnFold(t_order, tr_user, ts_user, "buy_time", 30) 1290 | #tr_user, ts_user = gen_fixed_tw_features_for_order_cateIdUnFold(t_order, tr_user, ts_user, "buy_time", 15) 1291 | 1292 | 1293 | tr_user, ts_user = gen_fixed_tw_features_for_notOrder_click_PidParamUnfold(t_order, t_click, tr_user, ts_user,'buy_time',"click_time",30) 1294 | tr_user, ts_user = gen_fixed_tw_features_for_notOrder_click_PidParamUnfold(t_order, t_click, tr_user, ts_user,'buy_time',"click_time",60) 1295 | tr_user, ts_user = gen_fixed_tw_features_for_notOrder_click_PidParamUnfold(t_order, t_click, tr_user, ts_user,'buy_time',"click_time",90) 1296 | tr_user, ts_user = gen_fixed_tw_features_for_notOrder_click_PidParamUnfold(t_order, t_click, tr_user, ts_user,'buy_time',"click_time",15) 1297 | 1298 | 1299 | ##购买点击序列的特征万分位rmse下降1个点 1300 | tr_user, ts_user = loanClickBehiviorSeries(tr_user, ts_user, 60) 1301 | #tr_user, ts_user = loanClickBehiviorSeries1(tr_user, ts_user, 60) 1302 | 1303 | ##购买次数、天数 1304 | tr_user, ts_user = orderCntDays(tr_user, ts_user, 30) 1305 | tr_user, ts_user = orderCntDays(tr_user, ts_user, 60) 1306 | 1307 | 1308 | ###时间划分 1309 | valid_end_date = pd.Timestamp("2016-11-01") 1310 | valid_start_date= pd.Timestamp("2016-08-01") 1311 | valid_cut_point=[] 1312 | while valid_end_date>valid_start_date: 1313 | valid_cut_point=valid_cut_point+[valid_end_date] 1314 | valid_end_date=valid_end_date-timedelta(days=30) 1315 | valid_cut_point.sort() 1316 | 1317 | test_end_date = pd.Timestamp("2016-12-01") 1318 | test_start_date= pd.Timestamp("2016-09-01") 1319 | test_cut_point=[] 1320 | while test_end_date>=test_start_date: 1321 | test_cut_point=test_cut_point+[test_end_date] 1322 | test_end_date=test_end_date-timedelta(days=30) 1323 | test_cut_point.sort() 1324 | 1325 | l=len(valid_cut_point) 1326 | labels=['one_month_'+str(l-idx-1)for idx,j in enumerate(valid_cut_point) if idx+1 0: 46 | law_cate_feature = pd.get_dummies(lgbdata[category_f],sparse=True,columns=category_f).reset_index(drop=True) #原始分类型特征dummy 47 | pd_pred_feature = pd.get_dummies(pd_pred_leaf,sparse=True,columns=pd_pred_leaf.columns).reset_index(drop=True) #GBDT叶子dummy 48 | newdata = pd.concat([split_raw_data.reset_index(drop=True), pd_pred_feature, law_cate_feature], axis=1, ignore_index=True) 49 | newdata.columns = split_raw_data.columns.append(pd_pred_feature.columns).append(law_cate_feature.columns) 50 | else: 51 | print "do not contains category features" 52 | pd_pred_feature = pd.get_dummies(pd_pred_leaf,sparse=True,columns=pd_pred_leaf.columns).reset_index(drop=True) #GBDT叶子dummy 53 | newdata = pd.concat([split_raw_data.reset_index(drop=True), pd_pred_feature], axis=1, ignore_index=True) 54 | newdata.columns = split_raw_data.columns.append(pd_pred_feature.columns) 55 | return newdata.fillna(0) 56 | 57 | def split_raw_data(reg_data, dvalid_data, dtrain_alldata, dtest_data): 58 | tmp = pd.DataFrame() 59 | reg_data["type"] = "train" 60 | dvalid_data["type"] = "valid" 61 | dtrain_alldata["type"] = "train_all" 62 | dtest_data["type"] = "test" 63 | print "loging" 64 | all_data = reg_data.append(dvalid_data, ignore_index=True).append(dtrain_alldata, ignore_index=True).append(dtest_data, ignore_index=True) 65 | gbmreg = lgb.LGBMRegressor(num_leaves = 12, max_depth=3, n_estimators= 1) 66 | for col in reg_data.columns: 67 | if col in category_f or col == "type": 68 | continue 69 | print col 70 | gbmreg.fit(pd.DataFrame(dtrain_alldata[col].fillna(0)), tr_user["loan_sum"]) 71 | split = sorted(get_threshold(gbmreg.booster_.dump_model()["tree_info"][0]["tree_structure"])) 72 | categories = all_data[col].fillna(0).apply(lambda x: bisect.bisect_left(split, x)) 73 | tmp = pd.concat([tmp,categories],axis=1) 74 | tmp = pd.get_dummies(tmp,sparse=True,columns=tmp.columns).reset_index(drop=True) 75 | reg_data.drop("type",axis=1,inplace=True) 76 | dvalid_data.drop("type",axis=1,inplace=True) 77 | dtrain_alldata.drop("type",axis=1,inplace=True) 78 | dtest_data.drop("type",axis=1,inplace=True) 79 | return tmp, tmp[all_data["type"] == "train"], tmp[all_data["type"] == "valid"], tmp[all_data["type"]=="train_all"], tmp[all_data["type"] == "test"] 80 | 81 | 82 | def getPast3MonthLoanPlanInterval(df, duser1, duser2): 83 | valid_mask = df.month.isin([8, 9, 10]) 84 | test_mask = df.month.isin([9, 10, 11]) 85 | window_size = 90 86 | for idx, mask in enumerate([valid_mask, test_mask]): 87 | tmp = df[mask].groupby(["uid","plannum"]).apply(lambda x: x.sort_values(["loan_time"], ascending=True)).reset_index(drop=True) 88 | tmp["last_sameplan_loantime"] = tmp.groupby(["uid","plannum"])["loan_time"].shift(1) 89 | tmp["last_sameplan_loan_interval"] = (tmp["loan_time"] - tmp['last_sameplan_loantime']).apply(lambda x:x.days+x.seconds/86400.0) 90 | ##每期的贷款期数pivot 91 | perPlanInterval= tmp.groupby(["uid","plannum"])["last_sameplan_loan_interval"].agg(['max','min','mean','median']).reset_index() 92 | perPlanInterval["plannum"] = perPlanInterval['plannum'].astype(str) + "_plannum_" + str(window_size) 93 | perPlanInterval = perPlanInterval.pivot(index='uid', columns='plannum').reset_index() 94 | new_list = ["uid"] 95 | for words in perPlanInterval.columns.get_values(): 96 | if "uid" in words : 97 | continue 98 | new_list.append('_'.join(words)) 99 | perPlanInterval.columns = new_list 100 | if idx == 0: 101 | duser1 = duser1.merge(perPlanInterval, how="left", on="uid") 102 | elif idx == 1: 103 | duser2 = duser2.merge(perPlanInterval, how="left", on="uid") 104 | return duser1, duser2 105 | 106 | 107 | #过去一个月三个月,消费、贷款、点击数不为0的天数 108 | tr_user, ts_user = getActionDays(t_order, "buy_time", tr_user, ts_user, 30, "buy") 109 | tr_user, ts_user = getActionDays(t_loan, "date", tr_user, ts_user, 30, "loan") 110 | tr_user, ts_user = getActionDays(t_click, "date", tr_user, ts_user, 30, "click") 111 | tr_user, ts_user = getActionDays(t_loan, "date", tr_user, ts_user, 90, "loan") 112 | 113 | 114 | ##小额、大额借贷的频率(期数?)小期数借款间隔 115 | tr_user, ts_user = getPast3MonthLoanPlanInterval(t_loan, tr_user, ts_user) 116 | 117 | #最近一个月贷款比上过去三个月贷款金额的均值 118 | ##用户前三个月每个月消费,贷款、金额、次数 119 | tr_user, ts_user = getAmtBeforeRatio(t_loan, "loan_amount", tr_user, ts_user) 120 | 121 | #最近一个月消费比上过去三个月消费金额的均值 122 | tr_user, ts_user = getAmtBeforeRatio(t_order, "order_amt", tr_user, ts_user) 123 | 124 | 125 | to_drop = ["uid","active_date","loan_sum","pay_end_date"] 126 | features = list(np.setdiff1d(tr_user.columns.tolist(), to_drop)) 127 | features = tr_user[features].columns.intersection(ts_user.columns) 128 | category_f = ["sex_age_limit"]#,u'pid_param_pos1', u'pid_param_pos2',u'pid_param_pos3', u'pid_param_pos4', u'pid_param_pos5'] 129 | random.seed(888) 130 | select_rows = random.sample(tr_user.index, int(len(tr_user.index)*0.7)) 131 | train_df = tr_user.loc[select_rows] 132 | valid_df = tr_user.drop(select_rows) 133 | 134 | dtrain = lgb.Dataset(train_df[features], label=train_df["loan_sum"], free_raw_data=False) 135 | dvalid = lgb.Dataset(valid_df[features], label=valid_df["loan_sum"], free_raw_data=False) 136 | dtrain_all = lgb.Dataset(tr_user[features], label=tr_user["loan_sum"], free_raw_data=False) 137 | dtest = lgb.Dataset(ts_user[features], free_raw_data=False) 138 | 139 | 140 | 141 | param = {'num_leaves':8,'num_boost_round':150, 'objective':'regression_l2','metric':'rmse',"learning_rate" : 0.05, "boosting":"gbdt"} 142 | bst = lgb.train(param, dtrain, valid_sets=[dtrain, dvalid], verbose_eval=100) 143 | print('train mae: %g' % sqrt(mean_squared_error(train_df["loan_sum"], bst.predict(dtrain.data)))) 144 | print('valid mae: %g' % sqrt(mean_squared_error(valid_df["loan_sum"], bst.predict(dvalid.data)))) 145 | 146 | 147 | imp = bst.feature_importance(importance_type='gain', iteration=-1) 148 | feat_importance = pd.Series(imp,bst.feature_name()).to_dict() 149 | feat_importance = sorted(feat_importance.iteritems() ,key = lambda asd:asd[1],reverse=True) 150 | imp = pd.DataFrame(feat_importance) 151 | 152 | features = list(imp[imp[1] != 0][0]) 153 | 154 | all_tmp, split_dtrain, split_dvalid, split_all, split_dtest = split_raw_data(dtrain.data[features], dvalid.data[features], dtrain_all.data[features], dtest.data[features]) 155 | dtrain.data.drop("type",axis=1,inplace=True) 156 | dvalid.data.drop("type",axis=1,inplace=True) 157 | dtrain_all.data.drop("type",axis=1,inplace=True) 158 | dtest.data.drop("type",axis=1,inplace=True) 159 | 160 | lr_train_data = lgb_create_features(bst, dtrain.data, split_dtrain) #(63695, 66594) 161 | lr_valid_data = lgb_create_features(bst, dvalid.data, split_dvalid) #(27298, 30197) 162 | lr_all_data = lgb_create_features(bst, dtrain_all.data, split_all) #(90993, 93892) 163 | lr_test_data = lgb_create_features(bst, dtest.data, split_dtest) 164 | 165 | 166 | 167 | 168 | ##部分数据训练, 169 | lr = Lasso(alpha=0.003, normalize=False, copy_X=True, max_iter=100000, warm_start =True, precompute=True) 170 | union_feature = list(lr_train_data.columns.intersection(lr_valid_data.columns).intersection(lr_test_data.columns)) 171 | model = lr.fit(lr_train_data[union_feature], train_df["loan_sum"]) 172 | print "num_iter: ",lr.n_iter_ 173 | print "num_coef: ", sum(lr.coef_!=0) 174 | pred_lasso_train = lr.predict(lr_train_data[union_feature]) 175 | pred_lasso_valid = lr.predict(lr_valid_data[union_feature]) 176 | rmse_lasso_train = sqrt(mean_squared_error(train_df["loan_sum"], pred_lasso_train)) 177 | rmse_lasso_valid = sqrt(mean_squared_error(valid_df["loan_sum"], pred_lasso_valid)) 178 | print('train mae: %g' % rmse_lasso_train) 179 | print('valid mae: %g' % rmse_lasso_valid) 180 | 181 | 182 | 183 | ##全部数据训练 184 | all_features = list(lr_all_data.columns.intersection(lr_test_data.columns)) 185 | lr = Lasso(alpha=0.003, normalize=False, copy_X=True, max_iter=100000, warm_start =True, precompute=True) 186 | model = lr.fit(lr_all_data[all_features], tr_user["loan_sum"]) 187 | print "num_iter: ",lr.n_iter_ 188 | print "num_coef: ", sum(lr.coef_!=0) 189 | rmse_lasso_train = sqrt(mean_squared_error(tr_user["loan_sum"], lr.predict(lr_all_data[all_features]))) 190 | print('train mae: %g' % rmse_lasso_train) 191 | 192 | ##保存上传文件 193 | lr_pred = lr.predict(lr_test_data[all_features]) 194 | id_test = ts_user['uid'] 195 | lr_sub = pd.DataFrame({'uid': id_test, 'loan_sum_lr': lr_pred}) 196 | print(lr_sub.describe()) 197 | lr_sub.loc[lr_sub["loan_sum_lr"] < 0,"loan_sum_lr"] = 0 198 | now_time = time.strftime("%m-%d %H_%M_%S", time.localtime()) 199 | lr_sub.to_csv("./submission/" +now_time+'_gbdt_lr_Vscore_'+ str(rmse_lasso_valid) +'.csv', index=False, header=False) 200 | #lr_sub = pd.read_csv("/Users/zhangkai/Desktop/gbdt_lr.csv") 201 | 202 | 203 | ##模型融合 204 | final = lr_sub.merge(lgb_sub,on="uid",how="left") 205 | 206 | ##手动调参 207 | final["loan_sum"]= 0.3 * final["loan_sum_lr"] + 0.7* final["lgb_loan_sum"] 208 | final.loc[final["loan_sum"] < 0,"loan_sum"] = 0 209 | print(final.describe()) 210 | final[["uid","loan_sum"]].to_csv("./submission/" + now_time + 'lightgbm_gbdt_lr.csv', index=False, header=False) 211 | 212 | 213 | #######xgboost 214 | import xgboost as xgb 215 | import random 216 | random.seed(888) 217 | 218 | 219 | xgb_dtrain = xgb.DMatrix(train_df[features], label=train_df["loan_sum"]) 220 | xgb_dvalid = xgb.DMatrix(valid_df[features], label=valid_df["loan_sum"]) 221 | xgb_dtrain_all = xgb.DMatrix(tr_user[features], label=tr_user["loan_sum"]) 222 | xgb_dtest = xgb.DMatrix(ts_user[features]) 223 | 224 | watchlist = [(xgb_dtrain, 'train'), (xgb_dvalid, 'eval')] 225 | param = { 226 | 'booster': 'gbtree', 227 | 'objective': 'reg:linear', 228 | 'eval_metric': 'rmse', 229 | 'eta': 0.08, 230 | 'num_round': 500, #300 231 | 'max_depth': 3, 232 | 'nthread': -1, 233 | 'seed': 888, 234 | 'silent': 1, 235 | 'lambda':1500, 236 | 'min_child_weight': 4 237 | } 238 | #{'n_estimators': 100, 'max_depth': 5, } 239 | 240 | 241 | xgbmodel = xgb.train(param, xgb_dtrain, param['num_round'], watchlist, verbose_eval=100) 242 | rmse_xgb_train = sqrt(mean_squared_error(train_df["loan_sum"], xgbmodel.predict(xgb_dtrain))) 243 | print('train mae: %g' % rmse_xgb_train) 244 | rmse_xgb_valid = sqrt(mean_squared_error(valid_df["loan_sum"], xgbmodel.predict(xgb_dvalid))) 245 | print('valid mae: %g' % rmse_xgb_valid) 246 | 247 | xgbmodel = xgb.train(param, xgb_dtrain_all, param['num_round'], verbose_eval=1) 248 | print('valid mae: %g' % sqrt(mean_squared_error(tr_user["loan_sum"], xgbmodel.predict(xgb_dtrain_all)))) 249 | 250 | 251 | xgb_pred = xgbmodel.predict(xgb_dtest) 252 | id_test = ts_user['uid'] 253 | xgb_sub = pd.DataFrame({'uid': id_test, 'loan_sum_xgb': xgb_pred}) 254 | print(xgb_sub.describe()) 255 | xgb_sub.loc[xgb_sub["loan_sum_xgb"] < 0,"loan_sum_xgb"] = 0 256 | now_time = time.strftime("%m-%d %H_%M_%S", time.localtime()) 257 | xgb_sub.to_csv("./submission/" +now_time+'_xgb_Vscore_'+ str(rmse_xgb_valid) +'.csv', index=False, header=False) 258 | 259 | 260 | ##模型融合 261 | final = lr_sub.merge(lgb_sub,on="uid",how="left").merge(xgb_sub, on="uid",how="left") 262 | 263 | final = lgb_sub.merge(xgb_sub, on="uid",how="left") 264 | 265 | final["loan_sum"]= 0.9* final["lgb_loan_sum"] + 0.1 * final["loan_sum_xgb"] 266 | 267 | ##手动调参 268 | final["loan_sum"]= 0.3 * final["loan_sum_lr"] + 0.5* final["lgb_loan_sum"] + 0.2 * final["loan_sum_xgb"] 269 | final.loc[final["loan_sum"] < 0,"loan_sum"] = 0 270 | print(final.describe()) 271 | final[["uid","loan_sum"]].to_csv("./submission/" + now_time + 'lightgbm_gbdt_lr_xgb.csv', index=False, header=False) 272 | 273 | 274 | 275 | ###自动学习融合参数 276 | ensemble_train = pd.DataFrame({'lasso': pred_lasso_train, 'lgb': pred_lgb_train, 'xgb': rmse_xgb_train}) 277 | ensemble_valid = pd.DataFrame({'lasso': pred_lasso_valid, 'lgb': pred_lgb_valid, 'xgb': rmse_xgb_valid}) 278 | ensemble_test = pd.DataFrame({'lasso': lr_pred, 'lgb': pred, 'xgb': xgb_pred}) 279 | ensemble_lr = Lasso(alpha=0.000001, normalize=False, copy_X=True, max_iter=100000, warm_start =True, precompute=True, fit_intercept=False,positive=True) 280 | ensemble_lr.fit(ensemble_train, train_df["loan_sum"]) 281 | print "num_iter: ",ensemble_lr.n_iter_ 282 | print "coef: ", ensemble_lr.coef_ 283 | rmse_ensemble_train = sqrt(mean_squared_error(train_df["loan_sum"], ensemble_lr.predict(ensemble_train))) 284 | rmse_ensemble_valid = sqrt(mean_squared_error(valid_df["loan_sum"], ensemble_lr.predict(ensemble_valid))) 285 | print('train mae: %g' % rmse_ensemble_train) 286 | print('valid mae: %g' % rmse_ensemble_valid) 287 | final["loan_sum"] = ensemble_lr.coef_[0] * final["loan_sum_lr"] + ensemble_lr.coef_[1] * final["lgb_loan_sum"] 288 | 289 | 290 | ensemble_model = lgb.LGBMRegressor(num_leaves = 3, learning_rate = 0.05, n_estimators= 100) 291 | ensemble_model.fit(ensemble_train, train_df["loan_sum"]) 292 | rmse_ensemble_train = sqrt(mean_squared_error(train_df["loan_sum"], ensemble_model.predict(ensemble_train))) 293 | rmse_ensemble_valid = sqrt(mean_squared_error(valid_df["loan_sum"], ensemble_model.predict(ensemble_valid))) 294 | print('train mae: %g' % rmse_ensemble_train) 295 | print('valid mae: %g' % rmse_ensemble_valid) 296 | 297 | ensemble_pred = ensemble_model.predict(ensemble_test) 298 | id_test = ts_user['uid'] 299 | stacking_sub = pd.DataFrame({'uid': id_test, 'ensemble_pred': ensemble_pred}) 300 | print(stacking_sub.describe()) 301 | stacking_sub.loc[stacking_sub["ensemble_pred"] < 0,"ensemble_pred"] = 0 302 | now_time = time.strftime("%m-%d %H_%M_%S", time.localtime()) 303 | stacking_sub[["uid","ensemble_pred"]].to_csv("./submission/" +now_time+'_ensemble_pred.csv', index=False, header=False) 304 | 305 | 306 | 307 | 308 | ##FM 309 | from fastFM import als 310 | cscMatrix = sparseDfToCsc(lr_train_data[union_feature]) 311 | validCscdMatrix = sparseDfToCsc(lr_valid_data[union_feature]) 312 | fm = als.FMRegression(n_iter=18, init_stdev=0.1, rank=8, l2_reg_w=40000, l2_reg_V=100000) 313 | fm = als.FMRegression(n_iter=18, init_stdev=0.1, rank=2, l2_reg_w=0.001, l2_reg_V=0.006) 314 | 315 | fm.fit(cscMatrix,train_df["loan_sum"]) 316 | print(sum(fm.w_ !=0)) 317 | print( sqrt(mean_squared_error(train_df["loan_sum"], fm.predict(cscMatrix)))) 318 | rmse_fm_valid= sqrt(mean_squared_error(valid_df["loan_sum"], fm.predict(validCscdMatrix))) 319 | print(rmse_fm_valid ) 320 | 321 | 322 | ##FM全部数据训练 323 | fm = als.FMRegression(n_iter=18, init_stdev=0.1, rank=2, l2_reg_w=40000, l2_reg_V=100000) 324 | all_features = list(lr_all_data.columns.intersection(lr_test_data.columns)) 325 | allcscMatrix = sparseDfToCsc(lr_all_data[all_features]) 326 | fm.fit(allcscMatrix, tr_user["loan_sum"]) 327 | rmse_fm_train = sqrt(mean_squared_error(tr_user["loan_sum"], fm.predict(allcscMatrix))) 328 | print('train rmse: %g' % rmse_fm_train) 329 | 330 | ##FM保存上传文件 331 | testcscMatrix = sparseDfToCsc(lr_test_data[all_features]) 332 | fm_pred = fm.predict(testcscMatrix) 333 | id_test = ts_user['uid'] 334 | fm_sub = pd.DataFrame({'uid': id_test, 'loan_sum_fm': fm_pred}) 335 | print(fm_sub.describe()) 336 | fm_sub.loc[fm_sub["loan_sum_fm"] < 0,"loan_sum_fm"] = 0 337 | now_time = time.strftime("%m-%d %H_%M_%S", time.localtime()) 338 | fm_sub[["uid","loan_sum_fm"]].to_csv("./submission/" +now_time+'_gbdt_fm_Vscore_'+ str(rmse_fm_valid)+'.csv', index=False, header=False) 339 | #lr_sub = pd.read_csv("/Users/zhangkai/Desktop/gbdt_lr.csv") 340 | 341 | 342 | 343 | #####Random Forest 344 | from sklearn.ensemble import RandomForestRegressor 345 | #[m/3] 346 | regr = RandomForestRegressor(n_estimators= 100, max_depth=3, criterion= "mae", random_state=0, n_jobs = 30, oob_score =False, max_features= "log2", warm_start = True) 347 | rf_feature = list(set(features) - set(category_f)) 348 | 349 | indices_to_keep = ~x_train[rf_feature].isin([np.nan, np.inf, -np.inf]).any(1) 350 | 351 | regr.fit(x_train[rf_feature].replace([np.inf,np.nan], 0), x_train["delivery_duration"]) 352 | 353 | rf_train = regr.predict(x_train[rf_feature].fillna(0)) 354 | rf_test = regr.predict(x_test[rf_feature].fillna(0)) 355 | rf_valid = regr.predict(x_valid[rf_feature].fillna(0)) 356 | 357 | 358 | print('train mae: %g' % np.mean(np.abs((np.power(2,x_train["delivery_duration"]) -1) - (np.power(2,rf_train) -1) ))) 359 | print('valid mae: %g' % np.mean(np.abs((np.power(2,x_valid["delivery_duration"])-1) - (np.power(2, rf_valid) -1) ))) 360 | test_mae1 = np.mean(np.abs((np.power(2,x_test["delivery_duration"])-1) - (np.power(2, rf_test) -1) )) 361 | print('test mae: %g' % test_mae1) 362 | 363 | 364 | regr.feature_importances_,rf_feature 365 | 366 | feat_importance = pd.Series(regr.feature_importances_,rf_feature).to_dict() 367 | feat_importance = sorted(feat_importance.iteritems() ,key = lambda asd:asd[1],reverse=True) 368 | imp = pd.DataFrame(feat_importance) 369 | 370 | 371 | 372 | 373 | 374 | 375 | #### 376 | ##zscore 377 | z_dtrain = copy.deepcopy(train_df[features]) 378 | z_dvalid = copy.deepcopy(valid_df[features]) 379 | z_dtrain_all = copy.deepcopy(tr_user[features]) 380 | z_dtest = copy.deepcopy(ts_user[features]) 381 | 382 | for col in set(z_dvalid.columns) - set(category_f) -set(["sex"]) - set(to_drop): 383 | col_zscore = col + '_zscore' 384 | z_dtrain_all[col_zscore] = ((z_dtrain_all[col] - z_dtrain_all[col].mean())/z_dtrain_all[col].std(ddof=0)).replace([np.inf, -np.inf, np.nan], 0) 385 | z_dtest[col_zscore] = ((z_dtest[col] - z_dtest[col].mean())/z_dtest[col].std(ddof=0)).replace([np.inf, -np.inf, np.nan], 0) 386 | z_dtrain[col_zscore] = ((z_dtrain[col] - z_dtrain_all[col].mean())/z_dtrain_all[col].std(ddof=0)).replace([np.inf, -np.inf, np.nan], 0) 387 | z_dvalid[col_zscore] = ((z_dvalid[col] - z_dtrain_all[col].mean())/z_dtrain_all[col].std(ddof=0)).replace([np.inf, -np.inf, np.nan], 0) 388 | 389 | col_zscore = [] 390 | for i in z_dtrain_all.columns: 391 | if '_zscore' in i: 392 | col_zscore.extend([i]) 393 | 394 | lr_zscore_train = pd.concat([lr_train_data[union_feature].reset_index(drop=True), z_dtrain[col_zscore].reset_index(drop=True)], axis=1) 395 | lr_zscore_valid = pd.concat([lr_valid_data[union_feature].reset_index(drop=True), z_dvalid[col_zscore].reset_index(drop=True)], axis=1) 396 | 397 | lr = Lasso(alpha=0.003, normalize=False, copy_X=True, max_iter=100000, warm_start =True, precompute=True) 398 | model = lr.fit(lr_zscore_train, train_df["loan_sum"]) 399 | print "num_iter: ",lr.n_iter_ 400 | print "num_coef: ", sum(lr.coef_!=0) 401 | print('train mae: %g' % sqrt(mean_squared_error(train_df["loan_sum"], lr.predict(lr_zscore_train)))) 402 | print('valid mae: %g' % sqrt(mean_squared_error(valid_df["loan_sum"], lr.predict(lr_zscore_valid)))) 403 | -------------------------------------------------------------------------------- /stacking.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_boston 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.metrics import mean_absolute_error 4 | from sklearn.metrics import mean_squared_error 5 | from sklearn.ensemble import ExtraTreesRegressor 6 | from sklearn.ensemble import RandomForestRegressor 7 | from xgboost import XGBRegressor 8 | from lightgbm import LGBMRegressor 9 | from vecstack import stacking 10 | from math import sqrt 11 | 12 | import numpy as np 13 | import scipy.stats as st 14 | from sklearn.cross_validation import KFold 15 | from sklearn.cross_validation import StratifiedKFold 16 | from sklearn.metrics import mean_absolute_error 17 | from sklearn.metrics import accuracy_score 18 | 19 | #------------------------------------------------------------------------------- 20 | #------------------------------------------------------------------------------- 21 | 22 | def transformer(y, func=None): 23 | if func is None: 24 | return y 25 | else: 26 | return func(y) 27 | 28 | #------------------------------------------------------------------------------- 29 | #------------------------------------------------------------------------------- 30 | 31 | def stacking(models, X_train, y_train, X_test, regression=True, 32 | transform_target=None, transform_pred=None, 33 | metric=None, n_folds=4, stratified=False, 34 | shuffle=False, random_state=0, verbose=0): 35 | # Print type of task 36 | if regression and verbose > 0: 37 | print('task: [regression]') 38 | elif not regression and verbose > 0: 39 | print('task: [classification]') 40 | 41 | # Specify default metric for cross-validation 42 | if metric is None and regression: 43 | metric = mean_absolute_error 44 | elif metric is None and not regression: 45 | metric = accuracy_score 46 | 47 | # Print metric 48 | if verbose > 0: 49 | print('metric: [%s]\n' % metric.__name__) 50 | 51 | # Split indices to get folds (stratified can be used only for classification) 52 | if stratified and not regression: 53 | kf = StratifiedKFold(y_train, n_folds, shuffle = shuffle, random_state = random_state) 54 | else: 55 | kf = KFold(len(y_train), n_folds, shuffle = shuffle, random_state = random_state) 56 | 57 | # Create empty numpy arrays for stacking features 58 | S_train = np.zeros((X_train.shape[0], len(models))) 59 | S_test = np.zeros((X_test.shape[0], len(models))) 60 | 61 | # Loop across models 62 | for model_counter, model in enumerate(models): 63 | if verbose > 0: 64 | print('model %d: [%s]' % (model_counter, model.__class__.__name__)) 65 | 66 | # Create empty numpy array, which will contain temporary predictions for test set made in each fold 67 | S_test_temp = np.zeros((X_test.shape[0], len(kf))) 68 | 69 | # Loop across folds 70 | for fold_counter, (tr_index, te_index) in enumerate(kf): 71 | X_tr = X_train.loc[tr_index] 72 | y_tr = y_train.loc[tr_index] 73 | X_te = X_train.loc[te_index] 74 | y_te = y_train.loc[te_index] 75 | 76 | # Fit 1-st level model 77 | model = model.fit(X_tr, transformer(y_tr, func = transform_target)) 78 | # Predict out-of-fold part of train set 79 | S_train[te_index, model_counter] = transformer(model.predict(X_te), func = transform_pred) 80 | # Predict full test set 81 | S_test_temp[:, fold_counter] = transformer(model.predict(X_test), func = transform_pred) 82 | 83 | if verbose > 1: 84 | print(' fold %d: [%.8f]' % (fold_counter, metric(y_te, S_train[te_index, model_counter]))) 85 | 86 | # Compute mean or mode of predictions for test set 87 | if regression: 88 | S_test[:, model_counter] = np.mean(S_test_temp, axis = 1) 89 | else: 90 | S_test[:, model_counter] = st.mode(S_test_temp, axis = 1)[0].ravel() 91 | 92 | if verbose > 0: 93 | print(' ----') 94 | print(' MEAN: [%.8f]\n' % (metric(y_train, S_train[:, model_counter]))) 95 | 96 | return (S_train, S_test) 97 | 98 | 99 | 100 | X_train = tr_user[features].replace([np.inf,np.nan], 0).reset_index(drop=True) 101 | X_test = ts_user[features].replace([np.inf,np.nan], 0).reset_index(drop=True) 102 | y_train = tr_user["loan_sum"].reset_index(drop=True) 103 | 104 | 105 | # Caution! All models and parameter values are just 106 | # demonstrational and shouldn't be considered as recommended. 107 | # Initialize 1-st level models. 108 | models = [ 109 | ExtraTreesRegressor(random_state = 0, n_jobs = -1, 110 | n_estimators = 300, max_depth = 3), 111 | 112 | RandomForestRegressor(random_state = 0, n_jobs = -1, 113 | n_estimators = 300, max_depth = 3), 114 | 115 | XGBRegressor(seed = 0, learning_rate = 0.05, 116 | n_estimators = 300, max_depth = 3), 117 | 118 | LGBMRegressor(num_leaves = 8, learning_rate = 0.05, n_estimators= 300) 119 | ] 120 | 121 | # Compute stacking features 122 | 123 | S_train, S_test = stacking(models, X_train, y_train, X_test, regression = True, metric = mean_squared_error, n_folds = 5, shuffle = True, random_state = 0, verbose = 2) 124 | 125 | 126 | # Fit 2-nd level model 127 | model = LGBMRegressor(num_leaves = 8, learning_rate = 0.05, n_estimators= 300) 128 | model = model.fit(S_train, y_train) 129 | y_pred = model.predict(S_test) 130 | 131 | id_test = ts_user['uid'] 132 | stacking_sub = pd.DataFrame({'uid': id_test, 'stacking_loan_sum': y_pred}) 133 | print(stacking_sub.describe()) 134 | stacking_sub.loc[stacking_sub["stacking_loan_sum"] < 0,"stacking_loan_sum"] = 0 135 | print('saving submission...') 136 | now_time = time.strftime("%m-%d %H_%M_%S", time.localtime()) 137 | stacking_sub[["uid","stacking_loan_sum"]].to_csv("./submission/" +now_time+'_stacking.csv', index=False, header=False) 138 | 139 | 140 | -------------------------------------------------------------------------------- /useless.py: -------------------------------------------------------------------------------- 1 | ##周期内购买金额与借贷金额差值, 离线变差一点 2 | tr_user["gap_buy_loan_amt"] = tr_user["order_amt30"] - tr_user["sum_loanAmt_30"] 3 | ts_user["gap_buy_loan_amt"] = ts_user["order_amt30"] - ts_user["sum_loanAmt_30"] 4 | 5 | 6 | ###估算剩余额度 7 | tr_user, ts_user = getRemainingLoanAmt(t_loan, tr_user, ts_user) 8 | tr_user["remainingAmt2"] = tr_user["limit"] - tr_user["unpayedLoanAmt"] 9 | ts_user["remainingAmt2"] = ts_user["limit"] - ts_user["unpayedLoanAmt"] 10 | 11 | 12 | def getRemainingLoanAmt(df_loan, duser1, duser2): 13 | valid_mask = df_loan.month < 11 14 | test_mask = df_loan.month < 12 15 | for idx, mask in enumerate([valid_mask, test_mask]): 16 | df_loan_tmp = df_loan[mask].reset_index(drop=True) 17 | payingLoan = df_loan_tmp.groupby("uid").apply(lambda x: x.sort_values("pay_end_date",ascending=True)).reset_index(drop=True) 18 | payingLoan["payedLoanAmt"]= payingLoan.groupby("uid")["loan_amount"].cumsum().rename("payedLoanAmt").reset_index(drop=True) 19 | historyLoanAmt = df_loan_tmp.groupby("uid")["loan_amount"].sum().rename("totalLoanAmt").reset_index() 20 | if idx ==0: 21 | historyLoanAmtLimit = historyLoanAmt.merge(duser1[["uid","limit"]], on="uid", how="left") 22 | historyLoanAmtLimit["payedAmt"] = historyLoanAmtLimit["totalLoanAmt"] - historyLoanAmtLimit["limit"] 23 | payedAmt = historyLoanAmtLimit.loc[historyLoanAmtLimit["payedAmt"] > 0,["uid","payedAmt"]].reset_index(drop=True) 24 | payingLoan = payingLoan.merge(payedAmt, on="uid", how="left") 25 | unpayedLoan = payingLoan[payingLoan["payedLoanAmt"] > payingLoan["payedAmt"]].reset_index(drop=True) 26 | unpayedLoanAmt = unpayedLoan.groupby("uid")["loan_amount"].sum().rename("unpayedLoanAmt").reset_index() 27 | duser1 = duser1.merge(unpayedLoanAmt, on="uid", how="left") 28 | elif idx == 1: 29 | historyLoanAmtLimit = historyLoanAmt.merge(duser2[["uid","limit"]], on="uid", how="left") 30 | historyLoanAmtLimit["payedAmt"] = historyLoanAmtLimit["totalLoanAmt"] - historyLoanAmtLimit["limit"] 31 | payedAmt = historyLoanAmtLimit.loc[historyLoanAmtLimit["payedAmt"] > 0,["uid","payedAmt"]].reset_index(drop=True) 32 | payingLoan = payingLoan.merge(payedAmt, on="uid", how="left") 33 | unpayedLoan = payingLoan[payingLoan["payedLoanAmt"] > payingLoan["payedAmt"]].reset_index(drop=True) 34 | unpayedLoanAmt = unpayedLoan.groupby("uid")["loan_amount"].sum().rename("unpayedLoanAmt").reset_index() 35 | duser2 = duser2.merge(unpayedLoanAmt, on="uid", how="left") 36 | return duser1, duser2 37 | 38 | tr_user = encodeByuserProfile(tr_user, 10) 39 | ts_user = encodeByuserProfile(ts_user, 11) 40 | 41 | ##人群编码 42 | def encodeByuserProfile(duser, monthlimit): 43 | duser["encode_key"]= duser["sex_age_limit"].astype(str) + duser["limit_increase"].apply(lambda x: round(x,1)).astype(str) 44 | monthLoanAmt = t_loan.groupby(["uid","month"],as_index=False)["loan_amount"].agg(sum) 45 | encodeMean= monthLoanAmt[monthLoanAmt.month <= monthlimit].groupby(["uid"],as_index=False)["loan_amount"].mean() 46 | encodeMean.columns = ["uid", "monthLoanMeanAmt"] 47 | encodeMean = encodeMean.merge(duser[["uid","encode_key"]], on="uid", how="left") 48 | encodeMean = encodeMean.groupby("encode_key")["monthLoanMeanAmt"].mean().rename("encode_mean").reset_index() 49 | duser = duser.merge(encodeMean, on="encode_key", how="left") 50 | duser.drop("encode_key", axis=1, inplace=True) 51 | return duser 52 | 53 | ##前三个月每个月贷款金额的置信度 54 | def getConfidenceInterval(duser1, duser2, degree): 55 | # mean +/- z * std/sqrt(n) 56 | if degree == 0.8: 57 | z = 1.282 58 | elif degree == 0.85: 59 | z = 1.440 60 | elif degree == 0.90: 61 | z = 1.645 62 | elif degree ==0.95: 63 | z = 1.960 64 | valid_mask = t_loan.month<11 65 | test_mask = t_loan.month<12 66 | for idx, mask in enumerate([valid_mask, test_mask]): 67 | month_loanAmt = t_loan[mask].groupby(["uid","month"])['loan_amount'].sum().rename("monthLoanAmt").reset_index() 68 | month_loanAmt_status = month_loanAmt.groupby(["uid"])["monthLoanAmt"].agg(["count","mean","std"]).reset_index().fillna(0.0) 69 | month_loanAmt_status["upperConfidence"] = month_loanAmt_status["mean"] + z * month_loanAmt_status["std"] / month_loanAmt_status["count"].apply(lambda x: sqrt(x)) 70 | month_loanAmt_status["lowerConfidence"] = month_loanAmt_status["mean"] - z * month_loanAmt_status["std"] / month_loanAmt_status["count"].apply(lambda x: sqrt(x)) 71 | month_loanAmt_status.columns = ["uid", "loanMonthCount","loanMonthAmtMean","loanMonthAmtStd","upperConfidence","lowerConfidence"] 72 | if idx ==0: 73 | duser1 = duser1.merge(month_loanAmt_status, on="uid", how="left") 74 | elif idx ==1: 75 | duser2 = duser2.merge(month_loanAmt_status, on="uid", how="left") 76 | return duser1, duser2 77 | 78 | def getOrderAmtWinds(df, duser, tuser, window_size): 79 | valid_mask, test_mask = get_windows_mask(df, "buy_time", window_size) 80 | for idx, mask in enumerate([valid_mask, test_mask]): 81 | uid_order = df[mask].groupby(["uid"])["order_amt"].sum().rename("order_amt_" + str(window_size)).reset_index() 82 | if idx == 0: 83 | duser = duser.merge(uid_order, how="left", on = 'uid') 84 | elif idx == 1: 85 | tuser = tuser.merge(uid_order, how="left", on = 'uid') 86 | return duser, tuser 87 | 88 | 89 | ##计算每个页面的停留时间 90 | tr_user, ts_user = getStaytimePid(t_click, tr_user, ts_user) 91 | def getStaytimePid(df_click, duser1, duser2): 92 | valid_mask = df_click.month.isin([10,8,9]) 93 | test_mask = df_click.month.isin([11,10,9]) 94 | for idx, mask in enumerate([valid_mask, test_mask]): 95 | tmp = df_click[mask].groupby(["uid","last_click_pid"])["click_interval"].sum().rename("pidStaytime").reset_index() 96 | tmp1 = tmp.pivot(index='uid', columns='last_click_pid', values='pidStaytime').reset_index().fillna(0) 97 | tmp1.columns = ['uid']+ [ 'pid_' + str(i) +'_staytime' for i in list(tmp1.columns)[1:]] 98 | tmp1.drop("pid_0.0_staytime", inplace=True, axis=1) 99 | if idx == 0: 100 | duser1 = duser1.merge(tmp, on ="uid", how="left") 101 | elif idx == 1: 102 | duser2 = duser2.merge(tmp, on ="uid", how="left") 103 | return duser1, duser2 104 | 105 | tr_user, ts_user = getTotalPidStaytime(t_click, tr_user, ts_user) 106 | 107 | def getTotalPidStaytime(df_click, duser1, duser2): 108 | valid_mask = df_click.month.isin([8,9,10]) 109 | test_mask = df_click.month.isin([11,10,9]) 110 | for idx, mask in enumerate([valid_mask, test_mask]): 111 | tmp = df_click[mask].groupby(["uid"])["click_interval"].sum().rename("Staytime").reset_index() 112 | if idx == 0: 113 | duser1 = duser1.merge(tmp, on ="uid", how="left") 114 | elif idx == 1: 115 | duser2 = duser2.merge(tmp, on ="uid", how="left") 116 | return duser1, duser2 117 | 118 | tr_user, ts_user= uidOrderAmtCntWinds(t_order, tr_user, ts_user, 90) 119 | 120 | tr_user, ts_user= uidOrderAmtCntWinds(t_order, tr_user, ts_user, 60) 121 | 122 | tr_user, ts_user= uidOrderAmtCntWinds(t_order, tr_user, ts_user, 30) 123 | 124 | def uidOrderAmtCntWinds(df, duser, tuser, window_size): 125 | valid_mask, test_mask = get_windows_mask(df, "buy_time", window_size) 126 | for idx, mask in enumerate([valid_mask, test_mask]): 127 | tmp = df[mask] 128 | order_status = tmp.groupby(["uid"])["order_amt"].agg(["count","sum"]).reset_index() 129 | order_status.columns = ['uid', 'order_cnt_'+str(window_size), "order_amt_" + str(window_size) ] 130 | if idx == 0: 131 | duser = duser.merge(order_status, how="left", on = 'uid') 132 | elif idx == 1: 133 | tuser = tuser.merge(order_status, how="left", on = 'uid') 134 | return duser, tuser 135 | 136 | 137 | tr_user, ts_user= uidOrderLoanAmtWinds(tr_user, ts_user, 30) 138 | 139 | tr_user, ts_user= uidOrderLoanAmtWinds( tr_user, ts_user, 60) 140 | 141 | tr_user, ts_user= uidOrderLoanAmtWinds( tr_user, ts_user, 90) 142 | 143 | 144 | def uidOrderLoanAmtWinds( duser, tuser, window_size): 145 | valid_ordermask, test_ordermask = get_windows_mask(t_order, "buy_time", window_size) 146 | valid_loanmask, test_loanmask = get_windows_mask(t_loan, "loan_time", window_size) 147 | for idx, mask in enumerate([valid_ordermask, test_ordermask]): 148 | tmp = t_order[mask] 149 | order_status = tmp.groupby(["uid"])["order_amt"].sum().rename("orderAmt" + str(window_size)).reset_index() 150 | if idx == 0: 151 | loanstatus = t_loan[valid_loanmask].groupby(["uid"])["loan_amount"].sum().rename("loanAmt" + str(window_size)).reset_index() 152 | order_loan = order_status.merge(loanstatus,on="uid",how="left") 153 | order_loan["loanOrderGapAmt" + str(window_size)] = order_loan["orderAmt" + str(window_size)] - order_loan["loanAmt" + str(window_size)] 154 | duser = duser.merge(order_loan[["uid","loanOrderGapAmt" + str(window_size)]], how="left", on = 'uid') 155 | elif idx == 1: 156 | loanstatus = t_loan[test_loanmask].groupby(["uid"])["loan_amount"].sum().rename("loanAmt" + str(window_size)).reset_index() 157 | order_loan = order_status.merge(loanstatus,on="uid",how="left") 158 | order_loan["loanOrderGapAmt" + str(window_size)] = order_loan["orderAmt" + str(window_size)] - order_loan["loanAmt" + str(window_size)] 159 | tuser = tuser.merge(order_loan[["uid","loanOrderGapAmt" + str(window_size)]], how="left", on = 'uid') 160 | return duser, tuser 161 | 162 | 163 | 164 | def getLoanIntervalShiftDiff(df_loan, duser1, duser2): 165 | valid_mask = df_loan.month.isin([8, 9, 10]) 166 | test_mask = df_loan.month.isin([8, 9, 10, 11]) 167 | for idx, mask in enumerate([valid_mask, test_mask]): 168 | tmp = df_loan[mask].reset_index(drop=True) 169 | tmp["loanIntervalDiff"] = tmp.groupby("uid")["loan_interval"].apply(lambda x: x - x.shift(1)) 170 | tmp["loanIntervalDiff2"] = tmp.groupby("uid")["loanIntervalDiff"].apply(lambda x: x - x.shift(1)) 171 | maxtime_idx = tmp.groupby(['uid'])['loan_time'].transform(max) == tmp['loan_time'] #用户最近一天贷款的情况 172 | tmp = tmp[maxtime_idx] 173 | if idx == 0: 174 | duser1 = duser1.merge(tmp[["uid", "loanIntervalDiff","loanIntervalDiff2"]], on ="uid", how="left") 175 | elif idx == 1: 176 | duser2 = duser2.merge(tmp[["uid", "loanIntervalDiff","loanIntervalDiff2"]], on ="uid", how="left") 177 | return duser1, duser2 178 | 179 | def getMonthLoanShiftDiff_WRONG(df_loan, duser1, duser2): 180 | uidMonthLoan = df_loan.groupby(["uid","month"])["loan_amount"].sum().rename("month_loan_amt").reset_index() 181 | uidMonthLoan = uidMonthLoan.pivot(index='uid', columns='month', values='month_loan_amt').fillna(0) 182 | uidMonthLoan = uidMonthLoan.stack().reset_index() 183 | uidMonthLoan.columns =["uid","month","month_loan_amt"] 184 | uidMonthLoan = uidMonthLoan.groupby(["uid"]).apply(lambda x: x.sort_values(["month"], ascending=True)).reset_index(drop=True) 185 | uidMonthLoan["monthLoanAmtDiff"] = uidMonthLoan.groupby("uid")["month_loan_amt"].apply(lambda x: x - x.shift(1)) 186 | uidMonthLoan["monthLoanAmtDiff2"] = uidMonthLoan.groupby("uid")["monthLoanAmtDiff"].apply(lambda x: x - x.shift(1)) 187 | duser1 = duser1.merge(uidMonthLoan[uidMonthLoan.month == 10][["uid", "monthLoanAmtDiff","monthLoanAmtDiff2"]], on ="uid", how="left") 188 | duser2 = duser2.merge(uidMonthLoan[uidMonthLoan.month == 11][["uid", "monthLoanAmtDiff","monthLoanAmtDiff2"]], on ="uid", how="left") 189 | return duser1, duser2 190 | 191 | ##最近一次借款时间-与最近一次购买时间差 192 | tr_user, ts_user = getNearestLoanOrderInterval(t_loan, t_order, tr_user, ts_user) 193 | def getNearestLoanOrderInterval(df_loan, df_order, duser1, duser2): 194 | window_size = 180 195 | loan_valid_mask, loan_test_mask = get_windows_mask(df_loan, "loan_time", window_size) 196 | order_valid_mask, order_test_mask = get_windows_mask(df_order, "buy_time", window_size) 197 | for i in [0,1]: 198 | if i == 0: 199 | loan_tmp = df_loan[loan_valid_mask][["uid","loan_time","loan_amount"]] 200 | order_tmp = df_order[order_valid_mask][["uid","buy_time","order_amt"]] 201 | elif i == 1: 202 | loan_tmp = df_loan[loan_test_mask][["uid","loan_time","loan_amount"]] 203 | order_tmp = df_order[order_test_mask][["uid","buy_time","order_amt"]] 204 | uid_nearest_loan = loan_tmp[loan_tmp.groupby(['uid'])['loan_time'].transform(max) == loan_tmp['loan_time']] #用户最近一次贷款的时间 205 | uid_nearest_order = order_tmp[order_tmp.groupby(['uid'])['buy_time'].transform(max) == order_tmp['buy_time']] 206 | uid_loan_order = uid_nearest_loan.merge(uid_nearest_order, on = 'uid', how='left') 207 | uid_loan_order["nearest_order_loan_interval"] = (uid_loan_order["loan_time"] - uid_loan_order["buy_time"]).apply(lambda x: x.days) 208 | uid_loan_order["nearest_order_loan_price"] = (uid_loan_order["loan_amount"] - uid_loan_order["order_amt"]) 209 | if i == 0: 210 | duser1 = duser1.merge(uid_loan_order[["uid","nearest_order_loan_interval","nearest_order_loan_price"]], on="uid", how="left") 211 | elif i == 1: 212 | duser2 = duser2.merge(uid_loan_order[["uid","nearest_order_loan_interval","nearest_order_loan_price"]], on="uid", how="left") 213 | return duser1, duser2 214 | 215 | ##用户当前所有贷款的未还款之前贷了多少钱 216 | tr_user, ts_user = loanAgainBeforePayed(tr_user, ts_user, 15) 217 | tr_user, ts_user = loanAgainBeforePayed(tr_user, ts_user, 30) 218 | 219 | def loanAgainBeforePayed(duser1, duser2, window_size): 220 | valid_mask_loan, test_mask_loan = get_windows_mask(t_loan, "loan_time", window_size) 221 | for idx, mask in enumerate([valid_mask_loan, test_mask_loan]): 222 | tmp = t_loan[mask].groupby("uid")["loan_amount"].sum().rename("loanAmount").reset_index() 223 | if idx == 0: 224 | valid_before = pd.Timestamp(valid_end_date) - timedelta(days=window_size) 225 | still_mask = (t_loan["loan_time"] < valid_before) & (t_loan["pay_end_date"] >= pd.Timestamp(valid_end_date)) 226 | stillLoan = t_loan[still_mask].reset_index(drop=True) 227 | stillLoan["topayed_num"] = (stillLoan["pay_end_date"] - pd.Timestamp(valid_end_date)).apply(lambda x: x.days/30.0) 228 | elif idx ==1: 229 | test_before = pd.Timestamp(test_end_date) - timedelta(days=window_size) 230 | still_mask = (t_loan["loan_time"] < valid_before) & (t_loan["pay_end_date"] >= pd.Timestamp(test_end_date)) 231 | stillLoan = t_loan[still_mask].reset_index(drop=True) 232 | stillLoan["topayed_num"] = (stillLoan["pay_end_date"] - pd.Timestamp(test_end_date)).apply(lambda x: x.days/30.0) 233 | stillLoan["debtAmt"] = stillLoan["amt_per_plan"] * stillLoan["topayed_num"] 234 | stillLoan = stillLoan.groupby("uid")["debtAmt"].sum().rename("stillDebtAmt").reset_index() 235 | tmp = tmp.merge(stillLoan, on="uid", how="left") 236 | tmp["stillDebtAmt_" + str(window_size)] = tmp["loanAmount"] + tmp["stillDebtAmt"] 237 | if idx == 0: 238 | duser1 = duser1.merge(tmp[["uid","stillDebtAmt_" + str(window_size)]], on="uid", how="left") 239 | elif idx == 1: 240 | duser2 = duser2.merge(tmp[["uid","stillDebtAmt_" + str(window_size)]], on="uid", how="left") 241 | return duser1, duser2 242 | 243 | 244 | 245 | tr_user, ts_user = loanOrderBehiviorSeries(tr_user, ts_user, 30) 246 | 247 | def loanOrderBehiviorSeries(duser1,duser2,window_size): 248 | valid_mask_loan, test_mask_loan = get_windows_mask(t_loan, "loan_time", window_size) 249 | valid_mask_order, test_mask_order = get_windows_mask(t_order, "buy_time", window_size) 250 | for idx, mask in enumerate([valid_mask_loan, test_mask_loan]): 251 | tmp_loan = t_loan.loc[mask,['uid','date']].rename(columns={'date':'behavior_time'}) 252 | tmp_loan['behavior'] = 0 253 | if idx == 0: 254 | mask_order = valid_mask_order 255 | elif idx == 1: 256 | mask_order = test_mask_order 257 | tmp_order = t_order.loc[mask_order,['uid','buy_time']].rename(columns={'buy_time':'behavior_time'}) 258 | tmp_order['behavior']=1 259 | tmp=pd.concat([tmp_loan.drop_duplicates(),tmp_order.drop_duplicates()]).groupby(["uid"]).apply(lambda x: x.sort_values(["behavior_time"], ascending=True)).reset_index(drop=True) 260 | tmp1 = tmp.groupby("uid")["behavior"].apply(lambda x:list(x)).rename("orderLoanBehavior").reset_index() 261 | tmp1["orderLoanBehavior"] = tmp1["orderLoanBehavior"].apply(lambda x: ''.join(map(str, x))) 262 | tmp1["orderLoanBehavior"] = tmp1["orderLoanBehavior"].astype('category') 263 | tmp1['orderLoanBehavior'].cat.categories=range(tmp1["orderLoanBehavior"].nunique()) #4563 264 | tmp1["orderLoanBehavior"] = tmp1["orderLoanBehavior"].astype(int) 265 | if idx == 0: 266 | duser1 = duser1.merge(tmp1[["uid","orderLoanBehavior"]], on="uid", how="left") 267 | elif idx == 1: 268 | duser2 = duser2.merge(tmp1[["uid","orderLoanBehavior"]], on="uid", how="left") 269 | return duser1, duser2 270 | 271 | 272 | tr_user, ts_user = getOrderLoanAmtdiff(tr_user, ts_user, 60) 273 | 274 | ##购买到借贷的平均时间间隔、价格差 275 | def getOrderLoanAmtdiff(duser1, duser2, window_size): 276 | tmp_loan = t_loan[["uid","loan_time","loan_amount"]] 277 | tmp_order = t_order[["uid","buy_time","order_amt"]] 278 | tmp_loan["type"] = "loan" 279 | tmp_order["type"] = "order" 280 | tmp_loan.columns = ["uid","action_time","amount","type"] 281 | tmp_order.columns = ["uid","action_time","amount","type"] 282 | loan_order = pd.concat([tmp_loan,tmp_order]) 283 | loan_order = loan_order.sort_values(["action_time"], ascending=True).reset_index(drop=True) 284 | valid_mask, test_mask = get_windows_mask(loan_order,"action_time",window_size) 285 | for idx, mask in enumerate([valid_mask, test_mask]): 286 | tmp = loan_order[mask].reset_index(drop=True) 287 | tmp['last_action_time']= tmp.groupby(['uid'])[['action_time']].shift(1) 288 | tmp['last_type']= tmp.groupby(['uid'])[['type']].shift(1) 289 | tmp["order_click_interval"] = (tmp["action_time"] - tmp['last_action_time']).apply(lambda x: x.days) 290 | uid_order_click= tmp[tmp["type"] != tmp["last_type"]].groupby("uid")["order_click_interval"].agg(["mean","max","min"]).reset_index() 291 | uid_order_click.columns = ["uid","mean_order_click_interval","max_order_click_interval","min_order_click_interval"] 292 | if idx == 0: 293 | duser1 = duser1.merge(uid_order_click, on="uid", how="left") 294 | elif idx == 1: 295 | duser2 = duser2.merge(uid_order_click, on="uid", how="left") 296 | return duser1, duser2 297 | 298 | 299 | def getOrderLoanAmtdiff(duser1, duser2, window_size): 300 | tmp_loan = t_loan[["uid","loan_time","loan_amount"]] 301 | tmp_order = t_order[["uid","buy_time","order_amt"]] 302 | tmp_loan["type"] = "loan" 303 | tmp_order["type"] = "order" 304 | tmp_loan.columns = ["uid","action_time","amount","type"] 305 | tmp_order.columns = ["uid","action_time","amount","type"] 306 | loan_order = pd.concat([tmp_loan,tmp_order]) 307 | loan_order = loan_order.sort_values(["action_time"], ascending=True).reset_index(drop=True) 308 | valid_mask, test_mask = get_windows_mask(loan_order,"action_time",window_size) 309 | for idx, mask in enumerate([valid_mask, test_mask]): 310 | tmp = loan_order[mask].reset_index(drop=True) 311 | tmp['last_amount']= tmp.groupby(['uid'])[['amount']].shift(1) 312 | tmp['last_type']= tmp.groupby(['uid'])[['type']].shift(1) 313 | tmp["order_click_gap"] = (tmp["amount"] - tmp['last_amount']) 314 | uid_order_click= tmp[tmp["type"] != tmp["last_type"]].groupby("uid")["order_click_gap"].agg(["mean","max","min"]).reset_index() 315 | uid_order_click.columns = ["uid","mean_order_click_interval","max_order_click_interval","min_order_click_interval"] 316 | if idx == 0: 317 | duser1 = duser1.merge(uid_order_click, on="uid", how="left") 318 | elif idx == 1: 319 | duser2 = duser2.merge(uid_order_click, on="uid", how="left") 320 | return duser1, duser2 321 | 322 | ##用户一个月每个月每个类目消费金额、次数, 时间周期分了三次 323 | tr_user, ts_user = getCatePivotAmtCnt(t_order, tr_user, ts_user) 324 | 325 | def getCatePivotAmtCnt(df, duser1, duser2): 326 | valid_mask = df.month.isin([8, 9,10]) 327 | test_mask = df.month.isin([9, 10,11]) 328 | for idx, mask in enumerate([valid_mask, test_mask]): 329 | tmp = df[mask] 330 | uid_months = tmp.groupby(["uid","cate_id"])["order_amt"].agg(["count","sum"]).reset_index() 331 | uid_months.rename({'count': 'cate_id_cnt', 'sum': 'cate_id_sum' }, axis='columns',inplace=True) 332 | if idx == 0: 333 | uid_months["cate_id"] = "cate_id_" + uid_months['cate_id'].astype(str) 334 | elif idx == 1: 335 | uid_months["cate_id"] = "cate_id_" + uid_months['cate_id'].astype(str) 336 | uid_months = uid_months.pivot(index='uid', columns='cate_id').reset_index().fillna(0) 337 | new_list = ["uid"] 338 | for words in uid_months.columns.get_values(): 339 | if "uid" in words: 340 | continue 341 | new_list.append('_'.join(words)) 342 | uid_months.columns = new_list 343 | if idx == 0: 344 | duser1 = duser1.merge(uid_months, how="left", on="uid") 345 | elif idx == 1: 346 | duser2 = duser2.merge(uid_months, how="left", on="uid") 347 | return duser1, duser2 348 | 349 | ##用户前三个月每个月消费,贷款、金额、次数 350 | def getPivotAmtCnt(df, column ,duser1, duser2): 351 | valid_mask = df.month.isin([10,9,8]) 352 | test_mask = df.month.isin([11,10,9]) 353 | for idx, mask in enumerate([valid_mask, test_mask]): 354 | tmp = df[mask] 355 | uid_months = tmp.groupby(["uid","month"])[column].agg(["count","sum"]).reset_index() 356 | uid_months.rename({'count': column + '_cnt', 'sum': column + '_sum' }, axis='columns',inplace=True) 357 | if idx == 0: 358 | uid_months["month"] = "month" + uid_months['month'].astype(str) 359 | elif idx == 1: 360 | uid_months["month"] = "month" + (uid_months['month']-1).astype(str) 361 | uid_months = uid_months.pivot(index='uid', columns='month').reset_index().fillna(0) 362 | new_list = ["uid"] 363 | for words in uid_months.columns.get_values(): 364 | if "uid" in words: 365 | continue 366 | new_list.append('_'.join(words)) 367 | uid_months.columns = new_list 368 | if idx == 0: 369 | duser1 = duser1.merge(uid_months, how="left", on="uid") 370 | elif idx == 1: 371 | duser2 = duser2.merge(uid_months, how="left", on="uid") 372 | return duser1, duser2 373 | 374 | ##用户点击多少商品, 效果不好,下降0.001 375 | tr_user, ts_user = getClick(t_click, tr_user, ts_user) 376 | 377 | def getClick(df_click, duser1, duser2): 378 | df_click["pid_param"] = df_click["pid"].astype(str) + "_" + df_click["param"].astype(str) 379 | valid_mask = df_click.month.isin([8,9,10]) 380 | test_mask = df_click.month.isin([9,10,11]) 381 | for idx, mask in enumerate([valid_mask, test_mask]): 382 | tmp = df_click[mask] 383 | product = tmp.groupby("uid")["pid_param"].nunique().rename("click_pdt_cnt").reset_index() 384 | if idx == 0: 385 | duser1 = duser1.merge(product, on="uid", how="left") 386 | elif idx == 1: 387 | duser2 = duser2.merge(product, on="uid", how="left") 388 | return duser1, duser2 389 | 390 | 391 | ##按天汇总loan order金额, 1,7,15,30 392 | tr_user, ts_user = getLoanAmtPerWindows(t_loan, tr_user, ts_user, 7) 393 | 394 | tr_user, ts_user = getLoanAmtPerWindows(t_loan, tr_user, ts_user, 15) 395 | 396 | def getLoanAmtPerWindows(df_loan, duser1, duser2, window_size): 397 | valid_mask = df_loan.month < 11 398 | test_mask = df_loan.month < 12 399 | df_loan["dayofyear"] = df_loan["loan_time"].apply(lambda x: x.dayofyear) 400 | for idx, mask in enumerate([valid_mask, test_mask]): 401 | df_loan_tmp = df_loan[mask].reset_index(drop=True) 402 | df_loan_tmp["dayofwindows"] = df_loan_tmp["dayofyear"].apply(lambda x: round(x/window_size)) 403 | uid_tmp = df_loan_tmp.groupby(["uid","dayofwindows"])["loan_amount"].sum().rename("loanAmtIn"+str(window_size)).reset_index().groupby("uid")["loanAmtIn"+str(window_size)].mean().reset_index() 404 | if idx == 0: 405 | duser1 = duser1.merge(uid_tmp, on="uid", how="left") 406 | duser1["loanAmtIn"+str(window_size)] = duser1["loanAmtIn"+str(window_size)].fillna(0) 407 | elif idx == 1: 408 | duser2 = duser2.merge(uid_tmp, on="uid", how="left") 409 | duser2["loanAmtIn"+str(window_size)] = duser2["loanAmtIn"+str(window_size)].fillna(0) 410 | return duser1, duser2 411 | 412 | 413 | ##最近五次的贷款金额、时间差、金额/时间差、期数 414 | tr_user, ts_user = getNthNearestLoan(tr_user, ts_user, 2) 415 | tr_user, ts_user = getNthNearestLoan(tr_user, ts_user, 3) 416 | tr_user, ts_user = getNthNearestLoan(tr_user, ts_user, 4) 417 | tr_user, ts_user = getNthNearestLoan(tr_user, ts_user, 5) 418 | 419 | def getNthNearestLoan(duser1, duser2, pos): 420 | loan_validmask = t_loan.month < 11 421 | loan_testmask = t_loan.month < 12 422 | for idx, mask in enumerate([loan_validmask, loan_testmask]): 423 | t_loan_tmp = t_loan[mask].reset_index(drop=True) 424 | t_loan_tmp["loan_rank"] = t_loan_tmp.groupby(["uid"])["loan_time"].rank(ascending=False) 425 | t_loan_tmp = t_loan_tmp[t_loan_tmp["loan_rank"] == pos][["uid","loan_time","loan_amount","plannum"]] 426 | t_loan_tmp.columns = ["uid","loan_time", "pos_" + str(pos) + "_loan_amount", "plannum"] 427 | if idx ==0: 428 | t_loan_tmp["pos_"+str(pos)+"_loan_interval"] = (pd.Timestamp(valid_end_date)-t_loan_tmp["loan_time"]).apply(lambda x:x.total_seconds()) 429 | t_loan_tmp["loan_amount_interval_pos_" + str(pos)]= t_loan_tmp["pos_" + str(pos) + "_loan_amount"] / t_loan_tmp["pos_"+str(pos)+"_loan_interval"] 430 | t_loan_tmp = t_loan_tmp[["uid", "pos_" + str(pos) + "_loan_amount", "pos_"+str(pos)+"_loan_interval", "loan_amount_interval_pos_" + str(pos)]] 431 | duser1 = duser1.merge(t_loan_tmp, on="uid", how="left") 432 | elif idx == 1: 433 | t_loan_tmp["pos_"+str(pos)+"_loan_interval"] = (pd.Timestamp(test_end_date)-t_loan_tmp["loan_time"]).apply(lambda x:x.total_seconds()) 434 | t_loan_tmp["loan_amount_interval_pos_" + str(pos)]= t_loan_tmp["pos_" + str(pos) + "_loan_amount"] / t_loan_tmp["pos_"+str(pos)+"_loan_interval"] 435 | t_loan_tmp = t_loan_tmp[["uid", "pos_" + str(pos) + "_loan_amount", "pos_"+str(pos)+"_loan_interval", "loan_amount_interval_pos_" + str(pos)]] 436 | duser2 = duser2.merge(t_loan_tmp, on="uid", how="left") 437 | return duser1, duser2 438 | 439 | 440 | ##最近N的点击记录 441 | 442 | tr_user, ts_user = getNthNearestClick(tr_user, ts_user, 1) 443 | tr_user, ts_user = getNthNearestClick(tr_user, ts_user, 2) 444 | tr_user, ts_user = getNthNearestClick(tr_user, ts_user, 3) 445 | tr_user, ts_user = getNthNearestClick(tr_user, ts_user, 4) 446 | tr_user, ts_user = getNthNearestClick(tr_user, ts_user, 5) 447 | 448 | 449 | def getNthNearestClick(duser1, duser2, pos): 450 | click_validmask = t_click.month < 11 451 | click_testmask = t_click.month < 12 452 | for idx, mask in enumerate([click_validmask, click_testmask]): 453 | t_click_tmp = t_click[mask].reset_index(drop=True) 454 | t_click_tmp["pid_param"]= t_click_tmp["pid"].astype(str) + "_" + t_click_tmp["param"].astype(str) 455 | t_click_tmp["loan_rank"] = t_click_tmp.groupby(["uid"])["click_time"].rank(ascending=False) 456 | t_click_tmp = t_click_tmp[t_click_tmp["loan_rank"] == pos][["uid","pid_param"]] 457 | t_click_tmp.columns = ["uid", "pid_param_pos" + str(pos)] 458 | if idx ==0: 459 | duser1 = duser1.merge(t_click_tmp, on="uid", how="left") 460 | duser1["pid_param_pos" + str(pos)] = duser1["pid_param_pos" + str(pos)].astype('category') 461 | duser1["pid_param_pos" + str(pos)].cat.categories= np.arange(1,duser1["pid_param_pos" + str(pos)].nunique()+1) 462 | duser1["pid_param_pos" + str(pos)] = duser1["pid_param_pos" + str(pos)].astype(int) 463 | elif idx == 1: 464 | duser2 = duser2.merge(t_click_tmp, on="uid", how="left") 465 | duser2["pid_param_pos" + str(pos)] = duser2["pid_param_pos" + str(pos)].astype('category') 466 | duser2["pid_param_pos" + str(pos)].cat.categories= np.arange(1,duser2["pid_param_pos" + str(pos)].nunique()+1) 467 | duser2["pid_param_pos" + str(pos)] = duser2["pid_param_pos" + str(pos)].astype(int) 468 | return duser1, duser2 469 | 470 | 471 | ##每人购买力,预测贷款金额 472 | def getRealPurchasePower(df_loan, df_order, duser1, duser2): 473 | month_orderAmt = df_order.groupby(["uid","month"])["order_amt"].sum().rename("uid_month_orderAmt").reset_index() 474 | month_loanAmt = df_loan.groupby(["uid","month"])["loan_amount"].sum().rename("uid_month_loanAmt").reset_index() 475 | month_order_loan = month_orderAmt.merge(month_loanAmt, on = ["uid","month"], how="left").fillna(0.0) 476 | month_order_loan["realPurchasePower"] = month_order_loan["uid_month_orderAmt"] - month_order_loan["uid_month_loanAmt"] 477 | realPurchasePower = month_order_loan[month_order_loan.month < 11] 478 | tmp = realPurchasePower.groupby("uid")["realPurchasePower"].agg(["max","min","mean","median"]).reset_index() 479 | tmp.columns = ["uid","max_realPurchasePower", "min_realPurchasePower", "mean_realPurchasePower", "median_realPurchasePower"] 480 | duser1 = duser1.merge(tmp, on = 'uid', how="left") 481 | tmp = month_order_loan.groupby("uid")["realPurchasePower"].agg(["max","min","mean","median"]).reset_index() 482 | tmp.columns = ["uid","max_realPurchasePower", "min_realPurchasePower", "mean_realPurchasePower", "median_realPurchasePower"] 483 | duser2 = duser2.merge(tmp, on = 'uid', how="left") 484 | return duser1, duser2 485 | 486 | ##每人购买力,预测贷款金额 487 | def avgLoanAmt4orderAmt(df_loan, df_order, duser1, duser2): 488 | month_orderAmt = df_order.groupby(["uid","month"])["order_amt"].sum().rename("uid_month_orderAmt").reset_index() 489 | month_loanAmt = df_loan.groupby(["uid","month"])["loan_amount"].sum().rename("uid_month_loanAmt").reset_index() 490 | month_order_loan = month_orderAmt.merge(month_loanAmt, on = ["uid","month"], how="left").fillna(0.0) 491 | valid_mask = month_order_loan.month < 11 492 | test_mask = month_order_loan.month < 12 493 | for idx, mask in enumerate([valid_mask, test_mask]): 494 | monthOrderLoanTmp = month_order_loan[mask].reset_index(drop=True) 495 | monthOrderLoanTmp["orderLoanAmtGap"] = monthOrderLoanTmp["uid_month_loanAmt"] + monthOrderLoanTmp["uid_month_orderAmt"] 496 | meanOrderLoanAmtGap = monthOrderLoanTmp.groupby("uid")["orderLoanAmtGap"].mean().rename("meanOrderLoanAmtGap").reset_index() 497 | if idx == 0: 498 | duser1 = duser1.merge(meanOrderLoanAmtGap, on = 'uid', how="left") 499 | elif idx == 1: 500 | duser2 = duser2.merge(meanOrderLoanAmtGap, on = 'uid', how="left") 501 | return duser1, duser2 502 | 503 | ##每个人最近一次贷款之后购买金额,次数, rmse微弱上升 504 | def getOrderStatusAfterNearestLoan(df_loan, df_order, duser1, duser2, window_size): 505 | valid_mask, test_mask = get_windows_mask(df_loan, "loan_time", window_size) 506 | for idx, mask in enumerate([valid_mask, test_mask]): 507 | df_loan_tmp = df_loan[mask].reset_index(drop=True) 508 | maxtime_idx = df_loan_tmp.groupby(['uid'])['loan_time'].transform(max) == df_loan_tmp['loan_time'] #用户最近一次贷款的情况 509 | uid_nearest_loan = df_loan_tmp[maxtime_idx].reset_index(drop=True) 510 | order_loan = df_order.merge(uid_nearest_loan[["uid","loan_time"]], on="uid", how="left") 511 | if idx == 0: 512 | valid_idx = (order_loan["buy_time"] > order_loan["loan_time"]) & (order_loan["buy_time"] < pd.Timestamp("2016-11-01")) 513 | afterLoanOrderStatus = order_loan[valid_idx].groupby("uid")["order_amt"].agg(["count","sum"]).reset_index() 514 | afterLoanOrderStatus.columns = ["uid","afterNearestLoanOrderCnt", "afterNearestLoanOrderAmt"] 515 | duser1 = duser1.merge(afterLoanOrderStatus, on="uid", how="left") 516 | duser1[["afterNearestLoanOrderCnt", "afterNearestLoanOrderAmt"]] = duser1[["afterNearestLoanOrderCnt", "afterNearestLoanOrderAmt"]].fillna(0) 517 | elif idx ==1: 518 | valid_idx = (order_loan["buy_time"] > order_loan["loan_time"]) & (order_loan["buy_time"] < pd.Timestamp("2016-12-01")) 519 | afterLoanOrderStatus = order_loan[valid_idx].groupby("uid")["order_amt"].agg(["count","sum"]).reset_index() 520 | afterLoanOrderStatus.columns = ["uid","afterNearestLoanOrderCnt", "afterNearestLoanOrderAmt"] 521 | duser2 = duser2.merge(afterLoanOrderStatus, on="uid", how="left") 522 | duser2[["afterNearestLoanOrderCnt", "afterNearestLoanOrderAmt"]] = duser2[["afterNearestLoanOrderCnt", "afterNearestLoanOrderAmt"]].fillna(0) 523 | return duser1, duser2 524 | 525 | 526 | ##每个人最近一次贷款之后点击的次数, 527 | def getClickStatusAfterNearestLoan(df_loan, df_click, duser1, duser2, window_size): 528 | valid_mask, test_mask = get_windows_mask(df_loan, "loan_time", window_size) 529 | for idx, mask in enumerate([valid_mask, test_mask]): 530 | df_loan_tmp = df_loan[mask].reset_index(drop=True) 531 | maxtime_idx = df_loan_tmp.groupby(['uid'])['loan_time'].transform(max) == df_loan_tmp['loan_time'] #用户最近一次贷款的情况 532 | uid_nearest_loan = df_loan_tmp[maxtime_idx].reset_index(drop=True) 533 | click_loan = uid_nearest_loan[["uid","loan_time"]].merge(df_click[["uid","click_time"]], on="uid", how="left") 534 | if idx == 0: 535 | valid_idx = (click_loan["click_time"] > click_loan["loan_time"]) & (click_loan["click_time"] < pd.Timestamp("2016-11-01")) 536 | afterLoanOrderStatus = click_loan[valid_idx].groupby("uid")["click_time"].count().rename("afterNearestLoanClickCnt").reset_index() 537 | duser1 = duser1.merge(afterLoanOrderStatus, on="uid", how="left") 538 | duser1["afterNearestLoanClickCnt"] = duser1["afterNearestLoanClickCnt"].fillna(0) 539 | elif idx ==1: 540 | valid_idx = (click_loan["click_time"] > click_loan["loan_time"]) & (click_loan["click_time"] < pd.Timestamp("2016-12-01")) 541 | afterLoanOrderStatus = click_loan[valid_idx].groupby("uid")["click_time"].count().rename("afterNearestLoanClickCnt").reset_index() 542 | duser2 = duser2.merge(afterLoanOrderStatus, on="uid", how="left") 543 | duser2["afterNearestLoanClickCnt"] = duser2["afterNearestLoanClickCnt"].fillna(0) 544 | return duser1, duser2 545 | 546 | 547 | ##每个人最近一次购买之后贷款金额,次数, 548 | def getLoanStatusAfterNearestOrder(df_loan, df_order, duser1, duser2, window_size): 549 | valid_mask, test_mask = get_windows_mask(df_order, "buy_time", window_size) 550 | for idx, mask in enumerate([valid_mask, test_mask]): 551 | df_order_tmp = df_order[mask].reset_index(drop=True) 552 | maxtime_idx = df_order_tmp.groupby(['uid'])['buy_time'].transform(max) == df_order_tmp['buy_time'] #用户最近一次贷款的情况 553 | uid_nearest_order = df_order_tmp[maxtime_idx][["uid","buy_time"]].drop_duplicates().reset_index(drop=True) 554 | order_loan = df_loan.merge(uid_nearest_order, on="uid", how="left") 555 | if idx == 0: 556 | valid_idx = (order_loan["buy_time"] <= order_loan["loan_time"]) & (order_loan["loan_time"] < pd.Timestamp("2016-11-01")) 557 | afterLoanOrderStatus = order_loan[valid_idx].groupby("uid")["loan_amount"].agg(["count","sum"]).reset_index() 558 | afterLoanOrderStatus.columns = ["uid","afterNearestOrderLoanCnt", "afterNearestOrderLoanAmt"] 559 | duser1 = duser1.merge(afterLoanOrderStatus, on="uid", how="left") 560 | duser1[["afterNearestOrderLoanCnt", "afterNearestOrderLoanAmt"]] = duser1[["afterNearestOrderLoanCnt", "afterNearestOrderLoanAmt"]].fillna(0) 561 | elif idx ==1: 562 | valid_idx = (order_loan["buy_time"] <= order_loan["loan_time"]) & (order_loan["loan_time"] < pd.Timestamp("2016-12-01")) 563 | afterLoanOrderStatus = order_loan[valid_idx].groupby("uid")["loan_amount"].agg(["count","sum"]).reset_index() 564 | afterLoanOrderStatus.columns = ["uid","afterNearestOrderLoanCnt", "afterNearestOrderLoanAmt"] 565 | duser2 = duser2.merge(afterLoanOrderStatus, on="uid", how="left") 566 | duser2[["afterNearestOrderLoanCnt", "afterNearestOrderLoanAmt"]] = duser2[["afterNearestOrderLoanCnt", "afterNearestOrderLoanAmt"]].fillna(0) 567 | return duser1, duser2 568 | 569 | def gen_fixed_tw_features_for_order_01(df_order, df_loan, duser1, duser2, window_size): 570 | valid_mask, test_mask = get_windows_mask(df_order, "buy_time", window_size) 571 | loan_valid_mask, loan_test_mask = get_windows_mask(df_loan, "loan_time", window_size) 572 | for idx, mask in enumerate([valid_mask, test_mask]): 573 | tmp = df_order[mask].reset_index(drop=True) 574 | tmp['is_discount']=(tmp.discount!=0.0) * 1.0 575 | if idx==0: 576 | tmp['daysOrder']=(pd.Timestamp(valid_end_date)-tmp.buy_time).apply(lambda x:x.days+x.seconds/86400.0) 577 | loan_uid = df_loan[loan_valid_mask].groupby('uid').loan_amount.sum().rename('loan_amt').reset_index() 578 | elif idx==1: 579 | tmp['daysOrder']=(pd.Timestamp(test_end_date)-tmp.buy_time).apply(lambda x:x.days+x.seconds/86400.0) 580 | loan_uid = df_loan[loan_test_mask].groupby('uid').loan_amount.sum().rename('loan_amt').reset_index() 581 | code_cate_id = (tmp[['uid','cate_id']].drop_duplicates().merge(loan_uid, how="left", on="uid").groupby('cate_id').loan_amt.sum()/loan_uid.loan_amt.sum()).rename('code_cate_id').reset_index() 582 | tmp = tmp.merge(code_cate_id, how="left", on='cate_id') 583 | stat_codeCateId = tmp.groupby('uid').code_cate_id.agg(['sum','mean','max','min']).reset_index() 584 | stat_codeCateId.columns=['uid']+[i+'_codeCateId_'+str(window_size) for i in list(stat_codeCateId.columns)[1:]] 585 | #购买类别 586 | stat_codeId = tmp.groupby('uid').cate_id.agg('nunique').reset_index() 587 | stat_codeId.columns = ['uid']+['nunique'+'_CateId_'+str(window_size) for i in list(stat_codeId.columns)[1:]] 588 | #购买数量的sum 589 | stat_qty=tmp.groupby('uid').qty.agg('sum').reset_index() 590 | stat_qty.columns = ['uid']+['sum'+'_qty_'+str(window_size) for i in list(stat_qty.columns)[1:]] 591 | #金额最大的一次的code_cate_id和order_amt 592 | order_amt_idxmax = tmp.groupby('uid').order_amt.idxmax() 593 | stat_order_amt_max = tmp.loc[list(order_amt_idxmax)][['uid','code_cate_id','order_amt']] 594 | stat_order_amt_max.columns = ['uid']+[i+'_maxAmtOrder_'+str(window_size) for i in list(stat_order_amt_max.columns)[1:]] 595 | #最近的一次的code_cate_id和order_amt 596 | stat_daysOrder = tmp.groupby('uid').daysOrder.agg(['mean','max','min']).reset_index() 597 | stat_daysOrder.columns = ['uid']+[i+'_daysOrder_'+str(window_size) for i in list(stat_daysOrder.columns)[1:]] 598 | cur_order=tmp.groupby('uid').daysOrder.idxmin() 599 | stat_cur_order = tmp.loc[list(cur_order)][['uid','code_cate_id','order_amt','daysOrder']] 600 | stat_cur_order.columns = ['uid']+[i+'_curOrder_'+str(window_size) for i in list(stat_cur_order.columns)[1:]] 601 | #使用的discount的次数和占比 602 | stat_isDiscount = tmp.groupby('uid').is_discount.sum().reset_index() 603 | stat_isDiscount.columns = ['uid','cnt_discount_'+str(window_size)] 604 | stat_Discount = tmp.groupby('uid')['discount','order_amt'].sum().reset_index() 605 | stat_Discount['discount_ratio'] = stat_Discount.discount/(stat_Discount.discount+stat_Discount.order_amt) 606 | stat_Discount.columns = ['uid','sum_discount_'+str(window_size),'sum_orderAmt_'+str(window_size),'discount_ratio_'+str(window_size)] 607 | stat = stat_codeCateId.merge(stat_codeId, how="left", on="uid").merge(stat_qty, how="left", on="uid").merge(stat_order_amt_max, how="left", on="uid").merge(stat_daysOrder, how="left", on="uid").merge(stat_cur_order, how="left", on="uid").merge(stat_isDiscount, how="left", on="uid").merge(stat_Discount[['uid','discount_ratio_'+str(window_size)]], how="left", on="uid") 608 | if idx==0: 609 | duser1=duser1.merge(stat, how="left", on="uid") 610 | elif idx==1: 611 | duser2=duser2.merge(stat, how="left", on="uid") 612 | return duser1,duser2 613 | 614 | 615 | ##下次贷款金额比上间隔 616 | tr_user, ts_user = getLoanDaily(tr_user, ts_user) 617 | 618 | def getLoanDaily(duser1, duser2): 619 | valid_mask = t_loan.month.isin([8,9,10]) 620 | test_mask = t_loan.month.isin([9,10,11]) 621 | for idx, mask in enumerate([valid_mask, test_mask]): 622 | tmp = t_loan[mask].reset_index() 623 | tmp["dailyLoanAmt"] = tmp["loan_amount"]/ (tmp["loan_interval"]+1) 624 | meanDailyLoanAmt = tmp.groupby("uid")["dailyLoanAmt"].agg(["mean","max","min"]).reset_index() 625 | if idx == 0: 626 | duser1 = duser1.merge(meanDailyLoanAmt, how="left", on = 'uid') 627 | elif idx == 1: 628 | duser2 = duser2.merge(meanDailyLoanAmt, how="left", on = 'uid') 629 | return duser1, duser2 630 | 631 | tr_user, ts_user = getCatePivotAmtCnt(t_order, tr_user, ts_user) 632 | #类目消费金额的差分 633 | def getCatePivotAmtCnt(df, duser1, duser2): 634 | valid_mask = t_order.month.isin([8, 9,10]) 635 | test_mask = t_order.month.isin([9, 10,11]) 636 | for idx, mask in enumerate([valid_mask, test_mask]): 637 | tmp = t_order[mask] 638 | uid_months = tmp.groupby(["uid","month","cate_id"])["order_amt"].agg(["sum"]).reset_index() 639 | uid_months.rename({'sum': 'cate_id_sum' }, axis='columns',inplace=True) 640 | uid_months["cate_id"] = "cate_id_" + uid_months['cate_id'].astype(str) 641 | cols = ["uid","cate_id","cate_id_cnt","cate_id_sum"] 642 | months_cate = pd.DataFrame() 643 | for i in uid_months["month"].unique(): 644 | tmp = uid_months.loc[uid_months["month"]==i,cols].reset_index(drop=True) 645 | uid_months_cate = tmp.pivot(index='uid', columns='cate_id').reset_index().fillna(0) 646 | new_list = ["uid"] 647 | for words in uid_months_cate.columns.get_values(): 648 | if "uid" in words: 649 | continue 650 | new_list.append('_'.join(words)) 651 | uid_months_cate.columns = new_list 652 | uid_months_cate["month"] = i 653 | months_cate = pd.concat([months_cate,uid_months_cate]) 654 | months_cate1 = months_cate.pivot(index='uid', columns='month').fillna(0) 655 | months_cate1 = months_cate1.stack().reset_index() 656 | months_cate1 = months_cate1.groupby(["uid"]).apply(lambda x: x.sort_values(["month"], ascending=True)).reset_index(drop=True) 657 | fea = set(months_cate1.columns) - set(["uid","month"]) 658 | for i in fea: 659 | months_cate1[i + "_Diff"] = months_cate1.groupby("uid")[i].apply(lambda x: x - x.shift(1)) 660 | #months_cate1[i + "_Diff2"] = months_cate1.groupby("uid")[i + "_Diff"].apply(lambda x: x - x.shift(1)) 661 | col = ["uid"] 662 | for i in months_cate1.columns: 663 | if "Diff" in i: 664 | col.extend([i]) 665 | print col 666 | if idx == 0: 667 | duser1 = duser1.merge(months_cate1[months_cate1["month"] == 10, col], how="left", on="uid") 668 | elif idx == 1: 669 | duser2 = duser2.merge(months_cate1[months_cate1["month"] == 11, col], how="left", on="uid") 670 | return duser1, duser2 671 | #贷款序列编码 672 | tr_user, ts_user = codeLoanSeq(t_loan, tr_user, ts_user) 673 | 674 | def codeLoanSeq(df_loan, duser1, duser2): 675 | valid_month = 11 676 | test_month = 12 677 | for idx, s_month in enumerate(range(valid_month-3,valid_month)): 678 | monthmask = df_loan.month.isin([s_month]) 679 | uid_tmp = df_loan[monthmask].uid.unique() 680 | df_loan["isloan" + str(3-idx)+"monthago"] = df_loan.uid.isin(uid_tmp) * 1 681 | df_loan["loanSeq"] = df_loan["isloan3monthago"].astype(str)+df_loan["isloan2monthago"].astype(str)+df_loan["isloan1monthago"].astype(str) 682 | duser1 = duser1.merge(df_loan[["uid","loanSeq"]], on="uid", how="left") 683 | duser1["loanSeq"] = duser1["loanSeq"].fillna("000") 684 | #测试集 685 | for idx, s_month in enumerate(range(test_month-3,test_month)): 686 | monthmask = df_loan.month.isin([s_month]) 687 | uid_tmp = df_loan[monthmask].uid.unique() 688 | df_loan["isloan" + str(3-idx)+"monthago"] = df_loan.uid.isin(uid_tmp) * 1 689 | df_loan["loanSeq"] = df_loan["isloan3monthago"].astype(str)+df_loan["isloan2monthago"].astype(str)+df_loan["isloan1monthago"].astype(str) 690 | duser2 = duser2.merge(df_loan[["uid","loanSeq"]], on="uid", how="left") 691 | duser2["loanSeq"] = duser2["loanSeq"].fillna("000") 692 | return duser1, duser2 693 | 694 | tr_user["loanSeq"] = tr_user["loanSeq"].astype('category') 695 | tr_user['loanSeq'].cat.categories=range(tr_user["loanSeq"].nunique()) #148 696 | tr_user["loanSeq"] = tr_user["loanSeq"].astype(int) 697 | 698 | ts_user["loanSeq"] = ts_user["loanSeq"].astype('category') 699 | ts_user['loanSeq'].cat.categories=range(ts_user["loanSeq"].nunique()) #148 700 | ts_user["loanSeq"] = ts_user["loanSeq"].astype(int) 701 | 702 | 703 | 704 | tr_user, ts_user = getMaxLoanAmt(tr_user, ts_user, 60) 705 | tr_user, ts_user = getMaxLoanAmt(tr_user, ts_user, 90) 706 | 707 | 708 | def getMaxLoanAmt(duser, tuser, window_size): 709 | valid_mask, test_mask = get_windows_mask(t_loan, "loan_time", window_size) 710 | for idx, mask in enumerate([valid_mask, test_mask]): 711 | tmp = t_loan[mask] 712 | tmp = tmp.groupby(['uid','date'])["loan_amount"].sum().rename("dailyLoanAmt").reset_index() #每个人一天内贷款了多少金额 713 | tmp["maxDailyLoanAmt"] = tmp.groupby(['uid'])['dailyLoanAmt'].transform(max) #用户消费最大的情况 714 | uid_max_loan = tmp[(tmp["dailyLoanAmt"] == tmp["maxDailyLoanAmt"])] 715 | max_amt_idx = uid_max_loan.groupby("uid")['date'].transform(max) == uid_max_loan['date'] 716 | uid_max_loan = uid_max_loan[max_amt_idx] 717 | if idx == 0: 718 | uid_max_loan["maxLoan_interval"] = (pd.Timestamp(valid_end_date) - uid_max_loan["date"]).apply(lambda x:x.days) 719 | elif idx == 1: 720 | uid_max_loan["maxLoan_interval"] = (pd.Timestamp(test_end_date) - uid_max_loan["date"]).apply(lambda x:x.days) 721 | uid_max_loan["maxloan_price_interval"] = uid_max_loan["maxDailyLoanAmt"] / (uid_max_loan["maxLoan_interval"]+1) 722 | uid_max_loan.drop(["dailyLoanAmt","date"],axis=1,inplace=True) 723 | if idx == 0: 724 | duser = duser.merge(uid_max_loan,how="left",on="uid") 725 | elif idx == 1: 726 | tuser = tuser.merge(uid_max_loan,how="left",on="uid") 727 | return duser, tuser 728 | 729 | 730 | tr_user, ts_user = getAgeLoan(t_loan, tr_user, ts_user) 731 | 732 | 733 | tr_user, ts_user = getFixedSexLoan(t_loan, tr_user, ts_user, 30) 734 | ###从sex的角度#### 735 | def getFixedSexLoan(df,duser1,duser2,window_size): 736 | valid_mask, test_mask = get_windows_mask(df, "loan_time", window_size) 737 | for idx, mask in enumerate([valid_mask, test_mask]): 738 | tmp = df[mask].reset_index(drop=True) 739 | tmp = tmp.merge(t_user[["uid","sex"]], on ="uid", how="left") 740 | stat_loanAmt = tmp.groupby(["sex"])['loan_amount'].agg(['sum','mean','count','median']).reset_index() 741 | stat_loanAmt.columns=['sex']+ [i + '_sex'+'_loanAmt_'+str(window_size)+"d" for i in list(stat_loanAmt.columns)[1:]] 742 | if idx==0: 743 | duser1=duser1.merge(stat_loanAmt, how="left", on="sex") 744 | elif idx==1: 745 | duser2=duser2.merge(stat_loanAmt, how="left", on="sex") 746 | return duser1, duser2 747 | 748 | 749 | ###从age的角度的贷款金额 750 | def getAgeLoan(df, duser1, duser2): 751 | valid_mask = df.month.isin([10]) 752 | test_mask = df.month.isin([11]) 753 | for idx, mask in enumerate([valid_mask, test_mask]): 754 | tmp = df[mask].reset_index(drop=True) 755 | tmp = tmp.merge(t_user[["uid","age"]], on ="uid", how="left") 756 | stat_loanAmt = tmp.groupby(["age"])['loan_amount'].agg(['sum','count','mean','median']).reset_index() 757 | stat_loanAmt.columns=['age']+ [i+ '_AgeLoanAmt' for i in list(stat_loanAmt.columns)[1:]] 758 | if idx==0: 759 | duser1 = duser1.merge(stat_loanAmt, how="left", on="age") 760 | elif idx==1: 761 | duser2 = duser2.merge(stat_loanAmt, how="left", on="age") 762 | return duser1, duser2 763 | 764 | 765 | ##购买占比limit 766 | tr_user, ts_user = getOrderAmtLimit(tr_user, ts_user) 767 | 768 | def getOrderAmtLimit(duser1, duser2): 769 | valid_mask = t_order.month == 10 770 | test_mask = t_order.month == 11 771 | for idx, mask in enumerate([valid_mask, test_mask]): 772 | lastMonthOrderAmt = t_order[mask].groupby("uid")["order_amt"].sum().rename("lastMonthOrderAmt").reset_index() 773 | if idx == 0: 774 | duser1 = duser1.merge(lastMonthOrderAmt, on="uid", how="left") 775 | duser1["lastMonthOrderAmt_Limit"] = duser1["lastMonthOrderAmt"] /duser1["limit"] 776 | duser1.drop("lastMonthOrderAmt",inplace=True,axis=1) 777 | elif idx == 1: 778 | duser2 = duser2.merge(lastMonthOrderAmt, on="uid", how="left") 779 | duser2["lastMonthOrderAmt_Limit"] = duser2["lastMonthOrderAmt"] /duser2["limit"] 780 | duser2.drop("lastMonthOrderAmt",inplace=True,axis=1) 781 | return duser1, duser2 782 | 783 | 784 | 785 | ##最近N的点击记录序列 786 | 787 | tr_user, ts_user = getNthNearestClick(tr_user, ts_user) 788 | 789 | def getNthNearestClick(duser1, duser2): 790 | click_validmask = t_click.month < 11 791 | click_testmask = t_click.month < 12 792 | for idx, mask in enumerate([click_validmask, click_testmask]): 793 | t_click_tmp = t_click[mask].reset_index(drop=True) 794 | t_click_tmp["pid_param"]= t_click_tmp["pid"].astype(str) + "_" + t_click_tmp["param"].astype(str) 795 | t_click_tmp["loan_rank"] = t_click_tmp.groupby(["uid"])["click_time"].rank(ascending=False) 796 | for pos in [1,2,3,4,5]: 797 | t_click_tmp1 = t_click_tmp[t_click_tmp["loan_rank"] == pos][["uid","pid_param"]] 798 | t_click_tmp1.columns = ["uid", "pid_param_pos" + str(pos)] 799 | if idx ==0: 800 | duser1 = duser1.merge(t_click_tmp1, on="uid", how="left") 801 | elif idx == 1: 802 | duser2 = duser2.merge(t_click_tmp1, on="uid", how="left") 803 | dropcol= ["pid_param_pos" + str(1), "pid_param_pos" + str(2), "pid_param_pos" + str(3), "pid_param_pos" + str(4), "pid_param_pos" + str(5)] 804 | if idx ==0: 805 | duser1["pid_param_pos"] = duser1["pid_param_pos" + str(1)] + duser1["pid_param_pos" + str(2)] + duser1["pid_param_pos" + str(3)] + duser1["pid_param_pos" + str(4)] + duser1["pid_param_pos" + str(5)] 806 | duser1["pid_param_pos"] = duser1["pid_param_pos"].astype('category') 807 | duser1["pid_param_pos"].cat.categories= np.arange(1,duser1["pid_param_pos"].nunique()+1) 808 | duser1["pid_param_pos"] = duser1["pid_param_pos"].astype(int) 809 | duser1.drop(dropcol, axis=1 ,inplace=True) 810 | elif idx == 1: 811 | duser2["pid_param_pos"] = duser2["pid_param_pos" + str(1)] + duser2["pid_param_pos" + str(2)] + duser2["pid_param_pos" + str(3)] + duser2["pid_param_pos" + str(4)] + duser2["pid_param_pos" + str(5)] 812 | duser2["pid_param_pos"] = duser2["pid_param_pos"].astype('category') 813 | duser2["pid_param_pos"].cat.categories= np.arange(1,duser2["pid_param_pos"].nunique()+1) 814 | duser2["pid_param_pos"] = duser2["pid_param_pos"].astype(int) 815 | duser2.drop(dropcol, axis=1 ,inplace=True) 816 | return duser1, duser2 817 | 818 | 819 | 820 | def get_ordernum_window(df, gkey, window_size): 821 | grouped = df.groupby(gkey)['order_id', 'order_unix_time'] 822 | order_num = grouped.rolling(window = window_size, on = 'order_unix_time', closed = 'left').count().reset_index().fillna(0) 823 | df = df.merge(order_num[["level_1",newcol]], how='left', left_index=True, right_on='level_1').drop(['level_1'], axis = 1) 824 | return df 825 | 826 | 827 | 828 | selected_mask= tr_user.loan_sum<=7.5 829 | 830 | selected_tr_user = tr_user[selected_mask] 831 | 832 | select_rows1 = random.sample(selected_tr_user.index, int(len(selected_tr_user.index)*0.7)) 833 | selected_train_df = selected_tr_user.loc[select_rows1] 834 | selected_valid_df = selected_tr_user.drop(select_rows1) 835 | 836 | selected_dtrain = lgb.Dataset(selected_train_df[features], label=selected_train_df["loan_sum"], free_raw_data=False) 837 | selected_dvalid = lgb.Dataset(selected_valid_df[features], label=selected_valid_df["loan_sum"], free_raw_data=False) 838 | 839 | 840 | #"feature_fraction":0.66, "bagging_freq" : 1 , "bagging_fraction": 0.6 ,'lambda_l2':0.0 841 | param = {'num_leaves':8,'num_boost_round':300, 'objective':'regression_l2','metric':'rmse',"learning_rate" : 0.05, "boosting":"gbdt"} 842 | bst = lgb.train(param, selected_dtrain, verbose_eval=100) 843 | pred_lgb_train = bst.predict(selected_dtrain.data) 844 | pred_lgb_valid = bst.predict(selected_dvalid.data) 845 | print('train mae: %g' % sqrt(mean_squared_error(selected_train_df["loan_sum"], pred_lgb_train))) 846 | valid_score = sqrt(mean_squared_error(selected_valid_df["loan_sum"], pred_lgb_valid)) 847 | print('valid mae: %g' % valid_score) 848 | 849 | 850 | ##后向选择算法 851 | logfeatures = list(imp[imp[1] != 0][0]) 852 | 853 | bestre = 1.77239 854 | removes =[] 855 | for col in set(logfeatures): 856 | print "removing: ", col 857 | features.remove(col) 858 | dtrain = lgb.Dataset(train_df[features], label=train_df["loan_sum"],free_raw_data=False) 859 | dvalid = lgb.Dataset(valid_df[features], label=valid_df["loan_sum"], free_raw_data=False) 860 | dtrain_all = lgb.Dataset(tr_user[features], label=tr_user["loan_sum"], free_raw_data=False) 861 | dtest = lgb.Dataset(ts_user[features], free_raw_data=False) 862 | 863 | param = {'num_leaves':8,'num_boost_round':500, 'objective':'regression_l2','metric':'rmse',"learning_rate" : 0.05, "boosting":"gbdt", "lambda_l2":1500, "feature_fraction":0.9, "bagging_fraction":0.9, "bagging_freq" : 50} 864 | bst = lgb.train(param, dtrain, valid_sets=[dtrain, dvalid], verbose_eval=100) 865 | pred_lgb_train = bst.predict(dtrain.data) 866 | pred_lgb_valid = bst.predict(dvalid.data) 867 | print('train mae: %g' % sqrt(mean_squared_error(train_df["loan_sum"], pred_lgb_train))) 868 | valid_score = sqrt(mean_squared_error(valid_df["loan_sum"], pred_lgb_valid)) 869 | print('valid mae: %g' % valid_score) 870 | if bestre > valid_score: 871 | bestre = valid_score 872 | print "removed: ", col 873 | removes.extend([col]) 874 | else: 875 | features.extend([col]) 876 | 877 | 878 | 879 | -------------------------------------------------------------------------------- /特征设计.numbers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klyan/JDD_Loan_Forecasting/a0e03a767173d7a8d4233aa41ea0b94f8d4418ac/特征设计.numbers --------------------------------------------------------------------------------