├── README.md
└── model-features.py


/README.md:
--------------------------------------------------------------------------------
1 | # ppd_score
2 | 拍拍贷的一个贷款预测比赛，里面用到了信用评分卡相关知识，比如WOE，IV值，卡方分箱，KS值等
3 | 


--------------------------------------------------------------------------------
/model-features.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun May 20 12:53:49 2018
  4 | 
  5 | @author: AnswerLee
  6 | """
  7 | 
  8 | #%%
  9 | import pandas as pd
 10 | import numpy as np
 11 | import importlib
 12 | import datetime
 13 | import collections
 14 | import numbers
 15 | import random
 16 | from itertools import combinations
 17 | import statsmodels as sm
 18 | from pandas import DataFrame,Series
 19 | import os
 20 | os.chdir(r'D:\give_me_five\githome\ppd-competition')
 21 | from scorecard_fucntions import *
 22 | 
 23 | #%%
 24 | # 载入数据，主要信息表、登录信息表、用户信息更新表
 25 | data_master = pd.read_csv('./dataset/PPD_Training_Master_GBK_3_1_Training_Set.csv',encoding='gbk')
 26 | data_loginfo = pd.read_csv('./dataset/PPD_LogInfo_3_1_Training_Set.csv',encoding='gbk')
 27 | data_update = pd.read_csv('./dataset/PPD_Userupdate_Info_3_1_Training_Set.csv',encoding='gbk')
 28 | #%%
 29 | ######################################################################################################################################################
 30 | # 衍生变量
 31 | ######################################################################################################################################################
 32 | # 四个代表城市的列，是否都一样，构建一个特征。
 33 | data_master.UserInfo_2 = data_master.UserInfo_2.map(lambda x:str(x).strip().replace('市',''))
 34 | data_master.UserInfo_4 = data_master.UserInfo_4.map(lambda x:str(x).strip().replace('市',''))
 35 | data_master.UserInfo_8 = data_master.UserInfo_8.map(lambda x:str(x).strip().replace('市',''))
 36 | data_master.UserInfo_20 = data_master.UserInfo_20.map(lambda x:str(x).strip().replace('市',''))
 37 | data_master['city_match'] = data_master.apply(lambda x:int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis=1)
 38 | # 剔除掉这四个城市列
 39 | data_master.drop(['UserInfo_2','UserInfo_4','UserInfo_8','UserInfo_20'],axis=1,inplace=True)
 40 | data_master.shape
 41 | #%%
 42 | # 构建登录信息的衍生变量
 43 | # 构建出每次登录和申请时间的间隔
 44 | data_loginfo['LogInfo'] = data_loginfo.LogInfo3.map(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d'))
 45 | data_loginfo['ListingInfo'] = data_loginfo.Listinginfo1.map(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d'))
 46 | data_loginfo['LogGap'] = data_loginfo[['LogInfo','ListingInfo']].apply(lambda x:(x[1]-x[0]).days,axis=1)
 47 | #%%
 48 | # 时间窗口选择  根据每个时间窗口对所有gap的覆盖率，得出选择180天
 49 | # 使用180天作为data1中的特征的最大时间跨度，一般可以选用7天，30天，
 50 | # 60天，90天，120天，150天，180天等。通过计算时间跨度内样本的总数以及非
 51 | # 重复的样本数来确定。
 52 | timeWindows = TimeWindowSelection(data_loginfo,'LogGap',range(30,361,30))
 53 | timeWindows
 54 | #%%
 55 | # 构建出LogInfo1和LogInfo2在每个时间窗口中的总次数，总类别数，以及类别平均次数
 56 | time_window = [7,30,60,90,120,150,180]
 57 | var_list = ['LogInfo1','LogInfo2']
 58 | datal_Idx = DataFrame({'Idx':data_loginfo.Idx.drop_duplicates()})
 59 | for tw in time_window:
 60 |     data_loginfo['TruncatedLogInfo'] = data_loginfo['ListingInfo'].map(lambda x:x+datetime.timedelta(-tw))
 61 |     temp = data_loginfo.loc[data_loginfo['LogInfo'] >= data_loginfo['TruncatedLogInfo']]
 62 |     for var in var_list:
 63 |         # 统计LogInfo1和LogInfo2分别的总次数
 64 |         col_count = str(var)+'_'+str(tw)+'_count'
 65 |         col_unique = str(var)+'_'+str(tw)+'_unique'
 66 |         col_avg_count = str(var)+'_'+str(tw)+'_avg_count'
 67 |         count_stats = temp.groupby(['Idx'])[var].count().to_dict()
 68 |         datal_Idx[col_count] = datal_Idx['Idx'].map(lambda x:count_stats.get(x,0))
 69 |         
 70 |         # 统计LogInfo1和LogInfo2分别的类别数，
 71 |         Idx_UserupdateInfo1 = temp[['Idx',var]].drop_duplicates()
 72 |         uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])[var].count().to_dict()
 73 |         datal_Idx[col_unique] = datal_Idx['Idx'].map(lambda x:uniq_stats.get(x,0))
 74 |         
 75 |         # 统计LogInfo1 和LogInfo2 分别的平均次数
 76 |         datal_Idx[col_avg_count] = datal_Idx[[col_count,col_unique]].apply(lambda x:round(x[0]*1.0/x[1]),axis=1)
 77 |         # x[1] 为0 时，返回NaN，使用0填充
 78 |         datal_Idx[col_avg_count].fillna(0,inplace=True)
 79 | #%%
 80 | # 构建修改信息表的衍生变量
 81 | # 计算出用户修改信息到申请时间的间隔
 82 | data_update['UpdateInfo'] = data_update.UserupdateInfo2.map(lambda x:datetime.datetime.strptime(x,'%Y/%m/%d'))
 83 | data_update['ListingInfo'] = data_update.ListingInfo1.map(lambda x:datetime.datetime.strptime(x,'%Y/%m/%d'))
 84 | data_update['UpdateGap'] = data_update[['UpdateInfo','ListingInfo']].apply(lambda x:(x[1]-x[0]).days,axis=1)
 85 | # 处理更新信息表单中的数据，观察发现UserupdateInfo1中存在同一个意思，
 86 | # 但是大小写不统一的情况，所以先对大小写进行处理。且将MobilePhone和Phone统一称Phone。
 87 | data_update['UpdateInfo_upper'] = data_update.UserupdateInfo1.map(ChangeContent)
 88 | #%%
 89 | datau_Idx = DataFrame({'Idx':data_update.Idx.drop_duplicates()})
 90 | time_window = [7,30,60,90,120,150,180]
 91 | for tw in time_window:
 92 |     col_freq = 'UpdateInfo_'+str(tw)+'_freq'
 93 |     col_unique = 'UpdateInfo_'+str(tw)+'_unique'
 94 |     col_avg_count = 'UpdateInfo_'+str(tw)+'_avg_count'
 95 |     
 96 |     data_update['TruncatedLogInfo'] = data_update['ListingInfo'].map(lambda x:x+datetime.timedelta(-tw))
 97 |     temp = data_update.loc[data_update.UpdateInfo >= data_update['TruncatedLogInfo']]
 98 | 
 99 |     # 更新 的总次数
100 |     freq_stats = temp.groupby(['Idx'])['UpdateInfo_upper'].count().to_dict()
101 |     datau_Idx[col_freq] = datau_Idx['Idx'].map(lambda x:freq_stats.get(x,0))
102 |     
103 |     # 更新信息有多少类
104 |     Idx_UserupdateInfo1 = temp[['Idx','UpdateInfo_upper']].drop_duplicates()
105 |     uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])['UpdateInfo_upper'].count().to_dict()
106 |     datau_Idx[col_unique] = datau_Idx['Idx'].map(lambda x:uniq_stats.get(x,0))
107 |     
108 |     # 更新平均值（每类）
109 |     datau_Idx[col_avg_count] = datau_Idx[[col_freq, col_unique]].apply(lambda x:round(x[0]*1.0/x[1]),axis=1)
110 |     # x[1] 为0 时，返回NaN，使用0填充
111 |     datau_Idx.fillna(0,inplace=True)
112 |     
113 |     # 改变的类别是_IDNUMBER,_HASBUYCAR,_MARRIAGESTATUSID,_PHONE
114 |     # 先将每一个item变成list，之后再求sum，而list的sum就是合并
115 |     # 合并之后再分别使用每个item查询in
116 |     Idx_UserupdateInfo1['UpdateInfo_upper'] = Idx_UserupdateInfo1['UpdateInfo_upper'].map(lambda x:[x])
117 |     Idx_UserupdateInfo1_V2=Idx_UserupdateInfo1.groupby(['Idx'])['UpdateInfo_upper'].sum()
118 |     for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']:
119 |         item_dict = Idx_UserupdateInfo1_V2.map(lambda x:int(item in x)).to_dict()
120 |         datau_Idx['UpdateInfo_'+str(tw)+str(item)]=datau_Idx['Idx'].map(lambda x:item_dict.get(x,0))
121 | #%%
122 | # 存储构建变量后的表，all_data_0.csv
123 | all_data_0 = pd.concat([data_master.set_index('Idx'),datal_Idx.set_index('Idx'),datau_Idx.set_index('Idx')],axis=1)
124 | all_data_0.to_csv('./dataset/all_data_0.csv',encoding='gbk',index=True)
125 | #%%
126 | ######################################################################################################################################################
127 | # 缺失值处理
128 | ######################################################################################################################################################
129 | # 读取all_data_0.csv数据
130 | data_all = pd.read_csv('./dataset/all_data_0.csv',encoding='gbk')
131 | allFeatures = list(data_all.columns)
132 | allFeatures.remove('Idx')
133 | allFeatures.remove('target')
134 | allFeatures.remove('ListingInfo')
135 | #%%
136 | # UserInfo9特征处理，'中国移动 ' 和'中国移动' 是一个类别，去除空格
137 | data_all.UserInfo_9 = data_all.UserInfo_9.map(lambda x:str(x).strip())
138 | #%%
139 | # 删除特征的值为常量的特征
140 | for col in allFeatures:
141 |     unique = set(data_all[col])
142 |     if len(unique) == 1:
143 |         print('{} is deleted'.format(col))
144 |         data_all.drop([col],axis=1,inplace=True)
145 |         allFeatures.remove(col)
146 | #%%
147 | # 把类别型的变量和数值型的变量分开
148 | # 原则是小于10个类别的数值型视作类别型变量来处理
149 | # 大于10个类别的数值型变量视作数值型变量
150 | numerical_cols = []
151 | for col in allFeatures:
152 |     unique = list(set(data_all[col]))
153 |     if np.nan in unique:
154 |         unique.remove(np.nan)
155 |     if len(unique) >= 10 and isinstance(unique[0],numbers.Real):
156 |         numerical_cols.append(col)
157 | categorical_cols = [i for i in allFeatures if i not in numerical_cols]
158 | #%%
159 | # 对于类别型变量，如果缺失值的比例占到50%以上，那么把它移除。
160 | # 否则使用一个特殊的状态来填充缺失值。
161 | missing_rate_threshold_c = 0.5
162 | for col in categorical_cols:
163 |     rate = MissingCategorial(data_all,col)
164 |     print('{0} has missing rate as {1}'.format(col,rate))
165 |     if rate > missing_rate_threshold_c:
166 |         print('drop',col)
167 |         data_all.drop([col],axis=1,inplace=True)
168 |         categorical_cols.remove(col)
169 |         allFeatures.remove(col)
170 |     if 0 < rate <= missing_rate_threshold_c:
171 |         data_all[col] = data_all[col].map(lambda x:str(x).upper())
172 | #%%   
173 | # 对于连续性变量，缺失值的比例超过0.3的直接剔除
174 | # 未超过0.3的缺失值使用该列值的中位数
175 | # 这个地方的缺失值填充能否使用多重插补？
176 | # 是否可以使用该列值集合的随机值？
177 | missing_rate_threshold_n = 0.3
178 | for col in numerical_cols:
179 |     rate = MissingContinuous(data_all,col)
180 |     print('{0} has missing rate as {1}'.format(col,rate))
181 |     if rate > missing_rate_threshold_n:
182 |         data_all.drop(col,axis=1,inplace=True)
183 |         numerical_cols.remove(col)
184 |         allFeatures.remove(col)
185 |         print('we drop variable {} because of its high missing rate'.format(col))
186 |     elif rate > 0:
187 | #使用随机值
188 | #            not_missing = data_all[data_all[col] == data_all[col]][col]
189 | #            makeuped = data_all[col].map(lambda x: MakeupRandom(x, list(not_missing)))
190 | #            data_all.drop(col,axis=1,inplace=True)
191 |             data_all[col].fillna(data_all[col].dropna().median(),inplace=True)
192 |             missingRate2 = MissingContinuous(data_all, col)
193 |             print('missing rate after making up is:{}'.format(str(missingRate2)))  
194 | 
195 | #%%
196 | ######################################################################################################################################################
197 | # 变量分箱，计算WOE和IV值
198 | ######################################################################################################################################################
199 | deleted_features = []
200 | encoded_features = {}
201 | merged_features = {}
202 | var_IV = {}
203 | var_WOE = {}
204 | # 处理类别型变量
205 | # 类别型变量的类别个数大于5，则将类别使用bad_rate来替代，变成数值型变量
206 | # 类别形变量的类别个数小于等于5
207 | # 查看其中有一个类别的占比是否超过90%，超过则剔除掉
208 | # 不超过90%，查看其中最小的badrate值，如果为0，则和最低的badrate类别合并
209 | # 对于所有类别都不存在badrate等于0的列，计算WOE和IV值，返回WOE字典和IV值
210 | for col in categorical_cols:
211 |     if len(set(data_all[col])) > 5:
212 |         print('{} is encoded with bad rate'.format(col))
213 |         col0 = str(col) + '_encoding'
214 |         encoding_result = Func_BadRateEncoding(data_all,col,'target')
215 |         data_all[col0] = encoding_result['encoding']
216 |         br_encoding = encoding_result['br_rate']
217 |         numerical_cols.append(col0)
218 |         encoded_features[col] = [col0,br_encoding]
219 |         deleted_features.append(col)
220 |     else:
221 |         maxPcnt = Func_MaximumBinPcnt(data_all,col)
222 |         if maxPcnt > 0.9:
223 |             print('{} is deleted because of large percentage of single bin'.format(col))
224 |             deleted_features.append(col)
225 |             categorical_cols.remove(col)
226 |             continue
227 |         else:
228 |             bad_rate = data_all.groupby([col]).sum()
229 |             if min(bad_rate) == 0:
230 |                 print('{} has 0 bad sample!'.format(col))
231 |                 col1 = col + '_mergeByBadRate'
232 |                 mergeBin = Func_MergeBad0(data_all,col,'target')
233 |                 data_all[col1] = data_all[col].map(mergeBin)
234 |                 maxPcnt = Func_MaximumBinPcnt(data_all,col1)
235 |                 if maxPcnt > 0.9:
236 |                     print('{} is deleted because of large percentage of single bin'.format(col1))
237 |                     deleted_features.append(col)
238 |                     categorical_cols.remove(col)
239 |                     continue
240 |                 merged_features[col] = [col1,mergeBin]
241 |                 WOE_IV = Func_CalcWOE(data_all,col1,'target')
242 |                 var_WOE[col] = WOE_IV['WOE']
243 |                 var_IV[col] = WOE_IV['IV']
244 |             else:
245 |                 WOE_IV = Func_CalcWOE(data_all,col,'target')
246 |                 var_WOE[col] = WOE_IV['WOE']
247 |                 var_IV[col] = WOE_IV['IV']
248 | #%%
249 | var_cutoff = {}
250 | for col in numerical_cols:
251 |     print ("{} is in processing".format(col))
252 |     col1 = str(col) + '_Bin'
253 |     #(1), split the continuous variable and save the cutoff points. Particulary, -1 is a special case and we separate it into a group
254 |     if -1 in set(data_all[col]):
255 |         special_attribute = [-1]
256 |     else:
257 |         special_attribute = []
258 |     cutOffPoints = ChiMerge_MaxInterval(data_all, col, 'target',special_attribute=special_attribute)
259 |     var_cutoff[col] = cutOffPoints
260 |     data_all[col1] = data_all[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute))
261 | 
262 |     #(2), check whether the bad rate is monotone
263 |     BRM = BadRateMonotone(data_all, col1, 'target',special_attribute=special_attribute)
264 |     if not BRM:
265 |         for bins in range(4,1,-1):
266 |             cutOffPoints = ChiMerge_MaxInterval(data_all, col, 'target',max_interval = bins,special_attribute=special_attribute)
267 |             data_all[col1] = data_all[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute))
268 |             BRM = BadRateMonotone(data_all, col1, 'target',special_attribute=special_attribute)
269 |             if BRM:
270 |                 break
271 |         var_cutoff[col] = cutOffPoints
272 | 
273 |     #(3), check whether any single bin occupies more than 90% of the total
274 |     maxPcnt = MaximumBinPcnt(data_all, col1)
275 |     if maxPcnt > 0.9:
276 |         deleted_features.append(col)
277 |         numerical_cols.remove(col)
278 |         print ('we delete {} because the maximum bin occupies more than 90%'.format(col))
279 |         continue
280 |     WOE_IV = CalcWOE(data_all, col1, 'target')
281 |     var_IV[col] = WOE_IV['IV']
282 |     var_WOE[col] = WOE_IV['WOE']
283 | 
284 | 


--------------------------------------------------------------------------------