├── README.md └── model-features.py /README.md: -------------------------------------------------------------------------------- 1 | # ppd_score 2 | 拍拍贷的一个贷款预测比赛,里面用到了信用评分卡相关知识,比如WOE,IV值,卡方分箱,KS值等 3 | -------------------------------------------------------------------------------- /model-features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun May 20 12:53:49 2018 4 | 5 | @author: AnswerLee 6 | """ 7 | 8 | #%% 9 | import pandas as pd 10 | import numpy as np 11 | import importlib 12 | import datetime 13 | import collections 14 | import numbers 15 | import random 16 | from itertools import combinations 17 | import statsmodels as sm 18 | from pandas import DataFrame,Series 19 | import os 20 | os.chdir(r'D:\give_me_five\githome\ppd-competition') 21 | from scorecard_fucntions import * 22 | 23 | #%% 24 | # 载入数据,主要信息表、登录信息表、用户信息更新表 25 | data_master = pd.read_csv('./dataset/PPD_Training_Master_GBK_3_1_Training_Set.csv',encoding='gbk') 26 | data_loginfo = pd.read_csv('./dataset/PPD_LogInfo_3_1_Training_Set.csv',encoding='gbk') 27 | data_update = pd.read_csv('./dataset/PPD_Userupdate_Info_3_1_Training_Set.csv',encoding='gbk') 28 | #%% 29 | ###################################################################################################################################################### 30 | # 衍生变量 31 | ###################################################################################################################################################### 32 | # 四个代表城市的列,是否都一样,构建一个特征。 33 | data_master.UserInfo_2 = data_master.UserInfo_2.map(lambda x:str(x).strip().replace('市','')) 34 | data_master.UserInfo_4 = data_master.UserInfo_4.map(lambda x:str(x).strip().replace('市','')) 35 | data_master.UserInfo_8 = data_master.UserInfo_8.map(lambda x:str(x).strip().replace('市','')) 36 | data_master.UserInfo_20 = data_master.UserInfo_20.map(lambda x:str(x).strip().replace('市','')) 37 | data_master['city_match'] = data_master.apply(lambda x:int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis=1) 38 | # 剔除掉这四个城市列 39 | data_master.drop(['UserInfo_2','UserInfo_4','UserInfo_8','UserInfo_20'],axis=1,inplace=True) 40 | data_master.shape 41 | #%% 42 | # 构建登录信息的衍生变量 43 | # 构建出每次登录和申请时间的间隔 44 | data_loginfo['LogInfo'] = data_loginfo.LogInfo3.map(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d')) 45 | data_loginfo['ListingInfo'] = data_loginfo.Listinginfo1.map(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d')) 46 | data_loginfo['LogGap'] = data_loginfo[['LogInfo','ListingInfo']].apply(lambda x:(x[1]-x[0]).days,axis=1) 47 | #%% 48 | # 时间窗口选择 根据每个时间窗口对所有gap的覆盖率,得出选择180天 49 | # 使用180天作为data1中的特征的最大时间跨度,一般可以选用7天,30天, 50 | # 60天,90天,120天,150天,180天等。通过计算时间跨度内样本的总数以及非 51 | # 重复的样本数来确定。 52 | timeWindows = TimeWindowSelection(data_loginfo,'LogGap',range(30,361,30)) 53 | timeWindows 54 | #%% 55 | # 构建出LogInfo1和LogInfo2在每个时间窗口中的总次数,总类别数,以及类别平均次数 56 | time_window = [7,30,60,90,120,150,180] 57 | var_list = ['LogInfo1','LogInfo2'] 58 | datal_Idx = DataFrame({'Idx':data_loginfo.Idx.drop_duplicates()}) 59 | for tw in time_window: 60 | data_loginfo['TruncatedLogInfo'] = data_loginfo['ListingInfo'].map(lambda x:x+datetime.timedelta(-tw)) 61 | temp = data_loginfo.loc[data_loginfo['LogInfo'] >= data_loginfo['TruncatedLogInfo']] 62 | for var in var_list: 63 | # 统计LogInfo1和LogInfo2分别的总次数 64 | col_count = str(var)+'_'+str(tw)+'_count' 65 | col_unique = str(var)+'_'+str(tw)+'_unique' 66 | col_avg_count = str(var)+'_'+str(tw)+'_avg_count' 67 | count_stats = temp.groupby(['Idx'])[var].count().to_dict() 68 | datal_Idx[col_count] = datal_Idx['Idx'].map(lambda x:count_stats.get(x,0)) 69 | 70 | # 统计LogInfo1和LogInfo2分别的类别数, 71 | Idx_UserupdateInfo1 = temp[['Idx',var]].drop_duplicates() 72 | uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])[var].count().to_dict() 73 | datal_Idx[col_unique] = datal_Idx['Idx'].map(lambda x:uniq_stats.get(x,0)) 74 | 75 | # 统计LogInfo1 和LogInfo2 分别的平均次数 76 | datal_Idx[col_avg_count] = datal_Idx[[col_count,col_unique]].apply(lambda x:round(x[0]*1.0/x[1]),axis=1) 77 | # x[1] 为0 时,返回NaN,使用0填充 78 | datal_Idx[col_avg_count].fillna(0,inplace=True) 79 | #%% 80 | # 构建修改信息表的衍生变量 81 | # 计算出用户修改信息到申请时间的间隔 82 | data_update['UpdateInfo'] = data_update.UserupdateInfo2.map(lambda x:datetime.datetime.strptime(x,'%Y/%m/%d')) 83 | data_update['ListingInfo'] = data_update.ListingInfo1.map(lambda x:datetime.datetime.strptime(x,'%Y/%m/%d')) 84 | data_update['UpdateGap'] = data_update[['UpdateInfo','ListingInfo']].apply(lambda x:(x[1]-x[0]).days,axis=1) 85 | # 处理更新信息表单中的数据,观察发现UserupdateInfo1中存在同一个意思, 86 | # 但是大小写不统一的情况,所以先对大小写进行处理。且将MobilePhone和Phone统一称Phone。 87 | data_update['UpdateInfo_upper'] = data_update.UserupdateInfo1.map(ChangeContent) 88 | #%% 89 | datau_Idx = DataFrame({'Idx':data_update.Idx.drop_duplicates()}) 90 | time_window = [7,30,60,90,120,150,180] 91 | for tw in time_window: 92 | col_freq = 'UpdateInfo_'+str(tw)+'_freq' 93 | col_unique = 'UpdateInfo_'+str(tw)+'_unique' 94 | col_avg_count = 'UpdateInfo_'+str(tw)+'_avg_count' 95 | 96 | data_update['TruncatedLogInfo'] = data_update['ListingInfo'].map(lambda x:x+datetime.timedelta(-tw)) 97 | temp = data_update.loc[data_update.UpdateInfo >= data_update['TruncatedLogInfo']] 98 | 99 | # 更新 的总次数 100 | freq_stats = temp.groupby(['Idx'])['UpdateInfo_upper'].count().to_dict() 101 | datau_Idx[col_freq] = datau_Idx['Idx'].map(lambda x:freq_stats.get(x,0)) 102 | 103 | # 更新信息有多少类 104 | Idx_UserupdateInfo1 = temp[['Idx','UpdateInfo_upper']].drop_duplicates() 105 | uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])['UpdateInfo_upper'].count().to_dict() 106 | datau_Idx[col_unique] = datau_Idx['Idx'].map(lambda x:uniq_stats.get(x,0)) 107 | 108 | # 更新平均值(每类) 109 | datau_Idx[col_avg_count] = datau_Idx[[col_freq, col_unique]].apply(lambda x:round(x[0]*1.0/x[1]),axis=1) 110 | # x[1] 为0 时,返回NaN,使用0填充 111 | datau_Idx.fillna(0,inplace=True) 112 | 113 | # 改变的类别是_IDNUMBER,_HASBUYCAR,_MARRIAGESTATUSID,_PHONE 114 | # 先将每一个item变成list,之后再求sum,而list的sum就是合并 115 | # 合并之后再分别使用每个item查询in 116 | Idx_UserupdateInfo1['UpdateInfo_upper'] = Idx_UserupdateInfo1['UpdateInfo_upper'].map(lambda x:[x]) 117 | Idx_UserupdateInfo1_V2=Idx_UserupdateInfo1.groupby(['Idx'])['UpdateInfo_upper'].sum() 118 | for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']: 119 | item_dict = Idx_UserupdateInfo1_V2.map(lambda x:int(item in x)).to_dict() 120 | datau_Idx['UpdateInfo_'+str(tw)+str(item)]=datau_Idx['Idx'].map(lambda x:item_dict.get(x,0)) 121 | #%% 122 | # 存储构建变量后的表,all_data_0.csv 123 | all_data_0 = pd.concat([data_master.set_index('Idx'),datal_Idx.set_index('Idx'),datau_Idx.set_index('Idx')],axis=1) 124 | all_data_0.to_csv('./dataset/all_data_0.csv',encoding='gbk',index=True) 125 | #%% 126 | ###################################################################################################################################################### 127 | # 缺失值处理 128 | ###################################################################################################################################################### 129 | # 读取all_data_0.csv数据 130 | data_all = pd.read_csv('./dataset/all_data_0.csv',encoding='gbk') 131 | allFeatures = list(data_all.columns) 132 | allFeatures.remove('Idx') 133 | allFeatures.remove('target') 134 | allFeatures.remove('ListingInfo') 135 | #%% 136 | # UserInfo9特征处理,'中国移动 ' 和'中国移动' 是一个类别,去除空格 137 | data_all.UserInfo_9 = data_all.UserInfo_9.map(lambda x:str(x).strip()) 138 | #%% 139 | # 删除特征的值为常量的特征 140 | for col in allFeatures: 141 | unique = set(data_all[col]) 142 | if len(unique) == 1: 143 | print('{} is deleted'.format(col)) 144 | data_all.drop([col],axis=1,inplace=True) 145 | allFeatures.remove(col) 146 | #%% 147 | # 把类别型的变量和数值型的变量分开 148 | # 原则是小于10个类别的数值型视作类别型变量来处理 149 | # 大于10个类别的数值型变量视作数值型变量 150 | numerical_cols = [] 151 | for col in allFeatures: 152 | unique = list(set(data_all[col])) 153 | if np.nan in unique: 154 | unique.remove(np.nan) 155 | if len(unique) >= 10 and isinstance(unique[0],numbers.Real): 156 | numerical_cols.append(col) 157 | categorical_cols = [i for i in allFeatures if i not in numerical_cols] 158 | #%% 159 | # 对于类别型变量,如果缺失值的比例占到50%以上,那么把它移除。 160 | # 否则使用一个特殊的状态来填充缺失值。 161 | missing_rate_threshold_c = 0.5 162 | for col in categorical_cols: 163 | rate = MissingCategorial(data_all,col) 164 | print('{0} has missing rate as {1}'.format(col,rate)) 165 | if rate > missing_rate_threshold_c: 166 | print('drop',col) 167 | data_all.drop([col],axis=1,inplace=True) 168 | categorical_cols.remove(col) 169 | allFeatures.remove(col) 170 | if 0 < rate <= missing_rate_threshold_c: 171 | data_all[col] = data_all[col].map(lambda x:str(x).upper()) 172 | #%% 173 | # 对于连续性变量,缺失值的比例超过0.3的直接剔除 174 | # 未超过0.3的缺失值使用该列值的中位数 175 | # 这个地方的缺失值填充能否使用多重插补? 176 | # 是否可以使用该列值集合的随机值? 177 | missing_rate_threshold_n = 0.3 178 | for col in numerical_cols: 179 | rate = MissingContinuous(data_all,col) 180 | print('{0} has missing rate as {1}'.format(col,rate)) 181 | if rate > missing_rate_threshold_n: 182 | data_all.drop(col,axis=1,inplace=True) 183 | numerical_cols.remove(col) 184 | allFeatures.remove(col) 185 | print('we drop variable {} because of its high missing rate'.format(col)) 186 | elif rate > 0: 187 | #使用随机值 188 | # not_missing = data_all[data_all[col] == data_all[col]][col] 189 | # makeuped = data_all[col].map(lambda x: MakeupRandom(x, list(not_missing))) 190 | # data_all.drop(col,axis=1,inplace=True) 191 | data_all[col].fillna(data_all[col].dropna().median(),inplace=True) 192 | missingRate2 = MissingContinuous(data_all, col) 193 | print('missing rate after making up is:{}'.format(str(missingRate2))) 194 | 195 | #%% 196 | ###################################################################################################################################################### 197 | # 变量分箱,计算WOE和IV值 198 | ###################################################################################################################################################### 199 | deleted_features = [] 200 | encoded_features = {} 201 | merged_features = {} 202 | var_IV = {} 203 | var_WOE = {} 204 | # 处理类别型变量 205 | # 类别型变量的类别个数大于5,则将类别使用bad_rate来替代,变成数值型变量 206 | # 类别形变量的类别个数小于等于5 207 | # 查看其中有一个类别的占比是否超过90%,超过则剔除掉 208 | # 不超过90%,查看其中最小的badrate值,如果为0,则和最低的badrate类别合并 209 | # 对于所有类别都不存在badrate等于0的列,计算WOE和IV值,返回WOE字典和IV值 210 | for col in categorical_cols: 211 | if len(set(data_all[col])) > 5: 212 | print('{} is encoded with bad rate'.format(col)) 213 | col0 = str(col) + '_encoding' 214 | encoding_result = Func_BadRateEncoding(data_all,col,'target') 215 | data_all[col0] = encoding_result['encoding'] 216 | br_encoding = encoding_result['br_rate'] 217 | numerical_cols.append(col0) 218 | encoded_features[col] = [col0,br_encoding] 219 | deleted_features.append(col) 220 | else: 221 | maxPcnt = Func_MaximumBinPcnt(data_all,col) 222 | if maxPcnt > 0.9: 223 | print('{} is deleted because of large percentage of single bin'.format(col)) 224 | deleted_features.append(col) 225 | categorical_cols.remove(col) 226 | continue 227 | else: 228 | bad_rate = data_all.groupby([col]).sum() 229 | if min(bad_rate) == 0: 230 | print('{} has 0 bad sample!'.format(col)) 231 | col1 = col + '_mergeByBadRate' 232 | mergeBin = Func_MergeBad0(data_all,col,'target') 233 | data_all[col1] = data_all[col].map(mergeBin) 234 | maxPcnt = Func_MaximumBinPcnt(data_all,col1) 235 | if maxPcnt > 0.9: 236 | print('{} is deleted because of large percentage of single bin'.format(col1)) 237 | deleted_features.append(col) 238 | categorical_cols.remove(col) 239 | continue 240 | merged_features[col] = [col1,mergeBin] 241 | WOE_IV = Func_CalcWOE(data_all,col1,'target') 242 | var_WOE[col] = WOE_IV['WOE'] 243 | var_IV[col] = WOE_IV['IV'] 244 | else: 245 | WOE_IV = Func_CalcWOE(data_all,col,'target') 246 | var_WOE[col] = WOE_IV['WOE'] 247 | var_IV[col] = WOE_IV['IV'] 248 | #%% 249 | var_cutoff = {} 250 | for col in numerical_cols: 251 | print ("{} is in processing".format(col)) 252 | col1 = str(col) + '_Bin' 253 | #(1), split the continuous variable and save the cutoff points. Particulary, -1 is a special case and we separate it into a group 254 | if -1 in set(data_all[col]): 255 | special_attribute = [-1] 256 | else: 257 | special_attribute = [] 258 | cutOffPoints = ChiMerge_MaxInterval(data_all, col, 'target',special_attribute=special_attribute) 259 | var_cutoff[col] = cutOffPoints 260 | data_all[col1] = data_all[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute)) 261 | 262 | #(2), check whether the bad rate is monotone 263 | BRM = BadRateMonotone(data_all, col1, 'target',special_attribute=special_attribute) 264 | if not BRM: 265 | for bins in range(4,1,-1): 266 | cutOffPoints = ChiMerge_MaxInterval(data_all, col, 'target',max_interval = bins,special_attribute=special_attribute) 267 | data_all[col1] = data_all[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute)) 268 | BRM = BadRateMonotone(data_all, col1, 'target',special_attribute=special_attribute) 269 | if BRM: 270 | break 271 | var_cutoff[col] = cutOffPoints 272 | 273 | #(3), check whether any single bin occupies more than 90% of the total 274 | maxPcnt = MaximumBinPcnt(data_all, col1) 275 | if maxPcnt > 0.9: 276 | deleted_features.append(col) 277 | numerical_cols.remove(col) 278 | print ('we delete {} because the maximum bin occupies more than 90%'.format(col)) 279 | continue 280 | WOE_IV = CalcWOE(data_all, col1, 'target') 281 | var_IV[col] = WOE_IV['IV'] 282 | var_WOE[col] = WOE_IV['WOE'] 283 | 284 | --------------------------------------------------------------------------------