├── README.md └── scorecard_model.py /README.md: -------------------------------------------------------------------------------- 1 | # 评分卡的半自动建模脚本 2 | 直接运行get_scorecard_model即可 3 | 4 | ## 函数说明: 5 | * tree_split :决策树分箱 6 | * quantile_split :等频分箱 7 | * cal_woe :计算woe 8 | * monot_trim : woe调成单调递减或单调递增 9 | * judge_increasing :判断一个list是否单调递增 10 | * judge_decreasing :判断一个list是否单调递减 11 | * binning_var :特征分箱,计算iv 12 | * binning_trim :调整单调后的分箱,计算IV 13 | * forward_corr_delete :相关性筛选 14 | * vif_delete :多重共线性筛选 15 | * forward_pvalue_delete :显著性筛选,前向逐步回归 16 | * backward_pvalue_delete :显著性筛选,后向逐步回归 17 | * forward_delete_coef :系数一致筛选 18 | * get_map_df :得到特征woe映射集合表 19 | * var_mapping:特征映射 20 | * plot_roc :绘制roc 21 | * plot_model_ks : 绘制ks 22 | * cal_scale :计算分数校准的A,B值,基础分 23 | * get_score_map:得到特征score的映射集合表 24 | * plot_score_hist :绘制好坏用户得分分布图 25 | * cal_psi :计算psi 26 | * get_scorecard_model :评分卡建模 27 | -------------------------------------------------------------------------------- /scorecard_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | import math 10 | import statsmodels.api as sm 11 | from sklearn.model_selection import train_test_split 12 | from sklearn import metrics 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.model_selection import cross_val_score 15 | from sklearn.tree import DecisionTreeClassifier,_tree 16 | from statsmodels.stats.outliers_influence import variance_inflation_factor 17 | import warnings 18 | warnings.filterwarnings('ignore') 19 | 20 | 21 | 22 | 23 | def tree_split(df,col,target,max_bin,min_binpct,nan_value): 24 | """ 25 | 决策树分箱 26 | param: 27 | df -- 数据集 Dataframe 28 | col -- 分箱的字段名 string 29 | target -- 标签的字段名 string 30 | max_bin -- 最大分箱数 int 31 | min_binpct -- 箱体的最小占比 float 32 | nan_value -- 缺失的映射值 int/float 33 | return: 34 | split_list -- 分割点 list 35 | """ 36 | miss_value_rate = df[df[col]==nan_value].shape[0]/df.shape[0] 37 | # 如果缺失占比小于5%,则直接对特征进行分箱 38 | if miss_value_rate<0.05: 39 | x = np.array(df[col]).reshape(-1,1) 40 | y = np.array(df[target]) 41 | tree = DecisionTreeClassifier(max_leaf_nodes=max_bin, 42 | min_samples_leaf = min_binpct) 43 | tree.fit(x,y) 44 | thresholds = tree.tree_.threshold 45 | thresholds = thresholds[thresholds!=_tree.TREE_UNDEFINED] 46 | split_list = sorted(thresholds.tolist()) 47 | # 如果缺失占比大于5%,则把缺失单独分为一箱,剩余部分再进行决策树分箱 48 | else: 49 | max_bin2 = max_bin-1 50 | x = np.array(df[~(df[col]==nan_value)][col]).reshape(-1,1) 51 | y = np.array(df[~(df[col]==nan_value)][target]) 52 | tree = DecisionTreeClassifier(max_leaf_nodes=max_bin2, 53 | min_samples_leaf = min_binpct) 54 | tree.fit(x,y) 55 | thresholds = tree.tree_.threshold 56 | thresholds = thresholds[thresholds!=_tree.TREE_UNDEFINED] 57 | split_list = sorted(thresholds.tolist()) 58 | split_list.insert(0,nan_value) 59 | 60 | return split_list 61 | 62 | 63 | 64 | 65 | def quantile_split(df,col,target,max_bin,nan_value): 66 | """ 67 | 等频分箱 68 | param: 69 | df -- 数据集 Dataframe 70 | col -- 分箱的字段名 string 71 | target -- 标签的字段名 string 72 | max_bin -- 最大分箱数 int 73 | nan_value -- 缺失的映射值 int/float 74 | return: 75 | split_list -- 分割点 list 76 | """ 77 | miss_value_rate = df[df[col]==nan_value].shape[0]/df.shape[0] 78 | 79 | # 如果缺失占比小于5%,则直接对特征进行分箱 80 | if miss_value_rate<0.05: 81 | bin_series,bin_cut = pd.qcut(df[col],q=max_bin,duplicates='drop',retbins=True) 82 | split_list = bin_cut.tolist() 83 | split_list.remove(split_list[0]) 84 | # 如果缺失占比大于5%,则把缺失单独分为一箱,剩余部分再进行等频分箱 85 | else: 86 | df2 = df[~(df[col]==nan_value)] 87 | max_bin2 = max_bin-1 88 | bin_series,bin_cut = pd.qcut(df2[col],q=max_bin2,duplicates='drop',retbins=True) 89 | split_list = bin_cut.tolist() 90 | split_list[0] = nan_value 91 | 92 | split_list.remove(split_list[-1]) 93 | 94 | # 当出现某个箱体只有好用户或只有坏用户时,进行前向合并箱体 95 | var_arr = np.array(df[col]) 96 | target_arr = np.array(df[target]) 97 | bin_trans = np.digitize(var_arr,split_list,right=True) 98 | var_tuple = [(x,y) for x,y in zip(bin_trans,target_arr)] 99 | 100 | delete_cut_list = [] 101 | for i in set(bin_trans): 102 | target_list = [y for x,y in var_tuple if x==i] 103 | if target_list.count(1)==0 or target_list.count(0)==0: 104 | if i ==min(bin_trans): 105 | index=i 106 | else: 107 | index = i-1 108 | delete_cut_list.append(split_list[index]) 109 | split_list = [x for x in split_list if x not in delete_cut_list] 110 | 111 | return split_list 112 | 113 | 114 | 115 | 116 | def cal_woe(df,col,target,nan_value,cut=None): 117 | """ 118 | 计算woe 119 | param: 120 | df -- 数据集 Dataframe 121 | col -- 分箱的字段名 string 122 | target -- 标签的字段名 string 123 | nan_value -- 缺失的映射值 int/float 124 | cut -- 箱体分割点 list 125 | return: 126 | woe_list -- 每个箱体的woe list 127 | """ 128 | total = df[target].count() 129 | bad = df[target].sum() 130 | good = total-bad 131 | 132 | bucket = pd.cut(df[col],cut) 133 | group = df.groupby(bucket) 134 | 135 | bin_df = pd.DataFrame() 136 | bin_df['total'] = group[target].count() 137 | bin_df['bad'] = group[target].sum() 138 | bin_df['good'] = bin_df['total'] - bin_df['bad'] 139 | bin_df['badattr'] = bin_df['bad']/bad 140 | bin_df['goodattr'] = bin_df['good']/good 141 | bin_df['woe'] = np.log(bin_df['badattr']/bin_df['goodattr']) 142 | # 当cut里有缺失映射值时,说明是把缺失单独分为一箱的,后续在进行调成单调分箱时 143 | # 不考虑缺失的箱,故将缺失映射值剔除 144 | if nan_value in cut: 145 | woe_list = bin_df['woe'].tolist()[1:] 146 | else: 147 | woe_list = bin_df['woe'].tolist() 148 | return woe_list 149 | 150 | 151 | 152 | 153 | def monot_trim(df,col,target,nan_value,cut=None): 154 | """ 155 | woe调成单调递减或单调递增 156 | param: 157 | df -- 数据集 Dataframe 158 | col -- 分箱的字段名 string 159 | target -- 标签的字段名 string 160 | nan_value -- 缺失的映射值 int/float 161 | cut -- 箱体分割点 list 162 | return: 163 | new_cut -- 调整后的分割点 list 164 | """ 165 | woe_lst = cal_woe(df,col,target,nan_value,cut = cut) 166 | # 若第一个箱体大于0,说明特征整体上服从单调递减 167 | if woe_lst[0]>0: 168 | while not judge_decreasing(woe_lst): 169 | # 找出哪几个箱不服从单调递减的趋势 170 | judge_list = [x>y for x, y in zip(woe_lst, woe_lst[1:])] 171 | # 用前向合并箱体的方式,找出需要剔除的分割点的索引,如果有缺失映射值,则索引+1 172 | if nan_value in cut: 173 | index_list = [i+2 for i,j in enumerate(judge_list) if j==False] 174 | else: 175 | index_list = [i+1 for i,j in enumerate(judge_list) if j==False] 176 | new_cut = [j for i,j in enumerate(cut) if i not in index_list] 177 | woe_lst = cal_woe(df,col,target,nan_value,cut = new_cut) 178 | # 若第一个箱体小于0,说明特征整体上服从单调递增 179 | elif woe_lst[0]<0: 180 | while not judge_increasing(woe_lst): 181 | # 找出哪几个箱不服从单调递增的趋势 182 | judge_list = [xy for x, y in zip(L, L[1:])) 207 | 208 | 209 | 210 | 211 | def binning_var(df,col,target,bin_type='dt',max_bin=5,min_binpct=0.05,nan_value=-999): 212 | """ 213 | 特征分箱,计算iv 214 | param: 215 | df -- 数据集 Dataframe 216 | col -- 分箱的字段名 string 217 | target -- 标签的字段名 string 218 | bin_type -- 分箱方式 默认是'dt',还有'quantile'(等频分箱) 219 | max_bin -- 最大分箱数 int 220 | min_binpct -- 箱体的最小占比 float 221 | nan_value -- 缺失映射值 int/float 222 | return: 223 | bin_df -- 特征的分箱明细表 Dataframe 224 | cut -- 分割点 list 225 | """ 226 | total = df[target].count() 227 | bad = df[target].sum() 228 | good = total-bad 229 | 230 | # 离散型特征分箱,直接根据类别进行groupby 231 | if df[col].dtype == np.dtype('object') or df[col].dtype == np.dtype('bool') or df[col].nunique()<=max_bin: 232 | group = df.groupby([col],as_index=True) 233 | bin_df = pd.DataFrame() 234 | 235 | bin_df['total'] = group[target].count() 236 | bin_df['totalrate'] = bin_df['total']/total 237 | bin_df['bad'] = group[target].sum() 238 | bin_df['badrate'] = bin_df['bad']/bin_df['total'] 239 | bin_df['good'] = bin_df['total'] - bin_df['bad'] 240 | bin_df['goodrate'] = bin_df['good']/bin_df['total'] 241 | bin_df['badattr'] = bin_df['bad']/bad 242 | bin_df['goodattr'] = (bin_df['total']-bin_df['bad'])/good 243 | bin_df['woe'] = np.log(bin_df['badattr']/bin_df['goodattr']) 244 | bin_df['bin_iv'] = (bin_df['badattr']-bin_df['goodattr'])*bin_df['woe'] 245 | bin_df['IV'] = bin_df['bin_iv'].sum() 246 | cut = df[col].unique().tolist() 247 | # 连续型特征的分箱 248 | else: 249 | if bin_type=='dt': 250 | cut = tree_split(df,col,target,max_bin=max_bin,min_binpct=min_binpct,nan_value=nan_value) 251 | elif bin_type=='quantile': 252 | cut = quantile_split(df,col,target,max_bin=max_bin,nan_value=nan_value) 253 | cut.insert(0,float('-inf')) 254 | cut.append(float('inf')) 255 | 256 | bucket = pd.cut(df[col],cut) 257 | group = df.groupby(bucket) 258 | bin_df = pd.DataFrame() 259 | 260 | bin_df['total'] = group[target].count() 261 | bin_df['totalrate'] = bin_df['total']/total 262 | bin_df['bad'] = group[target].sum() 263 | bin_df['badrate'] = bin_df['bad']/bin_df['total'] 264 | bin_df['good'] = bin_df['total'] - bin_df['bad'] 265 | bin_df['goodrate'] = bin_df['good']/bin_df['total'] 266 | bin_df['badattr'] = bin_df['bad']/bad 267 | bin_df['goodattr'] = (bin_df['total']-bin_df['bad'])/good 268 | bin_df['woe'] = np.log(bin_df['badattr']/bin_df['goodattr']) 269 | bin_df['bin_iv'] = (bin_df['badattr']-bin_df['goodattr'])*bin_df['woe'] 270 | bin_df['IV'] = bin_df['bin_iv'].sum() 271 | 272 | return bin_df,cut 273 | 274 | 275 | 276 | 277 | def binning_trim(df,col,target,cut=None,right_border=True): 278 | """ 279 | 调整单调后的分箱,计算IV 280 | param: 281 | df -- 数据集 Dataframe 282 | col -- 分箱的字段名 string 283 | target -- 标签的字段名 string 284 | cut -- 分割点 list 285 | right_border -- 箱体的右边界是否闭合 bool 286 | return: 287 | bin_df -- 特征的分箱明细表 Dataframe 288 | """ 289 | total = df[target].count() 290 | bad = df[target].sum() 291 | good = total - bad 292 | bucket = pd.cut(df[col],cut,right=right_border) 293 | 294 | group = df.groupby(bucket) 295 | bin_df = pd.DataFrame() 296 | bin_df['total'] = group[target].count() 297 | bin_df['totalrate'] = bin_df['total']/total 298 | bin_df['bad'] = group[target].sum() 299 | bin_df['badrate'] = bin_df['bad']/bin_df['total'] 300 | bin_df['good'] = bin_df['total'] - bin_df['bad'] 301 | bin_df['goodrate'] = bin_df['good']/bin_df['total'] 302 | bin_df['badattr'] = bin_df['bad']/bad 303 | bin_df['goodattr'] = (bin_df['total']-bin_df['bad'])/good 304 | bin_df['woe'] = np.log(bin_df['badattr']/bin_df['goodattr']) 305 | bin_df['bin_iv'] = (bin_df['badattr']-bin_df['goodattr'])*bin_df['woe'] 306 | bin_df['IV'] = bin_df['bin_iv'].sum() 307 | 308 | return bin_df 309 | 310 | 311 | 312 | 313 | def forward_corr_delete(df,col_list): 314 | """ 315 | 相关性筛选,设定的阈值为0.65 316 | param: 317 | df -- 数据集 Dataframe 318 | col_list -- 需要筛选的特征集合,需要提前按IV值从大到小排序好 list 319 | return: 320 | select_corr_col -- 筛选后的特征集合 list 321 | """ 322 | corr_list=[] 323 | corr_list.append(col_list[0]) 324 | delete_col = [] 325 | # 根据IV值的大小进行遍历 326 | for col in col_list[1:]: 327 | corr_list.append(col) 328 | corr = df.loc[:,corr_list].corr() 329 | corr_tup = [(x,y) for x,y in zip(corr[col].index,corr[col].values)] 330 | corr_value = [y for x,y in corr_tup if x!=col] 331 | # 若出现相关系数大于0.65,则将该特征剔除 332 | if len([x for x in corr_value if abs(x)>=0.65])>0: 333 | delete_col.append(col) 334 | select_corr_col = [x for x in col_list if x not in delete_col] 335 | return select_corr_col 336 | 337 | 338 | 339 | 340 | def vif_delete(df,list_corr): 341 | """ 342 | 多重共线性筛选 343 | param: 344 | df -- 数据集 Dataframe 345 | list_corr -- 相关性筛选后的特征集合,按IV值从大到小排序 list 346 | return: 347 | col_list -- 筛选后的特征集合 list 348 | """ 349 | col_list = list_corr.copy() 350 | # 计算各个特征的方差膨胀因子 351 | vif_matrix=np.matrix(df[col_list]) 352 | vifs_list=[variance_inflation_factor(vif_matrix,i) for i in range(vif_matrix.shape[1])] 353 | # 筛选出系数>10的特征 354 | vif_high = [x for x,y in zip(col_list,vifs_list) if y>10] 355 | 356 | # 根据IV从小到大的顺序进行遍历 357 | if len(vif_high)>0: 358 | for col in reversed(vif_high): 359 | col_list.remove(col) 360 | vif_matrix=np.matrix(df[col_list]) 361 | vifs=[variance_inflation_factor(vif_matrix,i) for i in range(vif_matrix.shape[1])] 362 | # 当系数矩阵里没有>10的特征时,循环停止 363 | if len([x for x in vifs if x>10])==0: 364 | break 365 | return col_list 366 | 367 | 368 | 369 | 370 | def forward_pvalue_delete(x,y): 371 | """ 372 | 显著性筛选,前向逐步回归 373 | param: 374 | x -- 特征数据集,woe转化后,且字段顺序按IV值从大到小排列 Dataframe 375 | y -- 标签列 Series 376 | return: 377 | pvalues_col -- 筛选后的特征集合 list 378 | """ 379 | col_list = x.columns.tolist() 380 | pvalues_col=[] 381 | # 按IV值逐个引入模型 382 | for col in col_list: 383 | pvalues_col.append(col) 384 | # 每引入一个特征就做一次显著性检验 385 | x_const = sm.add_constant(x.loc[:,pvalues_col]) 386 | sm_lr = sm.Logit(y,x_const) 387 | sm_lr = sm_lr.fit() 388 | pvalue = sm_lr.pvalues[col] 389 | # 当引入的特征P值>=0.05时,则剔除,原先满足显著性检验的则保留,不再剔除 390 | if pvalue>=0.05: 391 | pvalues_col.remove(col) 392 | return pvalues_col 393 | 394 | 395 | 396 | 397 | def backward_pvalue_delete(x,y): 398 | """ 399 | 显著性筛选,后向逐步回归 400 | param: 401 | x -- 特征数据集,woe转化后,且字段顺序按IV值从大到小排列 Dataframe 402 | y -- 标签列 Series 403 | return: 404 | pvalues_col -- 筛选后的特征集合 list 405 | """ 406 | x_c = x.copy() 407 | # 所有特征引入模型,做显著性检验 408 | x_const = sm.add_constant(x_c) 409 | sm_lr = sm.Logit(y,x_const).fit() 410 | pvalue_tup = [(i,j) for i,j in zip(sm_lr.pvalues.index,sm_lr.pvalues.values)][1:] 411 | delete_count = len([i for i,j in pvalue_tup if j>=0.05]) 412 | # 当有P值>=0.05的特征时,执行循环 413 | while delete_count>0: 414 | # 按IV值从小到大的顺序依次逐个剔除 415 | remove_col = [i for i,j in pvalue_tup if j>=0.05][-1] 416 | del x_c[remove_col] 417 | # 每次剔除特征后都要重新做显著性检验,直到入模的特征P值都小于0.05 418 | x2_const = sm.add_constant(x_c) 419 | sm_lr2 = sm.Logit(y,x2_const).fit() 420 | pvalue_tup2 = [(i,j) for i,j in zip(sm_lr2.pvalues.index,sm_lr2.pvalues.values)][1:] 421 | delete_count = len([i for i,j in pvalue_tup2 if j>=0.05]) 422 | 423 | pvalues_col = x_c.columns.tolist() 424 | 425 | return pvalues_col 426 | 427 | 428 | 429 | 430 | def forward_delete_coef(x,y): 431 | """ 432 | 系数一致筛选 433 | param: 434 | x -- 特征数据集,woe转化后,且字段顺序按IV值从大到小排列 Dataframe 435 | y -- 标签列 Series 436 | return: 437 | coef_col -- 筛选后的特征集合 list 438 | """ 439 | col_list = list(x.columns) 440 | coef_col = [] 441 | # 按IV值逐个引入模型,输出系数 442 | for col in col_list: 443 | coef_col.append(col) 444 | x2 = x.loc[:,coef_col] 445 | sk_lr = LogisticRegression(random_state=0).fit(x2,y) 446 | coef_dict = {k:v for k,v in zip(coef_col,sk_lr.coef_[0])} 447 | # 当引入特征的系数为负,则将其剔除 448 | if coef_dict[col]<0: 449 | coef_col.remove(col) 450 | 451 | return coef_col 452 | 453 | 454 | 455 | 456 | def get_map_df(bin_df_list): 457 | """ 458 | 得到特征woe映射集合表 459 | param: 460 | bin_df_list -- 每个特征的woe映射表 list 461 | return: 462 | map_merge_df -- 特征woe映射集合表 Dataframe 463 | """ 464 | map_df_list=[] 465 | for dd in bin_df_list: 466 | # 添加特征名列 467 | map_df = dd.reset_index().assign(col=dd.index.name).rename(columns={dd.index.name:'bin'}) 468 | # 将特征名列移到第一列,便于查看 469 | temp1 = map_df['col'] 470 | temp2 = map_df.iloc[:,:-1] 471 | map_df2 = pd.concat([temp1,temp2],axis=1) 472 | map_df_list.append(map_df2) 473 | 474 | map_merge_df = pd.concat(map_df_list,axis=0) 475 | 476 | return map_merge_df 477 | 478 | 479 | 480 | 481 | def var_mapping(df,map_df,var_map,target): 482 | """ 483 | 特征映射 484 | param: 485 | df -- 原始数据集 Dataframe 486 | map_df -- 特征映射集合表 Dataframe 487 | var_map -- map_df里映射的字段名,如"woe","score" string 488 | target -- 标签字段名 string 489 | return: 490 | df2 -- 映射后的数据集 Dataframe 491 | """ 492 | df2 = df.copy() 493 | # 去掉标签字段,遍历特征 494 | for col in df2.drop([target],axis=1).columns: 495 | x = df2[col] 496 | # 找到特征的映射表 497 | bin_map = map_df[map_df.col==col] 498 | # 新建一个映射array,填充0 499 | bin_res = np.array([0]*x.shape[0],dtype=float) 500 | for i in bin_map.index: 501 | # 每个箱的最小值和最大值 502 | lower = bin_map['min_bin'][i] 503 | upper = bin_map['max_bin'][i] 504 | # 对于类别型特征,每个箱的lower和upper时一样的 505 | if lower == upper: 506 | x1 = x[np.where(x == lower)[0]] 507 | # 连续型特征,左开右闭 508 | else: 509 | x1 = x[np.where((x>lower)&(x<=upper))[0]] 510 | mask = np.in1d(x,x1) 511 | # 映射array里填充对应的映射值 512 | bin_res[mask] = bin_map[var_map][i] 513 | bin_res = pd.Series(bin_res,index=x.index) 514 | bin_res.name = x.name 515 | # 将原始值替换为映射值 516 | df2[col] = bin_res 517 | return df2 518 | 519 | 520 | 521 | 522 | def plot_roc(y_label,y_pred): 523 | """ 524 | 绘制roc曲线 525 | param: 526 | y_label -- 真实的y值 list/array 527 | y_pred -- 预测的y值 list/array 528 | return: 529 | roc曲线 530 | """ 531 | tpr,fpr,threshold = metrics.roc_curve(y_label,y_pred) 532 | AUC = metrics.roc_auc_score(y_label,y_pred) 533 | fig = plt.figure(figsize=(6,4)) 534 | ax = fig.add_subplot(1,1,1) 535 | ax.plot(tpr,fpr,color='blue',label='AUC=%.3f'%AUC) 536 | ax.plot([0,1],[0,1],'r--') 537 | ax.set_ylim(0,1) 538 | ax.set_xlim(0,1) 539 | ax.set_title('ROC') 540 | ax.legend(loc='best') 541 | return plt.show(ax) 542 | 543 | 544 | 545 | 546 | def plot_model_ks(y_label,y_pred): 547 | """ 548 | 绘制ks曲线 549 | param: 550 | y_label -- 真实的y值 list/array 551 | y_pred -- 预测的y值 list/array 552 | return: 553 | ks曲线 554 | """ 555 | pred_list = list(y_pred) 556 | label_list = list(y_label) 557 | total_bad = sum(label_list) 558 | total_good = len(label_list)-total_bad 559 | items = sorted(zip(pred_list,label_list),key=lambda x:x[0]) 560 | step = (max(pred_list)-min(pred_list))/200 561 | 562 | pred_bin=[] 563 | good_rate=[] 564 | bad_rate=[] 565 | ks_list = [] 566 | for i in range(1,201): 567 | idx = min(pred_list)+i*step 568 | pred_bin.append(idx) 569 | label_bin = [x[1] for x in items if x[0]=0.25] 729 | train = train_data.drop(psi_delete,axis=1) 730 | print('psi筛选特征完成') 731 | print('-------------') 732 | 733 | # 特征分箱,默认用的是决策树分箱 734 | train_col = [x for x in train.columns if x!=target] 735 | bin_df_list=[] 736 | cut_list=[] 737 | for col in train_col: 738 | try: 739 | bin_df,cut = binning_var(train,col,target) 740 | bin_df_list.append(bin_df) 741 | cut_list.append(cut) 742 | except: 743 | pass 744 | print('特征分箱完成') 745 | print('-------------') 746 | 747 | # 剔除iv无限大的特征 748 | bin_df_list = [x for x in bin_df_list if x.IV.iloc[0]!=float('inf')] 749 | # 保存每个特征的分割点list 750 | cut_dict={} 751 | for dd,cc in zip(bin_df_list,cut_list): 752 | col = dd.index.name 753 | cut_dict[col] = cc 754 | # 将IV从大到小进行排序 755 | iv_col = [x.index.name for x in bin_df_list] 756 | iv_value = [x.IV.iloc[0] for x in bin_df_list] 757 | iv_sort = sorted(zip(iv_col,iv_value),key=lambda x:x[1],reverse=True) 758 | 759 | # iv筛选,筛选iv大于0.02的特征 760 | iv_select_col = [x for x,y in iv_sort if y>=0.02] 761 | print('iv筛选特征完成') 762 | print('-------------') 763 | # 特征分类 764 | cate_col = [] 765 | num_col = [] 766 | for col in iv_select_col: 767 | if train[col].dtype==np.dtype('object') or train[col].dtype==np.dtype('bool') or train[col].nunique()<=5: 768 | cate_col.append(col) 769 | else: 770 | num_col.append(col) 771 | 772 | #相关性筛选,相关系数阈值0.65 773 | corr_select_col = forward_corr_delete(train,num_col) 774 | print('相关性筛选完成') 775 | print('-------------') 776 | 777 | # 多重共线性筛选,系数阈值10 778 | vif_select_col = vif_delete(train,corr_select_col) 779 | print('多重共线性筛选完成') 780 | print('-------------') 781 | 782 | # 自动调整单调分箱 783 | trim_var_dict = {k:v for k,v in cut_dict.items() if k in vif_select_col} 784 | trim_bin_list=[] 785 | for col in tqdm(trim_var_dict.keys()): 786 | bin_cut = trim_var_dict[col] 787 | df_bin = [x for x in bin_df_list if x.index.name==col][0] 788 | if nan_value in bin_cut: 789 | woe_lst = df_bin['woe'].tolist() 790 | else: 791 | woe_lst = df_bin['woe'].tolist()[1:] 792 | if not judge_decreasing(woe_lst) and not judge_increasing(woe_lst): 793 | monot_cut = monot_trim(train, col, target, nan_value=nan_value, cut=bin_cut) 794 | monot_bin_df = binning_trim(train, col, target, cut=monot_cut, right_border=True) 795 | trim_bin_list.append(monot_bin_df) 796 | else: 797 | trim_bin_list.append(df_bin) 798 | # 调整后的分箱再根据iv筛选一遍 799 | select_num_df = [] 800 | for dd in trim_bin_list: 801 | if dd.IV.iloc[0]>=0.02: 802 | select_num_df.append(dd) 803 | print('自动调整单调分箱完成') 804 | print('-------------') 805 | 806 | # 连续型特征的woe映射集合表 807 | woe_map_num = get_map_df(select_num_df) 808 | woe_map_num['bin'] = woe_map_num['bin'].map(lambda x:str(x)) 809 | woe_map_num['min_bin'] = woe_map_num['bin'].map(lambda x:x.split(',')[0][1:]) 810 | woe_map_num['max_bin'] = woe_map_num['bin'].map(lambda x:x.split(',')[1][:-1]) 811 | woe_map_num['min_bin'] = woe_map_num['min_bin'].map(lambda x:float(x)) 812 | woe_map_num['max_bin'] = woe_map_num['max_bin'].map(lambda x:float(x)) 813 | 814 | if len(cate_col)>0: 815 | bin_cate_list = [x for x in bin_df_list if x.index.name in cate_col] 816 | # 剔除woe不单调的离散形特征 817 | select_cate_df=[] 818 | for i,dd in enumerate(bin_cate_list): 819 | woe_lst = dd['woe'].tolist() 820 | if judge_decreasing(woe_lst) or judge_increasing(woe_lst): 821 | select_cate_df.append(dd) 822 | # 离散型特征的woe映射集合表 823 | if len(select_cate_df)>0: 824 | woe_map_cate = get_map_df(select_cate_df) 825 | woe_map_cate['min_bin'] = list(woe_map_cate['bin']) 826 | woe_map_cate['max_bin'] = list(woe_map_cate['bin']) 827 | woe_map_df = pd.concat([woe_map_cate,woe_map_num],axis=0).reset_index(drop=True) 828 | else: 829 | woe_map_df = woe_map_num.reset_index(drop=True) 830 | 831 | # 显著性筛选,前向逐步回归 832 | select_all_col = woe_map_df['col'].unique().tolist() 833 | select_sort_col = [x for x,y in iv_sort if x in select_all_col] 834 | 835 | train2 = train.loc[:,select_sort_col+[target]].reset_index(drop=True) 836 | # woe映射 837 | train_woe = var_mapping(train2,woe_map_df,'woe',target) 838 | X = train_woe.loc[:,select_sort_col] 839 | y = train_woe[target] 840 | 841 | pvalue_select_col = forward_pvalue_delete(X,y) 842 | print('显著性筛选完成') 843 | print('-------------') 844 | 845 | # 剔除系数为负数的特征 846 | X2 = X.loc[:,pvalue_select_col] 847 | coef_select_col = forward_delete_coef(X2,y) 848 | 849 | # LR建模 850 | X3 = X2.loc[:,coef_select_col] 851 | x_train,x_valid,y_train,y_valid = train_test_split(X3,y,test_size=0.2,random_state=0) 852 | # 保存验证集的index 853 | valid_index = x_valid.index.tolist() 854 | 855 | lr_model = LogisticRegression(C=1.0).fit(x_train,y_train) 856 | coe_dict = {x:y for x,y in zip(x_train.columns,lr_model.coef_[0])} 857 | print('建模完成') 858 | print('-------------') 859 | 860 | # 绘制验证集的auc,ks 861 | valid_pre = lr_model.predict_proba(x_valid)[:,1] 862 | print('验证集的AUC,KS:') 863 | plot_roc(y_valid,valid_pre) 864 | plot_model_ks(y_valid,valid_pre) 865 | 866 | woe_map_df2 = woe_map_df[woe_map_df.col.isin(coef_select_col)].reset_index(drop=True) 867 | # 绘制测试集的auc,ks 868 | test = test_data.loc[:,coef_select_col+[target]].reset_index(drop=True) 869 | test_woe = var_mapping(test,woe_map_df2,'woe',target) 870 | x_test = test_woe.drop([target],axis=1) 871 | y_test = test_woe[target] 872 | test_pre = lr_model.predict_proba(x_test)[:,1] 873 | print('测试集的AUC,KS:') 874 | plot_roc(y_test,test_pre) 875 | plot_model_ks(y_test,test_pre) 876 | 877 | # 评分转换 878 | A,B,base_score = cal_scale(score,odds,pdo,lr_model) 879 | score_map_df = get_score_map(woe_map_df2,coe_dict,B) 880 | # 分数映射 881 | valid_data = train2.iloc[valid_index,:].loc[:,coef_select_col+[target]].reset_index(drop=True) 882 | valid_score = var_mapping(valid_data,score_map_df,'score',target) 883 | valid_score['final_score'] = base_score 884 | for col in coef_select_col: 885 | valid_score['final_score']+=valid_score[col] 886 | valid_score['final_score'] = valid_score['final_score'].map(lambda x:int(x)) 887 | 888 | test_score = var_mapping(test,score_map_df,'score',target) 889 | test_score['final_score'] = base_score 890 | for col in coef_select_col: 891 | test_score['final_score']+=test_score[col] 892 | test_score['final_score'] = test_score['final_score'].map(lambda x:int(x)) 893 | print('评分转换完成') 894 | print('-------------') 895 | # 验证集的评分分布 896 | plot_score_hist(valid_score, target, 'final_score','vaild_score',plt_size=(6,4)) 897 | # 测试集的评分分布 898 | plot_score_hist(test_score, target, 'final_score','test_score',plt_size=(6,4)) 899 | 900 | return lr_model,score_map_df,valid_score,test_score,valid_pre,A,B,x_valid --------------------------------------------------------------------------------