├── 变量woe离散化.py ├── README.md ├── EDA分析.py ├── 模型评估.py ├── 变量筛选.py ├── 数据预处理.py ├── 评分卡监控.py ├── 评分卡实现和评估.py └── 变量分箱.py /变量woe离散化.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | # 变量woe离散化 8 | 9 | # 变量woe结果表 10 | def woe_df_concat(bin_df): 11 | """ 12 | bin_df:list形式,里面存储每个变量的分箱结果 13 | 14 | return :woe结果表 15 | """ 16 | woe_df_list =[] 17 | for df in bin_df: 18 | woe_df = df.reset_index().assign(col=df.index.name).rename(columns={df.index.name:'bin'}) 19 | woe_df_list.append(woe_df) 20 | woe_result = pd.concat(woe_df_list,axis=0) 21 | # 为了便于查看,将字段名列移到第一列的位置上 22 | woe_result1 = woe_result['col'] 23 | woe_result2 = woe_result.iloc[:,:-1] 24 | woe_result_df = pd.concat([woe_result1,woe_result2],axis=1) 25 | woe_result_df = woe_result_df.reset_index(drop=True) 26 | return woe_result_df 27 | 28 | # woe转换 29 | def woe_transform(df,target,df_woe): 30 | """ 31 | df:数据集 32 | target:目标变量的字段名 33 | df_woe:woe结果表 34 | 35 | return:woe转化之后的数据集 36 | """ 37 | df2 = df.copy() 38 | for col in df2.drop([target],axis=1).columns: 39 | x = df2[col] 40 | bin_map = df_woe[df_woe.col==col] 41 | bin_res = np.array([0]*x.shape[0],dtype=float) 42 | for i in bin_map.index: 43 | lower = bin_map['min_bin'][i] 44 | upper = bin_map['max_bin'][i] 45 | if lower == upper: 46 | x1 = x[np.where(x == lower)[0]] 47 | else: 48 | x1 = x[np.where((x>=lower)&(x<=upper))[0]] 49 | mask = np.in1d(x,x1) 50 | bin_res[mask] = bin_map['woe'][i] 51 | bin_res = pd.Series(bin_res,index=x.index) 52 | bin_res.name = x.name 53 | df2[col] = bin_res 54 | return df2 55 | 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 评分卡模型实现函数模块 2 | 3 | 4 | author: yuxinxin 5 |
modify_date: 2018-11-27 6 | 7 | 使用方法:将 score_card.py 放在与notebook文件同个目录下,在notebook文件里输入:import score_card as sc 即可调用里面的函数 8 | 9 | ## 函数目录: 10 | ### 一. EDA分析.py 11 | 1. 变量的分布(可视化) 12 | * plot_cate_var -- 类别型变量分布 13 | * plot_num_col -- 数值型变量分布 14 | 2. 变量的违约率分析(可视化): 15 | * plot_default_cate -- 类别型变量的违约率分析 16 | * plot_default_num -- 数值型变量的违约率分析 17 | 18 | ### 二. 数据预处理.py 19 | 1. 缺失值处理 20 | * missing_cal -- 计算每个变量的缺失率 21 | * plot_missing_var -- 所有变量缺失值分布图 22 | * plot_missing_user -- 单个样本的缺失分析 23 | * missing_delete_var -- 缺失值剔除(针对单个变量) 24 | * missing_delete_user -- 缺失值剔除(针对单个样本) 25 | * fillna_cate_var -- 缺失值填充(类别型变量) 26 | * fillna_num_var -- 缺失值填充(数值型变量) 27 | 2. 常变量/同值化处理 28 | * const_delete -- 常变量/同值化处理 29 | 3. 降基处理 30 | * descending_cate -- 类别型变量的降基处理 31 | 32 | ### 三.变量分箱.py 33 | * binning_cate -- 类别型变量的分箱 34 | * iv_cate -- 类别型变量的IV明细表 35 | * binning_num -- 数值型变量的分箱(使用卡方分箱) 36 | * iv_num -- 数值型变量的IV明细表 37 | * binning_self -- 自定义分箱 38 | * plot_woe -- 变量woe的可视化 39 | * woe_monoton -- 检验变量的woe是否呈单调变化 40 | * woe_large -- 检验变量某个箱的woe是否过大(大于1),PS:箱体的woe在(-1,1)较合理 41 | 42 | 43 | ### 四.变量筛选.py 44 | * select_xgboost -- xgboost筛选变量 45 | * select_rf -- 随机森林筛选变量 46 | * plot_corr -- 变量相关性可视化 47 | * corr_mapping -- 变量强相关性映射 48 | * forward_delete_corr -- 逐个剔除相关性高的变量 49 | * forward_delete_pvalue -- 显著性筛选(向前选择法) 50 | * forward_delete_coef -- 逻辑回归系数符号筛选(每个变量的系数符号需要一致) 51 | 52 | ### 五.变量woe离散化.py 53 | * woe_df_concat -- 变量woe结果明细表 54 | * woe_transform -- 变量woe转换 55 | 56 | ### 六.模型评估.py 57 | * plot_roc -- 绘制ROC曲线 58 | * plot_model_ks -- 绘制模型的KS曲线 59 | * plot_learning_curve -- 绘制学习曲线 60 | * cross_verify -- 交叉验证 61 | * plot_matrix_report -- 混淆矩阵/分类结果报告 62 | 63 | ### 七.评分卡实现和评估.py 64 | * cal_scale -- 评分卡刻度 65 | * score_df_concat -- 变量score的明细表 66 | * score_transform -- 变量score转换 67 | * plot_score_ks -- 绘制评分卡的KS曲线 68 | * plot_PR -- PR曲线 69 | * plot_score_hist -- 好坏用户得分分布图 70 | * score_info -- 得分明细表 71 | * plot_lifting -- 绘制提升图和洛伦兹曲线 72 | * rule_verify -- 设定cutoff点,计算衡量指标 73 | 74 | ### 八.评分卡监控.py 75 | * score_psi -- 计算评分的PSI 76 | * plot_score_compare -- 评分对比图 77 | * var_stable -- 变量稳定性分析 78 | * plot_var_shift -- 变量偏移分析 79 | -------------------------------------------------------------------------------- /EDA分析.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | # EDA分析 8 | 9 | # 类别型变量的分布 10 | def plot_cate_var(df,col_list,hspace=0.4,wspace=0.4,plt_size=None,plt_num=None,x=None,y=None): 11 | """ 12 | df:数据集 13 | col_list:变量list集合 14 | hspace :子图之间的间隔(y轴方向) 15 | wspace :子图之间的间隔(x轴方向) 16 | plt_size :图纸的尺寸 17 | plt_num :子图的数量 18 | x :子图矩阵中一行子图的数量 19 | y :子图矩阵中一列子图的数量 20 | 21 | return :变量的分布图(柱状图形式) 22 | """ 23 | plt.figure(figsize=plt_size) 24 | plt.subplots_adjust(hspace=hspace,wspace=wspace) 25 | plt.rcParams['font.sans-serif']=['Microsoft YaHei'] 26 | plt.rcParams['axes.unicode_minus'] = False 27 | for i,col in zip(range(1,plt_num+1,1),col_list): 28 | plt.subplot(x,y,i) 29 | plt.title(col) 30 | sns.countplot(data=df,y=col) 31 | plt.ylabel('') 32 | return plt.show() 33 | 34 | 35 | # 数值型变量的分布 36 | def plot_num_col(df,col_list,hspace=0.4,wspace=0.4,plt_type=None,plt_size=None,plt_num=None,x=None,y=None): 37 | """ 38 | df:数据集 39 | col_list:变量list集合 40 | hspace :子图之间的间隔(y轴方向) 41 | wspace :子图之间的间隔(x轴方向) 42 | plt_type: 选择直方图/箱线图 43 | plt_size :图纸的尺寸 44 | plt_num :子图的数量 45 | x :子图矩阵中一行子图的数量 46 | y :子图矩阵中一列子图的数量 47 | 48 | return :变量的分布图(箱线图/直方图) 49 | """ 50 | plt.figure(figsize=plt_size) 51 | plt.subplots_adjust(hspace=hspace,wspace=wspace) 52 | if plt_type=='hist': 53 | for i,col in zip(range(1,plt_num+1,1),col_list): 54 | plt.subplot(x,y,i) 55 | plt.title(col) 56 | sns.distplot(df[col].dropna()) 57 | plt.xlabel('') 58 | if plt_type=='box': 59 | for i,col in zip(range(1,plt_num+1,1),col_list): 60 | plt.subplot(x,y,i) 61 | plt.title(col) 62 | sns.boxplot(data=df,x=col) 63 | plt.xlabel('') 64 | return plt.show() 65 | 66 | 67 | # 类别型变量的违约率分析 68 | def plot_default_cate(df,col_list,target,hspace=0.4,wspace=0.4,plt_size=None,plt_num=None,x=None,y=None): 69 | """ 70 | df:数据集 71 | col_list:变量list集合 72 | target :目标变量的字段名 73 | hspace :子图之间的间隔(y轴方向) 74 | wspace :子图之间的间隔(x轴方向) 75 | plt_size :图纸的尺寸 76 | plt_num :子图的数量 77 | x :子图矩阵中一行子图的数量 78 | y :子图矩阵中一列子图的数量 79 | 80 | return :违约率分布图(柱状图形式) 81 | """ 82 | all_bad = df[target].sum() 83 | total = df[target].count() 84 | all_default_rate = all_bad*1.0/total 85 | 86 | plt.figure(figsize=plt_size) 87 | plt.subplots_adjust(hspace=hspace,wspace=wspace) 88 | plt.rcParams['font.sans-serif']=['Microsoft YaHei'] 89 | plt.rcParams['axes.unicode_minus'] = False 90 | for i,col in zip(range(1,plt_num+1,1),col_list): 91 | d1 = df.groupby(col) 92 | d2 = pd.DataFrame() 93 | d2['total'] = d1[target].count() 94 | d2['bad'] = d1[target].sum() 95 | d2['default_rate'] = d2['bad']/d2['total'] 96 | d2 = d2.reset_index() 97 | plt.subplot(x,y,i) 98 | plt.title(col) 99 | plt.axvline(x=all_default_rate) 100 | sns.barplot(data=d2,y=col,x='default_rate') 101 | plt.ylabel('') 102 | return plt.show() 103 | 104 | 105 | # 数值型变量的违约率分析 106 | def plot_default_num(df,col_list,target,hspace=0.4,wspace=0.4,q=None,plt_size=None,plt_num=None,x=None,y=None): 107 | """ 108 | df:数据集 109 | col_list:变量list集合 110 | target :目标变量的字段名 111 | hspace :子图之间的间隔(y轴方向) 112 | wspace :子图之间的间隔(x轴方向) 113 | q :等深分箱的箱体个数 114 | plt_size :图纸的尺寸 115 | plt_num :子图的数量 116 | x :子图矩阵中一行子图的数量 117 | y :子图矩阵中一列子图的数量 118 | 119 | return :违约率分布图(折线图形式) 120 | """ 121 | all_bad = df[target].sum() 122 | total = df[target].count() 123 | all_default_rate = all_bad*1.0/total 124 | 125 | plt.figure(figsize=plt_size) 126 | plt.subplots_adjust(hspace=hspace,wspace=wspace) 127 | for i,col in zip(range(1,plt_num+1,1),col_list): 128 | bucket = pd.qcut(df[col],q=q,duplicates='drop') 129 | d1 = df.groupby(bucket) 130 | d2 = pd.DataFrame() 131 | d2['total'] = d1[target].count() 132 | d2['bad'] = d1[target].sum() 133 | d2['default_rate'] = d2['bad']/d2['total'] 134 | d2 = d2.reset_index() 135 | plt.subplot(x,y,i) 136 | plt.title(col) 137 | plt.axhline(y=all_default_rate) 138 | sns.pointplot(data=d2,x=col,y='default_rate',color='hotpink') 139 | plt.xticks(rotation=60) 140 | plt.xlabel('') 141 | return plt.show() 142 | 143 | -------------------------------------------------------------------------------- /模型评估.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | # 模型评估 8 | 9 | # AUC 10 | def plot_roc(y_label,y_pred): 11 | """ 12 | y_label:测试集的y 13 | y_pred:对测试集预测后的概率 14 | 15 | return:ROC曲线 16 | """ 17 | tpr,fpr,threshold = metrics.roc_curve(y_label,y_pred) 18 | AUC = metrics.roc_auc_score(y_label,y_pred) 19 | fig = plt.figure(figsize=(6,4)) 20 | ax = fig.add_subplot(1,1,1) 21 | ax.plot(tpr,fpr,color='blue',label='AUC=%.3f'%AUC) 22 | ax.plot([0,1],[0,1],'r--') 23 | ax.set_ylim(0,1) 24 | ax.set_xlim(0,1) 25 | ax.set_title('ROC') 26 | ax.legend(loc='best') 27 | return plt.show(ax) 28 | 29 | 30 | # KS 31 | def plot_model_ks(y_label,y_pred): 32 | """ 33 | y_label:测试集的y 34 | y_pred:对测试集预测后的概率 35 | 36 | return:KS曲线 37 | """ 38 | pred_list = list(y_pred) 39 | label_list = list(y_label) 40 | total_bad = sum(label_list) 41 | total_good = len(label_list)-total_bad 42 | items = sorted(zip(pred_list,label_list),key=lambda x:x[0]) 43 | step = (max(pred_list)-min(pred_list))/200 44 | 45 | pred_bin=[] 46 | good_rate=[] 47 | bad_rate=[] 48 | ks_list = [] 49 | for i in range(1,201): 50 | idx = min(pred_list)+i*step 51 | pred_bin.append(idx) 52 | label_bin = [x[1] for x in items if x[0]=threshold: 86 | list_corr.remove(i) 87 | return list_corr 88 | 89 | 90 | # 相关性变量映射关系 91 | def corr_mapping(df,col_list,threshold=None): 92 | """ 93 | df:数据集 94 | col_list:变量list集合 95 | threshold: 相关性设定的阈值 96 | 97 | return:强相关性变量之间的映射关系表 98 | """ 99 | corr_df = df.loc[:,col_list].corr() 100 | col_a = [] 101 | col_b = [] 102 | corr_value = [] 103 | for col,i in zip(col_list[:-1],range(1,len(col_list),1)): 104 | high_corr_col=[] 105 | high_corr_value=[] 106 | corr_series = corr_df[col][i:] 107 | for i,j in zip(corr_series.index,corr_series.values): 108 | if abs(j)>=threshold: 109 | high_corr_col.append(i) 110 | high_corr_value.append(j) 111 | col_a.extend([col]*len(high_corr_col)) 112 | col_b.extend(high_corr_col) 113 | corr_value.extend(high_corr_value) 114 | 115 | corr_map_df = pd.DataFrame({'col_A':col_a, 116 | 'col_B':col_b, 117 | 'corr':corr_value}) 118 | return corr_map_df 119 | 120 | 121 | # 显著性筛选,在筛选前需要做woe转换 122 | def forward_delete_pvalue(x_train,y_train): 123 | """ 124 | x_train -- x训练集 125 | y_train -- y训练集 126 | 127 | return :显著性筛选后的变量 128 | """ 129 | col_list = list(x_train.columns) 130 | pvalues_col=[] 131 | for col in col_list: 132 | pvalues_col.append(col) 133 | x_train2 = sm.add_constant(x_train.loc[:,pvalues_col]) 134 | sm_lr = sm.Logit(y_train,x_train2) 135 | sm_lr = sm_lr.fit() 136 | for i,j in zip(sm_lr.pvalues.index[1:],sm_lr.pvalues.values[1:]): 137 | if j>=0.05: 138 | pvalues_col.remove(i) 139 | 140 | x_new_train = x_train.loc[:,pvalues_col] 141 | x_new_train2 = sm.add_constant(x_new_train) 142 | lr = sm.Logit(y_train,x_new_train2) 143 | lr = lr.fit() 144 | print(lr.summary2()) 145 | return pvalues_col 146 | 147 | 148 | # 逻辑回归系数符号筛选,在筛选前需要做woe转换 149 | def forward_delete_coef(x_train,y_train): 150 | """ 151 | x_train -- x训练集 152 | y_train -- y训练集 153 | 154 | return : 155 | coef_col回归系数符号筛选后的变量 156 | lr_coe:每个变量的系数值 157 | """ 158 | col_list = list(x_train.columns) 159 | coef_col = [] 160 | for col in col_list: 161 | coef_col.append(col) 162 | x_train2 = x_train.loc[:,coef_col] 163 | sk_lr = LogisticRegression(random_state=0).fit(x_train2,y_train) 164 | coef_df = pd.DataFrame({'col':coef_col,'coef':sk_lr.coef_[0]}) 165 | if coef_df[coef_df.coef<0].shape[0]>0: 166 | coef_col.remove(col) 167 | 168 | x_new_train = x_train.loc[:,coef_col] 169 | lr = LogisticRegression(random_state=0).fit(x_new_train,y_train) 170 | lr_coe = pd.DataFrame({'col':coef_col, 171 | 'coef':lr.coef_[0]}) 172 | return coef_col,lr_coe 173 | 174 | -------------------------------------------------------------------------------- /数据预处理.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | # 数据预处理 8 | 9 | # 每个变量缺失率的计算 10 | def missing_cal(df): 11 | """ 12 | df :数据集 13 | 14 | return:每个变量的缺失率 15 | """ 16 | missing_series = df.isnull().sum()/df.shape[0] 17 | missing_df = pd.DataFrame(missing_series).reset_index() 18 | missing_df = missing_df.rename(columns={'index':'col', 19 | 0:'missing_pct'}) 20 | missing_df = missing_df.sort_values('missing_pct',ascending=False).reset_index(drop=True) 21 | return missing_df 22 | 23 | # 变量的缺失分布图 24 | def plot_missing_var(df,plt_size=None): 25 | """ 26 | df: 数据集 27 | plt_size :图纸的尺寸 28 | 29 | return: 缺失分布图(直方图形式) 30 | """ 31 | missing_df = missing_cal(df) 32 | plt.figure(figsize=plt_size) 33 | plt.rcParams['font.sans-serif']=['Microsoft YaHei'] 34 | plt.rcParams['axes.unicode_minus'] = False 35 | x = missing_df['missing_pct'] 36 | plt.hist(x=x,bins=np.arange(0,1.1,0.1),color='hotpink',ec='k',alpha=0.8) 37 | plt.ylabel('缺失值个数') 38 | plt.xlabel('缺失率') 39 | return plt.show() 40 | 41 | 42 | # 单个样本的缺失分布 43 | def plot_missing_user(df,plt_size=None): 44 | """ 45 | df: 数据集 46 | plt_size: 图纸的尺寸 47 | 48 | return :缺失分布图(折线图形式) 49 | """ 50 | missing_series = df.isnull().sum(axis=1) 51 | list_missing_num = sorted(list(missing_series.values)) 52 | plt.figure(figsize=plt_size) 53 | plt.rcParams['font.sans-serif']=['Microsoft YaHei'] 54 | plt.rcParams['axes.unicode_minus'] = False 55 | plt.plot(range(df.shape[0]),list_missing_num) 56 | plt.ylabel('缺失变量个数') 57 | plt.xlabel('sanples') 58 | return plt.show() 59 | 60 | 61 | # 缺失值剔除(单个变量) 62 | def missing_delete_var(df,threshold=None): 63 | """ 64 | df:数据集 65 | threshold:缺失率删除的阈值 66 | 67 | return :删除缺失后的数据集 68 | """ 69 | df2 = df.copy() 70 | missing_df = missing_cal(df) 71 | missing_col_num = missing_df[missing_df.missing_pct>=threshold].shape[0] 72 | missing_col = list(missing_df[missing_df.missing_pct>=threshold].col) 73 | df2 = df2.drop(missing_col,axis=1) 74 | print('缺失率超过{}的变量个数为{}'.format(threshold,missing_col_num)) 75 | return df2 76 | 77 | 78 | # 缺失值剔除(单个样本) 79 | def missing_delete_user(df,threshold=None): 80 | """ 81 | df:数据集 82 | threshold:缺失个数删除的阈值 83 | 84 | return :删除缺失后的数据集 85 | """ 86 | df2 = df.copy() 87 | missing_series = df.isnull().sum(axis=1) 88 | missing_list = list(missing_series) 89 | missing_index_list = [] 90 | for i,j in enumerate(missing_list): 91 | if j>=threshold: 92 | missing_index_list.append(i) 93 | df2 = df2[~(df2.index.isin(missing_index_list))] 94 | print('缺失变量个数在{}以上的用户数有{}个'.format(threshold,len(missing_index_list))) 95 | return df2 96 | 97 | 98 | # 缺失值填充(类别型变量) 99 | def fillna_cate_var(df,col_list,fill_type=None): 100 | """ 101 | df:数据集 102 | col_list:变量list集合 103 | fill_type: 填充方式:众数/当做一个类别 104 | 105 | return :填充后的数据集 106 | """ 107 | df2 = df.copy() 108 | for col in col_list: 109 | if fill_type=='class': 110 | df2[col] = df2[col].fillna('unknown') 111 | if fill_type=='mode': 112 | df2[col] = df2[col].fillna(df2[col].mode()[0]) 113 | return df2 114 | 115 | 116 | # 数值型变量的填充 117 | # 针对缺失率在5%以下的变量用中位数填充 118 | # 缺失率在5%--15%的变量用随机森林填充,可先对缺失率较低的变量先用中位数填充,在用没有缺失的样本来对变量作随机森林填充 119 | # 缺失率超过15%的变量建议当做一个类别 120 | def fillna_num_var(df,col_list,fill_type=None,filled_df=None): 121 | """ 122 | df:数据集 123 | col_list:变量list集合 124 | fill_type:填充方式:中位数/随机森林/当做一个类别 125 | filled_df :已填充好的数据集,当填充方式为随机森林时 使用 126 | 127 | return:已填充好的数据集 128 | """ 129 | df2 = df.copy() 130 | for col in col_list: 131 | if fill_type=='median': 132 | df2[col] = df2[col].fillna(df2[col].median()) 133 | if fill_type=='class': 134 | df2[col] = df2[col].fillna(-999) 135 | if fill_type=='rf': 136 | rf_df = pd.concat([df2[col],filled_df],axis=1) 137 | known = rf_df[rf_df[col].notnull()] 138 | unknown = rf_df[rf_df[col].isnull()] 139 | x_train = known.drop([col],axis=1) 140 | y_train = known[col] 141 | x_pre = unknown.drop([col],axis=1) 142 | rf = RandomForestRegressor(random_state=0) 143 | rf.fit(x_train,y_train) 144 | y_pre = rf.predict(x_pre) 145 | df2.loc[df2[col].isnull(),col] = y_pre 146 | return df2 147 | 148 | 149 | # 常变量/同值化处理 150 | def const_delete(df,col_list,threshold=None): 151 | """ 152 | df:数据集 153 | col_list:变量list集合 154 | threshold:同值化处理的阈值 155 | 156 | return :处理后的数据集 157 | """ 158 | df2 = df.copy() 159 | const_col = [] 160 | for col in col_list: 161 | const_pct = df2[col].value_counts().iloc[0]/df2[df2[col].notnull()].shape[0] 162 | if const_pct>=threshold: 163 | const_col.append(col) 164 | df2 = df2.drop(const_col,axis=1) 165 | print('常变量/同值化处理的变量个数为{}'.format(len(const_col))) 166 | return df2 167 | 168 | 169 | # 分类型变量的降基处理 170 | def descending_cate(df,col_list,threshold=None): 171 | """ 172 | df: 数据集 173 | col_list:变量list集合 174 | threshold:降基处理的阈值 175 | 176 | return :处理后的数据集 177 | """ 178 | df2 = df.copy() 179 | for col in col_list: 180 | value_series = df[col].value_counts()/df[df[col].notnull()].shape[0] 181 | small_value = [] 182 | for value_name,value_pct in zip(value_series.index,value_series.values): 183 | if value_pct<=threshold: 184 | small_value.append(value_name) 185 | df2.loc[df2[col].isin(small_value),col]='other' 186 | return df2 187 | 188 | -------------------------------------------------------------------------------- /评分卡监控.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | # 绘制变量的得分占比偏移图 8 | def plot_var_shift(df,day_col,score_col,plt_size=None): 9 | """ 10 | df:变量在一段时间内,每个区间上的得分 11 | day_col:时间的字段名(天) 12 | score_col:得分的字段名 13 | plt_size: 绘图尺寸 14 | 15 | return:变量区间得分的偏移图 16 | """ 17 | day_list = sorted(set(list(df[day_col]))) 18 | score_list = sorted(set(list(df[score_col]))) 19 | # 计算每天各个区间得分的占比 20 | prop_day_list = [] 21 | for day in day_list: 22 | prop_list = [] 23 | for score in score_list: 24 | prop = df[(df[day_col]==day)&(df[score_col]==score)].shape[0]/df[df[day_col]==day].shape[0] 25 | prop_list.append(prop) 26 | prop_day_list.append(prop_list) 27 | 28 | # 将得分占比的转化为画图的格式 29 | sub_list = [] 30 | for p in prop_day_list: 31 | p_cumsum = list(np.cumsum(p)) 32 | p_cumsum = p_cumsum[:-1] 33 | p_cumsum.insert(0,0) 34 | bar1_list = [1]*int(len(p_cumsum)) 35 | sub = [bar1_list[i]-p_cumsum[i] for i in range(len(p_cumsum))] 36 | sub_list.append(sub) 37 | array = np.array(sub_list) 38 | 39 | stack_prop_list = [] # 面积图的y值 40 | bar_prop_list = [] # 堆积柱状图的y 41 | for i in range(len(score_list)): 42 | bar_prop = array[:,i] 43 | bar_prop_list.append(bar_prop) 44 | stack_prop = [] 45 | for j in bar_prop: 46 | a = j 47 | b = j 48 | stack_prop.append(a) 49 | stack_prop.append(b) 50 | stack_prop_list.append(stack_prop) 51 | 52 | # 画图的x坐标轴 53 | x_bar = list(range(1,len(day_list)*2,2)) # 堆积柱状图的x值 54 | x_stack = [] # 面积图的x值 55 | for i in x_bar: 56 | c = i-0.5 57 | d = i+0.5 58 | x_stack.append(c) 59 | x_stack.append(d) 60 | 61 | # 绘图 62 | fig = plt.figure(figsize=plt_size) 63 | ax1 = fig.add_subplot(1,1,1) 64 | # 先清除x轴的刻度 65 | ax1.xaxis.set_major_formatter(plt.FuncFormatter(''.format)) 66 | ax1.set_xticks(range(1,len(day_list)*2,2)) 67 | # 将y轴的刻度设置为百分比形式 68 | def to_percent(temp, position): 69 | return '%1.0f'%(100*temp) + '%' 70 | plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(to_percent)) 71 | # 自定义x轴刻度标签 72 | for a,b in zip(x_bar,day_list): 73 | ax1.text(a,-0.08,b,ha='center',va='bottom') 74 | # 绘制面积图和堆积柱状图 75 | for i,s in zip(range(len(day_list)),score_list): 76 | ax1.stackplot(x_stack,stack_prop_list[i],alpha=0.25) 77 | ax1.bar(x_bar,bar_prop_list[i],width=1,label='得分:{}'.format(s)) 78 | # 添加y轴刻度虚线 79 | ax1.grid(True, 'major', 'y', ls='--', lw=.5, c='black', alpha=.3) 80 | ax1.legend(loc='best') 81 | plt.show() 82 | 83 | 84 | # 计算评分的PSI 85 | def score_psi(df1,df2,id_col,score_col,x,y,step=None): 86 | """ 87 | df1:建模样本的得分,包含用户id,得分 88 | df2:上线样本的得分,包含用户id,得分 89 | id_col:用户id字段名 90 | score_col:得分的字段名 91 | x:划分得分区间的left值 92 | y:划分得分区间的right值 93 | step:步长 94 | 95 | return: 得分psi表 96 | """ 97 | df1['score_bin'] = pd.cut(df1[score_col],bins=np.arange(x,y,step)) 98 | model_score_group = df1.groupby('score_bin',as_index=False)[id_col].count(). assign(pct=lambda x:x[id_col]/x[id_col].sum()). rename(columns={id_col:'建模样本户数', 99 | 'pct':'建模户数占比'}) 100 | df2['score_bin'] = pd.cut(df2[score_col],bins=np.arange(x,y,step)) 101 | online_score_group = df2.groupby('score_bin',as_index=False)[id_col].count(). assign(pct=lambda x:x[id_col]/x[id_col].sum()). rename(columns={id_col:'线上样本户数', 102 | 'pct':'线上户数占比'}) 103 | score_compare = pd.merge(model_score_group,online_score_group,on='score_bin',how='inner') 104 | score_compare['占比差异'] = score_compare['线上户数占比'] - score_compare['建模户数占比'] 105 | score_compare['占比权重'] = np.log(score_compare['线上户数占比']/score_compare['建模户数占比']) 106 | score_compare['Index']= score_compare['占比差异']*score_compare['占比权重'] 107 | score_compare['PSI'] = score_compare['Index'].sum() 108 | return score_compare 109 | 110 | 111 | # 评分比较分布图 112 | def plot_score_compare(df,plt_size=None): 113 | fig = plt.figure(figsize=plt_size) 114 | x = df.score_bin 115 | y1 = df.建模户数占比 116 | y2 = df.线上户数占比 117 | width=0.3 118 | plt.title('评分分布对比图') 119 | plt.xlabel('得分区间') 120 | plt.ylabel('用户占比') 121 | plt.xticks(np.arange(len(x))+0.15,x) 122 | plt.bar(np.arange(len(y1)),y1,width=width,color='seagreen',label='建模样本') 123 | plt.bar(np.arange(len(y2))+width,y2,width=width,color='hotpink',label='上线样本') 124 | plt.legend() 125 | return plt.show() 126 | 127 | 128 | # 变量稳定度分析 129 | def var_stable(score_result,df,var,id_col,score_col,bins): 130 | """ 131 | score_result:评分卡的score明细表,包含区间,用户数,用户占比,得分 132 | var:分析的变量名 133 | df:上线样本变量的得分,包含用户id,变量的value,变量的score 134 | id_col:df的用户id字段名 135 | score_col:df的得分字段名 136 | bins:变量划分的区间 137 | 138 | return :变量的稳定性分析表 139 | """ 140 | model_var_group = score_result.loc[score_result.col==var, ['bin','total','totalrate','score']].reset_index(drop=True). rename(columns={'total':'建模用户数', 141 | 'totalrate':'建模用户占比', 142 | 'score':'得分'}) 143 | df['bin'] = pd.cut(df[score_col],bins=bins) 144 | online_var_group = df.groupby('bin',as_index=False)[id_col].count() .assign(pct=lambda x:x[id_col]/x[id_col].sum()) .rename(columns={id_col:'线上用户数', 145 | 'pct':'线上用户占比'}) 146 | var_stable_df = pd.merge(model_var_group,online_var_group,on='bin',how='inner') 147 | var_stable_df = var_stable_df.iloc[:,[0,3,1,2,4,5]] 148 | var_stable_df['得分'] = var_stable_df['得分'].astype('int64') 149 | var_stable_df['建模样本权重'] = np.abs(var_stable_df['得分']*var_stable_df['建模用户占比']) 150 | var_stable_df['线上样本权重'] = np.abs(var_stable_df['得分']*var_stable_df['线上用户占比']) 151 | var_stable_df['权重差距'] = var_stable_df['线上样本权重'] - var_stable_df['建模样本权重'] 152 | return var_stable_df 153 | 154 | -------------------------------------------------------------------------------- /评分卡实现和评估.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | # 评分卡实现 8 | 9 | # 评分卡刻度 10 | def cal_scale(score,odds,PDO,model): 11 | """ 12 | odds:设定的坏好比 13 | score:在这个odds下的分数 14 | PDO: 好坏翻倍比 15 | model:逻辑回归模型 16 | 17 | return :A,B,base_score 18 | """ 19 | B = 20/(np.log(odds)-np.log(2*odds)) 20 | A = score-B*np.log(odds) 21 | base_score = A+B*model.intercept_[0] 22 | print('B: {:.2f}'.format(B)) 23 | print('A: {:.2f}'.format(A)) 24 | print('基础分为:{:.2f}'.format(base_score)) 25 | return A,B,base_score 26 | 27 | 28 | # 变量得分表 29 | def score_df_concat(woe_df,model,B): 30 | """ 31 | woe_df: woe结果表 32 | model:逻辑回归模型 33 | 34 | return:变量得分结果表 35 | """ 36 | coe = list(model.coef_[0]) 37 | columns = list(woe_df.col.unique()) 38 | scores=[] 39 | for c,col in zip(coe,columns): 40 | score=[] 41 | for w in list(woe_df[woe_df.col==col].woe): 42 | s = round(c*w*B,0) 43 | score.append(s) 44 | scores.extend(score) 45 | woe_df['score'] = scores 46 | score_df = woe_df.copy() 47 | return score_df 48 | 49 | 50 | # 分数转换 51 | def score_transform(df,target,df_score): 52 | """ 53 | df:数据集 54 | target:目标变量的字段名 55 | df_score:得分结果表 56 | 57 | return:得分转化之后的数据集 58 | """ 59 | df2 = df.copy() 60 | for col in df2.drop([target],axis=1).columns: 61 | x = df2[col] 62 | bin_map = df_score[df_score.col==col] 63 | bin_res = np.array([0]*x.shape[0],dtype=float) 64 | for i in bin_map.index: 65 | lower = bin_map['min_bin'][i] 66 | upper = bin_map['max_bin'][i] 67 | if lower == upper: 68 | x1 = x[np.where(x == lower)[0]] 69 | else: 70 | x1 = x[np.where((x>=lower)&(x<=upper))[0]] 71 | mask = np.in1d(x,x1) 72 | bin_res[mask] = bin_map['score'][i] 73 | bin_res = pd.Series(bin_res,index=x.index) 74 | bin_res.name = x.name 75 | df2[col] = bin_res 76 | return df2 77 | 78 | 79 | # 得分的KS 80 | def plot_score_ks(df,score_col,target): 81 | """ 82 | df:数据集 83 | target:目标变量的字段名 84 | score_col:最终得分的字段名 85 | """ 86 | total_bad = df[target].sum() 87 | total_good = df[target].count()-total_bad 88 | score_list = list(df[score_col]) 89 | target_list = list(df[target]) 90 | items = sorted(zip(score_list,target_list),key=lambda x:x[0]) 91 | step = (max(score_list)-min(score_list))/200 92 | 93 | score_bin=[] 94 | good_rate=[] 95 | bad_rate=[] 96 | ks_list = [] 97 | for i in range(1,201): 98 | idx = min(score_list)+i*step 99 | score_bin.append(idx) 100 | target_bin = [x[1] for x in items if x[0]=all_odds: 43 | GB_index = str(round((i/all_odds)*100,0))+str('G') 44 | else: 45 | GB_index = str(round((all_odds/i)*100,0))+str('B') 46 | GB_list.append(GB_index) 47 | d2['GB_index'] = GB_list 48 | d2['woe'] = np.log(d2['badattr']/d2['goodattr']) 49 | d2['bin_iv'] = (d2['badattr']-d2['goodattr'])*d2['woe'] 50 | d2['IV'] = d2['bin_iv'].sum() 51 | iv = d2['bin_iv'].sum().round(3) 52 | print('变量名:{}'.format(col)) 53 | print('IV:{}'.format(iv)) 54 | print('\t') 55 | bin_df.append(d2) 56 | iv_value.append(iv) 57 | return bin_df,iv_value 58 | 59 | 60 | # 类别性变量iv的明细表 61 | def iv_cate(df,col_list,target): 62 | """ 63 | df:数据集 64 | col_list:变量list集合 65 | target:目标变量的字段名 66 | 67 | return:变量的iv明细表 68 | """ 69 | bin_df,iv_value = binning_cate(df,col_list,target) 70 | iv_df = pd.DataFrame({'col':col_list, 71 | 'iv':iv_value}) 72 | iv_df = iv_df.sort_values('iv',ascending=False) 73 | return iv_df 74 | 75 | 76 | # 数值型变量的分箱 77 | 78 | # 先用卡方分箱输出变量的分割点 79 | def split_data(df,col,split_num): 80 | """ 81 | df: 原始数据集 82 | col:需要分箱的变量 83 | split_num:分割点的数量 84 | """ 85 | df2 = df.copy() 86 | count = df2.shape[0] # 总样本数 87 | n = math.floor(count/split_num) # 按照分割点数目等分后每组的样本数 88 | split_index = [i*n for i in range(1,split_num)] # 分割点的索引 89 | values = sorted(list(df2[col])) # 对变量的值从小到大进行排序 90 | split_value = [values[i] for i in split_index] # 分割点对应的value 91 | split_value = sorted(list(set(split_value))) # 分割点的value去重排序 92 | return split_value 93 | 94 | def assign_group(x,split_bin): 95 | """ 96 | x:变量的value 97 | split_bin:split_data得出的分割点list 98 | """ 99 | n = len(split_bin) 100 | if x<=min(split_bin): 101 | return min(split_bin) # 如果x小于分割点的最小值,则x映射为分割点的最小值 102 | elif x>max(split_bin): # 如果x大于分割点的最大值,则x映射为分割点的最大值 103 | return 10e10 104 | else: 105 | for i in range(n-1): 106 | if split_bin[i]cutoffpoints[-1]: # 如果x大于最大的cutoff点,则映射为Bin(bin_num-1) 152 | return 'Bin {}'.format(bin_num-1) 153 | else: 154 | for i in range(0,bin_num-1): 155 | if cutoffpoints[i]100: # 如果变量的唯一值数目超过100,则将通过split_data和assign_group将x映射为split对应的value 163 | split_col = split_data(df2,col,100) # 通过这个目的将变量的唯一值数目人为设定为100 164 | df2['col_map'] = df2[col].map(lambda x:assign_group(x,split_col)) 165 | else: 166 | df2['col_map'] = df2[col] # 变量的唯一值数目没有超过100,则不用做映射 167 | # 生成dict_bad,regroup,all_bad_rate的元组 168 | (dict_bad,regroup,all_bad_rate) = bin_bad_rate(df2,'col_map',target,grantRateIndicator=1) 169 | col_map_unique = sorted(list(set(df2['col_map']))) # 对变量映射后的value进行去重排序 170 | group_interval = [[i] for i in col_map_unique] # 对col_map_unique中每个值创建list并存储在group_interval中 171 | 172 | while (len(group_interval)>max_bin): # 当group_interval的长度大于max_bin时,执行while循环 173 | chi_list=[] 174 | for i in range(len(group_interval)-1): 175 | temp_group = group_interval[i]+group_interval[i+1] # temp_group 为生成的区间,list形式,例如[1,3] 176 | chi_df = regroup[regroup['col_map'].isin(temp_group)] 177 | chi_value = cal_chi2(chi_df,all_bad_rate) # 计算每一对相邻区间的卡方值 178 | chi_list.append(chi_value) 179 | best_combined = chi_list.index(min(chi_list)) # 最小的卡方值的索引 180 | # 将卡方值最小的一对区间进行合并 181 | group_interval[best_combined] = group_interval[best_combined]+group_interval[best_combined+1] 182 | # 删除合并前的右区间 183 | group_interval.remove(group_interval[best_combined+1]) 184 | # 对合并后每个区间进行排序 185 | group_interval = [sorted(i) for i in group_interval] 186 | # cutoff点为每个区间的最大值 187 | cutoffpoints = [max(i) for i in group_interval[:-1]] 188 | 189 | # 检查是否有箱只有好样本或者只有坏样本 190 | df2['col_map_bin'] = df2['col_map'].apply(lambda x:assign_bin(x,cutoffpoints)) # 将col_map映射为对应的区间Bin 191 | # 计算每个区间的违约率 192 | (dict_bad,regroup) = bin_bad_rate(df2,'col_map_bin',target) 193 | # 计算最小和最大的违约率 194 | [min_bad_rate,max_bad_rate] = [min(dict_bad.values()),max(dict_bad.values())] 195 | # 当最小的违约率等于0,说明区间内只有好样本,当最大的违约率等于1,说明区间内只有坏样本 196 | while min_bad_rate==0 or max_bad_rate==1: 197 | bad01_index = regroup[regroup['bad_rate'].isin([0,1])].col_map_bin.tolist()# 违约率为1或0的区间 198 | bad01_bin = bad01_index[0] 199 | if bad01_bin==max(regroup.col_map_bin): 200 | cutoffpoints = cutoffpoints[:-1] # 当bad01_bin是最大的区间时,删除最大的cutoff点 201 | elif bad01_bin==min(regroup.col_map_bin): 202 | cutoffpoints = cutoffpoints[1:] # 当bad01_bin是最小的区间时,删除最小的cutoff点 203 | else: 204 | bad01_bin_index = list(regroup.col_map_bin).index(bad01_bin) # 找出bad01_bin的索引 205 | prev_bin = list(regroup.col_map_bin)[bad01_bin_index-1] # bad01_bin前一个区间 206 | df3 = df2[df2.col_map_bin.isin([prev_bin,bad01_bin])] 207 | (dict_bad,regroup1) = bin_bad_rate(df3,'col_map_bin',target) 208 | chi1 = cal_chi2(regroup1,all_bad_rate) # 计算前一个区间和bad01_bin的卡方值 209 | later_bin = list(regroup.col_map_bin)[bad01_bin_index+1] # bin01_bin的后一个区间 210 | df4 = df2[df2.col_map_bin.isin([later_bin,bad01_bin])] 211 | (dict_bad,regroup2) = bin_bad_rate(df4,'col_map_bin',target) 212 | chi2 = cal_chi2(regroup2,all_bad_rate) # 计算后一个区间和bad01_bin的卡方值 213 | if chi1=chi2时,删除bin01对应的cutoff点 216 | cutoffpoints.remove(cutoffpoints[bad01_bin_index]) 217 | df2['col_map_bin'] = df2['col_map'].apply(lambda x:assign_bin(x,cutoffpoints)) 218 | (dict_bad,regroup) = bin_bad_rate(df2,'col_map_bin',target) 219 | # 重新将col_map映射至区间,并计算最小和最大的违约率,直达不再出现违约率为0或1的情况,循环停止 220 | [min_bad_rate,max_bad_rate] = [min(dict_bad.values()),max(dict_bad.values())] 221 | 222 | # 检查分箱后的最小占比 223 | if min_binpct>0: 224 | group_values = df2['col_map'].apply(lambda x:assign_bin(x,cutoffpoints)) 225 | df2['col_map_bin'] = group_values # 将col_map映射为对应的区间Bin 226 | group_df = group_values.value_counts().to_frame() 227 | group_df['bin_pct'] = group_df['col_map']/n # 计算每个区间的占比 228 | min_pct = group_df.bin_pct.min() # 得出最小的区间占比 229 | while min_pct2: # 当最小的区间占比小于min_pct且cutoff点的个数大于2,执行循环 230 | # 下面的逻辑基本与“检验是否有箱体只有好/坏样本”的一致 231 | min_pct_index = group_df[group_df.bin_pct==min_pct].index.tolist() 232 | min_pct_bin = min_pct_index[0] 233 | if min_pct_bin == max(group_df.index): 234 | cutoffpoints=cutoffpoints[:-1] 235 | elif min_pct_bin == min(group_df.index): 236 | cutoffpoints=cutoffpoints[1:] 237 | else: 238 | minpct_bin_index = list(group_df.index).index(min_pct_bin) 239 | prev_pct_bin = list(group_df.index)[minpct_bin_index-1] 240 | df5 = df2[df2['col_map_bin'].isin([min_pct_bin,prev_pct_bin])] 241 | (dict_bad,regroup3) = bin_bad_rate(df5,'col_map_bin',target) 242 | chi3 = cal_chi2(regroup3,all_bad_rate) 243 | later_pct_bin = list(group_df.index)[minpct_bin_index+1] 244 | df6 = df2[df2['col_map_bin'].isin([min_pct_bin,later_pct_bin])] 245 | (dict_bad,regroup4) = bin_bad_rate(df6,'col_map_bin',target) 246 | chi4 = cal_chi2(regroup4,all_bad_rate) 247 | if chi3=all_odds: 295 | GB_index = str(round((i/all_odds)*100,0))+str('G') 296 | else: 297 | GB_index = str(round((all_odds/i)*100,0))+str('B') 298 | GB_list.append(GB_index) 299 | d2['GB_index'] = GB_list 300 | d2['woe'] = np.log(d2['badattr']/d2['goodattr']) 301 | d2['bin_iv'] = (d2['badattr']-d2['goodattr'])*d2['woe'] 302 | d2['IV'] = d2['bin_iv'].sum() 303 | iv = d2['bin_iv'].sum().round(3) 304 | print('变量名:{}'.format(col)) 305 | print('IV:{}'.format(iv)) 306 | print('\t') 307 | bin_df.append(d2) 308 | iv_value.append(iv) 309 | return bin_df,iv_value 310 | 311 | 312 | # 数值型变量的iv明细表 313 | def iv_num(df,target,col_list,max_bin=None,min_binpct=None): 314 | """ 315 | df:数据集 316 | target:目标变量的字段名 317 | col_list:变量list集合 318 | max_bin:最大的分箱个数 319 | min_binpct:区间内样本所占总体的最小比 320 | 321 | return :变量的iv明细表 322 | """ 323 | bin_df,iv_value = binning_num(df,target,col_list,max_bin=max_bin,min_binpct=min_binpct) 324 | iv_df = pd.DataFrame({'col':col_list, 325 | 'iv':iv_value}) 326 | iv_df = iv_df.sort_values('iv',ascending=False) 327 | return iv_df 328 | 329 | 330 | # 自定义分箱 331 | def binning_self(df,col,target,cut=None,right_border=True): 332 | """ 333 | df: 数据集 334 | col:分箱的单个变量名 335 | cut:划分区间的list 336 | right_border:设定左开右闭、左闭右开 337 | 338 | return: 339 | bin_df: df形式,单个变量的分箱结果 340 | iv_value: 单个变量的iv 341 | """ 342 | total = df[target].count() 343 | bad = df[target].sum() 344 | good = total - bad 345 | all_odds = good/bad 346 | bucket = pd.cut(df[col],cut,right=right_border) 347 | d1 = df.groupby(bucket) 348 | d2 = pd.DataFrame() 349 | d2['min_bin'] = d1[col].min() 350 | d2['max_bin'] = d1[col].max() 351 | d2['total'] = d1[target].count() 352 | d2['totalrate'] = d2['total']/total 353 | d2['bad'] = d1[target].sum() 354 | d2['badrate'] = d2['bad']/d2['total'] 355 | d2['good'] = d2['total'] - d2['bad'] 356 | d2['goodrate'] = d2['good']/d2['total'] 357 | d2['badattr'] = d2['bad']/bad 358 | d2['goodattr'] = (d2['total']-d2['bad'])/good 359 | d2['odds'] = d2['good']/d2['bad'] 360 | GB_list=[] 361 | for i in d2.odds: 362 | if i>=all_odds: 363 | GB_index = str(round((i/all_odds)*100,0))+str('G') 364 | else: 365 | GB_index = str(round((all_odds/i)*100,0))+str('B') 366 | GB_list.append(GB_index) 367 | d2['GB_index'] = GB_list 368 | d2['woe'] = np.log(d2['badattr']/d2['goodattr']) 369 | d2['bin_iv'] = (d2['badattr']-d2['goodattr'])*d2['woe'] 370 | d2['IV'] = d2['bin_iv'].sum() 371 | iv_value = d2['bin_iv'].sum().round(3) 372 | print('变量名:{}'.format(col)) 373 | print('IV:{}'.format(iv_value)) 374 | bin_df = d2.copy() 375 | return bin_df,iv_value 376 | 377 | 378 | # 变量分箱结果的检查 379 | 380 | # woe的可视化 381 | def plot_woe(bin_df,hspace=0.4,wspace=0.4,plt_size=None,plt_num=None,x=None,y=None): 382 | """ 383 | bin_df:list形式,里面存储每个变量的分箱结果 384 | hspace :子图之间的间隔(y轴方向) 385 | wspace :子图之间的间隔(x轴方向) 386 | plt_size :图纸的尺寸 387 | plt_num :子图的数量 388 | x :子图矩阵中一行子图的数量 389 | y :子图矩阵中一列子图的数量 390 | 391 | return :每个变量的woe变化趋势图 392 | """ 393 | plt.figure(figsize=plt_size) 394 | plt.subplots_adjust(hspace=hspace,wspace=wspace) 395 | for i,df in zip(range(1,plt_num+1,1),bin_df): 396 | col_name = df.index.name 397 | df = df.reset_index() 398 | plt.subplot(x,y,i) 399 | plt.title(col_name) 400 | sns.barplot(data=df,x=col_name,y='woe') 401 | plt.xlabel('') 402 | plt.xticks(rotation=30) 403 | return plt.show() 404 | 405 | 406 | # 检验woe是否单调 407 | def woe_monoton(bin_df): 408 | """ 409 | bin_df:list形式,里面存储每个变量的分箱结果 410 | 411 | return : 412 | woe_notmonoton_col :woe没有呈单调变化的变量,list形式 413 | woe_judge_df :df形式,每个变量的检验结果 414 | """ 415 | woe_notmonoton_col =[] 416 | col_list = [] 417 | woe_judge=[] 418 | for woe_df in bin_df: 419 | col_name = woe_df.index.name 420 | woe_list = list(woe_df.woe) 421 | if woe_df.shape[0]==2: 422 | #print('{}是否单调: True'.format(col_name)) 423 | col_list.append(col_name) 424 | woe_judge.append('True') 425 | else: 426 | woe_not_monoton = [(woe_list[i]woe_list[i+1] and woe_list[i]>woe_list[i-1]) for i in range(1,len(woe_list)-1,1)] 427 | if True in woe_not_monoton: 428 | #print('{}是否单调: False'.format(col_name)) 429 | woe_notmonoton_col.append(col_name) 430 | col_list.append(col_name) 431 | woe_judge.append('False') 432 | else: 433 | #print('{}是否单调: True'.format(col_name)) 434 | col_list.append(col_name) 435 | woe_judge.append('True') 436 | woe_judge_df = pd.DataFrame({'col':col_list, 437 | 'judge_monoton':woe_judge}) 438 | return woe_notmonoton_col,woe_judge_df 439 | 440 | 441 | # 检查某个区间的woe是否大于1 442 | def woe_large(bin_df): 443 | """ 444 | bin_df:list形式,里面存储每个变量的分箱结果 445 | 446 | return: 447 | woe_large_col: 某个区间woe大于1的变量,list集合 448 | woe_judge_df :df形式,每个变量的检验结果 449 | """ 450 | woe_large_col=[] 451 | col_list =[] 452 | woe_judge =[] 453 | for woe_df in bin_df: 454 | col_name = woe_df.index.name 455 | woe_list = list(woe_df.woe) 456 | woe_large = list(filter(lambda x:x>=1,woe_list)) 457 | if len(woe_large)>0: 458 | col_list.append(col_name) 459 | woe_judge.append('True') 460 | woe_large_col.append(col_name) 461 | else: 462 | col_list.append(col_name) 463 | woe_judge.append('False') 464 | woe_judge_df = pd.DataFrame({'col':col_list, 465 | 'judge_large':woe_judge}) 466 | return woe_large_col,woe_judge_df 467 | 468 | --------------------------------------------------------------------------------