├── .ipynb_checkpoints └── credit_card-summarize-checkpoint.ipynb ├── README.md ├── code ├── feature_bin.py ├── helper.py ├── preprocessing.py ├── scorecard_functions_V3.py └── visualization.py └── report ├── proposal.pdf ├── report.html ├── report.ipynb └── report.pdf /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lisa-wang1987/risk_model_machinelearning/3d9237cdd4290cc565f596ebdc2bd9059f86855d/README.md -------------------------------------------------------------------------------- /code/feature_bin.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import pandas as pd 4 | import scorecard_functions_V3 as sf 5 | from statsmodels.stats.outliers_influence import variance_inflation_factor 6 | 7 | 8 | 9 | #分类特征个数小于5的分类特征用get_dummy编码 10 | def dummy_code(data,less_cat_col): 11 | return pd.concat([data,pd.get_dummies(data[less_cat_col])],axis=1).drop(less_cat_col,axis=1) 12 | 13 | #分类特征个数大于5的分类特征用badrate编码 14 | #辅助函数 15 | def val_transe(val): 16 | idx = list(temp.index) 17 | for i in idx: 18 | if val == i: 19 | return m[i] 20 | 21 | def bin_count_code(data,col,more_cat_col): 22 | total = pd.Series(data[col].vaule_counts(),name='total') 23 | y_1 = pd.Series(data[data[label]==1][col].value_counts(),name='overdue') 24 | y_0 = pd.Series(data[data[label]==0][col].value_counts(),name='normal') 25 | counts = pd.DataFrame([y_1,y_0,total]).T 26 | counts['overdue_rate'] = counts['overdue']/counts['total'] 27 | counts['normal_rate'] = counts['normal']/counts['total'] 28 | counts['log_overdue'] = counts['overdue_rate']/counts['normal_rate'] 29 | return pd.Series(counts['log_overdue'].index) 30 | def get_more_cat_col_code(data,more_cat_col,label): 31 | for col in more_cat_col: 32 | temp = bin_count_code(data,col,label) 33 | data[col + '_encode'] = data[col].apply(val_transe) 34 | return data 35 | 36 | #对于分类个数大于和连续特征进行分箱 37 | def feature_bin(data,label,feature_set): 38 | #feature_set 需要分箱的特征集合 包括分类和连续 39 | continues_merged_dict={} 40 | var_bin_list=[] 41 | for col in feature_set: 42 | print('{} is in preprocessing'.format(col)) 43 | max_bin_set = 5 44 | cutoff = sf.ChiMerge(data,col,label,max_interval=max_bin_set,special_attribute=[],minBinPcnt=0) 45 | data[col + '_Bin'] = data[col].map(lambda x:sf.AssignBin(x, cutOff, special_attribute=[])) 46 | monotone = sf.BadRateMonotone(data,col+'_Bin',label,['Bin_1']) 47 | while (not monotone): 48 | max_bin_set -=1 49 | cutoff = sf.ChiMerge(data,col,label,max_interval=max_bin_set,special_attribute=[],minBinPcnt=0) 50 | data[col + '_Bin'] = data[col].map(lambda x:sf.AssignBin(x, cutOff, special_attribute=[])) 51 | if max_bin_set ==3: 52 | break 53 | monotone = sf.BadRateMonotone(data,col+'_Bin',label,['Bin_1']) 54 | newvar = col + '_Bin' 55 | data[newvar] = data[col].map(lambda x: sf.AssignBin(x, cutOff, special_attribute=[])) 56 | var_bin_list.append(newvar) 57 | continues_merged_dict[col]=cutoff 58 | return continues_merged_dict,var_bin_list,data 59 | 60 | #分箱完需要计算woe 61 | def IV_WOE(data,label): 62 | col_IV ={} 63 | col_woe ={} 64 | bin_col = [s for s in data.columns if 'Bin' in s] 65 | for col in bin_col: 66 | IV_woe = sf.CalcWOE(data,col,label) 67 | col_IV[col] = IV_woe['IV'] 68 | col_woe[col] = IV_woe['WOE'] 69 | return col_IV,col_woe 70 | 71 | #选择iv值>0.01的特征 72 | def feature_select_iv(data,col_IV): 73 | high_IV = {k:v for k,v in col_IV.items() if v >0.01} 74 | high_IV_sorted = sorted(high_IV.items(),key=lambda x:x[1],reverse=True) 75 | short_list=high_IV.keys() 76 | short_list_2 = [] 77 | for var in short_list: 78 | newvar = var +'_WOE' 79 | data[newvar] = data[var].map(col_woe[var]) 80 | short_list_2.append(newvar) 81 | return data,short_list_2,high_IV_sorted 82 | 83 | 84 | #计算两两间的相关系数,若线性相关提出IV值低的 85 | def cal_corr_del_IV_low(data,high_IV_sorted): 86 | deleted_index=[] 87 | cnt_vars = len(high_IV_sorted) 88 | for i in range(cnt_vars): 89 | if i in deleted_index: 90 | continue 91 | x1 = high_IV_sorted[i][0]+'_WOE' 92 | for j in range(cnt_vars): 93 | if i==j or j in deleted_index: 94 | continue 95 | y1 = high_IV_sorted[j][0]+'_WOE' 96 | roh = np.corrcoef(data[x1],data[y1])[0,1] 97 | if abs(roh) >0.7: 98 | x1_IV = high_IV_sorted[i][1] 99 | y1_IV = high_IV_sorted[j][1] 100 | if x1_IV > y1_IV: 101 | deleted_index.append(j) 102 | else: 103 | deleted_index.append(i) 104 | multi_analysis_vars = [high_IV_sorted[i][0]+'_WOE' for i in range(cnt_vars) if i not in deleted_index] 105 | return multi_analysis_vars 106 | 107 | #vif诊断 108 | def get_vif(data,cols_set): 109 | vif={} 110 | for var in feature_V_L_cor_col.columns: 111 | new_vif =variance_inflation_factor(feature_V_L_cor_col.values,feature_V_L_cor_col.columns.get_loc(var)) 112 | vif[var]=new_vif 113 | vif_sorted = pd.Series(list(vif.values()),index=vif.keys()) 114 | return vif_sorted 115 | -------------------------------------------------------------------------------- /code/helper.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import scorecard_functions_V3 as sf 3 | import numpy as np 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.model_selection import ShuffleSplit,StratifiedKFold 6 | from sklearn.ensemble import RandomForestClassifier 7 | from sklearn.model_selection import ShuffleSplit,StratifiedKFold 8 | import visualization as vs 9 | #get catergory col 10 | def get_less_more_cat_col(data,cols_set): 11 | less_col = [] 12 | more_col =[] 13 | for col in cols_set: 14 | if len(set(data[col])) <=5: 15 | less_col.append(col) 16 | else: 17 | more_col.append(col) 18 | return less_col,more_col 19 | 20 | #define dummy code for less cat col 21 | def dummy_code(data,less_cat_col): 22 | return pd.concat([data,pd.get_dummies(data[less_cat_col])],axis=1).drop(less_cat_col,axis=1) 23 | 24 | #get more col code 25 | def get_more_cat_code(data,col_set,label): 26 | for col in col_set: 27 | data[col+'_Bin'] = data[col].map(sf.BinBadRate(data,col,label)[0]) 28 | return data 29 | # get order columns cutoff points and new data 30 | 31 | # order merge and bin code 32 | ''' 33 | i explore found,some order feature not only have some value no bad sample and 34 | some no good sample,so here processing them separately. 35 | ''' 36 | def order_merge_encode(data,order_col):# order_col is sure 37 | ''' 38 | param data: input data 39 | param order_col: all order features need to be deal with 40 | ''' 41 | merge_bin_dict_bad={} 42 | merge_bin_dict_good={} 43 | order_list_bad=[] 44 | order_list_good=[] 45 | 46 | # test bad part 47 | print('starting bad part=========') 48 | for col in order_col: 49 | binBadRate = sf.BinBadRate(data, col, 'y')[0] 50 | if min(binBadRate.values()) == 0 : #由于某个取值没有坏样本而进行合并 51 | print ('{} need to be combined due to 0 bad rate'.format(col)) 52 | combine_bin = sf.MergeBad0(data, col, 'y') 53 | merge_bin_dict_bad[col] = combine_bin 54 | newVar = col + '_Merge' 55 | order_list_bad.append(newVar) 56 | data[newVar] = data[col].map(combine_bin) 57 | del_list_bad=[w.replace('_Merge','') for w in order_list_bad] 58 | data = data.drop(del_list_bad,axis=1) 59 | colmns_set= [w for w in data.columns if w != 'y'] 60 | 61 | # test good part 62 | print('starting good part==========') 63 | for col in colmns_set: 64 | binBadRate = sf.BinBadRate(data, col, 'y')[0] 65 | if min(binBadRate.values()) == 1 : #由于某个取值没有坏样本而进行合并 66 | print ('{} need to be combined due to 0 good rate'.format(col)) 67 | combine_bin = sf.MergeBad0(data, col, 'y') 68 | merge_bin_dict_good[col] = combine_bin 69 | newVar = col + '_Merge' 70 | order_list_good.append(newVar) 71 | data[newVar] = data[col].map(combine_bin) 72 | del_list_good=[w.replace('_Merge','') for w in order_list_good] 73 | data = data.drop(del_list_good,axis=1) 74 | return data 75 | 76 | # order encode 77 | def order_encode(data,order_col): 78 | ''' 79 | param data: input data 80 | param order_col: all order features need to be deal with 81 | ''' 82 | df = order_merge_encode(data,order_col)#use above function 83 | list_set = [w for w in df.columns if w !='y'] 84 | df_data = get_more_cat_code(df,list_set,'y') 85 | df_data = df_data.drop(list_set,axis=1) 86 | return df_data 87 | 88 | # get cutoff point and get new data 89 | def get_cutoff(data,cols_set,label): 90 | less_cols,more_cols = get_less_more_cat_col(data,cols_set) 91 | merge_bin_dict={} 92 | 93 | continues_merged_dict={} 94 | var_bin_list =[] 95 | for col in less_cols: 96 | binBadRate = sf.BinBadRate(data, col, 'y')[0] 97 | if min(binBadRate.values()) == 0 : #由于某个取值没有坏样本而进行合并 98 | print ('{} need to be combined due to 0 bad rate'.format(col)) 99 | combine_bin = sf.MergeBad0(data, col, 'y') 100 | merge_bin_dict[col] = combine_bin 101 | newVar = col + '_Bin' 102 | data[newVar] = data[col].map(combine_bin) 103 | var_bin_list.append(newVar) 104 | if max(binBadRate.values()) == 1: #由于某个取值没有好样本而进行合并 105 | print ('{} need to be combined due to 0 good rate'.format(col)) 106 | combine_bin = sf.MergeBad0(data, col, 'y',direction = 'good') 107 | merge_bin_dict[col] = combine_bin 108 | newVar = col + '_Bin' 109 | data[newVar] = data[col].map(combine_bin) 110 | order_list.append(newVar) 111 | var_bin_list.append(newVar) 112 | 113 | for col in more_cols: 114 | print('{} is in processing'.format(col)) 115 | max_interval = 5 116 | cutoff = sf.ChiMerge(data,col,label,max_interval=max_interval,minBinPcnt=0) 117 | data[col+'_Bin'] = data[col].map(lambda x: sf.AssignBin(x,cutoff,special_attribute=[])) 118 | monotone = sf.BadRateMonotone(data,col+'_Bin',label) 119 | while (not monotone): 120 | max_interval -=1 121 | cutoff = sf.ChiMerge(data,col,label,max_interval=max_interval,special_attribute=[],minBinPcnt=0) 122 | data[col +'_Bin'] = data[col].map(lambda x: sf.AssignBin(x,cutoff,special_attribute=[])) 123 | if max_interval == 2: 124 | break 125 | monotone = sf.BadRateMonotone(data,col+'_Bin',label) 126 | newVar = col +'_Bin' 127 | data[newVar] = data[col].map(lambda x: sf.AssignBin(x,cutoff,special_attribute=[])) 128 | var_bin_list.append(newVar) 129 | continues_merged_dict[col] = cutoff 130 | return continues_merged_dict,var_bin_list,data 131 | 132 | # get woe and iv value 133 | def get_woe_iv(data,cols_set,label): 134 | col_iv ={} 135 | col_woe={} 136 | for col in cols_set: 137 | temp = sf.CalcWOE(data,col,label) 138 | col_iv[col] = temp['IV'] 139 | col_woe[col] = temp['WOE'] 140 | return col_iv,col_woe 141 | 142 | #get top iv>0.01 143 | def choose_iv_feature(data,col_iv,col_woe): 144 | high_IV = {k:v for k, v in col_iv.items() if v >= 0.01} 145 | high_IV_sorted = sorted(high_IV.items(),key=lambda x:x[1],reverse=True) 146 | 147 | short_list = high_IV.keys() 148 | short_list_2 = [] 149 | for var in short_list: 150 | newVar = var + '_WOE' 151 | data[newVar] = data[var].map(col_woe[var]) 152 | short_list_2.append(newVar) 153 | return short_list_2 154 | 155 | 156 | # correlation 157 | def cor_feature(data,col_iv): 158 | deleted_index = [] 159 | high_IV = {k:v for k, v in col_iv.items() if v >= 0.01} 160 | high_IV_sorted = sorted(high_IV.items(),key=lambda x:x[1],reverse=True) 161 | cnt_vars = len(high_IV_sorted) 162 | for i in range(cnt_vars): 163 | if i in deleted_index: 164 | continue 165 | x1 = high_IV_sorted[i][0]+"_WOE" 166 | for j in range(cnt_vars): 167 | if i == j or j in deleted_index: 168 | continue 169 | y1 = high_IV_sorted[j][0]+"_WOE" 170 | roh = np.corrcoef(data[x1],data[y1])[0,1] 171 | if abs(roh)>0.7: 172 | x1_IV = high_IV_sorted[i][1] 173 | y1_IV = high_IV_sorted[j][1] 174 | if x1_IV > y1_IV: 175 | deleted_index.append(j) 176 | else: 177 | deleted_index.append(i) 178 | multi_analysis_vars_1 = [high_IV_sorted[i][0]+"_WOE" for i in range(cnt_vars) if i not in deleted_index] 179 | return multi_analysis_vars_1 180 | 181 | # train model 182 | 183 | # calculate ks value 184 | def KS(df, score, target): 185 | ''' 186 | :param df: 包含目标变量与预测值的数据集 187 | :param score: 得分或者概率 188 | :param target: 目标变量 189 | :return: KS值 190 | ''' 191 | total = df.groupby([score])[target].count() 192 | bad = df.groupby([score])[target].sum() 193 | all = pd.DataFrame({'total':total, 'bad':bad}) 194 | all['good'] = all['total'] - all['bad'] 195 | all[score] = all.index 196 | all = all.sort_values(by=score,ascending=False) 197 | all.index = range(len(all)) 198 | all['badCumRate'] = all['bad'].cumsum() / all['bad'].sum() 199 | all['goodCumRate'] = all['good'].cumsum() / all['good'].sum() 200 | KS = all.apply(lambda x: x.badCumRate - x.goodCumRate, axis=1) 201 | return max(KS) 202 | 203 | #lr training 204 | def lr_train(X,y,clf): 205 | clf.fit(X,y) 206 | pred = clf.predict_proba(X)[:,1] 207 | df = pd.DataFrame([pred,y]).T 208 | df.columns = ['score','y'] 209 | print('the roc curve is:') 210 | vs.model_roc_curve(X,y,clf) 211 | print('the ks value is :',KS(df,'score','y')) 212 | 213 | 214 | # grid research 215 | def grid_research(X,y,clf,paremeter): 216 | from sklearn.model_selection import GridSearchCV 217 | cv = StratifiedKFold(n_splits=5) 218 | clf = RandomForestClassifier() 219 | grid = GridSearchCV(clf,paremeter,cv=cv) 220 | grid.fit(X,y) 221 | return grid.best_params_ 222 | -------------------------------------------------------------------------------- /code/preprocessing.py: -------------------------------------------------------------------------------- 1 | # import api 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from scipy.stats import mode 6 | from scipy.interpolate import lagrange 7 | import scorecard_functions_V3 as sf 8 | import datetime 9 | from sklearn.preprocessing import Imputer 10 | from sklearn.model_selection import train_test_split 11 | import pickle 12 | import itertools 13 | 14 | import warnings 15 | warnings.filterwarnings("ignore") 16 | 17 | 18 | 19 | #preprocessing function 20 | 21 | #home_ownership turn 'ANY' to 'MORTGAGE' 22 | def any_to_mort(val): 23 | if val=='ANY': 24 | return 'MORTGAGE' 25 | else: 26 | return val 27 | #delete '%' string function 28 | def int_rate(val): 29 | if val != 'nan': 30 | return round(float(str(val).replace('%',''))/100,4) 31 | elif val == str('nan'): 32 | return -1 33 | #delete‘year’string function 34 | def emp_length(val): 35 | if val =='10+ years': 36 | return 10 37 | elif val =='< 1 year': 38 | return 0 39 | elif val == 'n/a': 40 | return -1 41 | elif val =='1 year': 42 | return 1 43 | else: 44 | return float(str(val).replace('years','')) 45 | 46 | #date transformation function 47 | def ConvertDateStr(x): 48 | mth_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 49 | 'Nov': 11, 'Dec': 12} 50 | if str(x) == 'nan': 51 | return datetime.datetime.fromtimestamp(time.mktime(time.strptime('9900-1','%Y-%m'))) 52 | #time.mktime 不能读取1970年之前的日期 53 | else: 54 | yr = int(x[4:6]) 55 | if yr <=17: 56 | yr = 2000+yr 57 | else: 58 | yr = 1900 + yr 59 | mth = mth_dict[x[:3]] 60 | return datetime.datetime(yr,mth,1) 61 | 62 | #date to date long function 63 | def days_long(val): 64 | now = datetime.datetime.now() 65 | delta = now - val 66 | return delta.days 67 | 68 | 69 | #grade 70 | def grade_value(val): 71 | grade = val.replace('A',1) 72 | grade = grade.replace('B',2) 73 | grade = grade.replace('C',3) 74 | grade = grade.replace('D',4) 75 | grade = grade.replace('E',5) 76 | grade = grade.replace('F',6) 77 | grade = grade.replace('G',7) 78 | return grade 79 | 80 | #subgrade 81 | def subgrade_value(val): 82 | grade = val.replace('A1',11) 83 | grade = grade.replace('A2',12) 84 | grade = grade.replace('A3',13) 85 | grade = grade.replace('A4',14) 86 | grade = grade.replace('A5',15) 87 | grade = grade.replace('B1',21) 88 | grade = grade.replace('B2',22) 89 | grade = grade.replace('B3',23) 90 | grade = grade.replace('B4',24) 91 | grade = grade.replace('B5',25) 92 | grade = grade.replace('C1',31) 93 | grade = grade.replace('C2',32) 94 | grade = grade.replace('C3',33) 95 | grade = grade.replace('C4',34) 96 | grade = grade.replace('C5',35) 97 | grade = grade.replace('D1',41) 98 | grade = grade.replace('D2',42) 99 | grade = grade.replace('D3',43) 100 | grade = grade.replace('D4',44) 101 | grade = grade.replace('D5',45) 102 | grade = grade.replace('E1',51) 103 | grade = grade.replace('E2',52) 104 | grade = grade.replace('E3',53) 105 | grade = grade.replace('E4',54) 106 | grade = grade.replace('E5',55) 107 | grade = grade.replace('F1',61) 108 | grade = grade.replace('F2',62) 109 | grade = grade.replace('F3',63) 110 | grade = grade.replace('F4',64) 111 | grade = grade.replace('F5',65) 112 | grade = grade.replace('G1',71) 113 | grade = grade.replace('G2',72) 114 | grade = grade.replace('G3',73) 115 | grade = grade.replace('G4',74) 116 | grade = grade.replace('G5',75) 117 | return grade 118 | 119 | # missing fill function 120 | def mean_val(data,col): 121 | mean_col = data[col].mean() 122 | idx = data[data[col].isnull()==True].index 123 | data.loc[idx,col]=mean_col 124 | return data[col] 125 | 126 | #label transformation function 127 | def label_transe(val): 128 | if val == 'Charged Off': 129 | return 1 130 | elif (val == 'Fully Paid' or val =='Current'): 131 | return 0 132 | else: 133 | return -1 134 | 135 | # read data and print statistic information 136 | def read_data(data): 137 | print('The data is the third quarter of 2017 borrower data of LendingClub opened on official website') 138 | df = pd.read_csv(data,header=1) 139 | df = df[df['term']==' 36 months'] 140 | print ('\n') 141 | print('top 5 line of data is :\n',df.head(2)) 142 | print('\n') 143 | print('data statistic information is ',df.describe()) 144 | print('\n') 145 | print('all data shape is:',df.shape) 146 | return df 147 | 148 | # separate training set and test set 149 | def split_train_test(data): 150 | trainData = data[(data['issue_d']!='Nov-2015') & (data['issue_d']!='Dec-2015')] 151 | testData = data[(data['issue_d']=='Nov-2015') | (data['issue_d']=='Dec-2015')] 152 | print('the train data shape is:',trainData.shape) 153 | print('the test data shape is:',testData.shape) 154 | return trainData,testData 155 | 156 | #data preprocessing 157 | def drop_afterloan_columns(data): 158 | #drop after loan feature 159 | after_col =['pymnt_plan', 'collection_recovery_fee', 'recoveries', 'hardship_flag','title', 160 | 'out_prncp_inv', 'out_prncp','total_rec_prncp','last_pymnt_amnt','last_pymnt_d', 161 | 'last_credit_pull_d','total_pymnt','total_pymnt_inv','total_rec_int', 162 | 'total_rec_late_fee','title','term'] 163 | df = data.drop(after_col,axis=1) 164 | print('after drop after loan data, the data shape is ',df.shape) 165 | return df 166 | 167 | # drop only one value columns 168 | def drop_unique1_col(data): 169 | cols =data.nunique()[data.nunique()>1].index.tolist() 170 | df =data.loc[:,cols] 171 | print('after drop only one value columns, the data shape is ',df.shape) 172 | return df 173 | 174 | # drop columns if it's missing greater than 60% 175 | def drop_missingmore60_col(data): 176 | miss_60_col = data.isnull().sum()[data.isnull().sum()>=0.40*data.shape[0]].index 177 | df = data.drop(miss_60_col,axis=1) 178 | print('after drop missing greater than 60% columns, the data shape is ',df.shape) 179 | return df 180 | 181 | #drop all null row and all null column 182 | def drop_row_col_miss(data): 183 | data = data.dropna(how='all',axis=1) 184 | df = data.dropna(how='all',axis=0) 185 | return df 186 | 187 | #delete 90% value same in one column 188 | def drop_90samevalue_col(data): 189 | colum=data.columns 190 | per=pd.DataFrame(colum,index=colum) 191 | max_valuecounts=[] 192 | for col in colum: 193 | max_valuecounts.append(data[col].value_counts().max()) 194 | per['mode']=max_valuecounts 195 | per['percentil'] =per['mode']/data.shape[0] 196 | same_value_col =per[per.sort_values(by='percentil',ascending=False)['percentil']>0.9].index 197 | df = data.drop(same_value_col,axis=1) 198 | print('after delete 90% values same in one column,the data shape is',df.shape) 199 | return df 200 | 201 | #get label function 202 | def get_label(data,label): 203 | data['y'] = data[label].apply(label_transe) 204 | data =data[((data['y']!=2) & (data['y']!=-1))].drop([label],axis=1) 205 | return data 206 | 207 | #character to value 208 | def string_to_value(data): 209 | data['int_rate'] = data.loc[:,'int_rate'].apply(int_rate) #int_rate 210 | data['emp_length'] = data.loc[:,'emp_length'].apply(emp_length) # emp_length 211 | data['revol_util']=data.loc[:,'revol_util'].astype(str).apply(int_rate) # revol_util 212 | data['grade'] = grade_value(data.loc[:,'grade'])#grade 213 | data['sub_grade'] = subgrade_value(data.loc[:,'sub_grade'])#sub_grade 214 | data['earliest_cr_line'] = data.loc[:,'earliest_cr_line'].apply(ConvertDateStr).apply(days_long) 215 | return data 216 | 217 | 218 | #get catogery columns and continues columns 219 | def class_feature(data): 220 | cat_col = list(data.columns.to_series().groupby(data.dtypes).groups.values())[2] 221 | continue_col = list(data.columns.to_series().groupby(data.dtypes).groups.values())[0].append(list(data.columns.to_series().groupby(data.dtypes).groups.values())[1]) 222 | 223 | return cat_col,continue_col 224 | 225 | #get word_col,cat_col,ordered_col,continue_col 226 | def get_word_cat_ordered_continue_col(data): 227 | # text feature columns 228 | word_col=['zip_code', 'addr_state','emp_title'] 229 | temp_col=['mo_sin_old_il_acct','mo_sin_old_rev_tl_op','mths_since_recent_bc', 230 | 'mths_since_recent_inq','pct_tl_nvr_dlq','percent_bc_gt_75','bc_util','revol_util','dti'] 231 | cat_col,value_col = class_feature(data) 232 | cat_col = [w for w in cat_col if w not in word_col] #catergory columns 233 | continue_col = [] 234 | for col in value_col: 235 | if len(set(data[col]))>500: 236 | continue_col.append(col) 237 | continue_col =[key for key in continue_col if key not in word_col] 238 | continue_col = [key for key in continue_col if key !='emp_length'] 239 | continue_col = [key for key in continue_col if key not in temp_col] #continue col 240 | 241 | ordered_col = [key for key in value_col if key not in continue_col] 242 | ordered_col = [key for key in ordered_col if key !='y'] 243 | ordered_col = [key for key in ordered_col if key !='term'] 244 | ordered_co = ordered_col + ['emp_length'] # ordered columns 245 | return word_col,cat_col,ordered_col,continue_col 246 | 247 | 248 | # filling columns has missing value 249 | def missing_fill(data): 250 | cat_col,continue_col = class_feature(data) # 区分分类特征和连续特征 251 | missing_col=list(data.isnull().sum()[data.isnull().sum()>0].index) 252 | for col in missing_col: 253 | if col in cat_col: 254 | fill_value = data[col].mode()[0] 255 | data[col]=data[col].fillna(fill_value) 256 | else: 257 | fill_value = data[col].mean() 258 | data[col] = data[col].fillna(fill_value) 259 | return data 260 | 261 | # outlier 262 | #del orded columns and continues top 10 greater samples 263 | def del_outlier_index(data,key): 264 | temp_index = data[data[key]>data[key].sort_values(ascending=False)[0:10].min()].index 265 | return list(temp_index) 266 | -------------------------------------------------------------------------------- /code/scorecard_functions_V3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def SplitData(df, col, numOfSplit, special_attribute=[]): 5 | ''' 6 | :param df: 按照col排序后的数据集 7 | :param col: 待分箱的变量 8 | :param numOfSplit: 切分的组别数 9 | :param special_attribute: 在切分数据集的时候,某些特殊值需要排除在外 10 | :return: 在原数据集上增加一列,把原始细粒度的col重新划分成粗粒度的值,便于分箱中的合并处理 11 | ''' 12 | df2 = df.copy() 13 | if special_attribute != []: 14 | df2 = df.loc[~df[col].isin(special_attribute)] 15 | N = df2.shape[0] 16 | n = N//numOfSplit 17 | splitPointIndex = [i*n for i in range(1,numOfSplit)] 18 | rawValues = sorted(list(df2[col])) 19 | splitPoint = [rawValues[i] for i in splitPointIndex] 20 | splitPoint = sorted(list(set(splitPoint))) 21 | return splitPoint 22 | 23 | 24 | 25 | # def Chi2(df, total_col, bad_col, overallRate): 26 | # ''' 27 | # :param df: 包含全部样本总计与坏样本总计的数据框 28 | # :param total_col: 全部样本的个数 29 | # :param bad_col: 坏样本的个数 30 | # :param overallRate: 全体样本的坏样本占比 31 | # :return: 卡方值 32 | # ''' 33 | # df2 = df.copy() 34 | # # 期望坏样本个数=全部样本个数*平均坏样本占比 35 | # df2['expected'] = df[total_col].apply(lambda x: x*overallRate) 36 | # combined = zip(df2['expected'], df2[bad_col]) 37 | # chi = [(i[0]-i[1])**2/i[0] for i in combined] 38 | # chi2 = sum(chi) 39 | # return chi2 40 | 41 | 42 | def Chi2(df, total_col, bad_col): 43 | ''' 44 | :param df: 包含全部样本总计与坏样本总计的数据框 45 | :param total_col: 全部样本的个数 46 | :param bad_col: 坏样本的个数 47 | :return: 卡方值 48 | ''' 49 | df2 = df.copy() 50 | # 求出df中,总体的坏样本率和好样本率 51 | badRate = sum(df2[bad_col])*1.0/sum(df2[total_col]) 52 | df2['good'] = df2.apply(lambda x: x[total_col] - x[bad_col], axis = 1) 53 | goodRate = sum(df2['good']) * 1.0 / sum(df2[total_col]) 54 | # 期望坏(好)样本个数=全部样本个数*平均坏(好)样本占比 55 | df2['badExpected'] = df[total_col].apply(lambda x: x*badRate) 56 | df2['goodExpected'] = df[total_col].apply(lambda x: x * goodRate) 57 | badCombined = zip(df2['badExpected'], df2[bad_col]) 58 | goodCombined = zip(df2['goodExpected'], df2['good']) 59 | badChi = [(i[0]-i[1])**2/i[0] for i in badCombined] 60 | goodChi = [(i[0] - i[1]) ** 2 / i[0] for i in goodCombined] 61 | chi2 = sum(badChi) + sum(goodChi) 62 | return chi2 63 | 64 | 65 | # Chi2 的另外一种计算方法 66 | # def Chi2(df, total_col, bad_col): 67 | # df2 = df.copy() 68 | # df2['good'] = df2[total_col] - df2[bad_col] 69 | # goodTotal = sum(df2['good']) 70 | # badTotal = sum(df2[bad_col]) 71 | # p1 = df2.loc[0]['good']*1.0/df2.loc[0][total_col] 72 | # p2 = df2.loc[1]['good']*1.0/df2.loc[1][total_col] 73 | # w1 = df2.loc[0]['good']*1.0/goodTotal 74 | # w2 = df2.loc[0][bad_col]*1.0/badTotal 75 | # N = sum(df2[total_col]) 76 | # return N*(p1-p2)*(w1-w2) 77 | 78 | 79 | def BinBadRate(df, col, target, grantRateIndicator=0): 80 | ''' 81 | :param df: 需要计算好坏比率的数据集 82 | :param col: 需要计算好坏比率的特征 83 | :param target: 好坏标签 84 | :param grantRateIndicator: 1返回总体的坏样本率,0不返回 85 | :return: 每箱的坏样本率,以及总体的坏样本率(当grantRateIndicator==1时) 86 | ''' 87 | total = df.groupby([col])[target].count() 88 | total = pd.DataFrame({'total': total}) 89 | bad = df.groupby([col])[target].sum() 90 | bad = pd.DataFrame({'bad': bad}) 91 | regroup = total.merge(bad, left_index=True, right_index=True, how='left') 92 | regroup.reset_index(level=0, inplace=True) 93 | regroup['bad_rate'] = regroup.apply(lambda x: x.bad * 1.0 / x.total, axis=1) 94 | dicts = dict(zip(regroup[col],regroup['bad_rate'])) 95 | if grantRateIndicator==0: 96 | return (dicts, regroup) 97 | N = sum(regroup['total']) 98 | B = sum(regroup['bad']) 99 | overallRate = B * 1.0 / N 100 | return (dicts, regroup, overallRate) 101 | 102 | 103 | 104 | ### ChiMerge_MaxInterval: split the continuous variable using Chi-square value by specifying the max number of intervals 105 | def ChiMerge(df, col, target, max_interval=5,special_attribute=[],minBinPcnt=0): 106 | ''' 107 | :param df: 包含目标变量与分箱属性的数据框 108 | :param col: 需要分箱的属性 109 | :param target: 目标变量,取值0或1 110 | :param max_interval: 最大分箱数。如果原始属性的取值个数低于该参数,不执行这段函数 111 | :param special_attribute: 不参与分箱的属性取值 112 | :param minBinPcnt:最小箱的占比,默认为0 113 | :return: 分箱结果 114 | ''' 115 | colLevels = sorted(list(set(df[col]))) 116 | N_distinct = len(colLevels) 117 | if N_distinct <= max_interval: #如果原始属性的取值个数低于max_interval,不执行这段函数 118 | print ("The number of original levels for {} is less than or equal to max intervals".format(col)) 119 | return colLevels[:-1] 120 | else: 121 | if len(special_attribute)>=1: 122 | df1 = df.loc[df[col].isin(special_attribute)] 123 | df2 = df.loc[~df[col].isin(special_attribute)] 124 | else: 125 | df2 = df.copy() 126 | N_distinct = len(list(set(df2[col]))) 127 | 128 | # 步骤一: 通过col对数据集进行分组,求出每组的总样本数与坏样本数 129 | if N_distinct > 100: 130 | split_x = SplitData(df2, col, 100) 131 | df2['temp'] = df2[col].map(lambda x: AssignGroup(x, split_x)) 132 | else: 133 | df2['temp'] = df2[col] 134 | # 总体bad rate将被用来计算expected bad count 135 | (binBadRate, regroup, overallRate) = BinBadRate(df2, 'temp', target, grantRateIndicator=1) 136 | 137 | # 首先,每个单独的属性值将被分为单独的一组 138 | # 对属性值进行排序,然后两两组别进行合并 139 | colLevels = sorted(list(set(df2['temp']))) 140 | groupIntervals = [[i] for i in colLevels] 141 | 142 | # 步骤二:建立循环,不断合并最优的相邻两个组别,直到: 143 | # 1,最终分裂出来的分箱数<=预设的最大分箱数 144 | # 2,每箱的占比不低于预设值(可选) 145 | # 3,每箱同时包含好坏样本 146 | # 如果有特殊属性,那么最终分裂出来的分箱数=预设的最大分箱数-特殊属性的个数 147 | split_intervals = max_interval - len(special_attribute) 148 | while (len(groupIntervals) > split_intervals): # 终止条件: 当前分箱数=预设的分箱数 149 | # 每次循环时, 计算合并相邻组别后的卡方值。具有最小卡方值的合并方案,是最优方案 150 | chisqList = [] 151 | for k in range(len(groupIntervals)-1): 152 | temp_group = groupIntervals[k] + groupIntervals[k+1] 153 | df2b = regroup.loc[regroup['temp'].isin(temp_group)] 154 | #chisq = Chi2(df2b, 'total', 'bad', overallRate) 155 | chisq = Chi2(df2b, 'total', 'bad') 156 | chisqList.append(chisq) 157 | best_comnbined = chisqList.index(min(chisqList)) 158 | groupIntervals[best_comnbined] = groupIntervals[best_comnbined] + groupIntervals[best_comnbined+1] 159 | # after combining two intervals, we need to remove one of them 160 | groupIntervals.remove(groupIntervals[best_comnbined+1]) 161 | groupIntervals = [sorted(i) for i in groupIntervals] 162 | cutOffPoints = [max(i) for i in groupIntervals[:-1]] 163 | 164 | # 检查是否有箱没有好或者坏样本。如果有,需要跟相邻的箱进行合并,直到每箱同时包含好坏样本 165 | groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints)) 166 | df2['temp_Bin'] = groupedvalues 167 | (binBadRate,regroup) = BinBadRate(df2, 'temp_Bin', target) 168 | [minBadRate, maxBadRate] = [min(binBadRate.values()),max(binBadRate.values())] 169 | while minBadRate ==0 or maxBadRate == 1: 170 | # 找出全部为好/坏样本的箱 171 | indexForBad01 = regroup[regroup['bad_rate'].isin([0,1])].temp_Bin.tolist() 172 | bin=indexForBad01[0] 173 | # 如果是最后一箱,则需要和上一个箱进行合并,也就意味着分裂点cutOffPoints中的最后一个需要移除 174 | if bin == max(regroup.temp_Bin): 175 | cutOffPoints = cutOffPoints[:-1] 176 | # 如果是第一箱,则需要和下一个箱进行合并,也就意味着分裂点cutOffPoints中的第一个需要移除 177 | elif bin == min(regroup.temp_Bin): 178 | cutOffPoints = cutOffPoints[1:] 179 | # 如果是中间的某一箱,则需要和前后中的一个箱进行合并,依据是较小的卡方值 180 | else: 181 | # 和前一箱进行合并,并且计算卡方值 182 | currentIndex = list(regroup.temp_Bin).index(bin) 183 | prevIndex = list(regroup.temp_Bin)[currentIndex - 1] 184 | df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, bin])] 185 | (binBadRate, df2b) = BinBadRate(df3, 'temp_Bin', target) 186 | #chisq1 = Chi2(df2b, 'total', 'bad', overallRate) 187 | chisq1 = Chi2(df2b, 'total', 'bad') 188 | # 和后一箱进行合并,并且计算卡方值 189 | laterIndex = list(regroup.temp_Bin)[currentIndex + 1] 190 | df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, bin])] 191 | (binBadRate, df2b) = BinBadRate(df3b, 'temp_Bin', target) 192 | #chisq2 = Chi2(df2b, 'total', 'bad', overallRate) 193 | chisq2 = Chi2(df2b, 'total', 'bad') 194 | if chisq1 < chisq2: 195 | cutOffPoints.remove(cutOffPoints[currentIndex - 1]) 196 | else: 197 | cutOffPoints.remove(cutOffPoints[currentIndex]) 198 | # 完成合并之后,需要再次计算新的分箱准则下,每箱是否同时包含好坏样本 199 | groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints)) 200 | df2['temp_Bin'] = groupedvalues 201 | (binBadRate, regroup) = BinBadRate(df2, 'temp_Bin', target) 202 | [minBadRate, maxBadRate] = [min(binBadRate.values()), max(binBadRate.values())] 203 | # 需要检查分箱后的最小占比 204 | if minBinPcnt > 0: 205 | groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints)) 206 | df2['temp_Bin'] = groupedvalues 207 | valueCounts = groupedvalues.value_counts().to_frame() 208 | valueCounts['pcnt'] = valueCounts['temp'].apply(lambda x: x * 1.0 / N) 209 | valueCounts = valueCounts.sort_index() 210 | minPcnt = min(valueCounts['pcnt']) 211 | while minPcnt < minBinPcnt and len(cutOffPoints) > 2: 212 | # 找出占比最小的箱 213 | indexForMinPcnt = valueCounts[valueCounts['pcnt'] == minPcnt].index.tolist()[0] 214 | # 如果占比最小的箱是最后一箱,则需要和上一个箱进行合并,也就意味着分裂点cutOffPoints中的最后一个需要移除 215 | if indexForMinPcnt == max(valueCounts.index): 216 | cutOffPoints = cutOffPoints[:-1] 217 | # 如果占比最小的箱是第一箱,则需要和下一个箱进行合并,也就意味着分裂点cutOffPoints中的第一个需要移除 218 | elif indexForMinPcnt == min(valueCounts.index): 219 | cutOffPoints = cutOffPoints[1:] 220 | # 如果占比最小的箱是中间的某一箱,则需要和前后中的一个箱进行合并,依据是较小的卡方值 221 | else: 222 | # 和前一箱进行合并,并且计算卡方值 223 | currentIndex = list(valueCounts.index).index(indexForMinPcnt) 224 | prevIndex = list(valueCounts.index)[currentIndex - 1] 225 | df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, indexForMinPcnt])] 226 | (binBadRate, df2b) = BinBadRate(df3, 'temp_Bin', target) 227 | #chisq1 = Chi2(df2b, 'total', 'bad', overallRate) 228 | chisq1 = Chi2(df2b, 'total', 'bad') 229 | # 和后一箱进行合并,并且计算卡方值 230 | laterIndex = list(valueCounts.index)[currentIndex + 1] 231 | df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, indexForMinPcnt])] 232 | (binBadRate, df2b) = BinBadRate(df3b, 'temp_Bin', target) 233 | #chisq2 = Chi2(df2b, 'total', 'bad', overallRate) 234 | chisq2 = Chi2(df2b, 'total', 'bad') 235 | if chisq1 < chisq2: 236 | cutOffPoints.remove(cutOffPoints[currentIndex - 1]) 237 | else: 238 | cutOffPoints.remove(cutOffPoints[currentIndex]) 239 | cutOffPoints = special_attribute + cutOffPoints 240 | return cutOffPoints 241 | 242 | 243 | 244 | def UnsupervisedSplitBin(df,var,numOfSplit = 5, method = 'equal freq'): 245 | ''' 246 | :param df: 数据集 247 | :param var: 需要分箱的变量。仅限数值型。 248 | :param numOfSplit: 需要分箱个数,默认是5 249 | :param method: 分箱方法,'equal freq':,默认是等频,否则是等距 250 | :return: 251 | ''' 252 | if method == 'equal freq': 253 | N = df.shape[0] 254 | n = N / numOfSplit 255 | splitPointIndex = [i * n for i in range(1, numOfSplit)] 256 | rawValues = sorted(list(df[col])) 257 | splitPoint = [rawValues[i] for i in splitPointIndex] 258 | splitPoint = sorted(list(set(splitPoint))) 259 | return splitPoint 260 | else: 261 | var_max, var_min = max(df[var]), min(df[var]) 262 | interval_len = (var_max - var_min)*1.0/numOfSplit 263 | splitPoint = [var_min + i*interval_len for i in range(1,numOfSplit)] 264 | return splitPoint 265 | 266 | 267 | 268 | def AssignGroup(x, bin): 269 | ''' 270 | :param x: 某个变量的某个取值 271 | :param bin: 上述变量的分箱结果 272 | :return: x在分箱结果下的映射 273 | ''' 274 | N = len(bin) 275 | if x<=min(bin): 276 | return min(bin) 277 | elif x>max(bin): 278 | return 10e10 279 | else: 280 | for i in range(N-1): 281 | if bin[i] < x <= bin[i+1]: 282 | return bin[i+1] 283 | 284 | 285 | def BadRateEncoding(df, col, target): 286 | ''' 287 | :param df: dataframe containing feature and target 288 | :param col: the feature that needs to be encoded with bad rate, usually categorical type 289 | :param target: good/bad indicator 290 | :return: the assigned bad rate to encode the categorical feature 291 | ''' 292 | regroup = BinBadRate(df, col, target, grantRateIndicator=0)[1] 293 | br_dict = regroup[[col,'bad_rate']].set_index([col]).to_dict(orient='index') 294 | for k, v in br_dict.items(): 295 | br_dict[k] = v['bad_rate'] 296 | badRateEnconding = df[col].map(lambda x: br_dict[x]) 297 | return {'encoding':badRateEnconding, 'bad_rate':br_dict} 298 | 299 | 300 | def AssignBin(x, cutOffPoints,special_attribute=[]): 301 | ''' 302 | :param x: 某个变量的某个取值 303 | :param cutOffPoints: 上述变量的分箱结果,用切分点表示 304 | :param special_attribute: 不参与分箱的特殊取值 305 | :return: 分箱后的对应的第几个箱,从0开始 306 | for example, if cutOffPoints = [10,20,30], if x = 7, return Bin 0. If x = 35, return Bin 3 307 | ''' 308 | numBin = len(cutOffPoints) + 1 + len(special_attribute) 309 | if x in special_attribute: 310 | i = special_attribute.index(x)+1 311 | return 'Bin {}'.format(0-i) 312 | if x<=cutOffPoints[0]: 313 | return 'Bin 0' 314 | elif x > cutOffPoints[-1]: 315 | return 'Bin {}'.format(numBin-1) 316 | else: 317 | for i in range(0,numBin-1): 318 | if cutOffPoints[i] < x <= cutOffPoints[i+1]: 319 | return 'Bin {}'.format(i+1) 320 | 321 | 322 | 323 | def CalcWOE(df, col, target): 324 | ''' 325 | :param df: 包含需要计算WOE的变量和目标变量 326 | :param col: 需要计算WOE、IV的变量,必须是分箱后的变量,或者不需要分箱的类别型变量 327 | :param target: 目标变量,0、1表示好、坏 328 | :return: 返回WOE和IV 329 | ''' 330 | total = df.groupby([col])[target].count() 331 | total = pd.DataFrame({'total': total}) 332 | bad = df.groupby([col])[target].sum() 333 | bad = pd.DataFrame({'bad': bad}) 334 | regroup = total.merge(bad, left_index=True, right_index=True, how='left') 335 | regroup.reset_index(level=0, inplace=True) 336 | N = sum(regroup['total']) 337 | B = sum(regroup['bad']) 338 | regroup['good'] = regroup['total'] - regroup['bad'] 339 | G = N - B 340 | regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x*1.0/B) 341 | regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G) 342 | regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1) 343 | WOE_dict = regroup[[col,'WOE']].set_index(col).to_dict(orient='index') 344 | for k, v in WOE_dict.items(): 345 | WOE_dict[k] = v['WOE'] 346 | IV = regroup.apply(lambda x: (x.good_pcnt-x.bad_pcnt)*np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1) 347 | IV = sum(IV) 348 | return {"WOE": WOE_dict, 'IV':IV} 349 | 350 | 351 | 352 | ## 判断某变量的坏样本率是否单调 353 | def BadRateMonotone(df, sortByVar, target,special_attribute = []): 354 | ''' 355 | :param df: 包含检验坏样本率的变量,和目标变量 356 | :param sortByVar: 需要检验坏样本率的变量 357 | :param target: 目标变量,0、1表示好、坏 358 | :param special_attribute: 不参与检验的特殊值 359 | :return: 坏样本率单调与否 360 | ''' 361 | df2 = df.loc[~df[sortByVar].isin(special_attribute)] 362 | if len(set(df2[sortByVar])) <= 2: 363 | return True 364 | regroup = BinBadRate(df2, sortByVar, target)[1] 365 | combined = zip(regroup['total'],regroup['bad']) 366 | badRate = [x[1]*1.0/x[0] for x in combined] 367 | badRateNotMonotone = [badRate[i]badRate[i+1] and badRate[i] > badRate[i-1] 368 | for i in range(1,len(badRate)-1)] 369 | if True in badRateNotMonotone: 370 | return False 371 | else: 372 | return True 373 | 374 | 375 | 376 | def MergeBad0(df,col,target, direction='bad'): 377 | ''' 378 | :param df: 包含检验0%或者100%坏样本率 379 | :param col: 分箱后的变量或者类别型变量。检验其中是否有一组或者多组没有坏样本或者没有好样本。如果是,则需要进行合并 380 | :param target: 目标变量,0、1表示好、坏 381 | :return: 合并方案,使得每个组里同时包含好坏样本 382 | ''' 383 | regroup = BinBadRate(df, col, target)[1] 384 | if direction == 'bad': 385 | # 如果是合并0坏样本率的组,则跟最小的非0坏样本率的组进行合并 386 | regroup = regroup.sort_values(by = 'bad_rate') 387 | else: 388 | # 如果是合并0好样本样本率的组,则跟最小的非0好样本率的组进行合并 389 | regroup = regroup.sort_values(by='bad_rate',ascending=False) 390 | regroup.index = range(regroup.shape[0]) 391 | col_regroup = [[i] for i in regroup[col]] 392 | del_index = [] 393 | for i in range(regroup.shape[0]-1): 394 | col_regroup[i+1] = col_regroup[i] + col_regroup[i+1] 395 | del_index.append(i) 396 | if direction == 'bad': 397 | if regroup['bad_rate'][i+1] > 0: 398 | break 399 | else: 400 | if regroup['bad_rate'][i+1] < 1: 401 | break 402 | col_regroup2 = [col_regroup[i] for i in range(len(col_regroup)) if i not in del_index] 403 | newGroup = {} 404 | for i in range(len(col_regroup2)): 405 | for g2 in col_regroup2[i]: 406 | newGroup[g2] = 'Bin '+str(i) 407 | return newGroup 408 | 409 | def Prob2Score(prob, basePoint, PDO): 410 | #将概率转化成分数且为正整数 411 | y = np.log(prob/(1-prob)) 412 | return int(basePoint+PDO/np.log(2)*(-y)) 413 | 414 | 415 | ### 计算KS值 416 | def KS(df, score, target): 417 | ''' 418 | :param df: 包含目标变量与预测值的数据集 419 | :param score: 得分或者概率 420 | :param target: 目标变量 421 | :return: KS值 422 | ''' 423 | total = df.groupby([score])[target].count() 424 | bad = df.groupby([score])[target].sum() 425 | all = pd.DataFrame({'total':total, 'bad':bad}) 426 | all['good'] = all['total'] - all['bad'] 427 | all[score] = all.index 428 | all = all.sort_values(by=score,ascending=False) 429 | all.index = range(len(all)) 430 | all['badCumRate'] = all['bad'].cumsum() / all['bad'].sum() 431 | all['goodCumRate'] = all['good'].cumsum() / all['good'].sum() 432 | KS = all.apply(lambda x: x.badCumRate - x.goodCumRate, axis=1) 433 | return max(KS) 434 | 435 | 436 | def MergeByCondition(x,condition_list): 437 | #condition_list是条件列表。满足第几个condition,就输出几 438 | s = 0 439 | for condition in condition_list: 440 | if eval(str(x)+condition): 441 | return s 442 | else: 443 | s+=1 444 | return s 445 | -------------------------------------------------------------------------------- /code/visualization.py: -------------------------------------------------------------------------------- 1 | # ignore warnings 2 | import numpy as np 3 | from scipy import interp 4 | import matplotlib.pyplot as plt 5 | from itertools import cycle 6 | import helper as lp 7 | 8 | from sklearn.naive_bayes import GaussianNB 9 | from sklearn.model_selection import learning_curve 10 | from sklearn.model_selection import ShuffleSplit,StratifiedKFold 11 | from sklearn.metrics import roc_curve, auc 12 | from sklearn.model_selection import StratifiedKFold 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.linear_model import LogisticRegression 15 | import warnings 16 | warnings.filterwarnings('ignore',category=UserWarning,module='matplotlib') 17 | 18 | #Display inline 19 | from IPython import get_ipython 20 | get_ipython().run_line_magic('matplotlib','inline') 21 | 22 | import matplotlib.pyplot as plt 23 | import numpy as np 24 | import pandas as pd 25 | from wordcloud import WordCloud 26 | from collections import Counter 27 | from numpy.random import beta 28 | import seaborn as sns 29 | plt.style.use('bmh') 30 | 31 | 32 | #plot borrower loan status distribution 33 | def plot_label(data,col): 34 | temp = data.groupby('y').count().iloc[:,0] 35 | bar_data = {'normal':temp[0],'overdue':temp[1]} 36 | names = list(bar_data.keys()) 37 | values = list(bar_data.values()) 38 | plt.bar(range(2),values) 39 | plt.xticks((0,1),('normal','overdue')) 40 | plt.title('borrower loan status distribution') 41 | plt.text(0.45,180000,r'normal:overdue mostly equal to 7',color='black') 42 | plt.text(0.45,170000,r'unbalanced dataset',color='black') 43 | plt.show() 44 | 45 | #plot category columns <5 46 | def plot_cat(data,key): 47 | plot_data=data[[key,'y']] 48 | plt.figure(figsize=(8,6)) 49 | if (key=='home_ownership'): 50 | values=['ANY','RENT','MORTGAGE','OWN'] 51 | if (key=='verification_status'): 52 | values=['Source Verified', 'Not Verified', 'Verified'] 53 | if (key=='initial_list_status'): 54 | values=['w', 'f'] 55 | if (key=='grade'): 56 | values=[1, 2, 3, 4, 5, 6, 7] 57 | if (key=='emp_length'): 58 | values=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.027167861525544, 8.0, 9.0, 10.0, 7.0] 59 | if (key=='purpose'): 60 | values=['home_improvement', 'medical', 'educational', 'other', 'debt_consolidation', 'vacation', 'house', 'wedding', 61 | 'major_purchase', 'moving', 'car', 'small_business', 'renewable_energy', 'credit_card'] 62 | if (key=='issue_d'): 63 | values=['Oct-2015', 'Dec-2015', 'Aug-2015', 'Apr-2015', 'May-2015', 'Nov-2015', 'Jan-2015', 'Sep-2015', 64 | 'Jun-2015', 'Feb-2015', 'Mar-2015', 'Jul-2015'] 65 | #create DataFrame containing categories and each of counts 66 | frame = pd.DataFrame(index=np.arange(len(values)),columns=(key,'normal','overdue')) 67 | for i,value in enumerate(values): 68 | frame.loc[i]=[value,len(plot_data[(plot_data['y'] == 0) & (plot_data[key] == value)]),len(plot_data[(plot_data['y'] == 1) & (plot_data[key] == value)])] 69 | #display each categrory's overdue rate 70 | bin_width = 0.4 71 | for i in np.arange(frame.shape[0]): 72 | overdue_bar = plt.bar(i-bin_width,frame.loc[i]['overdue'],width = bin_width,color='r') 73 | normal_bar = plt.bar(i,frame.loc[i]['normal'],width = bin_width,color='g') 74 | 75 | plt.xticks(np.arange(len(frame)),values) 76 | plt.legend((overdue_bar[0],normal_bar[0]),('overdue','normal'),framealpha=0.8) 77 | 78 | plt.xlabel(key) 79 | plt.ylabel('number of borrower') 80 | plt.xticks(rotation=90) 81 | plt.title('borrower Statistics With \'%s\' Feature'%(key)) 82 | plt.show() 83 | 84 | 85 | # Report number of passengers with missing values 86 | if sum(pd.isnull(plot_data[key])): 87 | nan_outcomes = plot_data[pd.isnull(plot_data[key])]['y'] 88 | print ("borrower with missing '{}' values: {} ({} overdue, {} normal)".format( \ 89 | key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0))) 90 | 91 | #plot word cloud 92 | def word_cloud(data,key): 93 | word_freq=Counter() 94 | word = data[key].astype(str) 95 | word_freq =Counter(word) 96 | words_cloud = WordCloud(scale=5,min_font_size=8,max_words=100,background_color='white').fit_words(word_freq) 97 | plt.imshow(words_cloud) 98 | 99 | # Report number of passengers with missing values 100 | if sum(pd.isnull(data[key])): 101 | nan_outcomes = data[pd.isnull(data[key])]['y'] 102 | print ("borrower with missing '{}' values: {} ({} overdue, {} normal)".format( \ 103 | key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0))) 104 | 105 | 106 | 107 | #plot youxu columns 108 | def plot_youxu_col(data,label,key): 109 | all_data=data[[key,label]] 110 | all_data = all_data[~np.isnan(all_data[key])] 111 | plt.figure(figsize=(8,6)) 112 | min_value = all_data[key].min() 113 | max_value = all_data[key].max() 114 | value_range = max_value - min_value 115 | overdue = all_data[all_data['y']==1][key] 116 | normal = all_data[all_data['y']==0][key] 117 | bins = np.arange(min_value-1,all_data[key].max()+1,1) 118 | plt.hist(overdue,bins=bins,histtype='stepfilled',alpha=0.6,color='red',label='overdue') 119 | plt.hist(normal,bins=bins,histtype='stepfilled',alpha=0.6,color='green',label='normal') 120 | plt.xlim(0,bins.max()) 121 | plt.legend(framealpha=0.8) 122 | plt.xlabel(key) 123 | plt.ylabel('Number of borrower') 124 | plt.title('borrower overdue Statistics With \'%s\' Feature'%(key)) 125 | plt.show() 126 | 127 | # Report number of passengers with missing values 128 | if sum(pd.isnull(all_data[key])): 129 | nan_outcomes = all_data[pd.isnull(all_data[key])]['Survived'] 130 | print ("borrower with missing '{}' values: {} ({} overdue, {} normal)".format( \ 131 | key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0))) 132 | 133 | #plot lianxu columns distribution 134 | def plot_lianxu_col(data,label,key): 135 | all_data=data[[key,label]] 136 | plt.figure(figsize=(8,6)) 137 | min_value = all_data[key].min() 138 | max_value = all_data[key].max() 139 | value_range = max_value - min_value 140 | overdue = all_data[all_data['y']==1][key] 141 | normal = all_data[all_data['y']==0][key] 142 | if (key=='installment'): 143 | bins = np.arange(min_value-1,all_data[key].max()+1,100) 144 | else: 145 | bins = np.arange(min_value-1,all_data[key].max()+1,1000) 146 | plt.hist(overdue,bins=bins,histtype='stepfilled',alpha=0.6,color='red',label='overdue') 147 | plt.hist(normal,bins=bins,histtype='stepfilled',alpha=0.6,color='green',label='normal') 148 | plt.xlim(0,bins.max()) 149 | plt.legend(framealpha=0.8) 150 | plt.xlabel(key) 151 | plt.ylabel('Number of borrower') 152 | plt.title('borrower overdue Statistics With \'%s\' Feature'%(key)) 153 | plt.show() 154 | 155 | # Report number of passengers with missing values 156 | if sum(pd.isnull(all_data[key])): 157 | nan_outcomes = all_data[pd.isnull(all_data[key])]['y'] 158 | print ("borrower with missing '{}' values: {} ({} overdue, {} normal)".format( \ 159 | key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0))) 160 | 161 | # plot iv sort bar 162 | def feature_IV_bar(col_IV): 163 | IV_dict_sorted = sorted(col_IV.items(),key=lambda x:x[1],reverse=True) 164 | IV_values =[i[1] for i in IV_dict_sorted] 165 | IV_name = [i[0] for i in IV_dict_sorted] 166 | plt.title('feature IV value bar') 167 | plt.bar(range(len(IV_values)),IV_values) 168 | plt.show() 169 | 170 | 171 | 172 | 173 | # define cross validation roc curve and split use 5 174 | def model_roc_curve(X,y,clf): 175 | import numpy as np 176 | from scipy import interp 177 | import matplotlib.pyplot as plt 178 | from itertools import cycle 179 | 180 | from sklearn import svm, datasets 181 | from sklearn.metrics import roc_curve, auc 182 | from sklearn.model_selection import StratifiedKFold 183 | 184 | # Run classifier with cross-validation and plot ROC curves 185 | cv = StratifiedKFold(n_splits=5) 186 | 187 | tprs = [] 188 | aucs = [] 189 | mean_fpr = np.linspace(0, 1, 100) 190 | 191 | i = 0 192 | for train, test in cv.split(X, y): 193 | probas_ = clf.fit(X[train], y[train]).predict_proba(X[test]) 194 | # Compute ROC curve and area the curve 195 | fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) 196 | tprs.append(interp(mean_fpr, fpr, tpr)) 197 | tprs[-1][0] = 0.0 198 | roc_auc = auc(fpr, tpr) 199 | aucs.append(roc_auc) 200 | plt.plot(fpr, tpr, lw=1, alpha=0.3, 201 | label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) 202 | 203 | i += 1 204 | plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', 205 | label='Luck', alpha=.8) 206 | 207 | mean_tpr = np.mean(tprs, axis=0) 208 | mean_tpr[-1] = 1.0 209 | mean_auc = auc(mean_fpr, mean_tpr) 210 | std_auc = np.std(aucs) 211 | plt.plot(mean_fpr, mean_tpr, color='b', 212 | label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), 213 | lw=2, alpha=.8) 214 | 215 | std_tpr = np.std(tprs, axis=0) 216 | tprs_upper = np.minimum(mean_tpr + std_tpr, 1) 217 | tprs_lower = np.maximum(mean_tpr - std_tpr, 0) 218 | plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, 219 | label=r'$\pm$ 1 std. dev.') 220 | 221 | plt.xlim([-0.05, 1.05]) 222 | plt.ylim([-0.05, 1.05]) 223 | plt.xlabel('False Positive Rate') 224 | plt.ylabel('True Positive Rate') 225 | plt.title('Receiver operating characteristic example') 226 | plt.legend(loc="lower right") 227 | plt.show() 228 | 229 | #corr heatmap 230 | def plot_corrmatrix_heatmap(data): 231 | df_data = data.corr() 232 | plt.subplots(figsize=(9,9)) 233 | sns.heatmap(df_data,annot = False,vmax=1, square=True, cmap="Blues") 234 | plt.show() 235 | 236 | 237 | #plot sorted iv value bar 238 | def iv_sorted(col_iv): 239 | IV_dict_sorted = sorted(col_iv.items(),key=lambda x:x[1],reverse=True) 240 | IV_values = [i[1] for i in IV_dict_sorted] 241 | IV_name = [i[0] for i in IV_dict_sorted] 242 | plt.title('sorted feature IV bar') 243 | #plt.bar(range(len(IV_values)),IV_values) 244 | plt.bar(IV_name,IV_values) 245 | plt.show() 246 | 247 | #plot learning curve 248 | def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, 249 | n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): 250 | plt.figure() 251 | plt.title(title) 252 | if ylim is not None: 253 | plt.ylim(*ylim) 254 | plt.xlabel("Training examples") 255 | plt.ylabel("Score") 256 | train_sizes, train_scores, test_scores = learning_curve( 257 | estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) 258 | train_scores_mean = np.mean(train_scores, axis=1) 259 | train_scores_std = np.std(train_scores, axis=1) 260 | test_scores_mean = np.mean(test_scores, axis=1) 261 | test_scores_std = np.std(test_scores, axis=1) 262 | plt.grid() 263 | 264 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 265 | train_scores_mean + train_scores_std, alpha=0.1, 266 | color="r") 267 | plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 268 | test_scores_mean + test_scores_std, alpha=0.1, color="g") 269 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", 270 | label="Training score") 271 | plt.plot(train_sizes, test_scores_mean, 'o-', color="g", 272 | label="Cross-validation score") 273 | 274 | plt.legend(loc="best") 275 | return plt 276 | 277 | #plot ks curve 278 | def ks_curve(pred_y_data): 279 | total = pred_y_data.groupby(['score'])['y'].count() 280 | bad = pred_y_data.groupby(['score'])['y'].sum() 281 | alls = pd.DataFrame({'total':total,'bad':bad}) 282 | alls['good'] = alls['total'] - alls['bad'] 283 | alls['score'] = alls.index 284 | alls = alls.sorted_values(by='score',ascending=False) 285 | alls.index = range(len(alls)) 286 | alls['badCumRate'] = alls['bad'].cumsum() / alls['bad'].sum() 287 | alls['goodCumRate'] = alls['good'].cumsum() / alls['good'].sum() 288 | plt.plot(alls['badCumRate']) 289 | plt.plot(alls['goodCumRate']) 290 | plt.show() 291 | -------------------------------------------------------------------------------- /report/proposal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lisa-wang1987/risk_model_machinelearning/3d9237cdd4290cc565f596ebdc2bd9059f86855d/report/proposal.pdf -------------------------------------------------------------------------------- /report/report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lisa-wang1987/risk_model_machinelearning/3d9237cdd4290cc565f596ebdc2bd9059f86855d/report/report.pdf --------------------------------------------------------------------------------