├── .ipynb_checkpoints
    └── credit_card-summarize-checkpoint.ipynb
├── README.md
├── code
    ├── feature_bin.py
    ├── helper.py
    ├── preprocessing.py
    ├── scorecard_functions_V3.py
    └── visualization.py
└── report
    ├── proposal.pdf
    ├── report.html
    ├── report.ipynb
    └── report.pdf


/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lisa-wang1987/risk_model_machinelearning/3d9237cdd4290cc565f596ebdc2bd9059f86855d/README.md


--------------------------------------------------------------------------------
/code/feature_bin.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import numpy as np
  3 | import pandas as pd
  4 | import scorecard_functions_V3 as sf
  5 | from statsmodels.stats.outliers_influence import variance_inflation_factor
  6 | 
  7 | 
  8 | 
  9 | #分类特征个数小于5的分类特征用get_dummy编码
 10 | def dummy_code(data,less_cat_col):
 11 |     return pd.concat([data,pd.get_dummies(data[less_cat_col])],axis=1).drop(less_cat_col,axis=1)
 12 | 
 13 | #分类特征个数大于5的分类特征用badrate编码
 14 | #辅助函数
 15 | def val_transe(val):
 16 |     idx = list(temp.index)
 17 |     for i in idx:
 18 |         if val == i:
 19 |             return m[i]
 20 | 
 21 | def bin_count_code(data,col,more_cat_col):
 22 |     total = pd.Series(data[col].vaule_counts(),name='total')
 23 |     y_1 = pd.Series(data[data[label]==1][col].value_counts(),name='overdue')
 24 |     y_0 = pd.Series(data[data[label]==0][col].value_counts(),name='normal')
 25 |     counts = pd.DataFrame([y_1,y_0,total]).T
 26 |     counts['overdue_rate'] = counts['overdue']/counts['total']
 27 |     counts['normal_rate'] = counts['normal']/counts['total']
 28 |     counts['log_overdue'] = counts['overdue_rate']/counts['normal_rate']
 29 |     return pd.Series(counts['log_overdue'].index)
 30 | def get_more_cat_col_code(data,more_cat_col,label):
 31 |     for col in more_cat_col:
 32 |         temp = bin_count_code(data,col,label)
 33 |         data[col + '_encode'] = data[col].apply(val_transe)
 34 |         return data
 35 | 
 36 | #对于分类个数大于和连续特征进行分箱
 37 | def feature_bin(data,label,feature_set):
 38 |     #feature_set 需要分箱的特征集合 包括分类和连续
 39 |     continues_merged_dict={}
 40 |     var_bin_list=[]
 41 |     for col in feature_set:
 42 |         print('{} is in preprocessing'.format(col))
 43 |         max_bin_set = 5
 44 |         cutoff = sf.ChiMerge(data,col,label,max_interval=max_bin_set,special_attribute=[],minBinPcnt=0)
 45 |         data[col + '_Bin'] = data[col].map(lambda x:sf.AssignBin(x, cutOff, special_attribute=[]))
 46 |         monotone = sf.BadRateMonotone(data,col+'_Bin',label,['Bin_1'])
 47 |         while (not monotone):
 48 |             max_bin_set -=1
 49 |             cutoff = sf.ChiMerge(data,col,label,max_interval=max_bin_set,special_attribute=[],minBinPcnt=0)
 50 |             data[col + '_Bin'] = data[col].map(lambda x:sf.AssignBin(x, cutOff, special_attribute=[]))
 51 |             if max_bin_set ==3:
 52 |                 break
 53 |             monotone = sf.BadRateMonotone(data,col+'_Bin',label,['Bin_1'])
 54 |         newvar = col + '_Bin'
 55 |         data[newvar] = data[col].map(lambda x: sf.AssignBin(x, cutOff, special_attribute=[]))
 56 |         var_bin_list.append(newvar)
 57 |     continues_merged_dict[col]=cutoff
 58 |     return continues_merged_dict,var_bin_list,data
 59 | 
 60 | #分箱完需要计算woe
 61 | def IV_WOE(data,label):
 62 |     col_IV ={}
 63 |     col_woe ={}
 64 |     bin_col = [s for s in data.columns if 'Bin' in s]
 65 |     for col in bin_col:
 66 |         IV_woe = sf.CalcWOE(data,col,label)
 67 |         col_IV[col] = IV_woe['IV']
 68 |         col_woe[col] = IV_woe['WOE']
 69 |         return col_IV,col_woe
 70 | 
 71 | #选择iv值>0.01的特征
 72 | def feature_select_iv(data,col_IV):
 73 |     high_IV = {k:v for k,v in col_IV.items() if v >0.01}
 74 |     high_IV_sorted = sorted(high_IV.items(),key=lambda x:x[1],reverse=True)
 75 |     short_list=high_IV.keys()
 76 |     short_list_2 = []
 77 |     for var in short_list:
 78 |         newvar = var +'_WOE'
 79 |         data[newvar] = data[var].map(col_woe[var])
 80 |         short_list_2.append(newvar)
 81 |         return data,short_list_2,high_IV_sorted
 82 | 
 83 | 
 84 | #计算两两间的相关系数,若线性相关提出IV值低的
 85 | def cal_corr_del_IV_low(data,high_IV_sorted):
 86 |     deleted_index=[]
 87 |     cnt_vars = len(high_IV_sorted)
 88 |     for i in range(cnt_vars):
 89 |         if i in deleted_index:
 90 |             continue
 91 |         x1 = high_IV_sorted[i][0]+'_WOE'
 92 |         for j in range(cnt_vars):
 93 |             if i==j or j in deleted_index:
 94 |                 continue
 95 |             y1 = high_IV_sorted[j][0]+'_WOE'
 96 |             roh = np.corrcoef(data[x1],data[y1])[0,1]
 97 |             if abs(roh) >0.7:
 98 |                 x1_IV = high_IV_sorted[i][1]
 99 |                 y1_IV = high_IV_sorted[j][1]
100 |                 if x1_IV > y1_IV:
101 |                     deleted_index.append(j)
102 |                 else:
103 |                     deleted_index.append(i)
104 |     multi_analysis_vars = [high_IV_sorted[i][0]+'_WOE' for i in range(cnt_vars) if i not in deleted_index]
105 |     return multi_analysis_vars
106 | 
107 | #vif诊断
108 | def get_vif(data,cols_set):
109 |     vif={}
110 |     for var in feature_V_L_cor_col.columns:
111 |         new_vif =variance_inflation_factor(feature_V_L_cor_col.values,feature_V_L_cor_col.columns.get_loc(var))
112 |         vif[var]=new_vif
113 |         vif_sorted = pd.Series(list(vif.values()),index=vif.keys())
114 |         return vif_sorted
115 | 


--------------------------------------------------------------------------------
/code/helper.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import scorecard_functions_V3 as sf
  3 | import numpy as np
  4 | from sklearn.linear_model import LogisticRegression
  5 | from sklearn.model_selection import ShuffleSplit,StratifiedKFold
  6 | from sklearn.ensemble import RandomForestClassifier
  7 | from sklearn.model_selection import ShuffleSplit,StratifiedKFold
  8 | import visualization as vs
  9 | #get catergory col
 10 | def get_less_more_cat_col(data,cols_set):
 11 |     less_col = []
 12 |     more_col =[]
 13 |     for col in cols_set:
 14 |         if len(set(data[col])) <=5:
 15 |             less_col.append(col)
 16 |         else:
 17 |             more_col.append(col)
 18 |     return less_col,more_col
 19 | 
 20 | #define dummy code for less cat col
 21 | def dummy_code(data,less_cat_col):
 22 |     return pd.concat([data,pd.get_dummies(data[less_cat_col])],axis=1).drop(less_cat_col,axis=1)
 23 | 
 24 | #get more col code
 25 | def get_more_cat_code(data,col_set,label):
 26 |     for col in col_set:
 27 |         data[col+'_Bin'] = data[col].map(sf.BinBadRate(data,col,label)[0])
 28 |     return data
 29 | # get order columns cutoff points and new data
 30 | 
 31 | # order merge and bin code
 32 | '''
 33 | i explore found,some order feature not only have some value no bad sample and
 34 | some no good sample,so here processing them separately.
 35 | '''
 36 | def order_merge_encode(data,order_col):# order_col is sure
 37 |     '''
 38 |     param data: input data
 39 |     param order_col: all order features need to be deal with
 40 |     '''
 41 |     merge_bin_dict_bad={}
 42 |     merge_bin_dict_good={}
 43 |     order_list_bad=[]
 44 |     order_list_good=[]
 45 | 
 46 |     # test bad part
 47 |     print('starting bad part=========')
 48 |     for col in order_col:
 49 |         binBadRate = sf.BinBadRate(data, col, 'y')[0]
 50 |         if min(binBadRate.values()) == 0 :  #由于某个取值没有坏样本而进行合并
 51 |             print ('{} need to be combined due to 0 bad rate'.format(col))
 52 |             combine_bin = sf.MergeBad0(data, col, 'y')
 53 |             merge_bin_dict_bad[col] = combine_bin
 54 |             newVar = col + '_Merge'
 55 |             order_list_bad.append(newVar)
 56 |             data[newVar] = data[col].map(combine_bin)
 57 |     del_list_bad=[w.replace('_Merge','') for w in order_list_bad]
 58 |     data = data.drop(del_list_bad,axis=1)
 59 |     colmns_set= [w for w in data.columns if w != 'y']
 60 | 
 61 |     # test good part
 62 |     print('starting good part==========')
 63 |     for col in colmns_set:
 64 |         binBadRate = sf.BinBadRate(data, col, 'y')[0]
 65 |         if min(binBadRate.values()) == 1 :  #由于某个取值没有坏样本而进行合并
 66 |             print ('{} need to be combined due to 0 good rate'.format(col))
 67 |             combine_bin = sf.MergeBad0(data, col, 'y')
 68 |             merge_bin_dict_good[col] = combine_bin
 69 |             newVar = col + '_Merge'
 70 |             order_list_good.append(newVar)
 71 |             data[newVar] = data[col].map(combine_bin)
 72 |     del_list_good=[w.replace('_Merge','') for w in order_list_good]
 73 |     data = data.drop(del_list_good,axis=1)
 74 |     return data
 75 | 
 76 | # order encode
 77 | def order_encode(data,order_col):
 78 |     '''
 79 |     param data: input data
 80 |     param order_col: all order features need to be deal with
 81 |     '''
 82 |     df = order_merge_encode(data,order_col)#use above function
 83 |     list_set = [w for w in df.columns if w !='y']
 84 |     df_data = get_more_cat_code(df,list_set,'y')
 85 |     df_data = df_data.drop(list_set,axis=1)
 86 |     return df_data
 87 | 
 88 | # get cutoff point and get new data
 89 | def get_cutoff(data,cols_set,label):
 90 |     less_cols,more_cols = get_less_more_cat_col(data,cols_set)
 91 |     merge_bin_dict={}
 92 | 
 93 |     continues_merged_dict={}
 94 |     var_bin_list =[]
 95 |     for col in less_cols:
 96 |         binBadRate = sf.BinBadRate(data, col, 'y')[0]
 97 |         if min(binBadRate.values()) == 0 :  #由于某个取值没有坏样本而进行合并
 98 |             print ('{} need to be combined due to 0 bad rate'.format(col))
 99 |             combine_bin = sf.MergeBad0(data, col, 'y')
100 |             merge_bin_dict[col] = combine_bin
101 |             newVar = col + '_Bin'
102 |             data[newVar] = data[col].map(combine_bin)
103 |             var_bin_list.append(newVar)
104 |         if max(binBadRate.values()) == 1:    #由于某个取值没有好样本而进行合并
105 |             print ('{} need to be combined due to 0 good rate'.format(col))
106 |             combine_bin = sf.MergeBad0(data, col, 'y',direction = 'good')
107 |             merge_bin_dict[col] = combine_bin
108 |             newVar = col + '_Bin'
109 |             data[newVar] = data[col].map(combine_bin)
110 |             order_list.append(newVar)
111 |             var_bin_list.append(newVar)
112 | 
113 |     for col in more_cols:
114 |         print('{} is in processing'.format(col))
115 |         max_interval = 5
116 |         cutoff = sf.ChiMerge(data,col,label,max_interval=max_interval,minBinPcnt=0)
117 |         data[col+'_Bin'] = data[col].map(lambda x: sf.AssignBin(x,cutoff,special_attribute=[]))
118 |         monotone = sf.BadRateMonotone(data,col+'_Bin',label)
119 |         while (not monotone):
120 |             max_interval -=1
121 |             cutoff = sf.ChiMerge(data,col,label,max_interval=max_interval,special_attribute=[],minBinPcnt=0)
122 |             data[col +'_Bin'] = data[col].map(lambda x: sf.AssignBin(x,cutoff,special_attribute=[]))
123 |             if max_interval == 2:
124 |                 break
125 |             monotone = sf.BadRateMonotone(data,col+'_Bin',label)
126 |         newVar = col +'_Bin'
127 |         data[newVar] = data[col].map(lambda x: sf.AssignBin(x,cutoff,special_attribute=[]))
128 |         var_bin_list.append(newVar)
129 |         continues_merged_dict[col] = cutoff
130 |     return continues_merged_dict,var_bin_list,data
131 | 
132 | # get woe and iv value
133 | def get_woe_iv(data,cols_set,label):
134 |     col_iv ={}
135 |     col_woe={}
136 |     for col in cols_set:
137 |         temp = sf.CalcWOE(data,col,label)
138 |         col_iv[col] = temp['IV']
139 |         col_woe[col] = temp['WOE']
140 |     return col_iv,col_woe
141 | 
142 | #get top iv>0.01
143 | def choose_iv_feature(data,col_iv,col_woe):
144 |     high_IV = {k:v for k, v in col_iv.items() if v >= 0.01}
145 |     high_IV_sorted = sorted(high_IV.items(),key=lambda x:x[1],reverse=True)
146 | 
147 |     short_list = high_IV.keys()
148 |     short_list_2 = []
149 |     for var in short_list:
150 |         newVar = var + '_WOE'
151 |         data[newVar] = data[var].map(col_woe[var])
152 |         short_list_2.append(newVar)
153 |     return short_list_2
154 | 
155 | 
156 | # correlation
157 | def cor_feature(data,col_iv):
158 |     deleted_index = []
159 |     high_IV = {k:v for k, v in col_iv.items() if v >= 0.01}
160 |     high_IV_sorted = sorted(high_IV.items(),key=lambda x:x[1],reverse=True)
161 |     cnt_vars = len(high_IV_sorted)
162 |     for i in range(cnt_vars):
163 |         if i in deleted_index:
164 |             continue
165 |         x1 = high_IV_sorted[i][0]+"_WOE"
166 |         for j in range(cnt_vars):
167 |             if i == j or j in deleted_index:
168 |                 continue
169 |             y1 = high_IV_sorted[j][0]+"_WOE"
170 |             roh = np.corrcoef(data[x1],data[y1])[0,1]
171 |             if abs(roh)>0.7:
172 |                 x1_IV = high_IV_sorted[i][1]
173 |                 y1_IV = high_IV_sorted[j][1]
174 |                 if x1_IV > y1_IV:
175 |                     deleted_index.append(j)
176 |                 else:
177 |                     deleted_index.append(i)
178 |     multi_analysis_vars_1 = [high_IV_sorted[i][0]+"_WOE" for i in range(cnt_vars) if i not in deleted_index]
179 |     return multi_analysis_vars_1
180 | 
181 | # train model
182 | 
183 | # calculate ks value
184 | def KS(df, score, target):
185 |     '''
186 |     :param df: 包含目标变量与预测值的数据集
187 |     :param score: 得分或者概率
188 |     :param target: 目标变量
189 |     :return: KS值
190 |     '''
191 |     total = df.groupby([score])[target].count()
192 |     bad = df.groupby([score])[target].sum()
193 |     all = pd.DataFrame({'total':total, 'bad':bad})
194 |     all['good'] = all['total'] - all['bad']
195 |     all[score] = all.index
196 |     all = all.sort_values(by=score,ascending=False)
197 |     all.index = range(len(all))
198 |     all['badCumRate'] = all['bad'].cumsum() / all['bad'].sum()
199 |     all['goodCumRate'] = all['good'].cumsum() / all['good'].sum()
200 |     KS = all.apply(lambda x: x.badCumRate - x.goodCumRate, axis=1)
201 |     return max(KS)
202 | 
203 | #lr training
204 | def lr_train(X,y,clf):
205 |     clf.fit(X,y)
206 |     pred = clf.predict_proba(X)[:,1]
207 |     df = pd.DataFrame([pred,y]).T
208 |     df.columns = ['score','y']
209 |     print('the roc curve is:')
210 |     vs.model_roc_curve(X,y,clf)
211 |     print('the ks value is :',KS(df,'score','y'))
212 | 
213 | 
214 | # grid research
215 | def grid_research(X,y,clf,paremeter):
216 |     from sklearn.model_selection import GridSearchCV
217 |     cv = StratifiedKFold(n_splits=5)
218 |     clf = RandomForestClassifier()
219 |     grid = GridSearchCV(clf,paremeter,cv=cv)
220 |     grid.fit(X,y)
221 |     return grid.best_params_
222 | 


--------------------------------------------------------------------------------
/code/preprocessing.py:
--------------------------------------------------------------------------------
  1 | # import api
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from scipy.stats import mode
  6 | from scipy.interpolate import lagrange
  7 | import scorecard_functions_V3 as sf
  8 | import datetime
  9 | from sklearn.preprocessing import Imputer
 10 | from sklearn.model_selection import train_test_split
 11 | import pickle
 12 | import itertools
 13 | 
 14 | import warnings
 15 | warnings.filterwarnings("ignore")
 16 | 
 17 | 
 18 | 
 19 | #preprocessing function
 20 | 
 21 | #home_ownership turn 'ANY' to 'MORTGAGE'
 22 | def any_to_mort(val):
 23 |     if val=='ANY':
 24 |         return 'MORTGAGE'
 25 |     else:
 26 |         return val
 27 | #delete '%' string function
 28 | def int_rate(val):
 29 |     if val != 'nan':
 30 |         return round(float(str(val).replace('%',''))/100,4)
 31 |     elif val == str('nan'):
 32 |         return -1
 33 | #delete‘year’string function
 34 | def emp_length(val):
 35 |     if val =='10+ years':
 36 |         return 10
 37 |     elif val =='< 1 year':
 38 |         return 0
 39 |     elif val == 'n/a':
 40 |         return -1
 41 |     elif val =='1 year':
 42 |         return 1
 43 |     else:
 44 |         return float(str(val).replace('years',''))
 45 | 
 46 | #date transformation function
 47 | def ConvertDateStr(x):
 48 |     mth_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10,
 49 |                 'Nov': 11, 'Dec': 12}
 50 |     if str(x) == 'nan':
 51 |         return datetime.datetime.fromtimestamp(time.mktime(time.strptime('9900-1','%Y-%m')))
 52 |         #time.mktime 不能读取1970年之前的日期
 53 |     else:
 54 |         yr = int(x[4:6])
 55 |         if yr <=17:
 56 |             yr = 2000+yr
 57 |         else:
 58 |             yr = 1900 + yr
 59 |         mth = mth_dict[x[:3]]
 60 |         return datetime.datetime(yr,mth,1)
 61 | 
 62 | #date to date long function
 63 | def days_long(val):
 64 |     now = datetime.datetime.now()
 65 |     delta = now - val
 66 |     return delta.days
 67 | 
 68 | 
 69 | #grade
 70 | def grade_value(val):
 71 |     grade = val.replace('A',1)
 72 |     grade = grade.replace('B',2)
 73 |     grade = grade.replace('C',3)
 74 |     grade = grade.replace('D',4)
 75 |     grade = grade.replace('E',5)
 76 |     grade = grade.replace('F',6)
 77 |     grade = grade.replace('G',7)
 78 |     return grade
 79 | 
 80 | #subgrade
 81 | def subgrade_value(val):
 82 |     grade = val.replace('A1',11)
 83 |     grade = grade.replace('A2',12)
 84 |     grade = grade.replace('A3',13)
 85 |     grade = grade.replace('A4',14)
 86 |     grade = grade.replace('A5',15)
 87 |     grade = grade.replace('B1',21)
 88 |     grade = grade.replace('B2',22)
 89 |     grade = grade.replace('B3',23)
 90 |     grade = grade.replace('B4',24)
 91 |     grade = grade.replace('B5',25)
 92 |     grade = grade.replace('C1',31)
 93 |     grade = grade.replace('C2',32)
 94 |     grade = grade.replace('C3',33)
 95 |     grade = grade.replace('C4',34)
 96 |     grade = grade.replace('C5',35)
 97 |     grade = grade.replace('D1',41)
 98 |     grade = grade.replace('D2',42)
 99 |     grade = grade.replace('D3',43)
100 |     grade = grade.replace('D4',44)
101 |     grade = grade.replace('D5',45)
102 |     grade = grade.replace('E1',51)
103 |     grade = grade.replace('E2',52)
104 |     grade = grade.replace('E3',53)
105 |     grade = grade.replace('E4',54)
106 |     grade = grade.replace('E5',55)
107 |     grade = grade.replace('F1',61)
108 |     grade = grade.replace('F2',62)
109 |     grade = grade.replace('F3',63)
110 |     grade = grade.replace('F4',64)
111 |     grade = grade.replace('F5',65)
112 |     grade = grade.replace('G1',71)
113 |     grade = grade.replace('G2',72)
114 |     grade = grade.replace('G3',73)
115 |     grade = grade.replace('G4',74)
116 |     grade = grade.replace('G5',75)
117 |     return grade
118 | 
119 | # missing fill function
120 | def mean_val(data,col):
121 |     mean_col = data[col].mean()
122 |     idx = data[data[col].isnull()==True].index
123 |     data.loc[idx,col]=mean_col
124 |     return data[col]
125 | 
126 | #label transformation function
127 | def label_transe(val):
128 |     if val == 'Charged Off':
129 |         return 1
130 |     elif (val == 'Fully Paid' or val =='Current'):
131 |         return 0
132 |     else:
133 |         return -1
134 | 
135 | # read data and print statistic information
136 | def read_data(data):
137 |     print('The data is the third quarter of 2017 borrower data of LendingClub opened on official website')
138 |     df = pd.read_csv(data,header=1)
139 |     df = df[df['term']==' 36 months']
140 |     print ('\n')
141 |     print('top 5 line of data is :\n',df.head(2))
142 |     print('\n')
143 |     print('data statistic information is ',df.describe())
144 |     print('\n')
145 |     print('all data shape is:',df.shape)
146 |     return df
147 | 
148 | # separate training set and test set
149 | def split_train_test(data):
150 |     trainData = data[(data['issue_d']!='Nov-2015') & (data['issue_d']!='Dec-2015')]
151 |     testData = data[(data['issue_d']=='Nov-2015') | (data['issue_d']=='Dec-2015')]
152 |     print('the train data shape is:',trainData.shape)
153 |     print('the test data shape is:',testData.shape)
154 |     return trainData,testData
155 | 
156 | #data preprocessing
157 | def drop_afterloan_columns(data):
158 |     #drop after loan feature
159 |     after_col =['pymnt_plan', 'collection_recovery_fee', 'recoveries', 'hardship_flag','title',
160 |             'out_prncp_inv', 'out_prncp','total_rec_prncp','last_pymnt_amnt','last_pymnt_d',
161 |             'last_credit_pull_d','total_pymnt','total_pymnt_inv','total_rec_int',
162 |             'total_rec_late_fee','title','term']
163 |     df = data.drop(after_col,axis=1)
164 |     print('after drop after loan data, the data shape is ',df.shape)
165 |     return df
166 | 
167 | # drop only one value columns
168 | def drop_unique1_col(data):
169 |     cols =data.nunique()[data.nunique()>1].index.tolist()
170 |     df =data.loc[:,cols]
171 |     print('after drop only one value columns, the data shape is ',df.shape)
172 |     return df
173 | 
174 | # drop columns if it's missing greater than 60%
175 | def drop_missingmore60_col(data):
176 |     miss_60_col = data.isnull().sum()[data.isnull().sum()>=0.40*data.shape[0]].index
177 |     df = data.drop(miss_60_col,axis=1)
178 |     print('after drop missing greater than 60% columns, the data shape is ',df.shape)
179 |     return df
180 | 
181 | #drop all null row and all null column
182 | def drop_row_col_miss(data):
183 |     data = data.dropna(how='all',axis=1)
184 |     df = data.dropna(how='all',axis=0)
185 |     return df
186 | 
187 | #delete 90% value same in one column
188 | def drop_90samevalue_col(data):
189 |     colum=data.columns
190 |     per=pd.DataFrame(colum,index=colum)
191 |     max_valuecounts=[]
192 |     for col in colum:
193 |         max_valuecounts.append(data[col].value_counts().max())
194 |     per['mode']=max_valuecounts
195 |     per['percentil'] =per['mode']/data.shape[0]
196 |     same_value_col =per[per.sort_values(by='percentil',ascending=False)['percentil']>0.9].index
197 |     df = data.drop(same_value_col,axis=1)
198 |     print('after delete 90% values same in one column,the data shape is',df.shape)
199 |     return df
200 | 
201 | #get label function
202 | def get_label(data,label):
203 |     data['y'] = data[label].apply(label_transe)
204 |     data =data[((data['y']!=2) & (data['y']!=-1))].drop([label],axis=1)
205 |     return data
206 | 
207 | #character to value
208 | def string_to_value(data):
209 |     data['int_rate'] = data.loc[:,'int_rate'].apply(int_rate) #int_rate
210 |     data['emp_length'] = data.loc[:,'emp_length'].apply(emp_length) # emp_length
211 |     data['revol_util']=data.loc[:,'revol_util'].astype(str).apply(int_rate) # revol_util
212 |     data['grade'] = grade_value(data.loc[:,'grade'])#grade
213 |     data['sub_grade'] = subgrade_value(data.loc[:,'sub_grade'])#sub_grade
214 |     data['earliest_cr_line'] = data.loc[:,'earliest_cr_line'].apply(ConvertDateStr).apply(days_long)
215 |     return data
216 | 
217 | 
218 | #get catogery columns and continues columns
219 | def class_feature(data):
220 |     cat_col = list(data.columns.to_series().groupby(data.dtypes).groups.values())[2]
221 |     continue_col = list(data.columns.to_series().groupby(data.dtypes).groups.values())[0].append(list(data.columns.to_series().groupby(data.dtypes).groups.values())[1])
222 | 
223 |     return cat_col,continue_col
224 | 
225 | #get word_col,cat_col,ordered_col,continue_col
226 | def get_word_cat_ordered_continue_col(data):
227 |     # text feature columns
228 |     word_col=['zip_code', 'addr_state','emp_title']
229 |     temp_col=['mo_sin_old_il_acct','mo_sin_old_rev_tl_op','mths_since_recent_bc',
230 |              'mths_since_recent_inq','pct_tl_nvr_dlq','percent_bc_gt_75','bc_util','revol_util','dti']
231 |     cat_col,value_col = class_feature(data)
232 |     cat_col = [w for w in cat_col if w not in word_col] #catergory columns
233 |     continue_col = []
234 |     for col in value_col:
235 |         if len(set(data[col]))>500:
236 |             continue_col.append(col)
237 |     continue_col =[key for key in continue_col if key not in word_col]
238 |     continue_col = [key for key in continue_col if key !='emp_length']
239 |     continue_col = [key for key in continue_col if key not in temp_col] #continue col
240 | 
241 |     ordered_col = [key for key in value_col if key not in continue_col]
242 |     ordered_col = [key for key in ordered_col if key !='y']
243 |     ordered_col = [key for key in ordered_col if key !='term']
244 |     ordered_co = ordered_col + ['emp_length'] # ordered columns
245 |     return word_col,cat_col,ordered_col,continue_col
246 | 
247 | 
248 | # filling columns has missing value
249 | def missing_fill(data):
250 |     cat_col,continue_col = class_feature(data) # 区分分类特征和连续特征
251 |     missing_col=list(data.isnull().sum()[data.isnull().sum()>0].index)
252 |     for col in missing_col:
253 |         if col in cat_col:
254 |             fill_value = data[col].mode()[0]
255 |             data[col]=data[col].fillna(fill_value)
256 |         else:
257 |             fill_value = data[col].mean()
258 |             data[col] = data[col].fillna(fill_value)
259 |     return data
260 | 
261 | # outlier
262 | #del orded columns and continues top 10 greater samples
263 | def del_outlier_index(data,key):
264 |     temp_index = data[data[key]>data[key].sort_values(ascending=False)[0:10].min()].index
265 |     return list(temp_index)
266 | 


--------------------------------------------------------------------------------
/code/scorecard_functions_V3.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | def SplitData(df, col, numOfSplit, special_attribute=[]):
  5 |     '''
  6 |     :param df: 按照col排序后的数据集
  7 |     :param col: 待分箱的变量
  8 |     :param numOfSplit: 切分的组别数
  9 |     :param special_attribute: 在切分数据集的时候，某些特殊值需要排除在外
 10 |     :return: 在原数据集上增加一列，把原始细粒度的col重新划分成粗粒度的值，便于分箱中的合并处理
 11 |     '''
 12 |     df2 = df.copy()
 13 |     if special_attribute != []:
 14 |         df2 = df.loc[~df[col].isin(special_attribute)]
 15 |     N = df2.shape[0]
 16 |     n = N//numOfSplit
 17 |     splitPointIndex = [i*n for i in range(1,numOfSplit)]
 18 |     rawValues = sorted(list(df2[col]))
 19 |     splitPoint = [rawValues[i] for i in splitPointIndex]
 20 |     splitPoint = sorted(list(set(splitPoint)))
 21 |     return splitPoint
 22 | 
 23 | 
 24 | 
 25 | # def Chi2(df, total_col, bad_col, overallRate):
 26 | #     '''
 27 | #     :param df: 包含全部样本总计与坏样本总计的数据框
 28 | #     :param total_col: 全部样本的个数
 29 | #     :param bad_col: 坏样本的个数
 30 | #     :param overallRate: 全体样本的坏样本占比
 31 | #     :return: 卡方值
 32 | #     '''
 33 | #     df2 = df.copy()
 34 | #     # 期望坏样本个数＝全部样本个数*平均坏样本占比
 35 | #     df2['expected'] = df[total_col].apply(lambda x: x*overallRate)
 36 | #     combined = zip(df2['expected'], df2[bad_col])
 37 | #     chi = [(i[0]-i[1])**2/i[0] for i in combined]
 38 | #     chi2 = sum(chi)
 39 | #     return chi2
 40 | 
 41 | 
 42 | def Chi2(df, total_col, bad_col):
 43 |     '''
 44 |     :param df: 包含全部样本总计与坏样本总计的数据框
 45 |     :param total_col: 全部样本的个数
 46 |     :param bad_col: 坏样本的个数
 47 |     :return: 卡方值
 48 |     '''
 49 |     df2 = df.copy()
 50 |     # 求出df中，总体的坏样本率和好样本率
 51 |     badRate = sum(df2[bad_col])*1.0/sum(df2[total_col])
 52 |     df2['good'] = df2.apply(lambda x: x[total_col] - x[bad_col], axis = 1)
 53 |     goodRate = sum(df2['good']) * 1.0 / sum(df2[total_col])
 54 |     # 期望坏（好）样本个数＝全部样本个数*平均坏（好）样本占比
 55 |     df2['badExpected'] = df[total_col].apply(lambda x: x*badRate)
 56 |     df2['goodExpected'] = df[total_col].apply(lambda x: x * goodRate)
 57 |     badCombined = zip(df2['badExpected'], df2[bad_col])
 58 |     goodCombined = zip(df2['goodExpected'], df2['good'])
 59 |     badChi = [(i[0]-i[1])**2/i[0] for i in badCombined]
 60 |     goodChi = [(i[0] - i[1]) ** 2 / i[0] for i in goodCombined]
 61 |     chi2 = sum(badChi) + sum(goodChi)
 62 |     return chi2
 63 | 
 64 | 
 65 | # Chi2 的另外一种计算方法
 66 | # def Chi2(df, total_col, bad_col):
 67 | #     df2 = df.copy()
 68 | #     df2['good'] = df2[total_col] - df2[bad_col]
 69 | #     goodTotal = sum(df2['good'])
 70 | #     badTotal = sum(df2[bad_col])
 71 | #     p1 = df2.loc[0]['good']*1.0/df2.loc[0][total_col]
 72 | #     p2 = df2.loc[1]['good']*1.0/df2.loc[1][total_col]
 73 | #     w1 = df2.loc[0]['good']*1.0/goodTotal
 74 | #     w2 = df2.loc[0][bad_col]*1.0/badTotal
 75 | #     N = sum(df2[total_col])
 76 | #     return N*(p1-p2)*(w1-w2)
 77 | 
 78 | 
 79 | def BinBadRate(df, col, target, grantRateIndicator=0):
 80 |     '''
 81 |     :param df: 需要计算好坏比率的数据集
 82 |     :param col: 需要计算好坏比率的特征
 83 |     :param target: 好坏标签
 84 |     :param grantRateIndicator: 1返回总体的坏样本率，0不返回
 85 |     :return: 每箱的坏样本率，以及总体的坏样本率（当grantRateIndicator＝＝1时）
 86 |     '''
 87 |     total = df.groupby([col])[target].count()
 88 |     total = pd.DataFrame({'total': total})
 89 |     bad = df.groupby([col])[target].sum()
 90 |     bad = pd.DataFrame({'bad': bad})
 91 |     regroup = total.merge(bad, left_index=True, right_index=True, how='left')
 92 |     regroup.reset_index(level=0, inplace=True)
 93 |     regroup['bad_rate'] = regroup.apply(lambda x: x.bad * 1.0 / x.total, axis=1)
 94 |     dicts = dict(zip(regroup[col],regroup['bad_rate']))
 95 |     if grantRateIndicator==0:
 96 |         return (dicts, regroup)
 97 |     N = sum(regroup['total'])
 98 |     B = sum(regroup['bad'])
 99 |     overallRate = B * 1.0 / N
100 |     return (dicts, regroup, overallRate)
101 | 
102 | 
103 | 
104 | ### ChiMerge_MaxInterval: split the continuous variable using Chi-square value by specifying the max number of intervals
105 | def ChiMerge(df, col, target, max_interval=5,special_attribute=[],minBinPcnt=0):
106 |     '''
107 |     :param df: 包含目标变量与分箱属性的数据框
108 |     :param col: 需要分箱的属性
109 |     :param target: 目标变量，取值0或1
110 |     :param max_interval: 最大分箱数。如果原始属性的取值个数低于该参数，不执行这段函数
111 |     :param special_attribute: 不参与分箱的属性取值
112 |     :param minBinPcnt：最小箱的占比，默认为0
113 |     :return: 分箱结果
114 |     '''
115 |     colLevels = sorted(list(set(df[col])))
116 |     N_distinct = len(colLevels)
117 |     if N_distinct <= max_interval:  #如果原始属性的取值个数低于max_interval，不执行这段函数
118 |         print ("The number of original levels for {} is less than or equal to max intervals".format(col))
119 |         return colLevels[:-1]
120 |     else:
121 |         if len(special_attribute)>=1:
122 |             df1 = df.loc[df[col].isin(special_attribute)]
123 |             df2 = df.loc[~df[col].isin(special_attribute)]
124 |         else:
125 |             df2 = df.copy()
126 |         N_distinct = len(list(set(df2[col])))
127 | 
128 |         # 步骤一: 通过col对数据集进行分组，求出每组的总样本数与坏样本数
129 |         if N_distinct > 100:
130 |             split_x = SplitData(df2, col, 100)
131 |             df2['temp'] = df2[col].map(lambda x: AssignGroup(x, split_x))
132 |         else:
133 |             df2['temp'] = df2[col]
134 |         # 总体bad rate将被用来计算expected bad count
135 |         (binBadRate, regroup, overallRate) = BinBadRate(df2, 'temp', target, grantRateIndicator=1)
136 | 
137 |         # 首先，每个单独的属性值将被分为单独的一组
138 |         # 对属性值进行排序，然后两两组别进行合并
139 |         colLevels = sorted(list(set(df2['temp'])))
140 |         groupIntervals = [[i] for i in colLevels]
141 | 
142 |         # 步骤二：建立循环，不断合并最优的相邻两个组别，直到：
143 |         # 1，最终分裂出来的分箱数<＝预设的最大分箱数
144 |         # 2，每箱的占比不低于预设值（可选）
145 |         # 3，每箱同时包含好坏样本
146 |         # 如果有特殊属性，那么最终分裂出来的分箱数＝预设的最大分箱数－特殊属性的个数
147 |         split_intervals = max_interval - len(special_attribute)
148 |         while (len(groupIntervals) > split_intervals):  # 终止条件: 当前分箱数＝预设的分箱数
149 |             # 每次循环时, 计算合并相邻组别后的卡方值。具有最小卡方值的合并方案，是最优方案
150 |             chisqList = []
151 |             for k in range(len(groupIntervals)-1):
152 |                 temp_group = groupIntervals[k] + groupIntervals[k+1]
153 |                 df2b = regroup.loc[regroup['temp'].isin(temp_group)]
154 |                 #chisq = Chi2(df2b, 'total', 'bad', overallRate)
155 |                 chisq = Chi2(df2b, 'total', 'bad')
156 |                 chisqList.append(chisq)
157 |             best_comnbined = chisqList.index(min(chisqList))
158 |             groupIntervals[best_comnbined] = groupIntervals[best_comnbined] + groupIntervals[best_comnbined+1]
159 |             # after combining two intervals, we need to remove one of them
160 |             groupIntervals.remove(groupIntervals[best_comnbined+1])
161 |         groupIntervals = [sorted(i) for i in groupIntervals]
162 |         cutOffPoints = [max(i) for i in groupIntervals[:-1]]
163 | 
164 |         # 检查是否有箱没有好或者坏样本。如果有，需要跟相邻的箱进行合并，直到每箱同时包含好坏样本
165 |         groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
166 |         df2['temp_Bin'] = groupedvalues
167 |         (binBadRate,regroup) = BinBadRate(df2, 'temp_Bin', target)
168 |         [minBadRate, maxBadRate] = [min(binBadRate.values()),max(binBadRate.values())]
169 |         while minBadRate ==0 or maxBadRate == 1:
170 |             # 找出全部为好／坏样本的箱
171 |             indexForBad01 = regroup[regroup['bad_rate'].isin([0,1])].temp_Bin.tolist()
172 |             bin=indexForBad01[0]
173 |             # 如果是最后一箱，则需要和上一个箱进行合并，也就意味着分裂点cutOffPoints中的最后一个需要移除
174 |             if bin == max(regroup.temp_Bin):
175 |                 cutOffPoints = cutOffPoints[:-1]
176 |             # 如果是第一箱，则需要和下一个箱进行合并，也就意味着分裂点cutOffPoints中的第一个需要移除
177 |             elif bin == min(regroup.temp_Bin):
178 |                 cutOffPoints = cutOffPoints[1:]
179 |             # 如果是中间的某一箱，则需要和前后中的一个箱进行合并，依据是较小的卡方值
180 |             else:
181 |                 # 和前一箱进行合并，并且计算卡方值
182 |                 currentIndex = list(regroup.temp_Bin).index(bin)
183 |                 prevIndex = list(regroup.temp_Bin)[currentIndex - 1]
184 |                 df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, bin])]
185 |                 (binBadRate, df2b) = BinBadRate(df3, 'temp_Bin', target)
186 |                 #chisq1 = Chi2(df2b, 'total', 'bad', overallRate)
187 |                 chisq1 = Chi2(df2b, 'total', 'bad')
188 |                 # 和后一箱进行合并，并且计算卡方值
189 |                 laterIndex = list(regroup.temp_Bin)[currentIndex + 1]
190 |                 df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, bin])]
191 |                 (binBadRate, df2b) = BinBadRate(df3b, 'temp_Bin', target)
192 |                 #chisq2 = Chi2(df2b, 'total', 'bad', overallRate)
193 |                 chisq2 = Chi2(df2b, 'total', 'bad')
194 |                 if chisq1 < chisq2:
195 |                     cutOffPoints.remove(cutOffPoints[currentIndex - 1])
196 |                 else:
197 |                     cutOffPoints.remove(cutOffPoints[currentIndex])
198 |             # 完成合并之后，需要再次计算新的分箱准则下，每箱是否同时包含好坏样本
199 |             groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
200 |             df2['temp_Bin'] = groupedvalues
201 |             (binBadRate, regroup) = BinBadRate(df2, 'temp_Bin', target)
202 |             [minBadRate, maxBadRate] = [min(binBadRate.values()), max(binBadRate.values())]
203 |         # 需要检查分箱后的最小占比
204 |         if minBinPcnt > 0:
205 |             groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
206 |             df2['temp_Bin'] = groupedvalues
207 |             valueCounts = groupedvalues.value_counts().to_frame()
208 |             valueCounts['pcnt'] = valueCounts['temp'].apply(lambda x: x * 1.0 / N)
209 |             valueCounts = valueCounts.sort_index()
210 |             minPcnt = min(valueCounts['pcnt'])
211 |             while minPcnt < minBinPcnt and len(cutOffPoints) > 2:
212 |                 # 找出占比最小的箱
213 |                 indexForMinPcnt = valueCounts[valueCounts['pcnt'] == minPcnt].index.tolist()[0]
214 |                 # 如果占比最小的箱是最后一箱，则需要和上一个箱进行合并，也就意味着分裂点cutOffPoints中的最后一个需要移除
215 |                 if indexForMinPcnt == max(valueCounts.index):
216 |                     cutOffPoints = cutOffPoints[:-1]
217 |                 # 如果占比最小的箱是第一箱，则需要和下一个箱进行合并，也就意味着分裂点cutOffPoints中的第一个需要移除
218 |                 elif indexForMinPcnt == min(valueCounts.index):
219 |                     cutOffPoints = cutOffPoints[1:]
220 |                 # 如果占比最小的箱是中间的某一箱，则需要和前后中的一个箱进行合并，依据是较小的卡方值
221 |                 else:
222 |                     # 和前一箱进行合并，并且计算卡方值
223 |                     currentIndex = list(valueCounts.index).index(indexForMinPcnt)
224 |                     prevIndex = list(valueCounts.index)[currentIndex - 1]
225 |                     df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, indexForMinPcnt])]
226 |                     (binBadRate, df2b) = BinBadRate(df3, 'temp_Bin', target)
227 |                     #chisq1 = Chi2(df2b, 'total', 'bad', overallRate)
228 |                     chisq1 = Chi2(df2b, 'total', 'bad')
229 |                     # 和后一箱进行合并，并且计算卡方值
230 |                     laterIndex = list(valueCounts.index)[currentIndex + 1]
231 |                     df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, indexForMinPcnt])]
232 |                     (binBadRate, df2b) = BinBadRate(df3b, 'temp_Bin', target)
233 |                     #chisq2 = Chi2(df2b, 'total', 'bad', overallRate)
234 |                     chisq2 = Chi2(df2b, 'total', 'bad')
235 |                     if chisq1 < chisq2:
236 |                         cutOffPoints.remove(cutOffPoints[currentIndex - 1])
237 |                     else:
238 |                         cutOffPoints.remove(cutOffPoints[currentIndex])
239 |         cutOffPoints = special_attribute + cutOffPoints
240 |         return cutOffPoints
241 | 
242 | 
243 | 
244 | def UnsupervisedSplitBin(df,var,numOfSplit = 5, method = 'equal freq'):
245 |     '''
246 |     :param df: 数据集
247 |     :param var: 需要分箱的变量。仅限数值型。
248 |     :param numOfSplit: 需要分箱个数，默认是5
249 |     :param method: 分箱方法，'equal freq'：，默认是等频，否则是等距
250 |     :return:
251 |     '''
252 |     if method == 'equal freq':
253 |         N = df.shape[0]
254 |         n = N / numOfSplit
255 |         splitPointIndex = [i * n for i in range(1, numOfSplit)]
256 |         rawValues = sorted(list(df[col]))
257 |         splitPoint = [rawValues[i] for i in splitPointIndex]
258 |         splitPoint = sorted(list(set(splitPoint)))
259 |         return splitPoint
260 |     else:
261 |         var_max, var_min = max(df[var]), min(df[var])
262 |         interval_len = (var_max - var_min)*1.0/numOfSplit
263 |         splitPoint = [var_min + i*interval_len for i in range(1,numOfSplit)]
264 |         return splitPoint
265 | 
266 | 
267 | 
268 | def AssignGroup(x, bin):
269 |     '''
270 |     :param x: 某个变量的某个取值
271 |     :param bin: 上述变量的分箱结果
272 |     :return: x在分箱结果下的映射
273 |     '''
274 |     N = len(bin)
275 |     if x<=min(bin):
276 |         return min(bin)
277 |     elif x>max(bin):
278 |         return 10e10
279 |     else:
280 |         for i in range(N-1):
281 |             if bin[i] < x <= bin[i+1]:
282 |                 return bin[i+1]
283 | 
284 | 
285 | def BadRateEncoding(df, col, target):
286 |     '''
287 |     :param df: dataframe containing feature and target
288 |     :param col: the feature that needs to be encoded with bad rate, usually categorical type
289 |     :param target: good/bad indicator
290 |     :return: the assigned bad rate to encode the categorical feature
291 |     '''
292 |     regroup = BinBadRate(df, col, target, grantRateIndicator=0)[1]
293 |     br_dict = regroup[[col,'bad_rate']].set_index([col]).to_dict(orient='index')
294 |     for k, v in br_dict.items():
295 |         br_dict[k] = v['bad_rate']
296 |     badRateEnconding = df[col].map(lambda x: br_dict[x])
297 |     return {'encoding':badRateEnconding, 'bad_rate':br_dict}
298 | 
299 | 
300 | def AssignBin(x, cutOffPoints,special_attribute=[]):
301 |     '''
302 |     :param x: 某个变量的某个取值
303 |     :param cutOffPoints: 上述变量的分箱结果，用切分点表示
304 |     :param special_attribute:  不参与分箱的特殊取值
305 |     :return: 分箱后的对应的第几个箱，从0开始
306 |     for example, if cutOffPoints = [10,20,30], if x = 7, return Bin 0. If x = 35, return Bin 3
307 |     '''
308 |     numBin = len(cutOffPoints) + 1 + len(special_attribute)
309 |     if x in special_attribute:
310 |         i = special_attribute.index(x)+1
311 |         return 'Bin {}'.format(0-i)
312 |     if x<=cutOffPoints[0]:
313 |         return 'Bin 0'
314 |     elif x > cutOffPoints[-1]:
315 |         return 'Bin {}'.format(numBin-1)
316 |     else:
317 |         for i in range(0,numBin-1):
318 |             if cutOffPoints[i] < x <=  cutOffPoints[i+1]:
319 |                 return 'Bin {}'.format(i+1)
320 | 
321 | 
322 | 
323 | def CalcWOE(df, col, target):
324 |     '''
325 |     :param df: 包含需要计算WOE的变量和目标变量
326 |     :param col: 需要计算WOE、IV的变量，必须是分箱后的变量，或者不需要分箱的类别型变量
327 |     :param target: 目标变量，0、1表示好、坏
328 |     :return: 返回WOE和IV
329 |     '''
330 |     total = df.groupby([col])[target].count()
331 |     total = pd.DataFrame({'total': total})
332 |     bad = df.groupby([col])[target].sum()
333 |     bad = pd.DataFrame({'bad': bad})
334 |     regroup = total.merge(bad, left_index=True, right_index=True, how='left')
335 |     regroup.reset_index(level=0, inplace=True)
336 |     N = sum(regroup['total'])
337 |     B = sum(regroup['bad'])
338 |     regroup['good'] = regroup['total'] - regroup['bad']
339 |     G = N - B
340 |     regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x*1.0/B)
341 |     regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
342 |     regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
343 |     WOE_dict = regroup[[col,'WOE']].set_index(col).to_dict(orient='index')
344 |     for k, v in WOE_dict.items():
345 |         WOE_dict[k] = v['WOE']
346 |     IV = regroup.apply(lambda x: (x.good_pcnt-x.bad_pcnt)*np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
347 |     IV = sum(IV)
348 |     return {"WOE": WOE_dict, 'IV':IV}
349 | 
350 | 
351 | 
352 | ## 判断某变量的坏样本率是否单调
353 | def BadRateMonotone(df, sortByVar, target,special_attribute = []):
354 |     '''
355 |     :param df: 包含检验坏样本率的变量，和目标变量
356 |     :param sortByVar: 需要检验坏样本率的变量
357 |     :param target: 目标变量，0、1表示好、坏
358 |     :param special_attribute: 不参与检验的特殊值
359 |     :return: 坏样本率单调与否
360 |     '''
361 |     df2 = df.loc[~df[sortByVar].isin(special_attribute)]
362 |     if len(set(df2[sortByVar])) <= 2:
363 |         return True
364 |     regroup = BinBadRate(df2, sortByVar, target)[1]
365 |     combined = zip(regroup['total'],regroup['bad'])
366 |     badRate = [x[1]*1.0/x[0] for x in combined]
367 |     badRateNotMonotone = [badRate[i]<badRate[i+1] and badRate[i] < badRate[i-1] or badRate[i]>badRate[i+1] and badRate[i] > badRate[i-1]
368 |                        for i in range(1,len(badRate)-1)]
369 |     if True in badRateNotMonotone:
370 |         return False
371 |     else:
372 |         return True
373 | 
374 | 
375 | 
376 | def MergeBad0(df,col,target, direction='bad'):
377 |     '''
378 |      :param df: 包含检验0％或者100%坏样本率
379 |      :param col: 分箱后的变量或者类别型变量。检验其中是否有一组或者多组没有坏样本或者没有好样本。如果是，则需要进行合并
380 |      :param target: 目标变量，0、1表示好、坏
381 |      :return: 合并方案，使得每个组里同时包含好坏样本
382 |      '''
383 |     regroup = BinBadRate(df, col, target)[1]
384 |     if direction == 'bad':
385 |         # 如果是合并0坏样本率的组，则跟最小的非0坏样本率的组进行合并
386 |         regroup = regroup.sort_values(by  = 'bad_rate')
387 |     else:
388 |         # 如果是合并0好样本样本率的组，则跟最小的非0好样本率的组进行合并
389 |         regroup = regroup.sort_values(by='bad_rate',ascending=False)
390 |     regroup.index = range(regroup.shape[0])
391 |     col_regroup = [[i] for i in regroup[col]]
392 |     del_index = []
393 |     for i in range(regroup.shape[0]-1):
394 |         col_regroup[i+1] = col_regroup[i] + col_regroup[i+1]
395 |         del_index.append(i)
396 |         if direction == 'bad':
397 |             if regroup['bad_rate'][i+1] > 0:
398 |                 break
399 |         else:
400 |             if regroup['bad_rate'][i+1] < 1:
401 |                 break
402 |     col_regroup2 = [col_regroup[i] for i in range(len(col_regroup)) if i not in del_index]
403 |     newGroup = {}
404 |     for i in range(len(col_regroup2)):
405 |         for g2 in col_regroup2[i]:
406 |             newGroup[g2] = 'Bin '+str(i)
407 |     return newGroup
408 | 
409 | def Prob2Score(prob, basePoint, PDO):
410 |     #将概率转化成分数且为正整数
411 |     y = np.log(prob/(1-prob))
412 |     return int(basePoint+PDO/np.log(2)*(-y))
413 | 
414 | 
415 | ### 计算KS值
416 | def KS(df, score, target):
417 |     '''
418 |     :param df: 包含目标变量与预测值的数据集
419 |     :param score: 得分或者概率
420 |     :param target: 目标变量
421 |     :return: KS值
422 |     '''
423 |     total = df.groupby([score])[target].count()
424 |     bad = df.groupby([score])[target].sum()
425 |     all = pd.DataFrame({'total':total, 'bad':bad})
426 |     all['good'] = all['total'] - all['bad']
427 |     all[score] = all.index
428 |     all = all.sort_values(by=score,ascending=False)
429 |     all.index = range(len(all))
430 |     all['badCumRate'] = all['bad'].cumsum() / all['bad'].sum()
431 |     all['goodCumRate'] = all['good'].cumsum() / all['good'].sum()
432 |     KS = all.apply(lambda x: x.badCumRate - x.goodCumRate, axis=1)
433 |     return max(KS)
434 | 
435 | 
436 | def MergeByCondition(x,condition_list):
437 |     #condition_list是条件列表。满足第几个condition，就输出几
438 |     s = 0
439 |     for condition in condition_list:
440 |         if eval(str(x)+condition):
441 |             return s
442 |         else:
443 |             s+=1
444 |     return s
445 | 


--------------------------------------------------------------------------------
/code/visualization.py:
--------------------------------------------------------------------------------
  1 | # ignore warnings
  2 | import numpy as np
  3 | from scipy import interp
  4 | import matplotlib.pyplot as plt
  5 | from itertools import cycle
  6 | import helper as lp
  7 | 
  8 | from sklearn.naive_bayes import GaussianNB
  9 | from sklearn.model_selection import learning_curve
 10 | from sklearn.model_selection import ShuffleSplit,StratifiedKFold
 11 | from sklearn.metrics import roc_curve, auc
 12 | from sklearn.model_selection import StratifiedKFold
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | from sklearn.linear_model import LogisticRegression
 15 | import warnings
 16 | warnings.filterwarnings('ignore',category=UserWarning,module='matplotlib')
 17 | 
 18 | #Display inline
 19 | from IPython import get_ipython
 20 | get_ipython().run_line_magic('matplotlib','inline')
 21 | 
 22 | import matplotlib.pyplot as plt
 23 | import numpy as np
 24 | import pandas as pd
 25 | from wordcloud import WordCloud
 26 | from collections import Counter
 27 | from numpy.random import beta
 28 | import seaborn as sns
 29 | plt.style.use('bmh')
 30 | 
 31 | 
 32 | #plot borrower loan status distribution
 33 | def plot_label(data,col):
 34 |     temp = data.groupby('y').count().iloc[:,0]
 35 |     bar_data = {'normal':temp[0],'overdue':temp[1]}
 36 |     names = list(bar_data.keys())
 37 |     values = list(bar_data.values())
 38 |     plt.bar(range(2),values)
 39 |     plt.xticks((0,1),('normal','overdue'))
 40 |     plt.title('borrower loan status distribution')
 41 |     plt.text(0.45,180000,r'normal:overdue mostly equal to 7',color='black')
 42 |     plt.text(0.45,170000,r'unbalanced dataset',color='black')
 43 |     plt.show()
 44 | 
 45 | #plot category columns <5
 46 | def plot_cat(data,key):
 47 |     plot_data=data[[key,'y']]
 48 |     plt.figure(figsize=(8,6))
 49 |     if (key=='home_ownership'):
 50 |         values=['ANY','RENT','MORTGAGE','OWN']
 51 |     if (key=='verification_status'):
 52 |         values=['Source Verified', 'Not Verified', 'Verified']
 53 |     if (key=='initial_list_status'):
 54 |         values=['w', 'f']
 55 |     if (key=='grade'):
 56 |         values=[1, 2, 3, 4, 5, 6, 7]
 57 |     if (key=='emp_length'):
 58 |         values=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.027167861525544, 8.0, 9.0, 10.0, 7.0]
 59 |     if (key=='purpose'):
 60 |         values=['home_improvement', 'medical', 'educational', 'other', 'debt_consolidation', 'vacation', 'house', 'wedding',
 61 |                 'major_purchase', 'moving', 'car', 'small_business', 'renewable_energy', 'credit_card']
 62 |     if (key=='issue_d'):
 63 |         values=['Oct-2015', 'Dec-2015', 'Aug-2015', 'Apr-2015', 'May-2015', 'Nov-2015', 'Jan-2015', 'Sep-2015',
 64 |                 'Jun-2015', 'Feb-2015', 'Mar-2015', 'Jul-2015']
 65 |     #create DataFrame containing categories and each of counts
 66 |     frame = pd.DataFrame(index=np.arange(len(values)),columns=(key,'normal','overdue'))
 67 |     for i,value in enumerate(values):
 68 |         frame.loc[i]=[value,len(plot_data[(plot_data['y'] == 0) & (plot_data[key] == value)]),len(plot_data[(plot_data['y'] == 1) & (plot_data[key] == value)])]
 69 |     #display each categrory's overdue rate
 70 |     bin_width = 0.4
 71 |     for i in np.arange(frame.shape[0]):
 72 |         overdue_bar = plt.bar(i-bin_width,frame.loc[i]['overdue'],width = bin_width,color='r')
 73 |         normal_bar = plt.bar(i,frame.loc[i]['normal'],width = bin_width,color='g')
 74 | 
 75 |         plt.xticks(np.arange(len(frame)),values)
 76 |         plt.legend((overdue_bar[0],normal_bar[0]),('overdue','normal'),framealpha=0.8)
 77 | 
 78 |     plt.xlabel(key)
 79 |     plt.ylabel('number of borrower')
 80 |     plt.xticks(rotation=90)
 81 |     plt.title('borrower Statistics With \'%s\' Feature'%(key))
 82 |     plt.show()
 83 | 
 84 | 
 85 |     # Report number of passengers with missing values
 86 |     if sum(pd.isnull(plot_data[key])):
 87 |         nan_outcomes = plot_data[pd.isnull(plot_data[key])]['y']
 88 |         print ("borrower with missing '{}' values: {} ({} overdue, {} normal)".format( \
 89 |               key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0)))
 90 | 
 91 | #plot word cloud
 92 | def word_cloud(data,key):
 93 |     word_freq=Counter()
 94 |     word = data[key].astype(str)
 95 |     word_freq =Counter(word)
 96 |     words_cloud = WordCloud(scale=5,min_font_size=8,max_words=100,background_color='white').fit_words(word_freq)
 97 |     plt.imshow(words_cloud)
 98 | 
 99 |     # Report number of passengers with missing values
100 |     if sum(pd.isnull(data[key])):
101 |         nan_outcomes = data[pd.isnull(data[key])]['y']
102 |         print ("borrower with missing '{}' values: {} ({} overdue, {} normal)".format( \
103 |               key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0)))
104 | 
105 | 
106 | 
107 | #plot youxu columns
108 | def plot_youxu_col(data,label,key):
109 |     all_data=data[[key,label]]
110 |     all_data = all_data[~np.isnan(all_data[key])]
111 |     plt.figure(figsize=(8,6))
112 |     min_value = all_data[key].min()
113 |     max_value = all_data[key].max()
114 |     value_range = max_value - min_value
115 |     overdue = all_data[all_data['y']==1][key]
116 |     normal = all_data[all_data['y']==0][key]
117 |     bins = np.arange(min_value-1,all_data[key].max()+1,1)
118 |     plt.hist(overdue,bins=bins,histtype='stepfilled',alpha=0.6,color='red',label='overdue')
119 |     plt.hist(normal,bins=bins,histtype='stepfilled',alpha=0.6,color='green',label='normal')
120 |     plt.xlim(0,bins.max())
121 |     plt.legend(framealpha=0.8)
122 |     plt.xlabel(key)
123 |     plt.ylabel('Number of borrower')
124 |     plt.title('borrower overdue Statistics With \'%s\' Feature'%(key))
125 |     plt.show()
126 | 
127 |     # Report number of passengers with missing values
128 |     if sum(pd.isnull(all_data[key])):
129 |         nan_outcomes = all_data[pd.isnull(all_data[key])]['Survived']
130 |         print ("borrower with missing '{}' values: {} ({} overdue, {} normal)".format( \
131 |               key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0)))
132 | 
133 | #plot lianxu columns distribution
134 | def plot_lianxu_col(data,label,key):
135 |     all_data=data[[key,label]]
136 |     plt.figure(figsize=(8,6))
137 |     min_value = all_data[key].min()
138 |     max_value = all_data[key].max()
139 |     value_range = max_value - min_value
140 |     overdue = all_data[all_data['y']==1][key]
141 |     normal = all_data[all_data['y']==0][key]
142 |     if (key=='installment'):
143 |         bins = np.arange(min_value-1,all_data[key].max()+1,100)
144 |     else:
145 |         bins = np.arange(min_value-1,all_data[key].max()+1,1000)
146 |     plt.hist(overdue,bins=bins,histtype='stepfilled',alpha=0.6,color='red',label='overdue')
147 |     plt.hist(normal,bins=bins,histtype='stepfilled',alpha=0.6,color='green',label='normal')
148 |     plt.xlim(0,bins.max())
149 |     plt.legend(framealpha=0.8)
150 |     plt.xlabel(key)
151 |     plt.ylabel('Number of borrower')
152 |     plt.title('borrower overdue Statistics With \'%s\' Feature'%(key))
153 |     plt.show()
154 | 
155 |     # Report number of passengers with missing values
156 |     if sum(pd.isnull(all_data[key])):
157 |         nan_outcomes = all_data[pd.isnull(all_data[key])]['y']
158 |         print ("borrower with missing '{}' values: {} ({} overdue, {} normal)".format( \
159 |               key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0)))
160 | 
161 | # plot iv sort bar
162 | def feature_IV_bar(col_IV):
163 |     IV_dict_sorted = sorted(col_IV.items(),key=lambda x:x[1],reverse=True)
164 |     IV_values =[i[1] for i in IV_dict_sorted]
165 |     IV_name = [i[0] for i in IV_dict_sorted]
166 |     plt.title('feature IV value bar')
167 |     plt.bar(range(len(IV_values)),IV_values)
168 |     plt.show()
169 | 
170 | 
171 | 
172 | 
173 | # define cross validation roc curve and split use 5
174 | def model_roc_curve(X,y,clf):
175 |     import numpy as np
176 |     from scipy import interp
177 |     import matplotlib.pyplot as plt
178 |     from itertools import cycle
179 | 
180 |     from sklearn import svm, datasets
181 |     from sklearn.metrics import roc_curve, auc
182 |     from sklearn.model_selection import StratifiedKFold
183 | 
184 |     # Run classifier with cross-validation and plot ROC curves
185 |     cv = StratifiedKFold(n_splits=5)
186 | 
187 |     tprs = []
188 |     aucs = []
189 |     mean_fpr = np.linspace(0, 1, 100)
190 | 
191 |     i = 0
192 |     for train, test in cv.split(X, y):
193 |         probas_ = clf.fit(X[train], y[train]).predict_proba(X[test])
194 |         # Compute ROC curve and area the curve
195 |         fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
196 |         tprs.append(interp(mean_fpr, fpr, tpr))
197 |         tprs[-1][0] = 0.0
198 |         roc_auc = auc(fpr, tpr)
199 |         aucs.append(roc_auc)
200 |         plt.plot(fpr, tpr, lw=1, alpha=0.3,
201 |                  label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
202 | 
203 |         i += 1
204 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
205 |              label='Luck', alpha=.8)
206 | 
207 |     mean_tpr = np.mean(tprs, axis=0)
208 |     mean_tpr[-1] = 1.0
209 |     mean_auc = auc(mean_fpr, mean_tpr)
210 |     std_auc = np.std(aucs)
211 |     plt.plot(mean_fpr, mean_tpr, color='b',
212 |              label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
213 |              lw=2, alpha=.8)
214 | 
215 |     std_tpr = np.std(tprs, axis=0)
216 |     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
217 |     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
218 |     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
219 |                      label=r'$\pm$ 1 std. dev.')
220 | 
221 |     plt.xlim([-0.05, 1.05])
222 |     plt.ylim([-0.05, 1.05])
223 |     plt.xlabel('False Positive Rate')
224 |     plt.ylabel('True Positive Rate')
225 |     plt.title('Receiver operating characteristic example')
226 |     plt.legend(loc="lower right")
227 |     plt.show()
228 | 
229 | #corr heatmap
230 | def plot_corrmatrix_heatmap(data):
231 |     df_data = data.corr()
232 |     plt.subplots(figsize=(9,9))
233 |     sns.heatmap(df_data,annot = False,vmax=1, square=True, cmap="Blues")
234 |     plt.show()
235 | 
236 | 
237 | #plot sorted iv value bar
238 | def iv_sorted(col_iv):
239 |     IV_dict_sorted = sorted(col_iv.items(),key=lambda x:x[1],reverse=True)
240 |     IV_values = [i[1] for i in IV_dict_sorted]
241 |     IV_name = [i[0] for i in IV_dict_sorted]
242 |     plt.title('sorted feature IV bar')
243 |     #plt.bar(range(len(IV_values)),IV_values)
244 |     plt.bar(IV_name,IV_values)
245 |     plt.show()
246 | 
247 | #plot learning curve
248 | def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
249 |                         n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
250 |     plt.figure()
251 |     plt.title(title)
252 |     if ylim is not None:
253 |         plt.ylim(*ylim)
254 |     plt.xlabel("Training examples")
255 |     plt.ylabel("Score")
256 |     train_sizes, train_scores, test_scores = learning_curve(
257 |         estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
258 |     train_scores_mean = np.mean(train_scores, axis=1)
259 |     train_scores_std = np.std(train_scores, axis=1)
260 |     test_scores_mean = np.mean(test_scores, axis=1)
261 |     test_scores_std = np.std(test_scores, axis=1)
262 |     plt.grid()
263 | 
264 |     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
265 |                      train_scores_mean + train_scores_std, alpha=0.1,
266 |                      color="r")
267 |     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
268 |                      test_scores_mean + test_scores_std, alpha=0.1, color="g")
269 |     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
270 |              label="Training score")
271 |     plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
272 |              label="Cross-validation score")
273 | 
274 |     plt.legend(loc="best")
275 |     return plt
276 | 
277 | #plot ks curve
278 | def ks_curve(pred_y_data):
279 |     total = pred_y_data.groupby(['score'])['y'].count()
280 |     bad = pred_y_data.groupby(['score'])['y'].sum()
281 |     alls = pd.DataFrame({'total':total,'bad':bad})
282 |     alls['good'] = alls['total'] - alls['bad']
283 |     alls['score'] = alls.index
284 |     alls = alls.sorted_values(by='score',ascending=False)
285 |     alls.index = range(len(alls))
286 |     alls['badCumRate'] = alls['bad'].cumsum() / alls['bad'].sum()
287 |     alls['goodCumRate'] = alls['good'].cumsum() / alls['good'].sum()
288 |     plt.plot(alls['badCumRate'])
289 |     plt.plot(alls['goodCumRate'])
290 |     plt.show()
291 | 


--------------------------------------------------------------------------------
/report/proposal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lisa-wang1987/risk_model_machinelearning/3d9237cdd4290cc565f596ebdc2bd9059f86855d/report/proposal.pdf


--------------------------------------------------------------------------------
/report/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lisa-wang1987/risk_model_machinelearning/3d9237cdd4290cc565f596ebdc2bd9059f86855d/report/report.pdf


--------------------------------------------------------------------------------