├── Credit Risk Assessment Model.py ├── README.md └── credit_risk.csv /Credit Risk Assessment Model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 6 14:23:54 2022 4 | 5 | Author: Fu Yangyang 6 | 7 | @Our Final variables: 8 | pool=['gender', 9 | 'housing', 10 | 'income', 11 | 'std_age', 12 | 'past_bad_credit', 13 | 'married', 14 | '0', 15 | '1', 16 | 'edu0', 17 | 'e1', 18 | 'e2', 19 | 'e3'] 20 | """ 21 | 22 | import os,csv 23 | import pandas as pd 24 | import numpy as np 25 | from sklearn.linear_model import LogisticRegression 26 | import statsmodels.api as sm 27 | from sklearn.metrics import roc_curve, auc 28 | import matplotlib.pyplot as plt 29 | #from tqdm import tqdm 30 | from sklearn import preprocessing 31 | from imblearn.over_sampling import SMOTE 32 | 33 | path = r'C:\Users\hp\Desktop\MFIN7034 Problem set2' 34 | #df=pd.read_csv(path+os.sep+'credit_risk.csv') 35 | with open(path+os.sep+'credit_risk.csv', mode='r') as csv_file: 36 | csv_reader = csv.reader(csv_file, delimiter=',') 37 | line = 0 38 | data_temp = [] 39 | for row in csv_reader: 40 | if line == 0: 41 | print("variable names",",".join(row)) 42 | var_names = row 43 | line=line+1 44 | else: 45 | data_temp.append(list(map(float,row))) 46 | line=line+1 47 | 48 | csv_file.close() 49 | 50 | data_temp_m = np.asmatrix(data_temp) 51 | df = pd.DataFrame(np.array(data_temp), columns=var_names) 52 | 53 | def draw_curve(fpr,tpr,roc_auc,save_name): 54 | ###make a plot of roc curve 55 | plt.figure(dpi=150) 56 | lw = 2 57 | plt.plot(fpr, tpr, color='darkorange', 58 | lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 59 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 60 | plt.xlim([0.0, 1.0]) 61 | plt.ylim([0.0, 1.05]) 62 | plt.xlabel('False Positive Rate') 63 | plt.ylabel('True Positive Rate') 64 | plt.title(save_name) 65 | plt.legend(loc="lower right") 66 | plt.savefig(path+os.sep+save_name+'.jpg') 67 | plt.show() 68 | print('Figure was saved to ' + path) 69 | 70 | #%% 71 | #Question1 //Simple Logistic model 72 | LR = LogisticRegression() 73 | 74 | ###simple example: predictors include income and past_bad_credit 75 | X=df[['income','past_bad_credit']] 76 | y=df['default_label'] 77 | 78 | ###run logistic regression 79 | lr_model = LR.fit(X,y) 80 | 81 | ###another way to run logistic regression 82 | lr_model1 = sm.Logit(y,sm.add_constant(X)).fit() 83 | ###get a summary result of lr 84 | print(lr_model1.summary()) 85 | 86 | ###this is a two dimensional vector, prob d=0 and prob d=1, use the second one 87 | predicted_prob = lr_model.predict_proba(X) 88 | predicted_default_prob= predicted_prob[:,1] 89 | 90 | ###compute false positive rate and true positive rate using roc_curve function 91 | fpr, tpr, _ = roc_curve(y, predicted_default_prob) 92 | roc_auc = auc(fpr, tpr) 93 | draw_curve(fpr,tpr,roc_auc,'2.1 Receiver operating characteristic example') 94 | #%% 95 | #Question2 // 2.2 Full Logistic Model 96 | LR = LogisticRegression(penalty="l1",solver= 'liblinear',class_weight='balanced',tol=0.008,max_iter=100000) 97 | 98 | #convert the gender, age and income 99 | df['gender']=preprocessing.scale(df['gender']) 100 | df['std_age']=preprocessing.scale(df['Age']) 101 | df['std_income']=preprocessing.scale(df['income']) 102 | 103 | #change the job_occupation to dummy 104 | df['jo_0'] = pd.get_dummies(df['job_occupation'])[0] 105 | df['jo_1'] = pd.get_dummies(df['job_occupation'])[1] 106 | 107 | ##change the edu to dummy 108 | df['edu_0'] = pd.get_dummies(df['edu'])[0] 109 | df['edu_1'] = pd.get_dummies(df['edu'])[1] 110 | df['edu_2'] = pd.get_dummies(df['edu'])[2] 111 | df['edu_3'] = pd.get_dummies(df['edu'])[3] 112 | 113 | ''' 114 | #variables that we have tried 115 | #df['dummy_edu']=list(map(lambda x: np.log(x),df['edu'])) 116 | #df['gender']=list(map(lambda x: np.log(x),df['gender'])) 117 | #df['ln_income']=list(map(lambda x: np.log(x),df['income'])) 118 | #df['std_ln_age']=preprocessing.scale(list(map(lambda x: np.log(x),df['Age']))) 119 | #df['std_ln_income']=preprocessing.scale(list(map(lambda x: np.log(x),df['income']))) 120 | #df['Age'] = df['Age']//10 121 | #df['std_edu']=preprocessing.scale(df['edu']) 122 | 123 | #Use Exhaustive method to try every combination,but at last, we find the best combination is the pool list now 124 | 125 | LR = LogisticRegression(penalty="l1",solver= 'liblinear',class_weight='balanced',tol=0.008,max_iter=100000) 126 | df2=pd.DataFrame() 127 | cbna_list=[] #save variables 128 | auc_list=[] #save auc 129 | variables=[] #save number of variables 130 | 131 | for i in tqdm(range(7,len(pool)+1)): #Use up to len(pool) variables 132 | for cbna in itertools.combinations(pool, i): 133 | 134 | X=df[list(cbna)] 135 | y=df['default_label'] 136 | x_smote, y_smote = smote.fit_resample(X, y) 137 | lr_model = LR.fit(x_smote,y_smote) 138 | predicted_prob = lr_model.predict_proba(x_smote) 139 | predicted_default_prob= predicted_prob[:,1] 140 | fpr, tpr, _ = roc_curve(y_smote, predicted_default_prob) 141 | roc_auc = auc(fpr, tpr) 142 | 143 | #save results 144 | cbna_list.append(list(cbna)) 145 | variables.append(len(list(cbna))) 146 | auc_list.append(roc_auc) 147 | 148 | 149 | df2['Varibles']=cbna_list 150 | df2['No. of variables used'] = variables 151 | df2['auc value'] = auc_list 152 | ''' 153 | #%% 154 | #Continue Question 2 155 | #Choose the combinantion that achieve highest auc 156 | pool=['gender', 157 | 'housing', 158 | 'income', 159 | 'std_age', 160 | 'past_bad_credit', 161 | 'married', 162 | 'jo_0', 163 | 'jo_1', 164 | 'edu_0', 165 | 'edu_1', 166 | 'edu_2', 167 | 'edu_3'] 168 | 169 | smote = SMOTE()#use smote function to balance our data sample 170 | X = df[pool] 171 | y=df['default_label'] 172 | x_smote, y_smote = smote.fit_resample(X, y) 173 | lr_model = LR.fit(x_smote,y_smote) 174 | lr_model1 = sm.Logit(y_smote,sm.add_constant(x_smote)).fit() 175 | predicted_prob = lr_model.predict_proba(x_smote) 176 | predicted_default_prob= predicted_prob[:,1] 177 | fpr, tpr, _ = roc_curve(y_smote, predicted_default_prob) 178 | roc_auc = auc(fpr, tpr) 179 | print(lr_model1.summary()) 180 | print('the best combination: ', list(X.columns)) 181 | print('used variables: ' , len(X.columns)) 182 | print('the auc value: ' , roc_auc) 183 | 184 | draw_curve(fpr,tpr,roc_auc,'2.2 Full Logistic Model') 185 | 186 | #%% 187 | #Question3 // 2.3 SVM 188 | from sklearn.svm import SVC 189 | regressor = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, 190 | decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', 191 | max_iter=-1, probability=True, random_state=None, shrinking=True, 192 | tol=0.001, verbose=False) 193 | 194 | X = x_smote 195 | y = y_smote 196 | regressor.fit(X, y) 197 | predicted_prob = regressor.predict_proba(X) 198 | predicted_default_prob= predicted_prob[:,1] 199 | fpr, tpr, _ = roc_curve(y, predicted_default_prob) 200 | roc_auc = auc(fpr, tpr) 201 | draw_curve(fpr,tpr,roc_auc,'2.3 SVM') 202 | 203 | #%% 204 | #Question4 // 2.4 Out-of-Sample Test 205 | from sklearn.model_selection import train_test_split 206 | X_train, X_test, y_train, y_test = train_test_split(X, 207 | y, 208 | train_size=10000, 209 | random_state=1) 210 | 211 | 212 | LR = LogisticRegression() 213 | lr_model = LR.fit(X_train,y_train) 214 | predicted_prob = lr_model.predict_proba(X_test) 215 | predicted_default_prob= predicted_prob[:,1] 216 | fpr, tpr, _ = roc_curve(y_test, predicted_default_prob) 217 | roc_auc = auc(fpr, tpr) 218 | draw_curve(fpr,tpr,roc_auc,'2.4 Out-of-Sample Test') 219 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine-learning-based-credit-risk-assessment-model 2 | 基于机器学习的信用风险评估模型,主要使用了Sklearn库,通过逻辑回归,向量机等模型,根据借款人的个人身份信息评估是否应当发放贷款。 3 | --------------------------------------------------------------------------------