├── Credit Risk Assessment Model.py
├── README.md
└── credit_risk.csv


/Credit Risk Assessment Model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Mar  6 14:23:54 2022
  4 | 
  5 | Author: Fu Yangyang
  6 | 
  7 | @Our Final variables:
  8 | pool=['gender',
  9 |  'housing',
 10 |  'income',
 11 |  'std_age',
 12 |  'past_bad_credit',
 13 |  'married',
 14 |  '0',
 15 |  '1',
 16 |  'edu0',
 17 |  'e1',
 18 |  'e2',
 19 |  'e3']
 20 | """
 21 | 
 22 | import os,csv
 23 | import pandas as pd
 24 | import numpy as np
 25 | from sklearn.linear_model import LogisticRegression
 26 | import statsmodels.api as sm
 27 | from sklearn.metrics import roc_curve, auc
 28 | import matplotlib.pyplot as plt
 29 | #from tqdm import tqdm
 30 | from sklearn import preprocessing
 31 | from imblearn.over_sampling import SMOTE
 32 | 
 33 | path = r'C:\Users\hp\Desktop\MFIN7034 Problem set2'
 34 | #df=pd.read_csv(path+os.sep+'credit_risk.csv')
 35 | with open(path+os.sep+'credit_risk.csv', mode='r') as csv_file:
 36 |     csv_reader = csv.reader(csv_file, delimiter=',')
 37 |     line = 0
 38 |     data_temp = []
 39 |     for row in csv_reader:
 40 |         if line == 0:
 41 |             print("variable names",",".join(row))
 42 |             var_names = row
 43 |             line=line+1
 44 |         else:
 45 |             data_temp.append(list(map(float,row)))
 46 |             line=line+1
 47 | 
 48 | csv_file.close()
 49 | 
 50 | data_temp_m = np.asmatrix(data_temp)
 51 | df = pd.DataFrame(np.array(data_temp), columns=var_names)
 52 | 
 53 | def draw_curve(fpr,tpr,roc_auc,save_name):
 54 | ###make a plot of roc curve
 55 |     plt.figure(dpi=150)
 56 |     lw = 2
 57 |     plt.plot(fpr, tpr, color='darkorange',
 58 |              lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
 59 |     plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
 60 |     plt.xlim([0.0, 1.0])
 61 |     plt.ylim([0.0, 1.05])
 62 |     plt.xlabel('False Positive Rate')
 63 |     plt.ylabel('True Positive Rate')
 64 |     plt.title(save_name)
 65 |     plt.legend(loc="lower right")
 66 |     plt.savefig(path+os.sep+save_name+'.jpg')
 67 |     plt.show()
 68 |     print('Figure was saved to ' + path)
 69 | 
 70 | #%%
 71 | #Question1 //Simple Logistic model
 72 | LR = LogisticRegression()
 73 | 
 74 | ###simple example: predictors include income and past_bad_credit
 75 | X=df[['income','past_bad_credit']]
 76 | y=df['default_label']
 77 | 
 78 | ###run logistic regression
 79 | lr_model = LR.fit(X,y)
 80 | 
 81 | ###another way to run logistic regression
 82 | lr_model1 = sm.Logit(y,sm.add_constant(X)).fit()
 83 | ###get a summary result of lr
 84 | print(lr_model1.summary())
 85 | 
 86 | ###this is a two dimensional vector, prob d=0 and prob d=1, use the second one
 87 | predicted_prob = lr_model.predict_proba(X)
 88 | predicted_default_prob= predicted_prob[:,1]
 89 | 
 90 | ###compute false positive rate and true positive rate using roc_curve function
 91 | fpr, tpr, _ = roc_curve(y, predicted_default_prob)
 92 | roc_auc = auc(fpr, tpr)
 93 | draw_curve(fpr,tpr,roc_auc,'2.1 Receiver operating characteristic example')
 94 | #%%
 95 | #Question2 // 2.2 Full Logistic Model
 96 | LR = LogisticRegression(penalty="l1",solver= 'liblinear',class_weight='balanced',tol=0.008,max_iter=100000)
 97 | 
 98 | #convert the gender, age and income
 99 | df['gender']=preprocessing.scale(df['gender'])
100 | df['std_age']=preprocessing.scale(df['Age'])
101 | df['std_income']=preprocessing.scale(df['income'])
102 | 
103 | #change the job_occupation to dummy
104 | df['jo_0'] = pd.get_dummies(df['job_occupation'])[0]
105 | df['jo_1'] = pd.get_dummies(df['job_occupation'])[1]
106 | 
107 | ##change the edu to dummy
108 | df['edu_0'] = pd.get_dummies(df['edu'])[0]
109 | df['edu_1'] = pd.get_dummies(df['edu'])[1]
110 | df['edu_2'] = pd.get_dummies(df['edu'])[2]
111 | df['edu_3'] = pd.get_dummies(df['edu'])[3]
112 | 
113 | '''
114 | #variables that we have tried
115 | #df['dummy_edu']=list(map(lambda x: np.log(x),df['edu']))
116 | #df['gender']=list(map(lambda x: np.log(x),df['gender']))
117 | #df['ln_income']=list(map(lambda x: np.log(x),df['income']))
118 | #df['std_ln_age']=preprocessing.scale(list(map(lambda x: np.log(x),df['Age'])))
119 | #df['std_ln_income']=preprocessing.scale(list(map(lambda x: np.log(x),df['income'])))
120 | #df['Age'] = df['Age']//10
121 | #df['std_edu']=preprocessing.scale(df['edu'])
122 | 
123 | #Use Exhaustive method to try every combination,but at last, we find the best combination is the pool list now
124 | 
125 | LR = LogisticRegression(penalty="l1",solver= 'liblinear',class_weight='balanced',tol=0.008,max_iter=100000)
126 | df2=pd.DataFrame()
127 | cbna_list=[] #save variables
128 | auc_list=[] #save auc
129 | variables=[] #save number of variables
130 | 
131 | for i in tqdm(range(7,len(pool)+1)): #Use up to len（pool） variables
132 |     for cbna in itertools.combinations(pool, i):
133 |         
134 |         X=df[list(cbna)]
135 |         y=df['default_label']
136 |         x_smote, y_smote = smote.fit_resample(X, y)
137 |         lr_model = LR.fit(x_smote,y_smote)
138 |         predicted_prob = lr_model.predict_proba(x_smote)
139 |         predicted_default_prob= predicted_prob[:,1]
140 |         fpr, tpr, _ = roc_curve(y_smote, predicted_default_prob)
141 |         roc_auc = auc(fpr, tpr)
142 |             
143 |         #save results
144 |         cbna_list.append(list(cbna))
145 |         variables.append(len(list(cbna)))
146 |         auc_list.append(roc_auc)
147 | 
148 | 
149 | df2['Varibles']=cbna_list
150 | df2['No. of variables used'] = variables
151 | df2['auc value'] = auc_list
152 | '''
153 | #%%
154 | #Continue Question 2
155 | #Choose the combinantion that achieve highest auc
156 | pool=['gender',
157 |  'housing',
158 |  'income',
159 |  'std_age',
160 |  'past_bad_credit',
161 |  'married',
162 |  'jo_0',
163 |  'jo_1',
164 |  'edu_0',
165 |  'edu_1',
166 |  'edu_2',
167 |  'edu_3']
168 | 
169 | smote = SMOTE()#use smote function to balance our data sample
170 | X = df[pool]
171 | y=df['default_label']
172 | x_smote, y_smote = smote.fit_resample(X, y)
173 | lr_model = LR.fit(x_smote,y_smote)
174 | lr_model1 = sm.Logit(y_smote,sm.add_constant(x_smote)).fit()
175 | predicted_prob = lr_model.predict_proba(x_smote)
176 | predicted_default_prob= predicted_prob[:,1]
177 | fpr, tpr, _ = roc_curve(y_smote, predicted_default_prob)
178 | roc_auc = auc(fpr, tpr)
179 | print(lr_model1.summary())
180 | print('the best combination: ', list(X.columns))
181 | print('used variables: ' , len(X.columns))
182 | print('the auc value: ' , roc_auc)
183 | 
184 | draw_curve(fpr,tpr,roc_auc,'2.2 Full Logistic Model')
185 | 
186 | #%%
187 | #Question3 // 2.3 SVM
188 | from sklearn.svm import SVC
189 | regressor = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
190 |     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
191 |     max_iter=-1, probability=True, random_state=None, shrinking=True,
192 |     tol=0.001, verbose=False)
193 | 
194 | X = x_smote
195 | y = y_smote
196 | regressor.fit(X, y)
197 | predicted_prob = regressor.predict_proba(X)
198 | predicted_default_prob= predicted_prob[:,1]
199 | fpr, tpr, _ = roc_curve(y, predicted_default_prob)
200 | roc_auc = auc(fpr, tpr)
201 | draw_curve(fpr,tpr,roc_auc,'2.3 SVM')
202 | 
203 | #%%
204 | #Question4 // 2.4 Out-of-Sample Test
205 | from sklearn.model_selection import train_test_split
206 | X_train, X_test, y_train, y_test = train_test_split(X,
207 |                                                     y,
208 |                                                     train_size=10000,
209 |                                                     random_state=1)
210 | 
211 | 
212 | LR = LogisticRegression()
213 | lr_model = LR.fit(X_train,y_train)
214 | predicted_prob = lr_model.predict_proba(X_test)
215 | predicted_default_prob= predicted_prob[:,1]
216 | fpr, tpr, _ = roc_curve(y_test, predicted_default_prob)
217 | roc_auc = auc(fpr, tpr)
218 | draw_curve(fpr,tpr,roc_auc,'2.4 Out-of-Sample Test')
219 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Machine-learning-based-credit-risk-assessment-model
2 | 基于机器学习的信用风险评估模型，主要使用了Sklearn库，通过逻辑回归，向量机等模型，根据借款人的个人身份信息评估是否应当发放贷款。
3 | 


--------------------------------------------------------------------------------