├── .gitignore ├── Logit.py ├── MissingValue.py ├── Outlier.py ├── README.md ├── SignalVariable.py └── cs-training.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /Logit.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt #导入图像库 3 | import matplotlib 4 | import seaborn as sns 5 | import statsmodels.api as sm 6 | from sklearn.metrics import roc_curve, auc 7 | plt.rcParams["font.sans-serif"]='SimHei' 8 | plt.rcParams['axes.unicode_minus'] = False 9 | if __name__ == '__main__': 10 | matplotlib.rcParams['axes.unicode_minus'] = False 11 | data = pd.read_csv('WoeData.csv') 12 | Y=data['SeriousDlqin2yrs'] 13 | X=data.drop(['SeriousDlqin2yrs','DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1) 14 | X1=sm.add_constant(X) 15 | logit=sm.Logit(Y,X1) 16 | result=logit.fit() 17 | print(result.params) 18 | test = pd.read_csv('TestWoeData.csv') 19 | Y_test = test['SeriousDlqin2yrs'] 20 | X_test = test.drop(['SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines', 'NumberOfDependents'], axis=1) 21 | X3 = sm.add_constant(X_test) 22 | resu = result.predict(X3) 23 | fpr, tpr, threshold = roc_curve(Y_test, resu) 24 | rocauc = auc(fpr, tpr) 25 | plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc) 26 | plt.legend(loc='lower right') 27 | plt.plot([0, 1], [0, 1], 'r--') 28 | plt.xlim([0, 1]) 29 | plt.ylim([0, 1]) 30 | plt.ylabel('真正率') 31 | plt.xlabel('假正率') 32 | plt.show() -------------------------------------------------------------------------------- /MissingValue.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt #导入图像库 3 | from sklearn.ensemble import RandomForestRegressor 4 | 5 | # 用随机森林对缺失值预测填充函数 6 | def set_missing(df): 7 | # 把已有的数值型特征取出来 8 | process_df = df.ix[:,[5,0,1,2,3,4,6,7,8,9]] 9 | # 分成已知该特征和未知该特征两部分 10 | known = process_df[process_df.MonthlyIncome.notnull()].as_matrix() 11 | unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix() 12 | # X为特征属性值 13 | X = known[:, 1:] 14 | # y为结果标签值 15 | y = known[:, 0] 16 | # fit到RandomForestRegressor之中 17 | rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1) 18 | rfr.fit(X,y) 19 | # 用得到的模型进行未知特征值预测 20 | predicted = rfr.predict(unknown[:, 1:]).round(0) 21 | print(predicted) 22 | # 用得到的预测结果填补原缺失数据 23 | df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted 24 | return df 25 | 26 | if __name__ == '__main__': 27 | #载入数据 28 | data = pd.read_csv('G:\WEO\cs-training.csv') 29 | #数据集确实和分布情况 30 | data.describe().to_csv('DataDescribe.csv')#了解数据集的分布情况 31 | data=set_missing(data)#用随机森林填补比较多的缺失值 32 | data=data.dropna()#删除比较少的缺失值 33 | data = data.drop_duplicates()#删除重复项 34 | data.to_csv('MissingData.csv',index=False) 35 | data.describe().to_csv('MissingDataDescribe.csv') 36 | """ 37 | #异常值处理 38 | #年龄等于0的异常值进行剔除 39 | data=data[data['age']>0] 40 | # 箱形图 41 | data379=data[['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse']] 42 | data379.boxplot() 43 | data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 90] 44 | data379 = data[['NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse']] 45 | #data379.boxplot() 46 | plt.show() 47 | #data.to_csv('PretreatmentData.csv') 48 | """ -------------------------------------------------------------------------------- /Outlier.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt #导入图像库 3 | from sklearn.cross_validation import train_test_split 4 | 5 | def outlier_processing(df,col): 6 | s=df[col] 7 | oneQuoter=s.quantile(0.25) 8 | threeQuote=s.quantile(0.75) 9 | irq=threeQuote-oneQuoter 10 | min=oneQuoter-1.5*irq 11 | max=threeQuote+1.5*irq 12 | df=df[df[col]<=max] 13 | df=df[df[col]>=min] 14 | return df 15 | 16 | if __name__ == '__main__': 17 | data = pd.read_csv('MissingData.csv') 18 | # 年龄等于0的异常值进行剔除 19 | data = data[data['age'] > 0] 20 | data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]#剔除异常值 21 | data['SeriousDlqin2yrs']=1-data['SeriousDlqin2yrs'] 22 | Y = data['SeriousDlqin2yrs'] 23 | X = data.ix[:, 1:] 24 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0) 25 | # print(Y_train) 26 | train = pd.concat([Y_train, X_train], axis=1) 27 | test = pd.concat([Y_test, X_test], axis=1) 28 | clasTest = test.groupby('SeriousDlqin2yrs')['SeriousDlqin2yrs'].count() 29 | train.to_csv('TrainData.csv',index=False) 30 | test.to_csv('TestData.csv',index=False) 31 | print(train.shape) 32 | print(test.shape) 33 | 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WEO 2 | 信用风险评分卡为信用风险管理提供了一种有效的、经验性的解决方法,是消费信贷管理中广泛应用的技术手段。评分卡是信用风险评估领域常见的建模方法。评分卡并不加单对应于某一种机器学习算法,而是一种通用的建模框架,讲原始数据通过分箱后进行特征工程变换,继而应用于线性模型进行建模的一种方法。 3 | -------------------------------------------------------------------------------- /SignalVariable.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from pandas import Series,DataFrame 4 | import scipy.stats.stats as stats 5 | import matplotlib.pyplot as plt 6 | import statsmodels.api as sm 7 | import math 8 | # 定义自动分箱函数 9 | def mono_bin(Y, X, n = 20): 10 | r = 0 11 | good=Y.sum() 12 | bad=Y.count()-good 13 | while np.abs(r) < 1: 14 | d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)}) 15 | d2 = d1.groupby('Bucket', as_index = True) 16 | r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) 17 | n = n - 1 18 | d3 = pd.DataFrame(d2.X.min(), columns = ['min']) 19 | d3['min']=d2.min().X 20 | d3['max'] = d2.max().X 21 | d3['sum'] = d2.sum().Y 22 | d3['total'] = d2.count().Y 23 | d3['rate'] = d2.mean().Y 24 | d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad)) 25 | d3['goodattribute']=d3['sum']/good 26 | d3['badattribute']=(d3['total']-d3['sum'])/bad 27 | iv=((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum() 28 | d4 = (d3.sort_index(by = 'min')) 29 | print("=" * 60) 30 | print(d4) 31 | cut=[] 32 | cut.append(float('-inf')) 33 | for i in range(1,n+1): 34 | qua=X.quantile(i/(n+1)) 35 | cut.append(round(qua,4)) 36 | cut.append(float('inf')) 37 | woe=list(d4['woe'].round(3)) 38 | return d4,iv,cut,woe 39 | #自定义分箱函数 40 | def self_bin(Y,X,cat): 41 | good=Y.sum() 42 | bad=Y.count()-good 43 | d1=pd.DataFrame({'X':X,'Y':Y,'Bucket':pd.cut(X,cat)}) 44 | d2=d1.groupby('Bucket', as_index = True) 45 | d3 = pd.DataFrame(d2.X.min(), columns=['min']) 46 | d3['min'] = d2.min().X 47 | d3['max'] = d2.max().X 48 | d3['sum'] = d2.sum().Y 49 | d3['total'] = d2.count().Y 50 | d3['rate'] = d2.mean().Y 51 | d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad)) 52 | d3['goodattribute'] = d3['sum'] / good 53 | d3['badattribute'] = (d3['total'] - d3['sum']) / bad 54 | iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum() 55 | d4 = (d3.sort_index(by='min')) 56 | print("=" * 60) 57 | print(d4) 58 | woe = list(d4['woe'].round(3)) 59 | return d4, iv,woe 60 | #用woe代替 61 | def replace_woe(series,cut,woe): 62 | list=[] 63 | i=0 64 | while i=0: 69 | if value>=cut[j]: 70 | j=-1 71 | else: 72 | j -=1 73 | m -= 1 74 | list.append(woe[m]) 75 | i += 1 76 | return list 77 | #计算分数函数 78 | def get_score(coe,woe,factor): 79 | scores=[] 80 | for w in woe: 81 | score=round(coe*w*factor,0) 82 | scores.append(score) 83 | return scores 84 | #根据变量计算分数 85 | def compute_score(series,cut,score): 86 | list = [] 87 | i = 0 88 | while i < len(series): 89 | value = series[i] 90 | j = len(cut) - 2 91 | m = len(cut) - 2 92 | while j >= 0: 93 | if value >= cut[j]: 94 | j = -1 95 | else: 96 | j -= 1 97 | m -= 1 98 | list.append(score[m]) 99 | i += 1 100 | return list 101 | if __name__ == '__main__': 102 | data = pd.read_csv('TrainData.csv') 103 | pinf = float('inf')#正无穷大 104 | ninf = float('-inf')#负无穷大 105 | dfx1, ivx1,cutx1,woex1=mono_bin(data.SeriousDlqin2yrs,data.RevolvingUtilizationOfUnsecuredLines,n=10) 106 | dfx2, ivx2,cutx2,woex2=mono_bin(data.SeriousDlqin2yrs, data.age, n=10) 107 | dfx4, ivx4,cutx4,woex4 =mono_bin(data.SeriousDlqin2yrs, data.DebtRatio, n=20) 108 | dfx5, ivx5,cutx5,woex5 =mono_bin(data.SeriousDlqin2yrs, data.MonthlyIncome, n=10) 109 | # 连续变量离散化 110 | cutx3 = [ninf, 0, 1, 3, 5, pinf] 111 | cutx6 = [ninf, 1, 2, 3, 5, pinf] 112 | cutx7 = [ninf, 0, 1, 3, 5, pinf] 113 | cutx8 = [ninf, 0,1,2, 3, pinf] 114 | cutx9 = [ninf, 0, 1, 3, pinf] 115 | cutx10 = [ninf, 0, 1, 2, 3, 5, pinf] 116 | dfx3, ivx3,woex3 = self_bin(data.SeriousDlqin2yrs, data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3) 117 | dfx6, ivx6 ,woex6= self_bin(data.SeriousDlqin2yrs, data['NumberOfOpenCreditLinesAndLoans'], cutx6) 118 | dfx7, ivx7,woex7 = self_bin(data.SeriousDlqin2yrs, data['NumberOfTimes90DaysLate'], cutx7) 119 | dfx8, ivx8,woex8 = self_bin(data.SeriousDlqin2yrs, data['NumberRealEstateLoansOrLines'], cutx8) 120 | dfx9, ivx9,woex9 = self_bin(data.SeriousDlqin2yrs, data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9) 121 | dfx10, ivx10,woex10 = self_bin(data.SeriousDlqin2yrs, data['NumberOfDependents'], cutx10) 122 | ivlist=[ivx1,ivx2,ivx3,ivx4,ivx5,ivx6,ivx7,ivx8,ivx9,ivx10] 123 | index=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10'] 124 | fig1 = plt.figure(1) 125 | ax1 = fig1.add_subplot(1, 1, 1) 126 | x = np.arange(len(index))+1 127 | ax1.bar(x, ivlist, width=0.4) 128 | ax1.set_xticks(x) 129 | ax1.set_xticklabels(index, rotation=0, fontsize=12) 130 | ax1.set_ylabel('IV(Information Value)', fontsize=14) 131 | for a, b in zip(x, ivlist): 132 | plt.text(a, b + 0.01, '%.4f' % b, ha='center', va='bottom', fontsize=10) 133 | plt.show() 134 | ''' 135 | # 替换成woe 136 | data['RevolvingUtilizationOfUnsecuredLines'] = Series(replace_woe(data['RevolvingUtilizationOfUnsecuredLines'], cutx1, woex1)) 137 | data['age'] = Series(replace_woe(data['age'], cutx2, woex2)) 138 | data['NumberOfTime30-59DaysPastDueNotWorse'] = Series(replace_woe(data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, woex3)) 139 | data['DebtRatio'] = Series(replace_woe(data['DebtRatio'], cutx4, woex4)) 140 | data['MonthlyIncome'] = Series(replace_woe(data['MonthlyIncome'], cutx5, woex5)) 141 | data['NumberOfOpenCreditLinesAndLoans'] = Series(replace_woe(data['NumberOfOpenCreditLinesAndLoans'], cutx6, woex6)) 142 | data['NumberOfTimes90DaysLate'] = Series(replace_woe(data['NumberOfTimes90DaysLate'], cutx7, woex7)) 143 | data['NumberRealEstateLoansOrLines'] = Series(replace_woe(data['NumberRealEstateLoansOrLines'], cutx8, woex8)) 144 | data['NumberOfTime60-89DaysPastDueNotWorse'] = Series(replace_woe(data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, woex9)) 145 | data['NumberOfDependents'] = Series(replace_woe(data['NumberOfDependents'], cutx10, woex10)) 146 | data.to_csv('WoeData.csv', index=False) 147 | test= pd.read_csv('TestData.csv') 148 | # 替换成woe 149 | test['RevolvingUtilizationOfUnsecuredLines'] = Series(replace_woe(test['RevolvingUtilizationOfUnsecuredLines'], cutx1, woex1)) 150 | test['age'] = Series(replace_woe(test['age'], cutx2, woex2)) 151 | test['NumberOfTime30-59DaysPastDueNotWorse'] = Series(replace_woe(test['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, woex3)) 152 | test['DebtRatio'] = Series(replace_woe(test['DebtRatio'], cutx4, woex4)) 153 | test['MonthlyIncome'] = Series(replace_woe(test['MonthlyIncome'], cutx5, woex5)) 154 | test['NumberOfOpenCreditLinesAndLoans'] = Series(replace_woe(test['NumberOfOpenCreditLinesAndLoans'], cutx6, woex6)) 155 | test['NumberOfTimes90DaysLate'] = Series(replace_woe(test['NumberOfTimes90DaysLate'], cutx7, woex7)) 156 | test['NumberRealEstateLoansOrLines'] = Series(replace_woe(test['NumberRealEstateLoansOrLines'], cutx8, woex8)) 157 | test['NumberOfTime60-89DaysPastDueNotWorse'] = Series(replace_woe(test['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, woex9)) 158 | test['NumberOfDependents'] = Series(replace_woe(test['NumberOfDependents'], cutx10, woex10)) 159 | test.to_csv('TestWoeData.csv', index=False) 160 | #计算分数 161 | #coe为逻辑回归模型的系数 162 | coe=[9.738849,0.638002,0.505995,1.032246,1.790041,1.131956] 163 | # 我们取600分为基础分值,PDO为20(每高20分好坏比翻一倍),好坏比取20。 164 | p = 20 / math.log(2) 165 | q = 600 - 20 * math.log(20) / math.log(2) 166 | baseScore = round(q + p * coe[0], 0) 167 | # 各项部分分数 168 | x1 = get_score(coe[1], woex1, p) 169 | x2 = get_score(coe[2], woex2, p) 170 | x3 = get_score(coe[3], woex3, p) 171 | x7 = get_score(coe[4], woex7, p) 172 | x9 = get_score(coe[5], woex9, p) 173 | print(x1,x2, x3, x7, x9) 174 | test1 = pd.read_csv('TestData.csv') 175 | test1['BaseScore']=Series(np.zeros(len(test1)))+baseScore 176 | test1['x1'] = Series(compute_score(test1['RevolvingUtilizationOfUnsecuredLines'], cutx1, x1)) 177 | test1['x2'] = Series(compute_score(test1['age'], cutx2, x2)) 178 | test1['x3'] = Series(compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3)) 179 | test1['x7'] = Series(compute_score(test1['NumberOfTimes90DaysLate'], cutx7, x7)) 180 | test1['x9'] = Series(compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, x9)) 181 | test1['Score'] = test1['x1'] + test1['x2'] + test1['x3'] + test1['x7'] +test1['x9'] + baseScore 182 | test1.to_csv('ScoreData.csv', index=False) 183 | ''' 184 | --------------------------------------------------------------------------------