├── .gitignore
├── Logit.py
├── MissingValue.py
├── Outlier.py
├── README.md
├── SignalVariable.py
└── cs-training.csv


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/Logit.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt #导入图像库
 3 | import matplotlib
 4 | import seaborn as sns
 5 | import statsmodels.api as sm
 6 | from sklearn.metrics import roc_curve, auc
 7 | plt.rcParams["font.sans-serif"]='SimHei'
 8 | plt.rcParams['axes.unicode_minus'] = False
 9 | if __name__ == '__main__':
10 |     matplotlib.rcParams['axes.unicode_minus'] = False
11 |     data = pd.read_csv('WoeData.csv')
12 |     Y=data['SeriousDlqin2yrs']
13 |     X=data.drop(['SeriousDlqin2yrs','DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
14 |     X1=sm.add_constant(X)
15 |     logit=sm.Logit(Y,X1)
16 |     result=logit.fit()
17 |     print(result.params)
18 |     test = pd.read_csv('TestWoeData.csv')
19 |     Y_test = test['SeriousDlqin2yrs']
20 |     X_test = test.drop(['SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines', 'NumberOfDependents'], axis=1)
21 |     X3 = sm.add_constant(X_test)
22 |     resu = result.predict(X3)
23 |     fpr, tpr, threshold = roc_curve(Y_test, resu)
24 |     rocauc = auc(fpr, tpr)
25 |     plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
26 |     plt.legend(loc='lower right')
27 |     plt.plot([0, 1], [0, 1], 'r--')
28 |     plt.xlim([0, 1])
29 |     plt.ylim([0, 1])
30 |     plt.ylabel('真正率')
31 |     plt.xlabel('假正率')
32 |     plt.show()


--------------------------------------------------------------------------------
/MissingValue.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt #导入图像库
 3 | from sklearn.ensemble import RandomForestRegressor
 4 | 
 5 | # 用随机森林对缺失值预测填充函数
 6 | def set_missing(df):
 7 |     # 把已有的数值型特征取出来
 8 |     process_df = df.ix[:,[5,0,1,2,3,4,6,7,8,9]]
 9 |     # 分成已知该特征和未知该特征两部分
10 |     known = process_df[process_df.MonthlyIncome.notnull()].as_matrix()
11 |     unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix()
12 |     # X为特征属性值
13 |     X = known[:, 1:]
14 |     # y为结果标签值
15 |     y = known[:, 0]
16 |     # fit到RandomForestRegressor之中
17 |     rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
18 |     rfr.fit(X,y)
19 |     # 用得到的模型进行未知特征值预测
20 |     predicted = rfr.predict(unknown[:, 1:]).round(0)
21 |     print(predicted)
22 |     # 用得到的预测结果填补原缺失数据
23 |     df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
24 |     return df
25 | 
26 | if __name__ == '__main__':
27 |     #载入数据
28 |     data = pd.read_csv('G:\WEO\cs-training.csv')
29 |     #数据集确实和分布情况
30 |     data.describe().to_csv('DataDescribe.csv')#了解数据集的分布情况
31 |     data=set_missing(data)#用随机森林填补比较多的缺失值
32 |     data=data.dropna()#删除比较少的缺失值
33 |     data = data.drop_duplicates()#删除重复项
34 |     data.to_csv('MissingData.csv',index=False)
35 |     data.describe().to_csv('MissingDataDescribe.csv')
36 |     """
37 |     #异常值处理
38 |     #年龄等于0的异常值进行剔除
39 |     data=data[data['age']>0]
40 |     # 箱形图
41 |     data379=data[['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse']]
42 |     data379.boxplot()
43 |     data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]
44 |     data379 = data[['NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse']]
45 |     #data379.boxplot()
46 |     plt.show()
47 |     #data.to_csv('PretreatmentData.csv')
48 |     """


--------------------------------------------------------------------------------
/Outlier.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt #导入图像库
 3 | from sklearn.cross_validation import train_test_split
 4 | 
 5 | def outlier_processing(df,col):
 6 |     s=df[col]
 7 |     oneQuoter=s.quantile(0.25)
 8 |     threeQuote=s.quantile(0.75)
 9 |     irq=threeQuote-oneQuoter
10 |     min=oneQuoter-1.5*irq
11 |     max=threeQuote+1.5*irq
12 |     df=df[df[col]<=max]
13 |     df=df[df[col]>=min]
14 |     return df
15 | 
16 | if __name__ == '__main__':
17 |     data = pd.read_csv('MissingData.csv')
18 |     # 年龄等于0的异常值进行剔除
19 |     data = data[data['age'] > 0]
20 |     data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]#剔除异常值
21 |     data['SeriousDlqin2yrs']=1-data['SeriousDlqin2yrs']
22 |     Y = data['SeriousDlqin2yrs']
23 |     X = data.ix[:, 1:]
24 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
25 |     # print(Y_train)
26 |     train = pd.concat([Y_train, X_train], axis=1)
27 |     test = pd.concat([Y_test, X_test], axis=1)
28 |     clasTest = test.groupby('SeriousDlqin2yrs')['SeriousDlqin2yrs'].count()
29 |     train.to_csv('TrainData.csv',index=False)
30 |     test.to_csv('TestData.csv',index=False)
31 |     print(train.shape)
32 |     print(test.shape)
33 | 
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WEO
2 | 信用风险评分卡为信用风险管理提供了一种有效的、经验性的解决方法，是消费信贷管理中广泛应用的技术手段。评分卡是信用风险评估领域常见的建模方法。评分卡并不加单对应于某一种机器学习算法，而是一种通用的建模框架，讲原始数据通过分箱后进行特征工程变换，继而应用于线性模型进行建模的一种方法。
3 | 


--------------------------------------------------------------------------------
/SignalVariable.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from pandas import Series,DataFrame
  4 | import scipy.stats.stats as stats
  5 | import matplotlib.pyplot as plt
  6 | import statsmodels.api as sm
  7 | import math
  8 | # 定义自动分箱函数
  9 | def mono_bin(Y, X, n = 20):
 10 |     r = 0
 11 |     good=Y.sum()
 12 |     bad=Y.count()-good
 13 |     while np.abs(r) < 1:
 14 |         d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
 15 |         d2 = d1.groupby('Bucket', as_index = True)
 16 |         r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
 17 |         n = n - 1
 18 |     d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
 19 |     d3['min']=d2.min().X
 20 |     d3['max'] = d2.max().X
 21 |     d3['sum'] = d2.sum().Y
 22 |     d3['total'] = d2.count().Y
 23 |     d3['rate'] = d2.mean().Y
 24 |     d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad))
 25 |     d3['goodattribute']=d3['sum']/good
 26 |     d3['badattribute']=(d3['total']-d3['sum'])/bad
 27 |     iv=((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
 28 |     d4 = (d3.sort_index(by = 'min'))
 29 |     print("=" * 60)
 30 |     print(d4)
 31 |     cut=[]
 32 |     cut.append(float('-inf'))
 33 |     for i in range(1,n+1):
 34 |         qua=X.quantile(i/(n+1))
 35 |         cut.append(round(qua,4))
 36 |     cut.append(float('inf'))
 37 |     woe=list(d4['woe'].round(3))
 38 |     return d4,iv,cut,woe
 39 | #自定义分箱函数
 40 | def self_bin(Y,X,cat):
 41 |     good=Y.sum()
 42 |     bad=Y.count()-good
 43 |     d1=pd.DataFrame({'X':X,'Y':Y,'Bucket':pd.cut(X,cat)})
 44 |     d2=d1.groupby('Bucket', as_index = True)
 45 |     d3 = pd.DataFrame(d2.X.min(), columns=['min'])
 46 |     d3['min'] = d2.min().X
 47 |     d3['max'] = d2.max().X
 48 |     d3['sum'] = d2.sum().Y
 49 |     d3['total'] = d2.count().Y
 50 |     d3['rate'] = d2.mean().Y
 51 |     d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad))
 52 |     d3['goodattribute'] = d3['sum'] / good
 53 |     d3['badattribute'] = (d3['total'] - d3['sum']) / bad
 54 |     iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum()
 55 |     d4 = (d3.sort_index(by='min'))
 56 |     print("=" * 60)
 57 |     print(d4)
 58 |     woe = list(d4['woe'].round(3))
 59 |     return d4, iv,woe
 60 | #用woe代替
 61 | def replace_woe(series,cut,woe):
 62 |     list=[]
 63 |     i=0
 64 |     while i<len(series):
 65 |         value=series[i]
 66 |         j=len(cut)-2
 67 |         m=len(cut)-2
 68 |         while j>=0:
 69 |             if value>=cut[j]:
 70 |                 j=-1
 71 |             else:
 72 |                 j -=1
 73 |                 m -= 1
 74 |         list.append(woe[m])
 75 |         i += 1
 76 |     return list
 77 | #计算分数函数
 78 | def get_score(coe,woe,factor):
 79 |     scores=[]
 80 |     for w in woe:
 81 |         score=round(coe*w*factor,0)
 82 |         scores.append(score)
 83 |     return scores
 84 | #根据变量计算分数
 85 | def compute_score(series,cut,score):
 86 |     list = []
 87 |     i = 0
 88 |     while i < len(series):
 89 |         value = series[i]
 90 |         j = len(cut) - 2
 91 |         m = len(cut) - 2
 92 |         while j >= 0:
 93 |             if value >= cut[j]:
 94 |                 j = -1
 95 |             else:
 96 |                 j -= 1
 97 |                 m -= 1
 98 |         list.append(score[m])
 99 |         i += 1
100 |     return list
101 | if __name__ == '__main__':
102 |     data = pd.read_csv('TrainData.csv')
103 |     pinf = float('inf')#正无穷大
104 |     ninf = float('-inf')#负无穷大
105 |     dfx1, ivx1,cutx1,woex1=mono_bin(data.SeriousDlqin2yrs,data.RevolvingUtilizationOfUnsecuredLines,n=10)
106 |     dfx2, ivx2,cutx2,woex2=mono_bin(data.SeriousDlqin2yrs, data.age, n=10)
107 |     dfx4, ivx4,cutx4,woex4 =mono_bin(data.SeriousDlqin2yrs, data.DebtRatio, n=20)
108 |     dfx5, ivx5,cutx5,woex5 =mono_bin(data.SeriousDlqin2yrs, data.MonthlyIncome, n=10)
109 |     # 连续变量离散化
110 |     cutx3 = [ninf, 0, 1, 3, 5, pinf]
111 |     cutx6 = [ninf, 1, 2, 3, 5, pinf]
112 |     cutx7 = [ninf, 0, 1, 3, 5, pinf]
113 |     cutx8 = [ninf, 0,1,2, 3, pinf]
114 |     cutx9 = [ninf, 0, 1, 3, pinf]
115 |     cutx10 = [ninf, 0, 1, 2, 3, 5, pinf]
116 |     dfx3, ivx3,woex3 = self_bin(data.SeriousDlqin2yrs, data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3)
117 |     dfx6, ivx6 ,woex6= self_bin(data.SeriousDlqin2yrs, data['NumberOfOpenCreditLinesAndLoans'], cutx6)
118 |     dfx7, ivx7,woex7 = self_bin(data.SeriousDlqin2yrs, data['NumberOfTimes90DaysLate'], cutx7)
119 |     dfx8, ivx8,woex8 = self_bin(data.SeriousDlqin2yrs, data['NumberRealEstateLoansOrLines'], cutx8)
120 |     dfx9, ivx9,woex9 = self_bin(data.SeriousDlqin2yrs, data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9)
121 |     dfx10, ivx10,woex10 = self_bin(data.SeriousDlqin2yrs, data['NumberOfDependents'], cutx10)
122 |     ivlist=[ivx1,ivx2,ivx3,ivx4,ivx5,ivx6,ivx7,ivx8,ivx9,ivx10]
123 |     index=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
124 |     fig1 = plt.figure(1)
125 |     ax1 = fig1.add_subplot(1, 1, 1)
126 |     x = np.arange(len(index))+1
127 |     ax1.bar(x, ivlist, width=0.4)
128 |     ax1.set_xticks(x)
129 |     ax1.set_xticklabels(index, rotation=0, fontsize=12)
130 |     ax1.set_ylabel('IV(Information Value)', fontsize=14)
131 |     for a, b in zip(x, ivlist):
132 |         plt.text(a, b + 0.01, '%.4f' % b, ha='center', va='bottom', fontsize=10)
133 |     plt.show()
134 |     '''
135 |     # 替换成woe
136 |     data['RevolvingUtilizationOfUnsecuredLines'] = Series(replace_woe(data['RevolvingUtilizationOfUnsecuredLines'], cutx1, woex1))
137 |     data['age'] = Series(replace_woe(data['age'], cutx2, woex2))
138 |     data['NumberOfTime30-59DaysPastDueNotWorse'] = Series(replace_woe(data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, woex3))
139 |     data['DebtRatio'] = Series(replace_woe(data['DebtRatio'], cutx4, woex4))
140 |     data['MonthlyIncome'] = Series(replace_woe(data['MonthlyIncome'], cutx5, woex5))
141 |     data['NumberOfOpenCreditLinesAndLoans'] = Series(replace_woe(data['NumberOfOpenCreditLinesAndLoans'], cutx6, woex6))
142 |     data['NumberOfTimes90DaysLate'] = Series(replace_woe(data['NumberOfTimes90DaysLate'], cutx7, woex7))
143 |     data['NumberRealEstateLoansOrLines'] = Series(replace_woe(data['NumberRealEstateLoansOrLines'], cutx8, woex8))
144 |     data['NumberOfTime60-89DaysPastDueNotWorse'] = Series(replace_woe(data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, woex9))
145 |     data['NumberOfDependents'] = Series(replace_woe(data['NumberOfDependents'], cutx10, woex10))
146 |     data.to_csv('WoeData.csv', index=False)
147 |     test= pd.read_csv('TestData.csv')
148 |     # 替换成woe
149 |     test['RevolvingUtilizationOfUnsecuredLines'] = Series(replace_woe(test['RevolvingUtilizationOfUnsecuredLines'], cutx1, woex1))
150 |     test['age'] = Series(replace_woe(test['age'], cutx2, woex2))
151 |     test['NumberOfTime30-59DaysPastDueNotWorse'] = Series(replace_woe(test['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, woex3))
152 |     test['DebtRatio'] = Series(replace_woe(test['DebtRatio'], cutx4, woex4))
153 |     test['MonthlyIncome'] = Series(replace_woe(test['MonthlyIncome'], cutx5, woex5))
154 |     test['NumberOfOpenCreditLinesAndLoans'] = Series(replace_woe(test['NumberOfOpenCreditLinesAndLoans'], cutx6, woex6))
155 |     test['NumberOfTimes90DaysLate'] = Series(replace_woe(test['NumberOfTimes90DaysLate'], cutx7, woex7))
156 |     test['NumberRealEstateLoansOrLines'] = Series(replace_woe(test['NumberRealEstateLoansOrLines'], cutx8, woex8))
157 |     test['NumberOfTime60-89DaysPastDueNotWorse'] = Series(replace_woe(test['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, woex9))
158 |     test['NumberOfDependents'] = Series(replace_woe(test['NumberOfDependents'], cutx10, woex10))
159 |     test.to_csv('TestWoeData.csv', index=False)
160 |     #计算分数
161 |     #coe为逻辑回归模型的系数
162 |     coe=[9.738849,0.638002,0.505995,1.032246,1.790041,1.131956]
163 |     # 我们取600分为基础分值，PDO为20（每高20分好坏比翻一倍），好坏比取20。
164 |     p = 20 / math.log(2)
165 |     q = 600 - 20 * math.log(20) / math.log(2)
166 |     baseScore = round(q + p * coe[0], 0)
167 |     # 各项部分分数
168 |     x1 = get_score(coe[1], woex1, p)
169 |     x2 = get_score(coe[2], woex2, p)
170 |     x3 = get_score(coe[3], woex3, p)
171 |     x7 = get_score(coe[4], woex7, p)
172 |     x9 = get_score(coe[5], woex9, p)
173 |     print(x1,x2, x3, x7, x9)
174 |     test1 = pd.read_csv('TestData.csv')
175 |     test1['BaseScore']=Series(np.zeros(len(test1)))+baseScore
176 |     test1['x1'] = Series(compute_score(test1['RevolvingUtilizationOfUnsecuredLines'], cutx1, x1))
177 |     test1['x2'] = Series(compute_score(test1['age'], cutx2, x2))
178 |     test1['x3'] = Series(compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3))
179 |     test1['x7'] = Series(compute_score(test1['NumberOfTimes90DaysLate'], cutx7, x7))
180 |     test1['x9'] = Series(compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, x9))
181 |     test1['Score'] = test1['x1'] + test1['x2'] + test1['x3'] + test1['x7'] +test1['x9']  + baseScore
182 |     test1.to_csv('ScoreData.csv', index=False)
183 |     '''
184 | 


--------------------------------------------------------------------------------