├── README.md ├── 智能风控(代码) ├── 第1章 │ └── 1.3.py ├── 第2章 │ ├── 2.2.py │ ├── 2.3.py │ ├── 2.4.py │ ├── 2.5.py │ ├── 2.6.py │ └── 2.7.py ├── 第3章 │ ├── 3.3.py │ ├── 3.4.py │ ├── 3.5.py │ └── 3.6.py ├── 第4章 │ ├── 4.2.py │ ├── 4.4.py │ └── 4.5.py ├── 第5章 │ ├── 5.3.py │ └── 5.4.py ├── 第6章 │ ├── 6.3.py │ └── 6.4.py ├── 第7章 │ ├── 7.1.py │ ├── 7.2.py │ ├── 7.3.py │ ├── 7.4.py │ └── 7.5.py └── 第8章 │ ├── 8.2.py │ ├── 8.3.py │ ├── 8.4.py │ ├── 8.5.py │ └── 8.6.py └── 智能风控(数据集) ├── Acard.txt ├── data_for_tree.xlsx ├── stack_network_links.csv └── tra_sample.xlsx /README.md: -------------------------------------------------------------------------------- 1 | # IntelligentRiskControl 2 | 《智能风控——原理、算法与工程实践》 3 | 作者:梅子行 4 | -------------------------------------------------------------------------------- /智能风控(代码)/第1章/1.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 14:31:54 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import os 11 | #为画图指定路径 12 | os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' 13 | #读取数据 14 | data = pd.read_excel( './data/ data_for_tree.xlsx') 15 | data.head() 16 | 17 | org_lst = ['uid','create_dt','oil_actv_dt','class_new','bad_ind'] 18 | agg_lst = ['oil_amount','discount_amount','sale_amount','amount','pay_amount','coupon_amount','payment_coupon_amount'] 19 | dstc_lst = ['channel_code','oil_code','scene','source_app','call_source'] 20 | 21 | df = data[org_lst].copy() 22 | df[agg_lst] = data[agg_lst].copy() 23 | df[dstc_lst] = data[dstc_lst].copy() 24 | 25 | base = df[org_lst].copy() 26 | base = base.drop_duplicates(['uid'],keep = 'first') 27 | 28 | gn = pd.DataFrame() 29 | for i in agg_lst: 30 | #计算个数 31 | tp = pd.DataFrame(df.groupby('uid').apply( 32 | lambda df:len(df[i])).reset_index()) 33 | tp.columns = ['uid',i + '_cnt'] 34 | if gn.empty == True: 35 | gn = tp 36 | else: 37 | gn = pd.merge(gn,tp,on = 'uid',how = 'left') 38 | #求历史特征值大于零的个数 39 | tp = pd.DataFrame(df.groupby('uid').apply( 40 | lambda df:np.where(df[i]>0,1,0).sum()).reset_index()) 41 | tp.columns = ['uid',i + '_num'] 42 | if gn.empty == True: 43 | gn = tp 44 | else: 45 | gn = pd.merge(gn,tp,on = 'uid',how = 'left') 46 | #对历史数据求和 47 | tp = pd.DataFrame(df.groupby('uid').apply( 48 | lambda df:np.nansum(df[i])).reset_index()) 49 | tp.columns = ['uid',i + '_tot'] 50 | if gn.empty == True: 51 | gn = tp 52 | else: 53 | gn = pd.merge(gn,tp,on = 'uid',how = 'left') 54 | #对历史数据求均值 55 | tp = pd.DataFrame(df.groupby('uid').apply( 56 | lambda df:np.nanmean(df[i])).reset_index()) 57 | tp.columns = ['uid',i + '_avg'] 58 | if gn.empty == True: 59 | gn = tp 60 | else: 61 | gn = pd.merge(gn,tp,on = 'uid',how = 'left') 62 | #对历史数据求最大值 63 | tp = pd.DataFrame(df.groupby('uid').apply( 64 | lambda df:np.nanmax(df[i])).reset_index()) 65 | tp.columns = ['uid',i + '_max'] 66 | if gn.empty == True: 67 | gn = tp 68 | else: 69 | gn = pd.merge(gn,tp,on = 'uid',how = 'left') 70 | #对历史数据求最小值 71 | tp = pd.DataFrame(df.groupby('uid').apply( 72 | lambda df:np.nanmin(df[i])).reset_index()) 73 | tp.columns = ['uid',i + '_min'] 74 | if gn.empty == True: 75 | gn = tp 76 | else: 77 | gn = pd.merge(gn,tp,on = 'uid',how = 'left') 78 | #对历史数据求方差 79 | tp = pd.DataFrame(df.groupby('uid').apply( 80 | lambda df:np.nanvar(df[i])).reset_index()) 81 | tp.columns = ['uid',i + '_var'] 82 | if gn.empty == True: 83 | gn = tp 84 | else: 85 | gn = pd.merge(gn,tp,on = 'uid',how = 'left') 86 | #对历史数据求极差 87 | tp = pd.DataFrame(df.groupby('uid').apply( 88 | lambda df:np.nanmax(df[i])-np.nanmin(df[i]) ).reset_index()) 89 | tp.columns = ['uid',i + '_ran'] 90 | if gn.empty == True: 91 | gn = tp 92 | else: 93 | gn = pd.merge(gn,tp,on = 'uid',how = 'left') 94 | #对历史数据求变异系数,为防止除数为0,利用0.01进行平滑 95 | tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])/(np.nanvar(df[i])+0.01))).reset_index() 96 | tp.columns = ['uid',i + '_cva'] 97 | if gn.empty == True: 98 | gn = tp 99 | else: 100 | gn = pd.merge(gn,tp,on = 'uid',how = 'left') 101 | 102 | gc = pd.DataFrame() 103 | for i in dstc_lst: 104 | tp = pd.DataFrame(df.groupby('uid').apply( 105 | lambda df: len(set(df[i]))).reset_index()) 106 | tp.columns = ['uid',i + '_dstc'] 107 | if gc.empty == True: 108 | gc = tp 109 | else: 110 | gc = pd.merge(gc,tp,on = 'uid',how = 'left') 111 | 112 | fn = base.merge(gn,on='uid').merge(gc,on='uid') 113 | fn = pd.merge(fn,gc,on= 'uid') 114 | fn.shape 115 | 116 | x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1) 117 | y = fn.bad_ind.copy() 118 | 119 | from sklearn import tree 120 | dtree = tree.DecisionTreeRegressor(max_depth = 2,min_samples_leaf = 500,min_samples_split = 5000) 121 | dtree = dtree.fit(x,y) 122 | 123 | import pydotplus 124 | from IPython.display import Image 125 | from sklearn.externals.six import StringIO 126 | import os 127 | os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' 128 | 129 | dot_data = StringIO() 130 | tree.export_graphviz(dtree, out_file=dot_data, 131 | feature_names=x.columns, 132 | class_names=['bad_ind'], 133 | filled=True, rounded=True, 134 | special_characters=True) 135 | graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 136 | Image(graph.create_png()) 137 | 138 | dff1 = fn.loc[(fn.amount_tot>9614.5)&(fn.coupon_amount_cnt>6)].copy() 139 | dff1['level'] = 'past_A' 140 | dff2 = fn.loc[(fn.amount_tot>9614.5)&(fn.coupon_amount_cnt<=6)].copy() 141 | dff2['level'] = 'past_B' 142 | dff3 = fn.loc[fn.amount_tot<=9614.5].copy() 143 | dff3['level'] = 'past_C' 144 | -------------------------------------------------------------------------------- /智能风控(代码)/第2章/2.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 14:50:38 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | def Num(feature,mth): 9 | df=data.loc[:,feature +'1': feature +str(mth)] 10 | auto_value=np.where(df>0,1,0).sum(axis=1) 11 | return feature +'_num'+str(mth),auto_value 12 | 13 | def Avg(feature, mth): 14 | df=data.loc[:,feature +'1': feature +str(mth)] 15 | auto_value=np.nanmean(df,axis = 1 ) 16 | return feature +'_avg'+str(mth),auto_value 17 | 18 | def Msg(feature, mth): 19 | df=data.loc[:,feature +'1': feature +str(mth)] 20 | df_value=np.where(df>0,1,0) 21 | auto_value=[] 22 | for i in range(len(df_value)): 23 | row_value=df_value[i,:] 24 | if row_value.max()<=0: 25 | indexs='0' 26 | auto_value.append(indexs) 27 | else: 28 | indexs=1 29 | for j in row_value: 30 | if j>0: 31 | break 32 | indexs+=1 33 | auto_value.append(indexs) 34 | return feature +'_msg'+str(mth),auto_value 35 | 36 | def Cav(feature, mth): 37 | df=data.loc[:,feature +'1':inv+str(mth)] 38 | auto_value = df[feature +'1']/np.nanmean(df,axis = 1 ) 39 | return feature +'_cav'+str(mth),auto_value 40 | 41 | def Mai(feature, mth): 42 | arr=np.array(data.loc[:,feature +'1': feature +str(mth)]) 43 | auto_value = [] 44 | for i in range(len(arr)): 45 | df_value = arr[i,:] 46 | value_lst = [] 47 | for k in range(len(df_value)-1): 48 | minus = df_value[k] - df_value[k+1] 49 | value_lst.append(minus) 50 | auto_value.append(np.nanmax(value_lst)) 51 | return feature +'_mai'+str(mth),auto_value 52 | 53 | def Ran(feature, mth): 54 | df=data.loc[:,feature +'1': feature +str(mth)] 55 | auto_value = np.nanmax(df,axis = 1 ) - np.nanmin(df,axis = 1 ) 56 | return feature +'_ran'+str(mth),auto_value 57 | 58 | -------------------------------------------------------------------------------- /智能风控(代码)/第2章/2.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 14:52:55 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | from sklearn.preprocessing import OneHotEncoder 9 | enc = OneHotEncoder() 10 | enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) 11 | enc.transform([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]).toarray() 12 | 13 | import math 14 | #离散型变量 WOE编码 15 | class charWoe(object): 16 | def __init__(self, datasets, dep, weight, vars): 17 | #数据集字典,{'dev':训练集,'val':测试集,'off':跨时间验证集} 18 | self.datasets = datasets 19 | self.devf = datasets.get("dev", "") #训练集 20 | self.valf = datasets.get("val", "") #测试集 21 | self.offf = datasets.get("off", "") #跨时间验证集 22 | self.dep = dep #标签 23 | self.weight = weight #样本权重 24 | self.vars = vars #参与建模的特征名 25 | self.nrows, self.ncols = self.devf.shape #样本数,特征数 26 | 27 | def char_woe(self): 28 | #得到每一类样本的个数,且加入平滑项使得bad和good都不为0 29 | dic = dict(self.devf.groupby([self.dep]).size()) 30 | good = dic.get(0, 0) + 1e-10 31 | bad = dic.get(1, 0) + 1e-10 32 | #对每一个特征进行遍历。 33 | for col in self.vars: 34 | #得到每一个特征值对应的样本数。 35 | data = dict(self.devf[[col, self.dep]].groupby( 36 | [col, self.dep]).size()) 37 | ''' 38 | 当前特征取值超过100个的时候,跳过当前取值。 39 | 因为取值过多时,WOE分箱的效率较低,建议对特征进行截断。 40 | 出现频率过低的特征值统一赋值,放入同一箱内。 41 | ''' 42 | if len(data) > 100: 43 | print(col, "contains too many different values...") 44 | continue 45 | #打印取值个数 46 | print(col, len(data)) 47 | dic = dict() 48 | #k是特征名和特征取值的组合,v是样本数 49 | for (k, v) in data.items(): 50 | #value为特征名,dp为特征取值 51 | value, dp = k 52 | #如果找不到key设置为一个空字典 53 | dic.setdefault(value, {}) 54 | #字典中嵌套字典 55 | dic[value][int(dp)] = v 56 | for (k, v) in dic.items(): 57 | dic[k] = {str(int(k1)): v1 for (k1, v1) in v.items()} 58 | dic[k]["cnt"] = sum(v.values()) 59 | bad_rate = round(dic[k].get("1", 0)/ dic[k]["cnt"], 5) 60 | dic[k]["bad_rate"] = bad_rate 61 | #利用定义的函数进行合并。 62 | dic = self.combine_box_char(dic) 63 | #对每个特征计算WOE值和IV值 64 | for (k, v) in dic.items(): 65 | a = v.get("0", 1) / good + 1e-10 66 | b = v.get("1", 1) / bad + 1e-10 67 | dic[k]["Good"] = v.get("0", 0) 68 | dic[k]["Bad"] = v.get("1", 0) 69 | dic[k]["woe"] = round(math.log(a / b), 5) 70 | ''' 71 | 按照分箱后的点进行分割, 72 | 计算得到每一个特征值的WOE值, 73 | 将原始特征名加上'_woe'后缀,并赋予WOE值。 74 | ''' 75 | for (klis, v) in dic.items(): 76 | for k in klis.split(","): 77 | #训练集进行替换 78 | self.devf.loc[self.devf[col]==k, 79 | "%s_woe" % col] = v["woe"] 80 | #测试集进行替换 81 | if not isinstance(self.valf, str): 82 | self.valf.loc[self.valf[col]==k, 83 | "%s_woe" % col] = v["woe"] 84 | #跨时间验证集进行替换 85 | if not isinstance(self.offf, str): 86 | self.offf.loc[self.offf[col]==k, 87 | "%s_woe" % col] = v["woe"] 88 | #返回新的字典,其中包含三个数据集。 89 | return {"dev": self.devf, "val": self.valf, "off": self.offf} 90 | 91 | def combine_box_char(self, dic): 92 | ''' 93 | 实施两种分箱策略。 94 | 1.不同箱之间负样本占比差异最大化。 95 | 2.每一箱的样本量不能过少。 96 | ''' 97 | #首先合并至10箱以内。按照每一箱负样本占比差异最大化原则进行分箱。 98 | while len(dic) >= 10: 99 | #k是特征值,v["bad_rate"]是特征值对应的负样本占比 100 | bad_rate_dic = {k: v["bad_rate"] 101 | for (k, v) in dic.items()} 102 | #按照负样本占比排序。因为离散型变量 是无序的, 103 | #可以直接写成负样本占比递增的形式。 104 | bad_rate_sorted = sorted(bad_rate_dic.items(), 105 | key=lambda x: x[1]) 106 | #计算每两箱之间的负样本占比差值。 107 | #准备将差值最小的两箱进行合并。 108 | bad_rate = [bad_rate_sorted[i+1][1]- 109 | bad_rate_sorted[i][1] for i in 110 | range(len(bad_rate_sorted)-1)] 111 | min_rate_index = bad_rate.index(min(bad_rate)) 112 | #k1和k2是差值最小的两箱的key. 113 | k1, k2 = bad_rate_sorted[min_rate_index][0],\ 114 | bad_rate_sorted[min_rate_index+1][0] 115 | #得到重新划分后的字典,箱的个数比之前少一。 116 | dic["%s,%s" % (k1, k2)] = dict() 117 | dic["%s,%s" % (k1, k2)]["0"] = dic[k1].get("0", 0)\ 118 | + dic[k2].get("0", 0) 119 | dic["%s,%s" % (k1, k2)]["1"] = dic[k1].get("1", 0) \ 120 | + dic[k2].get("1", 0) 121 | dic["%s,%s" % (k1, k2)]["cnt"] = dic[k1]["cnt"]\ 122 | + dic[k2]["cnt"] 123 | dic["%s,%s" % (k1, k2)]["bad_rate"] = round( 124 | dic["%s,%s" % (k1, k2)]["1"] / 125 | dic["%s,%s" % (k1, k2)]["cnt"],5) 126 | #删除旧的key。 127 | del dic[k1], dic[k2] 128 | ''' 129 | 结束循环后,箱的个数应该少于10。 130 | 下面实施第二种分箱策略。 131 | 将样本数量少的箱合并至其他箱中,以保证每一箱的样本数量不要太少。 132 | ''' 133 | #记录当前样本最少的箱的个数。 134 | min_cnt = min([v["cnt"] for v in dic.values()]) 135 | #当样本数量小于总样本的5%或者总箱的个数大于5的时候,对箱进行合并 136 | while min_cnt < self.nrows * 0.05 and len(dic) > 5: 137 | min_key = [k for (k, v) in dic.items() 138 | if v["cnt"] == min_cnt][0] 139 | bad_rate_dic = {k: v["bad_rate"] 140 | for (k, v) in dic.items()} 141 | bad_rate_sorted = sorted(bad_rate_dic.items(), 142 | key=lambda x: x[1]) 143 | keys = [k[0] for k in bad_rate_sorted] 144 | min_index = keys.index(min_key) 145 | ''''' 146 | 同样想保持合并后箱之间的负样本占比差异最大化。 147 | 由于箱的位置不同,按照三种不同情况进行分类讨论。 148 | ''' 149 | #如果是第一箱,和第二项合并 150 | if min_index == 0: 151 | k1, k2 = keys[:2] 152 | #如果是最后一箱,和倒数第二箱合并 153 | elif min_index == len(dic) - 1: 154 | k1, k2 = keys[-2:] 155 | #如果是中间箱,和bad_rate值相差最小的箱合并 156 | else: 157 | bef_bad_rate = dic[min_key]["bad_rate"]\ 158 | -dic[keys[min_index - 1]]["bad_rate"] 159 | aft_bad_rate = dic[keys[min_index+1]]["bad_rate"] - dic[min_key]["bad_rate"] 160 | if bef_bad_rate < aft_bad_rate: 161 | k1, k2 = keys[min_index - 1], min_key 162 | else: 163 | k1, k2 = min_key, keys[min_index + 1] 164 | #得到重新划分后的字典,箱的个数比之前少一。 165 | dic["%s,%s" % (k1, k2)] = dict() 166 | dic["%s,%s" % (k1, k2)]["0"] = dic[k1].get("0", 0) \ 167 | + dic[k2].get("0", 0) 168 | dic["%s,%s" % (k1, k2)]["1"] = dic[k1].get("1", 0)\ 169 | + dic[k2].get("1", 0) 170 | dic["%s,%s" % (k1, k2)]["cnt"] = dic[k1]["cnt"]\ 171 | +dic[k2]["cnt"] 172 | dic["%s,%s" % (k1, k2)]["bad_rate"] = round( 173 | dic["%s,%s" % (k1, k2)]["1"] / 174 | dic["%s,%s" % (k1, k2)]["cnt"],5) 175 | #删除旧的key。 176 | del dic[k1], dic[k2] 177 | #当前最小的箱的样本个数 178 | min_cnt = min([v["cnt"] for v in dic.values()]) 179 | return dic 180 | 181 | -------------------------------------------------------------------------------- /智能风控(代码)/第2章/2.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:08:42 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | import math 8 | 9 | def sloveKS(self, model, X, Y, Weight): 10 | Y_predict = [s[1] for s in model.predict_proba(X)] 11 | nrows = X.shape[0] 12 | #还原权重 13 | lis = [(Y_predict[i], Y.values[i], Weight[i]) for i in range(nrows)] 14 | #按照预测概率倒序排列 15 | ks_lis = sorted(lis, key=lambda x: x[0], reverse=True) 16 | KS = list() 17 | bad = sum([w for (p, y, w) in ks_lis if y > 0.5]) 18 | good = sum([w for (p, y, w) in ks_lis if y <= 0.5]) 19 | bad_cnt, good_cnt = 0, 0 20 | for (p, y, w) in ks_lis: 21 | if y > 0.5: 22 | #1*w 即加权样本个数 23 | bad_cnt += w 24 | else: 25 | #1*w 即加权样本个数 26 | good_cnt += w 27 | ks = math.fabs((bad_cnt/bad)-(good_cnt/good)) 28 | KS.append(ks) 29 | return max(KS) 30 | 31 | def slovePSI(self, model, dev_x, val_x): 32 | dev_predict_y = [s[1] for s in model.predict_proba(dev_x)] 33 | dev_nrows = dev_x.shape[0] 34 | dev_predict_y.sort() 35 | #等频分箱成10份 36 | cutpoint = [-100] + [dev_predict_y[int(dev_nrows/10*i)] 37 | for i in range(1, 10)] + [100] 38 | cutpoint = list(set(cutpoint)) 39 | cutpoint.sort() 40 | val_predict_y = [s[1] for s in list(model.predict_proba(val_x))] 41 | val_nrows = val_x.shape[0] 42 | PSI = 0 43 | #每一箱之间分别计算PSI 44 | for i in range(len(cutpoint)-1): 45 | start_point, end_point = cutpoint[i], cutpoint[i+1] 46 | dev_cnt = [p for p in dev_predict_y 47 | if start_point <= p < end_point] 48 | dev_ratio = len(dev_cnt) / dev_nrows + 1e-10 49 | val_cnt = [p for p in val_predict_y 50 | if start_point <= p < end_point] 51 | val_ratio = len(val_cnt) / val_nrows + 1e-10 52 | psi = (dev_ratio - val_ratio) * math.log(dev_ratio/val_ratio) 53 | PSI += psi 54 | return PSI 55 | 56 | import xgboost as xgb 57 | from xgboost import plot_importance 58 | 59 | class xgBoost(object): 60 | def __init__(self, datasets, uid, dep, weight, 61 | var_names, params, max_del_var_nums=0): 62 | self.datasets = datasets 63 | #样本唯一标识,不参与建模 64 | self.uid = uid 65 | #二分类标签 66 | self.dep = dep 67 | #样本权重 68 | self.weight = weight 69 | #特征列表 70 | self.var_names = var_names 71 | #参数字典,未指定字段使用默认值 72 | self.params = params 73 | #单次迭代最多删除特征的个数 74 | self.max_del_var_nums = max_del_var_nums 75 | self.row_num = 0 76 | self.col_num = 0 77 | 78 | def training(self, min_score=0.0001, modelfile="", output_scores=list()): 79 | lis = self.var_names[:] 80 | dev_data = self.datasets.get("dev", "") #训练集 81 | val_data = self.datasets.get("val", "") #测试集 82 | off_data = self.datasets.get("off", "") #跨时间验证集 83 | #从字典中查找参数值,没有则使用第二项作为默认值 84 | model = xgb.XGBClassifier( 85 | learning_rate=self.params.get("learning_rate", 0.1), 86 | n_estimators=self.params.get("n_estimators", 100), 87 | max_depth=self.params.get("max_depth", 3), 88 | min_child_weight=self.params.get("min_child_weight", 1),subsample=self.params.get("subsample", 1), 89 | objective=self.params.get("objective", 90 | "binary:logistic"), 91 | nthread=self.params.get("nthread", 10), 92 | scale_pos_weight=self.params.get("scale_pos_weight", 1), 93 | random_state=0, 94 | n_jobs=self.params.get("n_jobs", 10), 95 | reg_lambda=self.params.get("reg_lambda", 1), 96 | missing=self.params.get("missing", None) ) 97 | while len(lis) > 0: 98 | #模型训练 99 | model.fit(X=dev_data[self.var_names], y=dev_data[self.dep]) 100 | #得到特征重要性 101 | scores = model.feature_importances_ 102 | #清空字典 103 | lis.clear() 104 | ''' 105 | 当特征重要性小于预设值时, 106 | 将特征放入待删除列表。 107 | 当列表长度超过预设最大值时,跳出循环。 108 | 即一次只删除限定个数的特征。 109 | ''' 110 | for (idx, var_name) in enumerate(self.var_names): 111 | #小于特征重要性预设值则放入列表 112 | if scores[idx] < min_score: 113 | lis.append(var_name) 114 | #达到预设单次最大特征删除个数则停止本次循环 115 | if len(lis) >= self.max_del_var_nums: 116 | break 117 | #训练集KS 118 | devks = self.sloveKS(model, dev_data[self.var_names], 119 | dev_data[self.dep], dev_data[self.weight]) 120 | #初始化ks值和PSI 121 | valks, offks, valpsi, offpsi = 0.0, 0.0, 0.0, 0.0 122 | #测试集KS和PSI 123 | if not isinstance(val_data, str): 124 | valks = self.sloveKS(model, 125 | val_data[self.var_names], 126 | val_data[self.dep], 127 | val_data[self.weight]) 128 | valpsi = self.slovePSI(model, 129 | dev_data[self.var_names], 130 | val_data[self.var_names]) 131 | #跨时间验证集KS和PSI 132 | if not isinstance(off_data, str): 133 | offks = self.sloveKS(model, 134 | off_data[self.var_names], 135 | off_data[self.dep], 136 | off_data[self.weight]) 137 | offpsi = self.slovePSI(model, 138 | dev_data[self.var_names], 139 | off_data[self.var_names]) 140 | #将三个数据集的KS和PSI放入字典 141 | dic = {"devks": float(devks), 142 | "valks": float(valks), 143 | "offks": offks, 144 | "valpsi": float(valpsi), 145 | "offpsi": offpsi} 146 | print("del var: ", len(self.var_names), 147 | "-->", len(self.var_names) - len(lis), 148 | "ks: ", dic, ",".join(lis)) 149 | self.var_names = [var_name for var_name in self.var_names if var_name not in lis] 150 | plot_importance(model) 151 | #重新训练,准备进入下一循环 152 | model = xgb.XGBClassifier( 153 | learning_rate=self.params.get("learning_rate", 0.1), 154 | n_estimators=self.params.get("n_estimators", 100), 155 | max_depth=self.params.get("max_depth", 3), 156 | min_child_weight=self.params.get("min_child_weight",1), 157 | subsample=self.params.get("subsample", 1), 158 | objective=self.params.get("objective", 159 | "binary:logistic"), 160 | nthread=self.params.get("nthread", 10), 161 | scale_pos_weight=self.params.get("scale_pos_weight",1), 162 | random_state=0, 163 | n_jobs=self.params.get("n_jobs", 10), 164 | reg_lambda=self.params.get("reg_lambda", 1), 165 | missing=self.params.get("missing", None)) 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /智能风控(代码)/第2章/2.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:11:59 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | def target_value(self,old_devks,old_offks,target,devks,offks,w=0.2): 9 | ''' 10 | 如果参数设置为"best",使用最优调参策略, 11 | 否则使用跨时间测试集KS最大策略。 12 | ''' 13 | if target == "best": 14 | return offks-abs(devks-offks)*w 15 | else: 16 | return offks 17 | 18 | def check_params(self, dev_data, off_data, params, param, train_number, step, target, 19 | targetks, old_devks, old_offks): 20 | ''' 21 | 当前向搜索对调参策略有提升时, 22 | 继续前向搜索。 23 | 否则进行后向搜索 24 | ''' 25 | while True: 26 | try: 27 | if params[param] + step > 0: 28 | params[param] += step 29 | model = xgb.XGBClassifier( 30 | max_depth=params.get("max_depth", 3), 31 | learning_rate=params.get("learning_rate", 0.05), 32 | n_estimators=params.get("n_estimators", 100), 33 | min_child_weight=params.get( 34 | "min_child_weight", 1), 35 | subsample=params.get("subsample", 1), 36 | scale_pos_weight=params.get( 37 | "scale_pos_weight", 1), 38 | nthread=10,n_jobs=10, random_state=0) 39 | model.fit(dev_data[self.var_names], 40 | dev_data[self.dep], 41 | dev_data[self.weight]) 42 | devks = self.sloveKS(model, 43 | dev_data[self.var_names], 44 | dev_data[self.dep], 45 | dev_data[self.weight]) 46 | offks = self.sloveKS(model, 47 | off_data[self.var_names], 48 | off_data[self.dep], 49 | off_data[self.weight]) 50 | train_number += 1 51 | targetks_n = self.target_value( 52 | old_devks=old_devks, 53 | old_offks=old_offks, 54 | target=target, 55 | devks=devks, 56 | offks=offks) 57 | if targetks < targetks_n: 58 | targetks = targetks_n 59 | old_devks = devks 60 | old_offks = offks 61 | else: 62 | break 63 | else: 64 | break 65 | except: 66 | break 67 | params[param] -= step 68 | return params, targetks, train_number 69 | 70 | def auto_choose_params(self, target="offks"): 71 | """ 72 | "mzh1": offks + (offks - devks) * 0.2 最大化 73 | "mzh2": (offks + (offks - devks) * 0.2)**2 最大化 74 | 其余取值均使用跨时间测试集offks 最大化 75 | 当业务稳定性较差时,应将0.2改为更大的值 76 | """ 77 | dev_data = self.datasets.get("dev", "") 78 | off_data = self.datasets.get("off", "") 79 | #设置参数初始位置 80 | params = { 81 | "max_depth": 5, 82 | "learning_rate": 0.09, 83 | "n_estimators": 120, 84 | "min_child_weight": 50, 85 | "subsample": 1, 86 | "scale_pos_weight": 1, 87 | "reg_lambda": 21 88 | } 89 | model = xgb.XGBClassifier(max_depth=params.get("max_depth", 3), 90 | learning_rate=params.get("learning_rate", 0.05), 91 | n_estimators=params.get("n_estimators", 100), 92 | min_child_weight=params.get("min_child_weight",1), 93 | subsample=params.get("subsample", 1), 94 | scale_pos_weight=params.get("scale_pos_weight",1), 95 | reg_lambda=params.get("reg_lambda", 1), 96 | nthread=8, n_jobs=8, random_state=7) 97 | model.fit(dev_data[self.var_names], 98 | dev_data[self.dep], 99 | dev_data[self.weight]) 100 | devks = self.sloveKS(model, 101 | dev_data[self.var_names], 102 | dev_data[self.dep], 103 | dev_data[self.weight]) 104 | offks = self.sloveKS(model, 105 | off_data[self.var_names], 106 | off_data[self.dep], 107 | off_data[self.weight]) 108 | train_number = 0 109 | #设置调参步长 110 | dic = { 111 | "learning_rate": [0.05, -0.05], 112 | "max_depth": [1, -1], 113 | "n_estimators": [20, 5, -5, -20], 114 | "min_child_weight": [20, 5, -5, -20], 115 | "subsample": [0.05, -0.05], 116 | "scale_pos_weight": [20, 5, -5, -20], 117 | "reg_lambda": [10, -10] 118 | } 119 | #启用调参策略 120 | targetks = self.target_value(old_devks=devks, 121 | old_offks=offks, target=target, 122 | devks=devks, offks=offks) 123 | old_devks = devks 124 | old_offks = offks 125 | #按照参数字典,双向搜索最优参数 126 | while True: 127 | targetks_lis = [] 128 | for (key, values) in dic.items(): 129 | for v in values: 130 | if v + params[key] > 0: 131 | params, targetks, train_number = \ 132 | self.check_params(dev_data, 133 | off_data, params, 134 | key, train_number, 135 | v, target, targetks, 136 | old_devks, old_offks) 137 | targetks_n = self.target_value( 138 | old_devks=old_devks, 139 | old_offks=old_offks, 140 | target=target, 141 | devks=devks, offks=offks) 142 | if targetks < targetks_n: 143 | old_devks = devks 144 | old_offks = offks 145 | targetks_lis.append(targetks) 146 | if not targetks_lis: 147 | break 148 | print("Best params: ", params) 149 | model = xgb.XGBClassifier(max_depth=params.get("max_depth", 3), 150 | learning_rate=params.get("learning_rate", 0.05), 151 | n_estimators=params.get("n_estimators", 100), 152 | min_child_weight=params.get("min_child_weight",1), 153 | subsample=params.get("subsample", 1), 154 | scale_pos_weight=params.get("scale_pos_weight",1), 155 | reg_lambda=params.get("reg_lambda", 1), 156 | nthread=10, n_jobs=10, random_state=0) 157 | model.fit(dev_data[self.var_names], 158 | dev_data[self.dep], dev_data[self.weight]) 159 | 160 | -------------------------------------------------------------------------------- /智能风控(代码)/第2章/2.6.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:13:32 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | def auto_delete_vars(self): 9 | dev_data = self.datasets.get("dev", "") 10 | off_data = self.datasets.get("off", "") 11 | params = self.params 12 | model = xgb.XGBClassifier(max_depth=params.get("max_depth", 3), 13 | learning_rate=params.get("learning_rate", 0.05), 14 | n_estimators=params.get("n_estimators", 100), 15 | min_child_weight=params.get("min_child_weight",1), 16 | subsample=params.get("subsample", 1), 17 | scale_pos_weight=params.get("scale_pos_weight",1), 18 | reg_lambda=params.get("reg_lambda", 1), 19 | nthread=8, n_jobs=8, random_state=7) 20 | model.fit(dev_data[self.var_names], 21 | dev_data[self.dep], dev_data[self.weight]) 22 | offks = self.sloveKS(model, off_data[self.var_names], 23 | off_data[self.dep], off_data[self.weight]) 24 | train_number = 0 25 | print("train_number: %s, offks: %s" % (train_number, offks)) 26 | del_list = list() 27 | oldks = offks 28 | while True: 29 | bad_ind = True 30 | for var_name in self.var_names: 31 | #遍历每一个特征 32 | model=xgb.XGBClassifier( 33 | max_depth=params.get("max_depth", 3), 34 | learning_rate=params.get("learning_rate",0.05), 35 | n_estimators=params.get("n_estimators", 100), 36 | min_child_weight=params.get("min_child_weight",1), 37 | subsample=params.get("subsample", 1), 38 | scale_pos_weight=params.get("scale_pos_weight",1), 39 | reg_lambda=params.get("reg_lambda", 1), 40 | nthread=10,n_jobs=10,random_state=7) 41 | #将当前特征从模型中去掉 42 | names = [var for var in self.var_names 43 | if var_name != var] 44 | model.fit(dev_data[names], dev_data[self.dep], 45 | dev_data[self.weight]) 46 | train_number += 1 47 | offks = self.sloveKS(model, off_data[names], 48 | off_data[self.dep], off_data[self.weight]) 49 | ''' 50 | 比较KS是否有提升, 51 | 如果有提升或者武明显变化, 52 | 则可以将特征去掉 53 | ''' 54 | if offks >= oldks: 55 | oldks = offks 56 | bad_ind = False 57 | del_list.append(var_name) 58 | self.var_names = names 59 | else: 60 | continue 61 | if bad_ind: 62 | break 63 | print("(End) train_n: %s, offks: %s del_list_vars: %s" 64 | % (train_number, offks, del_list)) 65 | 66 | -------------------------------------------------------------------------------- /智能风控(代码)/第2章/2.7.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.metrics import roc_auc_score,roc_curve,auc 3 | from sklearn import metrics 4 | from sklearn.linear_model import LogisticRegression 5 | import numpy as np 6 | data = pd.read_csv('Acard.txt') 7 | data.head() 8 | data.obs_mth.unique() 9 | train = data[data.obs_mth != '2018-11-30'].reset_index().copy() 10 | val = data[data.obs_mth == '2018-11-30'].reset_index().copy() 11 | 12 | feature_lst = ['person_info','finance_info','credit_info','act_info'] 13 | x = train[feature_lst] 14 | y = train['bad_ind'] 15 | 16 | val_x = val[feature_lst] 17 | val_y = val['bad_ind'] 18 | 19 | lr_model = LogisticRegression(C=0.1,class_weight='balanced') 20 | lr_model.fit(x,y) 21 | 22 | y_pred = lr_model.predict_proba(x)[:,1] 23 | fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred) 24 | train_ks = abs(fpr_lr_train - tpr_lr_train).max() 25 | print('train_ks : ',train_ks) 26 | 27 | y_pred = lr_model.predict_proba(val_x)[:,1] 28 | fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) 29 | val_ks = abs(fpr_lr - tpr_lr).max() 30 | print('val_ks : ',val_ks) 31 | 32 | from matplotlib import pyplot as plt 33 | plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') 34 | plt.plot(fpr_lr,tpr_lr,label = 'evl LR') 35 | plt.plot([0,1],[0,1],'k--') 36 | plt.xlabel('False positive rate') 37 | plt.ylabel('True positive rate') 38 | plt.title('ROC Curve') 39 | plt.legend(loc = 'best') 40 | plt.show() 41 | model = lr_model 42 | row_num, col_num = 0, 0 43 | bins = 20 44 | Y_predict = [s[1] for s in model.predict_proba(val_x)] 45 | Y = val_y 46 | nrows = Y.shape[0] 47 | lis = [(Y_predict[i], Y[i]) for i in range(nrows)] 48 | ks_lis = sorted(lis, key=lambda x: x[0], reverse=True) 49 | bin_num = int(nrows/bins+1) 50 | bad = sum([1 for (p, y) in ks_lis if y > 0.5]) 51 | good = sum([1 for (p, y) in ks_lis if y <= 0.5]) 52 | bad_cnt, good_cnt = 0, 0 53 | KS = [] 54 | BAD = [] 55 | GOOD = [] 56 | BAD_CNT = [] 57 | GOOD_CNT = [] 58 | BAD_PCTG = [] 59 | BADRATE = [] 60 | dct_report = {} 61 | for j in range(bins): 62 | ds = ks_lis[j*bin_num: min((j+1)*bin_num, nrows)] 63 | bad1 = sum([1 for (p, y) in ds if y > 0.5]) 64 | good1 = sum([1 for (p, y) in ds if y <= 0.5]) 65 | bad_cnt += bad1 66 | good_cnt += good1 67 | bad_pctg = round(bad_cnt/sum(val_y),3) 68 | badrate = round(bad1/(bad1+good1),3) 69 | ks = round(math.fabs((bad_cnt / bad) - (good_cnt / good)),3) 70 | KS.append(ks) 71 | BAD.append(bad1) 72 | GOOD.append(good1) 73 | BAD_CNT.append(bad_cnt) 74 | GOOD_CNT.append(good_cnt) 75 | BAD_PCTG.append(bad_pctg) 76 | BADRATE.append(badrate) 77 | dct_report['KS'] = KS 78 | dct_report['负样本个数'] = BAD 79 | dct_report['正样本个数'] = GOOD 80 | dct_report['负样本累计个数'] = BAD_CNT 81 | dct_report['正样本累计个数'] = GOOD_CNT 82 | dct_report['捕获率'] = BAD_PCTG 83 | dct_report['负样本占比'] = BADRATE 84 | val_repot = pd.DataFrame(dct_report) 85 | print(val_repot) 86 | 87 | from pyecharts.charts import * 88 | from pyecharts import options as opts 89 | from pylab import * 90 | mpl.rcParams['font.sans-serif'] = ['SimHei'] 91 | np.set_printoptions(suppress=True) 92 | pd.set_option('display.unicode.ambiguous_as_wide', True) 93 | pd.set_option('display.unicode.east_asian_width', True) 94 | line = ( 95 | 96 | Line() 97 | .add_xaxis(list(val_repot.index)) 98 | .add_yaxis( 99 | "分组坏人占比", 100 | list(val_repot.BADRATE), 101 | yaxis_index=0, 102 | color="red", 103 | ) 104 | .set_global_opts( 105 | title_opts=opts.TitleOpts(title="行为评分卡模型表现"), 106 | ) 107 | .extend_axis( 108 | yaxis=opts.AxisOpts( 109 | name="累计坏人占比", 110 | type_="value", 111 | min_=0, 112 | max_=0.5, 113 | position="right", 114 | axisline_opts=opts.AxisLineOpts( 115 | linestyle_opts=opts.LineStyleOpts(color="red") 116 | ), 117 | axislabel_opts=opts.LabelOpts(formatter="{value}"), 118 | ) 119 | 120 | ) 121 | .add_xaxis(list(val_repot.index)) 122 | .add_yaxis( 123 | "KS", 124 | list(val_repot['KS']), 125 | yaxis_index=1, 126 | color="blue", 127 | label_opts=opts.LabelOpts(is_show=False), 128 | ) 129 | ) 130 | line.render_notebook() 131 | 132 | print('变量名单:',feature_lst) 133 | print('系数:',lr_model.coef_) 134 | print('截距:',lr_model.intercept_) 135 | 136 | import math 137 | #算分数onekey 138 | def score(person_info,finance_info,credit_info,act_info): 139 | xbeta = person_info * ( 3.49460978) \ 140 | + finance_info * ( 11.40051582 ) \ 141 | + credit_info * (2.45541981) \ 142 | + act_info * ( -1.68676079) \ 143 | -0.34484897 144 | score = 650-34* (xbeta)/math.log(2) 145 | return score 146 | val['score'] = val.apply(lambda x : 147 | score(x.person_info,x.finance_info,x. 148 | credit_info,x.act_info) ,axis=1) 149 | fpr_lr,tpr_lr,_ = roc_curve(val_y,val['score']) 150 | val_ks = abs(fpr_lr - tpr_lr).max() 151 | print('val_ks : ',val_ks) 152 | 153 | #对应评级区间 154 | def level(score): 155 | level = 0 156 | if score <= 600: 157 | level = "D" 158 | elif score <= 640 and score > 600 : 159 | level = "C" 160 | elif score <= 680 and score > 640: 161 | level = "B" 162 | elif score > 680 : 163 | level = "A" 164 | return level 165 | val['level'] = val.score.map(lambda x : level(x) ) 166 | print(val.level.groupby(val.level).count()/len(val)) 167 | 168 | import XGBoost as xgb 169 | data = pd.read_csv('Acard.txt') 170 | df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy() 171 | val = data[data.obs_mth == '2018-11-30'].reset_index().copy() 172 | lst = ['person_info','finance_info','credit_info','act_info'] 173 | 174 | train = data[data.obs_mth != '2018-11-30'].reset_index().copy() 175 | evl = data[data.obs_mth == '2018-11-30'].reset_index().copy() 176 | 177 | x = train[lst] 178 | y = train['bad_ind'] 179 | 180 | evl_x = evl[lst] 181 | evl_y = evl['bad_ind'] 182 | 183 | #定义XGB函数 184 | def XGB_test(train_x,train_y,test_x,test_y): 185 | from multiprocessing import cpu_count 186 | clf = xgb.XGBClassifier( 187 | boosting_type='gbdt', num_leaves=31, 188 | reg_Ap=0.0, reg_lambda=1, 189 | max_depth=2, n_estimators=800, 190 | max_features = 140, objective='binary', 191 | subsample=0.7, colsample_bytree=0.7, subsample_freq=1, 192 | learning_rate=0.05, min_child_weight=50, 193 | random_state=None,n_jobs=cpu_count()-1, 194 | num_iterations = 800 #迭代次数 195 | ) 196 | clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)], 197 | eval_metric='auc',early_stopping_rounds=100) 198 | print(clf.n_features_) 199 | return clf,clf.best_score_[ 'valid_1']['auc'] 200 | 201 | #模型训练 202 | model,auc = XGB_test(x,y,evl_x,evl_y) 203 | #训练集预测 204 | y_pred = model.predict_proba(x)[:,1] 205 | fpr_xgb_train,tpr_xgb_train,_ = roc_curve(y,y_pred) 206 | train_ks = abs(fpr_xgb_train - tpr_xgb_train).max() 207 | print('train_ks : ',train_ks) 208 | #跨时间验证集预测 209 | y_pred = model.predict_proba(evl_x)[:,1] 210 | fpr_xgb,tpr_xgb,_ = roc_curve(evl_y,y_pred) 211 | evl_ks = abs(fpr_xgb - tpr_xgb).max() 212 | print('evl_ks : ',evl_ks) 213 | #画出ROC曲线并计算KS值 214 | from matplotlib import pyplot as plt 215 | plt.plot(fpr_xgb_train,tpr_xgb_train,label = 'train LR') 216 | plt.plot(fpr_xgb,tpr_xgb,label = 'evl LR') 217 | plt.plot([0,1],[0,1],'k--') 218 | plt.xlabel('False positive rate') 219 | plt.ylabel('True positive rate') 220 | plt.title('ROC Curve') 221 | plt.legend(loc = 'best') 222 | plt.show() 223 | 224 | row_num, col_num = 0, 0 225 | bins = 20 226 | Y_predict = evl['score'] 227 | Y = evl_y 228 | nrows = Y.shape[0] 229 | lis = [(Y_predict[i], Y[i]) for i in range(nrows)] 230 | ks_lis = sorted(lis, key=lambda x: x[0], reverse=True) 231 | bin_num = int(nrows/bins+1) 232 | bad = sum([1 for (p, y) in ks_lis if y > 0.5]) 233 | good = sum([1 for (p, y) in ks_lis if y <= 0.5]) 234 | bad_cnt, good_cnt = 0, 0 235 | KS = [] 236 | BAD = [] 237 | GOOD = [] 238 | BAD_CNT = [] 239 | GOOD_CNT = [] 240 | BAD_PCTG = [] 241 | BADRATE = [] 242 | dct_report = {} 243 | for j in range(bins): 244 | ds = ks_lis[j*bin_num: min((j+1)*bin_num, nrows)] 245 | bad1 = sum([1 for (p, y) in ds if y > 0.5]) 246 | good1 = sum([1 for (p, y) in ds if y <= 0.5]) 247 | bad_cnt += bad1 248 | good_cnt += good1 249 | bad_pctg = round(bad_cnt/sum(evl_y),3) 250 | badrate = round(bad1/(bad1+good1),3) 251 | ks = round(math.fabs((bad_cnt / bad) - (good_cnt / good)),3) 252 | KS.append(ks) 253 | BAD.append(bad1) 254 | GOOD.append(good1) 255 | BAD_CNT.append(bad_cnt) 256 | GOOD_CNT.append(good_cnt) 257 | BAD_PCTG.append(bad_pctg) 258 | BADRATE.append(badrate) 259 | dct_report['KS'] = KS 260 | dct_report['BAD'] = BAD 261 | dct_report['GOOD'] = GOOD 262 | dct_report['BAD_CNT'] = BAD_CNT 263 | dct_report['GOOD_CNT'] = GOOD_CNT 264 | dct_report['BAD_PCTG'] = BAD_PCTG 265 | dct_report['BADRATE'] = BADRATE 266 | val_repot = pd.DataFrame(dct_report) 267 | print(val_repot) 268 | 269 | def score(pred): 270 | score = 600+50*(math.log2((1- pred)/ pred)) 271 | return score 272 | evl['xbeta'] = model.predict_proba(evl_x)[:,1] 273 | evl['score'] = evl.apply(lambda x : score(x.xbeta) ,axis=1) 274 | fpr_lr,tpr_lr,_ = roc_curve(evl_y,evl['score']) 275 | evl_ks = abs(fpr_lr - tpr_lr).max() 276 | print('val_ks : ',evl_ks) 277 | 278 | # 自定义损失函数,需要提供损失函数的一阶导和二阶导 279 | def loglikelood(preds, dtrain): 280 | labels = dtrain.get_label() 281 | preds = 1.0 / (1.0 + np.exp(-preds)) 282 | grad = preds - labels 283 | hess = preds * (1.0-preds) 284 | return grad, hess 285 | 286 | # 自定义前20%正样本占比最大化评价函数 287 | def binary_error(preds, train_data): 288 | labels = train_data.get_label() 289 | dct = pd.DataFrame({'pred':preds,'percent':preds,'labels':labels}) 290 | #取百分位点对应的阈值 291 | key = dct['percent'].quantile(0.2) 292 | #按照阈值处理成二分类任务 293 | dct['percent']= dct['percent'].map(lambda x :1 if x <= key else 0) 294 | #计算评价函数,权重默认0.5,可以根据情况调整 295 | result = np.mean(dct[dct.percent== 1]['labels'] == 1)*0.5 \ 296 | + np.mean((dct.labels - dct.pred)**2)*0.5 297 | return 'error',result 298 | 299 | watchlist = [(dtest,'eval'), (dtrain,'train')] 300 | param = {'max_depth':3, 'eta':0.1, 'silent':1} 301 | num_round = 100 302 | # 自定义损失函数训练 303 | bst = xgb.train(param, dtrain, num_round, watchlist, loglikelood, binary_error) 304 | -------------------------------------------------------------------------------- /智能风控(代码)/第3章/3.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:25:58 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import pandas as pd 9 | from sklearn.metrics import roc_auc_score,roc_curve,auc 10 | from sklearn.model_selection import train_test_split 11 | from sklearn import metrics 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.svm import LinearSVC 14 | import numpy as np 15 | import random 16 | import math 17 | from sklearn.calibration import CalibratedClassifierCV 18 | data = pd.read_excel('./data/tra_sample.xlsx') 19 | data.head() 20 | feature_lst = ['zx_score','msg_cnt','phone_num_cnt','register_days'] 21 | train = data[data.type == 'target'].reset_index().copy() 22 | diff = data[data.type == 'origin'].reset_index().copy() 23 | val = data[data.type == 'offtime'].reset_index().copy() 24 | 25 | ''' 26 | TrainS 目标域样本 27 | TrainA 源域样本 28 | LabelS 目标域标签 29 | LabelA 源域标签 30 | ''' 31 | 32 | train = train.loc[:1200] 33 | 34 | trans_S = train[feature_lst].copy() 35 | label_S = train['bad_ind'].copy() 36 | 37 | trans_A = diff[feature_lst].copy() 38 | label_A = diff['bad_ind'].copy() 39 | 40 | val_x = val[feature_lst].copy() 41 | val_y = val['bad_ind'].copy() 42 | 43 | test = val_x.copy() 44 | lr_model = LogisticRegression(C=0.1,class_weight = 'balanced',solver = 'liblinear') 45 | lr_model.fit(trans_S,label_S) 46 | 47 | y_pred = lr_model.predict_proba(trans_S)[:,1] 48 | fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred) 49 | train_ks = abs(fpr_lr_train - tpr_lr_train).max() 50 | print('train_ks : ',train_ks) 51 | 52 | y_pred = lr_model.predict_proba(test)[:,1] 53 | fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) 54 | val_ks = abs(fpr_lr - tpr_lr).max() 55 | print('val_ks : ',val_ks) 56 | 57 | from matplotlib import pyplot as plt 58 | plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') 59 | plt.plot(fpr_lr,tpr_lr,label = 'evl LR') 60 | plt.plot([0,1],[0,1],'k--') 61 | plt.xlabel('False positive rate') 62 | plt.ylabel('True positive rate') 63 | plt.title('ROC Curve') 64 | plt.legend(loc = 'best') 65 | plt.show() 66 | trans_data = np.concatenate((trans_A, trans_S), axis=0) 67 | trans_label = np.concatenate((label_A, label_S), axis=0) 68 | 69 | lr_model = LogisticRegression(C=0.3,class_weight = 'balanced',solver = 'liblinear') 70 | lr_model.fit(trans_A,label_A) 71 | 72 | y_pred = lr_model.predict_proba(trans_data)[:,1] 73 | fpr_lr_train,tpr_lr_train,_ = roc_curve(trans_label,y_pred) 74 | train_ks = abs(fpr_lr_train - tpr_lr_train).max() 75 | print('train_ks : ',train_ks) 76 | 77 | y_pred = lr_model.predict_proba(test)[:,1] 78 | fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) 79 | val_ks = abs(fpr_lr - tpr_lr).max() 80 | print('val_ks : ',val_ks) 81 | 82 | from matplotlib import pyplot as plt 83 | plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') 84 | plt.plot(fpr_lr,tpr_lr,label = 'evl LR') 85 | plt.plot([0,1],[0,1],'k--') 86 | plt.xlabel('False positive rate') 87 | plt.ylabel('True positive rate') 88 | plt.title('ROC Curve') 89 | plt.legend(loc = 'best') 90 | plt.show() 91 | import numpy as np 92 | import pandas as pd 93 | from sklearn.linear_model import LogisticRegression 94 | from sklearn.metrics import roc_curve 95 | 96 | def Tr_lr_boost(trans_A,trans_S,label_A,label_S,test,label_test, 97 | N=500,early_stopping_rounds =100): 98 | """ 99 | 逻辑回归的学习率、权重的大小,影响整体收敛的快慢 100 | H 测试样本分类结果 101 | TrainS 目标域样本 102 | TrainA 源域样本 103 | LabelS 目标域标签 104 | LabelA 源域标签 105 | Test 测试样本 106 | N 迭代次数 107 | early_stopping_rounds 提前停止轮次 108 | """ 109 | #计算weight 110 | def calculate_P(weights, label): 111 | total = np.sum(weights) 112 | return np.asarray(weights / total, order='C') 113 | 114 | #用逻辑回归作为基分类器,输出概率 115 | def train_classify(trans_data, trans_label, test_data, P): 116 | clf = LogisticRegression(C=0.3,class_weight = 'balanced',solver='liblinear') 117 | clf.fit(trans_data, trans_label, sample_weight=P[:, 0]) 118 | return clf.predict_proba(test_data)[:,1],clf 119 | 120 | #计算在目标域上面的错误率 121 | def calculate_error_rate(label_R, label_H, weight): 122 | total = np.sum(weight) 123 | return np.sum(weight[:, 0] / total * np.abs(label_R - label_H)) 124 | 125 | #根据逻辑回归输出的score的得到标签,注意这里不能用predict直接输出标签 126 | def put_label(score_H,thred): 127 | new_label_H = [] 128 | for i in score_H: 129 | if i <= thred: 130 | new_label_H.append(0) 131 | else: 132 | new_label_H.append(1) 133 | return new_label_H 134 | 135 | #指定迭代次数,相当于集成模型中基模型的数量 136 | 137 | 138 | #拼接数据集 139 | trans_data = np.concatenate((trans_A, trans_S), axis=0) 140 | trans_label = np.concatenate((label_A, label_S), axis=0) 141 | 142 | #三个数据集样本数 143 | row_A = trans_A.shape[0] 144 | row_S = trans_S.shape[0] 145 | row_T = test.shape[0] 146 | 147 | #三个数据集合并为打分数据集 148 | test_data = np.concatenate((trans_data, test), axis=0) 149 | 150 | # 初始化权重 151 | weights_A = np.ones([row_A, 1])/row_A 152 | weights_S = np.ones([row_S, 1])/row_S*2 153 | weights = np.concatenate((weights_A, weights_S), axis=0) 154 | 155 | #按照公式初始化beta值 156 | bata = 1 / (1 + np.sqrt(2 * np.log(row_A / N))) 157 | 158 | 159 | # 存每一次迭代的bata值=error_rate / (1 - error_rate) 160 | bata_T = np.zeros([1, N]) 161 | # 存储每次迭代的标签 162 | result_label = np.ones([row_A + row_S + row_T, N]) 163 | 164 | trans_data = np.asarray(trans_data, order='C') 165 | trans_label = np.asarray(trans_label, order='C') 166 | test_data = np.asarray(test_data, order='C') 167 | 168 | #最优KS 169 | best_ks = -1 170 | #最优基模型数量 171 | best_round = -1 172 | #最优模型 173 | best_model = -1 174 | 175 | """ 176 | 初始化结束 177 | 正式开始训练 178 | """ 179 | 180 | for i in range(N): 181 | P = calculate_P(weights, trans_label) 182 | 183 | result_label[:, i],model = train_classify(trans_data, trans_label, test_data, P) 184 | score_H = result_label[row_A:row_A + row_S, i] 185 | pctg = np.sum(trans_label)/len(trans_label) 186 | thred = pd.DataFrame(score_H).quantile(1-pctg)[0] 187 | 188 | label_H = put_label(score_H,thred) 189 | 190 | #计算在目标域上的错误率 191 | error_rate = calculate_error_rate(label_S, label_H, 192 | weights[row_A:row_A + row_S, :]) 193 | # 防止过拟合 194 | if error_rate > 0.5: 195 | error_rate = 0.5 196 | if error_rate == 0: 197 | N = i 198 | break 199 | 200 | bata_T[0, i] = error_rate / (1 - error_rate) 201 | 202 | # 调整目标域样本权重 203 | for j in range(row_S): 204 | weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i], \ 205 | (-np.abs(result_label[row_A + j, i] - label_S[j]))) 206 | 207 | # 调整源域样本权重 208 | for j in range(row_A): 209 | weights[j] = weights[j] * np.power(bata, 210 | np.abs(result_label[j, i] - label_A[j])) 211 | y_pred = result_label[(row_A + row_S):,i] 212 | fpr_lr_train,tpr_lr_train,_ = roc_curve(label_test,y_pred) 213 | train_ks = abs(fpr_lr_train - tpr_lr_train).max() 214 | print('test_ks : ',train_ks,'当前第',i+1,'轮') 215 | 216 | # 不再使用后一半学习器投票,而是只保留效果最好的逻辑回归模型 217 | if train_ks > best_ks : 218 | best_ks = train_ks 219 | best_round = i 220 | best_model = model 221 | # 当超过eadrly_stopping_rounds轮KS不再提升后,停止训练 222 | if best_round < i - early_stopping_rounds: 223 | break 224 | return best_ks,best_round,best_model 225 | 226 | # 训练并得到最优模型best_model 227 | best_ks,best_round,best_model = Tr_lr_boost(trans_A,trans_S,label_A,label_S, 228 | test,label_test=val_y,N=300, 229 | early_stopping_rounds=20) 230 | 231 | y_pred = best_model.predict_proba(trans_S)[:,1] 232 | fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred) 233 | train_ks = abs(fpr_lr_train - tpr_lr_train).max() 234 | print('train_ks : ',train_ks) 235 | 236 | y_pred = best_model.predict_proba(test)[:,1] 237 | fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) 238 | val_ks = abs(fpr_lr - tpr_lr).max() 239 | print('val_ks : ',val_ks) 240 | 241 | from matplotlib import pyplot as plt 242 | plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') 243 | plt.plot(fpr_lr,tpr_lr,label = 'evl LR') 244 | plt.plot([0,1],[0,1],'k--') 245 | plt.xlabel('False positive rate') 246 | plt.ylabel('True positive rate') 247 | plt.title('ROC Curve') 248 | plt.legend(loc = 'best') 249 | plt.show() 250 | 251 | -------------------------------------------------------------------------------- /智能风控(代码)/第3章/3.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:30:41 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import numpy as np 9 | from scipy.linalg.misc import norm 10 | from scipy.sparse.linalg import eigs 11 | 12 | def JDA(Xs,Xt,Ys,Yt,k=100,lamda=0.1,ker='primal',gamma=1.0,data='default'): 13 | X = np.hstack((Xs , Xt)) 14 | X = np.diag(1/np.sqrt(np.sum(X**2))) 15 | (m,n) = X.shape 16 | #源域样本量 17 | ns = Xs.shape[1] 18 | #目标域样本量 19 | nt = Xt.shape[1] 20 | #分类个数 21 | C = len(np.unique(Ys)) 22 | # 生成MMD矩阵 23 | e1 = 1/ns*np.ones((ns,1)) 24 | e2 = 1/nt*np.ones((nt,1)) 25 | e = np.vstack((e1,e2)) 26 | M = np.dot(e,e.T)*C 27 | 28 | #除了0,空,False以外都可以运行 29 | if any(Yt) and len(Yt)==nt: 30 | for c in np.reshape(np.unique(Ys) ,-1 ,1): 31 | e1 = np.zeros((ns,1)) 32 | e1[Ys == c] = 1/len(Ys[Ys == c]) 33 | e2 = np.zeros((nt,1)) 34 | e2[Yt ==c] = -1/len(Yt[Yt ==c]) 35 | e = np.hstack((e1 ,e2)) 36 | e = e[np.isinf(e) == 0] 37 | M = M+np.dot(e,e.T) 38 | 39 | #矩阵迹求平方根 40 | M = M/norm(M ,ord = 'fro' ) 41 | 42 | # 计算中心矩阵 43 | H = np.eye(n) - 1/(n)*np.ones((n,n)) 44 | 45 | # Joint Distribution Adaptation: JDA 46 | if ker == 'primal': 47 | #特征值特征向量 48 | A = eigs(np.dot(np.dot(X,M),X.T)+lamda*np.eye(m), 49 | k=k, M=np.dot(np.dot(X,H),X.T), which='SM') 50 | Z = np.dot(A.T,X) 51 | else: 52 | pass 53 | return A,Z 54 | 55 | -------------------------------------------------------------------------------- /智能风控(代码)/第3章/3.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:31:33 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import numpy as np 9 | from scipy import sparse as sp 10 | def DAELM(Train_s,Train_t,Test_t,NL,Type="CLASSIFIER" , Num_hid=100 ,Active_Function="sig"): 11 | ''' 12 | Train_s:源域训练集 13 | Train_t:目标域训练集 14 | Test_t:目标域测试集 15 | Type:模型类型(分类:"CLASSIFIER",回归:"REGRESSION") 16 | Num_hid:隐层神经元个数,默认100个 17 | Active_Function:映射函数(" sigmoid ":sigmoid函数, "sin":正弦函数) 18 | NL:模型选择 19 | ''' 20 | 21 | Cs = 0.01 22 | Ct = 0.01 23 | 24 | #回归或分类 25 | REGRESSION=0 26 | CLASSIFIER=1 27 | 28 | #训练数据 29 | train_data = Train_s 30 | T = train_data[:,0].T 31 | P = train_data[:,1:train_data.shape[1]].T 32 | del train_data 33 | 34 | #目标域数据 35 | train_target_dt = Train_t 36 | Tt = train_target_dt[:,0].T 37 | Pt = train_target_dt[:,1:train_target_dt.shape[1]].T 38 | 39 | #测试集数据 40 | test_data = Test_t 41 | TVT = test_data[:,0].T 42 | TE0 = test_data[:,0].T 43 | TVP = test_data[:,2:test_data.shape[1]].T 44 | del test_data 45 | 46 | Num_train = P.shape[1] 47 | Num_train_Target = Pt.shape[1] 48 | Num_test = TVP.shape[1] 49 | Num_input= P.shape[0] 50 | 51 | if Type is not "REGRESSION": 52 | sorted_target = np.sort(np.hstack((T , TVT))) 53 | label = np.zeros((1,1)) 54 | label[0,0] = sorted_target[0,0] 55 | j = 0 56 | for i in range(2,(Num_train+Num_test+1)): 57 | if sorted_target[0,i-1] != label[0,j-1]: 58 | j=j+1 59 | label[0,j-1] = sorted_target[0,i-1] 60 | 61 | number_class = j+1 62 | Num_output = number_class 63 | 64 | 65 | temp_T = np.zeros(Num_output , Num_train) 66 | for i in range(1,Num_train+1): 67 | for j in range(1,number_class+1): 68 | if label(0,j-1) == T(0,i-1): 69 | break 70 | temp_T[j-1 , i-1] = 1 71 | T = temp_T*2-1 72 | 73 | Tt_m = np.zeros(Num_output , Num_train_Target) 74 | for i in range(1,Num_train_Target+1): 75 | for j in range(1 , number_class+1): 76 | if label[0,j-1] == Tt[0,i-1]: 77 | break 78 | Tt_m[j-1 , i-1] = 1 79 | Tt = Tt_m*2-1 80 | 81 | 82 | temp_TV_T = np.zeros(Num_output,Num_test) 83 | for i in range(1,Num_test): 84 | for j in range(1,number_class+1): 85 | if label(0,j-1) == TVT(0,i-1): 86 | break 87 | temp_TV_T[j-1 , i-1] = 1 88 | TVT = temp_TV_T*2-1 89 | 90 | InputWeight = np.random.rand(Num_hid,Num_input)*2-1 91 | Bis_hid = np.random.rand(Num_hid ,1) 92 | H_m = InputWeight*P 93 | Ht_m = InputWeight*Pt 94 | del P 95 | del Pt 96 | 97 | ind = np.ones(1,Num_train) 98 | indt = np.ones(1,Num_train_Target) 99 | BiasMatrix = Bis_hid[:,ind-1] 100 | BiasMatrixT = Bis_hid[:,indt-1] 101 | H_m = H_m + BiasMatrix 102 | Ht_m=Ht_m+BiasMatrixT 103 | 104 | if Active_Function == "sigmoid": 105 | H = 1/(1+np.exp(-H_m)) 106 | Ht = 1/(1+np.exp(-Ht_m)) 107 | if Active_Function == "sin": 108 | H = np.sin(H_m) 109 | Ht = np.sin(Ht_m) 110 | if Active_Function != " sigmoid " and Active_Function!="sin": 111 | pass 112 | 113 | del H_m 114 | del Ht_m 115 | 116 | n = Num_hid 117 | 118 | #DAELM模型 119 | H=H.T 120 | Ht=Ht.T 121 | T=T.T 122 | Tt=Tt.T 123 | 124 | if NL == 0: 125 | A = Ht*H.T 126 | B = Ht*Ht.T+np.eye(Num_train_Target)/Ct 127 | C=H*Ht.T 128 | D=H*H.T+np.eye(Num_train)/Cs 129 | ApT=np.linalg.inv(B)*Tt-np.linalg.inv(B)*A* \ 130 | np.linalg.inv(C*np.linalg.inv(B)*A-D)*(C*np.linalg.inv(B)*Tt-T) 131 | ApS=inv(C*np.linalg.inv(B)*A-D)*(C*np.linalg.inv(B)*Tt-T) 132 | OutputWeight=H.T*ApS+Ht.T*ApT 133 | else: 134 | OutputWeight=np.linalg.inv(np.eye(n)+Cs*H.t*H+Ct*Ht.T*Ht)*(Cs*H.T*T+Ct*Ht.T*Tt) 135 | 136 | #计算准确率 137 | 138 | Y=(H * OutputWeight).T 139 | 140 | H_m_test=InputWeight*TVP 141 | ind = np.ones(1,Num_hid) 142 | BiasMatrix=Bis_hid[:,ind-1] 143 | H_m_test = H_m_test+BiasMatrix 144 | if Active_Function == "sig": 145 | H_test = 1/(1+np.exp(-H_m_test)) 146 | if Active_Function == "sin": 147 | H_test = np.sin(H_m_test) 148 | 149 | TY = (H_test.T*OutputWeight).T 150 | 151 | #返回测试集结果 152 | if Type =="CLASSIFIER": 153 | return TY 154 | else: 155 | pass 156 | 157 | -------------------------------------------------------------------------------- /智能风控(代码)/第3章/3.6.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:32:43 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | from sklearn.metrics import roc_auc_score as AUC 9 | import pandas as pd 10 | import numpy as np 11 | 12 | class Tra_learn3ft (object): 13 | """ 14 | 一种多模型融合的Tradaboost变体 15 | 使用三个模型同时进行样本筛选,目的是减小variance 16 | clfA 模型A 17 | clfB 模型B 18 | clfC 模型C 19 | step 预计去掉的样本比例 20 | max_turns最大迭代次数 21 | """ 22 | def __init__(self,clfA,clfB,clfC,step,max_turns=5): 23 | self.clfA = clfA 24 | self.clfB = clfB 25 | self.clfC = clfC 26 | self.step = step 27 | self.max_turns = max_turns 28 | self.scoreA = 0 29 | self.scoreB = 0 30 | self.scoreC = 0 31 | 32 | def tlearn(self,dev,test,val,bad_ind,featureA,featureB,featureC,drop_rate): 33 | """ 34 | dev 训练集 源域 35 | test 测试集 辅助域 36 | val 验证集 37 | bad_ind 标签 38 | featureA 特征组A 39 | featureB 特征组B 40 | featureC 特征组C 41 | """ 42 | print(len(featureA),len(featureB),len(featureC)) 43 | result = pd.DataFrame() 44 | temp_test = test 45 | features = list(set(featureA+featureB+featureC)) 46 | turn = 1 47 | while( turn <= self.max_turns): 48 | new = pd.DataFrame() 49 | 50 | """ 51 | 模型A对特征组featureA训练, 52 | 并预测得到dev和test和val的概率 53 | 以及test上的分类结果(分数分布在0.8*(min+max)两侧) 54 | """ 55 | self.clfA.fit(dev[featureA],dev[bad_ind]) 56 | predA= self.clfA.predict_proba(dev[featureA])[:,1] 57 | probA = self.clfA.predict_proba(test[featureA])[:,1] 58 | preA = (probA > (np.max(probA)+np.min(probA))*0.8) 59 | valid_a = self.clfA.predict_proba(val[featureA])[:,1] 60 | """ 61 | 模型B对特征组featureB训练, 62 | 并预测得到dev和test和val的概率 63 | 以及test上的分类结果(分数分布在0.8*(min+max)两侧) 64 | """ 65 | self.clfB.fit(dev[featureB],dev[bad_ind]) 66 | predB = self.clfB.predict_proba(dev[featureB])[:,1] 67 | probB = self.clfB.predict_proba(test[featureB])[:,1] 68 | preB = (probA > (np.max(probB)+np.min(probB))*0.8) 69 | valid_b = self.clfB.predict_proba(val[featureB])[:,1] 70 | """ 71 | 模型C对特征组featureC训练, 72 | 并预测得到dev和test和val的概率 73 | 以及test上的分类结果(分数分布在0.8*(min+max)两侧) 74 | """ 75 | self.clfC.fit(dev[featureC],dev[bad_ind]) 76 | predC= self.clfC.predict_proba(dev[featureC])[:,1] 77 | probC = self.clfC.predict_proba(test[featureC])[:,1] 78 | preC = (probC > (np.max(probC)+np.min(probC))*0.8) 79 | valid_c = self.clfC.predict_proba(val[featureC])[:,1] 80 | """ 81 | 分别计算三个模型在val上的AUC 82 | 模型加权融合的策略:以单模型的AUC作为权重 83 | """ 84 | valid_scoreA = AUC(val[bad_ind],valid_a) 85 | valid_scoreB = AUC(val[bad_ind],valid_b) 86 | valid_scoreC = AUC(val[bad_ind],valid_c) 87 | valid_score = AUC(val[bad_ind], valid_a*valid_scoreA 88 | +valid_b*valid_scoreB + valid_c*valid_scoreC) 89 | 90 | """ 91 | index1 三个模型在test上的预测概率相同的样本 92 | sum_va 三个模型AUC之和为分母做归一化 93 | prob 测试集分类结果融合, 94 | index1(分类结果)*AUC(权重)/sum_va(归一化分母) 95 | index2 分类结果升序排列,取出两端的test样本 96 | new 筛选后样本集 97 | """ 98 | index1 = (preA==preB) & (preA==preC) 99 | sum_va = valid_scoreA+valid_scoreB+valid_scoreC 100 | prob = (probC[index1]*valid_scoreC+probA[index1]*valid_scoreA 101 | +probB[index1]*valid_scoreB)/sum_va 102 | Ap_low = np.sort(prob)[int(len(prob)*turn/2.0/self.max_turns)]-0.01 103 | Ap_high= np.sort(prob)[int(len(prob)* 104 | (1-turn/2.0/self.max_turns))]+0.01 105 | index2 = ((prob>Ap_high) | (prob0): 136 | true_y = dev.iloc[0:self.step][bad_ind] 137 | temp = predA[0:self.step]*valid_scoreA \ 138 | + predB[0:self.step]*valid_scoreB \ 139 | + predC[0:self.step]*valid_scoreC 140 | temp = (temp+0.1)/(max(temp)+0.2)#归一化 141 | temp = (true_y-1)*np.log(1-temp)-true_y*np.log(temp)#样本权重 142 | loc = int(min(self.step,len(rightSamples)*drop+2) 143 | *np.random.rand())#去除样本的比例 144 | loss_bias = np.sort(temp)[-loc] 145 | temp = np.append(temp,np.zeros(len(dev)-self.step)-99) 146 | remain_index = (temp <= loss_bias) 147 | self.step = self.step-sum(1-remain_index) 148 | else: 149 | remain_index = [] 150 | 151 | """ 152 | 得到新的test 153 | """ 154 | dev = dev[remain_index].append(rightSamples[features+[bad_ind,'no']]) 155 | test = test[~test.index.isin(rightSamples.index)] 156 | turn += 1 157 | """ 158 | 计算原始test上的AUC 159 | """ 160 | probA = self.clfA.predict_proba(test[featureA])[:,1] 161 | pA = self.clfA.predict_proba(temp_test[featureA])[:,1] 162 | valid_a = self.clfA.predict_proba(val[featureA])[:,1] 163 | 164 | probB = self.clfB.predict_proba(test[featureB])[:,1] 165 | valid_b = self.clfB.predict_proba(val[featureB])[:,1] 166 | pB = self.clfB.predict_proba(temp_test[featureB])[:,1] 167 | 168 | probC = self.clfC.predict_proba(test[features])[:,1] 169 | valid_c = self.clfC.predict_proba(val[features])[:,1] 170 | pC = self.clfC.predict_proba(temp_test[features])[:,1] 171 | 172 | self.scoreA = AUC(val[bad_ind],valid_a) 173 | self.scoreB = AUC(val[bad_ind],valid_b) 174 | self.scoreC = AUC(val[bad_ind],valid_c) 175 | 176 | return pA,pB,pC 177 | 178 | -------------------------------------------------------------------------------- /智能风控(代码)/第4章/4.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:50:39 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import xgboost as xgb 9 | from sklearn.datasets import load_digits # 训练数据 10 | xgb_params_01 = {} 11 | digits_2class = load_digits(2) 12 | X_2class = digits_2class['data'] 13 | y_2class = digits_2class['target'] 14 | dtrain_2class = xgb.DMatrix(X_2class, label=y_2class) 15 | # 训练三棵树的模型 16 | gbdt_03 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=3) 17 | # 以前面三棵树的模型为基础,从第四棵树开始训练 18 | gbdt_03a = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03) 19 | 20 | 21 | -------------------------------------------------------------------------------- /智能风控(代码)/第4章/4.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:35:39 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns; sns.set() 10 | import numpy as np 11 | #产生实验数据 12 | from sklearn.datasets.samples_generator import make_blobs 13 | X, y_true = make_blobs(n_samples=700, centers=4, 14 | cluster_std=0.5, random_state=2019) 15 | X = X[:, ::-1] #方便画图 16 | 17 | from sklearn.mixture import GaussianMixture as GMM 18 | gmm = GMM(n_components=4).fit(X) #指定聚类中心个数为4 19 | labels = gmm.predict(X) 20 | plt.scatter(X[:, 0], X[:, 1], c=labels, s=5, cmap='viridis') 21 | probs = gmm.predict_proba(X) 22 | print(probs[:10].round(2)) 23 | size = probs.max(1) 24 | plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=size) 25 | 26 | from matplotlib.patches import Ellipse 27 | #给定的位置和协方差画一个椭圆 28 | def draw_ellipse(position, covariance, ax=None, **kwargs): 29 | ax = ax or plt.gca() 30 | #将协方差转换为主轴 31 | if covariance.shape == (2, 2): 32 | U, s, Vt = np.linalg.svd(covariance) 33 | angle = np.degrees(np.arctan2(U[1, 0], U[0, 0])) 34 | width, height = 2 * np.sqrt(s) 35 | else: 36 | angle = 0 37 | width, height = 2 * np.sqrt(covariance) 38 | 39 | #画出椭圆 40 | for nsig in range(1, 4): 41 | ax.add_patch(Ellipse(position, nsig * width, nsig * height, 42 | angle, **kwargs)) 43 | #画图 44 | def plot_gmm(gmm, X, label=True, ax=None): 45 | ax = ax or plt.gca() 46 | labels = gmm.fit(X).predict(X) 47 | if label: 48 | ax.scatter(X[:, 0], X[:, 1], c=labels, s=4, cmap='viridis', zorder=2) 49 | else: 50 | ax.scatter(X[:, 0], X[:, 1], s=4, zorder=2) 51 | ax.axis('equal') 52 | w_factor = 0.2 / gmm.weights_.max() 53 | for pos, covar, w in zip(gmm.means_, gmm.covariances_ , gmm.weights_): 54 | draw_ellipse(pos, covar, alpha=w * w_factor) 55 | 56 | from sklearn.datasets import make_moons 57 | Xmoon, ymoon = make_moons(100, noise=.04, random_state=0) 58 | plt.scatter(Xmoon[:, 0], Xmoon[:, 1]) 59 | gmm2 = GMM(n_components=2, covariance_type='full', random_state=0) 60 | plot_gmm(gmm2, Xmoon) 61 | gmm10 = GMM(n_components=10, covariance_type='full', random_state=0) 62 | plot_gmm(gmm10, Xmoon, label=False) 63 | Xnew = gmm10.sample(200)[0] 64 | plt.scatter(Xnew[:, 0], Xnew[:, 1]) 65 | 66 | -------------------------------------------------------------------------------- /智能风控(代码)/第4章/4.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:41:08 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | n_components = np.arange(1, 21) 9 | models = [GMM(n, covariance_type='full', 10 | random_state=0).fit(Xmoon) for n in n_components] 11 | plt.plot(n_components, [m.bic(Xmoon) for m in models], label='BIC') 12 | plt.plot(n_components, [m.aic(Xmoon) for m in models], label='AIC') 13 | plt.legend(loc='best') 14 | plt.xlabel('n_components') 15 | -------------------------------------------------------------------------------- /智能风控(代码)/第5章/5.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:42:49 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | class imbalanceData(): 9 | 10 | """ 11 | 处理不均衡数据 12 | train训练集 13 | test测试集 14 | mmin低分段错分比例 15 | mmax高分段错分比例 16 | bad_ind样本标签 17 | lis不参与建模变量列表 18 | """ 19 | def __init__(self, train,test,mmin,mmax, bad_ind,lis=[]): 20 | self.bad_ind = bad_ind 21 | self.train_x = train.drop([bad_ind]+lis,axis=1) 22 | self.train_y = train[bad_ind] 23 | self.test_x = test.drop([bad_ind]+lis,axis=1) 24 | self.test_y = test[bad_ind] 25 | self.columns = list(self.train_x.columns) 26 | self.keep = self.columns + [self.bad_ind] 27 | self.mmin = 0.1 28 | self.mmax = 0.7 29 | 30 | ''''' 31 | 设置不同比例, 32 | 针对头部和尾部预测不准的样本,进行加权处理。 33 | 0.1为噪声的权重,不参与过采样。 34 | 1为正常样本权重,参与过采样。 35 | ''' 36 | def weight(self,x,y): 37 | if x == 0 and y < self.mmin: 38 | return 0.1 39 | elif x == 1 and y > self.mmax: 40 | return 0.1 41 | else: 42 | return 1 43 | ''''' 44 | 用一个LightGBM算法和weight函数进行样本选择 45 | 只取预测准确的部分进行后续的smote过采样 46 | ''' 47 | def data_cleaning(self): 48 | lgb_model,lgb_auc = self.lgb_test() 49 | sample = self.train_x.copy() 50 | sample[self.bad_ind] = self.train_y 51 | sample['pred'] = lgb_model.predict_proba(self.train_x)[:,1] 52 | sample = sample.sort_values(by=['pred'],ascending=False).reset_index() 53 | sample['rank'] = np.array(sample.index)/len(sample) 54 | sample['weight'] = sample.apply(lambda x:self.weight(x.bad_ind,x['rank']), 55 | axis = 1) 56 | osvp_sample = sample[sample.weight == 1][self.keep] 57 | osnu_sample = sample[sample.weight < 1][self.keep] 58 | train_x_osvp = osvp_sample[self.columns] 59 | train_y_osvp = osvp_sample[self.bad_ind] 60 | return train_x_osvp,train_y_osvp,osnu_sample 61 | 62 | ''''' 63 | 实施smote过采样 64 | ''' 65 | def apply_smote(self): 66 | ''''' 67 | 选择样本,只对部分样本做过采样 68 | train_x_osvp,train_y_osvp 为参与过采样的样本 69 | osnu_sample为不参加过采样的部分样本 70 | ''' 71 | train_x_osvp,train_y_osvp,osnu_sample = self.data_cleaning() 72 | rex,rey = self.smote(train_x_osvp,train_y_osvp) 73 | print('badpctn:',rey.sum()/len(rey)) 74 | df_rex = pd.DataFrame(rex) 75 | df_rex.columns =self.columns 76 | df_rex['weight'] = 1 77 | df_rex[self.bad_ind] = rey 78 | df_aff_ovsp = df_rex.append(osnu_sample) 79 | return df_aff_ovsp 80 | 81 | ''''' 82 | 定义LightGBM函数 83 | ''' 84 | def lgb_test(self): 85 | import lightgbm as lgb 86 | clf =lgb.LGBMClassifier(boosting_type = 'gbdt', 87 | objective = 'binary', 88 | metric = 'auc', 89 | learning_rate = 0.1, 90 | n_estimators = 24, 91 | max_depth = 4, 92 | num_leaves = 25, 93 | max_bin = 40, 94 | min_data_in_leaf = 5, 95 | bagging_fraction = 0.6, 96 | bagging_freq = 0, 97 | feature_fraction = 0.8, 98 | ) 99 | clf.fit(self.train_x,self.train_y,eval_set=[(self.train_x,self.train_y), 100 | (self.test_x,self.test_y)], 101 | eval_metric = 'auc') 102 | return clf,clf.best_score_['valid_1']['auc'] 103 | 104 | ''''' 105 | 调用imblearn中的smote函数 106 | ''' 107 | def smote(self,train_x_osvp,train_y_osvp,m=4,K=15,random_state=0): 108 | from imblearn.over_sampling import SMOTE 109 | smote = SMOTE(k_neighbors=K, kind='borderline1', m_neighbors=m, n_jobs=1, 110 | out_step='deprecated', random_state=random_state, ratio=None, 111 | svm_estimator='deprecated') 112 | rex,rey = smote.fit_resample(train_x_osvp,train_y_osvp) 113 | return rex,rey 114 | df_aff_ovsp = imbalanceData(train=train,test=evl,mmin=0.3,mmax=0.7, bad_ind='bad_ind', 115 | lis=['index', 'uid', 'td_score', 'jxl_score', 'mj_score', 116 | 'rh_score', 'zzc_score', 'zcx_score','obs_mth']).apply_smote() 117 | from sklearn.linear_model import LogisticRegression 118 | from sklearn.metrics import roc_curve 119 | 120 | lr_model = LogisticRegression(C=0.05,class_weight='balanced') 121 | lr_model.fit(x,y) 122 | 123 | y_pred = lr_model.predict_proba(x)[:,1] 124 | fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred) 125 | train_ks = abs(fpr_lr_train - tpr_lr_train).max() 126 | print('train_ks : ',train_ks) 127 | 128 | y_pred = lr_model.predict_proba(evl_x)[:,1] 129 | fpr_lr,tpr_lr,_ = roc_curve(evl_y,y_pred) 130 | evl_ks = abs(fpr_lr - tpr_lr).max() 131 | print('evl_ks : ',evl_ks) 132 | 133 | from matplotlib import pyplot as plt 134 | plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') 135 | plt.plot(fpr_lr,tpr_lr,label = 'evl LR') 136 | plt.plot([0,1],[0,1],'k--') 137 | plt.xlabel('False positive rate') 138 | plt.ylabel('True positive rate') 139 | plt.title('ROC Curve') 140 | plt.legend(loc = 'best') 141 | plt.show() 142 | 143 | -------------------------------------------------------------------------------- /智能风控(代码)/第5章/5.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:46:56 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import numpy as np 9 | from utils import * 10 | import pandas as pd 11 | import sklearn.svm as svm 12 | from collections import Counter 13 | class TSVM(object): 14 | def __init__(self): 15 | # 分别对应有label的样本权重和无label的样本权重 16 | self.Cu = 0.001 17 | self.Cl = 1 18 | def fit(self,train_data): 19 | # 将数据集中的第一个正例,和第一个负例作为真实标记样本,其余视为无标记。 20 | pos_one = train_data[train_data[:,0] == 1][0] 21 | pos_other = train_data[train_data[:,0] == 1][1:] 22 | neg_one = train_data[train_data[:,0] == -1][0] 23 | neg_other = train_data[train_data[:,0] == -1][1:] 24 | train = np.vstack((pos_one,neg_one)) 25 | #S用于对数据进行测试 26 | self.other = np.vstack((pos_other,neg_other)) 27 | # 训练一个初始的分类器,设置不均衡参数 28 | self.clf = svm.SVC(C=1.5, kernel=self.kernel) 29 | self.clf.fit(train[:,1:],train[:,0]) 30 | pred_y = self.clf.predict(self.other[:,1:]) 31 | 32 | X = np.vstack((train,self.other)) 33 | # 将预测结果放到SVM模型中进行训练 34 | y = np.vstack((train[:,0].reshape(-1,1), pred_y.reshape(-1,1)))[:,0] 35 | self.w = np.ones(train_data.shape[0]) 36 | self.w[len(train):] = self.Cu 37 | while self.Cu < self.Cl: 38 | print(X.shape,y.shape,self.w.shape) 39 | self.clf.fit(X[:,1:],y,sample_weight = self.w) 40 | while True: 41 | #返回的是带符号的距离 42 | dist = self.clf.decision_function(X[:,1:]) 43 | xi = 1 - y * dist 44 | #取出预判为正例和负例的id 45 | xi_posi, xi_negi = np.where(y[2:]>0),np.where(y[2:]<0) 46 | xi_pos , xi_neg = xi[xi_posi],xi[xi_negi] 47 | xi_pos_maxi = np.argmax(xi_pos) 48 | xi_neg_maxi = np.argmax(xi_neg) 49 | xi_pos_max = xi_pos[xi_pos_maxi] 50 | xi_neg_max = xi_neg[xi_neg_maxi] 51 | #不断地拿两个距离最大的点进行交换。 52 | #交换策略:两个点中至少有一个误分类。 53 | if xi_pos_max >0 and xi_neg_max > 0 \ 54 | and (xi_pos_max + xi_neg_max) > 2: 55 | # 交换类别 56 | y[xi_pos_maxi],y[xi_neg_maxi] = \ 57 | y[xi_neg_maxi],y[xi_pos_maxi] 58 | self.clf.fit(X[:,1:],y, sample_weight = self.w) 59 | else: 60 | break 61 | self.Cu = min(2 * self.Cu ,self.Cl) 62 | # 交换权重 63 | self.w[len(train):] = self.Cu 64 | def predict(self): 65 | pred_y = self.clf.predict(self.other[:,1:]) 66 | return 1 - np.mean(pred_y == self.other[:,0]) 67 | 68 | import numpy as np 69 | import matplotlib.pyplot as plt 70 | from sklearn.semi_supervised import label_propagation 71 | from sklearn.datasets import make_moons 72 | 73 | # 生成弧形数据 74 | n_samples = 200 75 | X, y = make_moons(n_samples, noise=0.04, random_state=0) 76 | outer, inner = 0, 1 77 | labels = np.full(n_samples, -1.) 78 | labels[0] = outer 79 | labels[-1] = inner 80 | # 使用LP算法实现标签传递 81 | label_spread = label_propagation.LabelSpreading(kernel='rbf') 82 | label_spread.fit(X, labels) 83 | 84 | # 输出标签 85 | output_labels = label_spread.transduction_ 86 | plt.figure(figsize=(8.5, 4)) 87 | plt.subplot(1, 2, 1) 88 | plt.scatter(X[labels == outer, 0], 89 | X[labels == outer, 1], color='navy', 90 | marker='s', lw=0, label="outer labeled", s=10) 91 | plt.scatter(X[labels == inner, 0], X[labels == inner, 1], 92 | color='c', marker='s', lw=0, label='inner labeled', s=10) 93 | plt.scatter(X[labels == -1, 0], X[labels == -1, 1], 94 | color='darkorange', marker='.', label='unlabeled') 95 | plt.legend(scatterpoints=1, shadow=False, loc='upper right') 96 | plt.title("Raw data (2 classes=outer and inner)") 97 | 98 | plt.subplot(1, 2, 2) 99 | output_label_array = np.asarray(output_labels) 100 | outer_numbers = np.where(output_label_array == outer)[0] 101 | inner_numbers = np.where(output_label_array == inner)[0] 102 | plt.scatter(X[outer_numbers, 0], X[outer_numbers, 1], color='navy', 103 | marker='s', lw=0, s=10, label="outer learned") 104 | plt.scatter(X[inner_numbers, 0], X[inner_numbers, 1], color='c', 105 | marker='s', lw=0, s=10, label="inner learned") 106 | plt.legend(scatterpoints=1, shadow=False, loc='upper right') 107 | plt.title("Labels learned with Label Spreading (KNN)") 108 | 109 | plt.subplots_adjust(left=0.07, bottom=0.07, right=0.9, top=0.92) 110 | plt.show() 111 | -------------------------------------------------------------------------------- /智能风控(代码)/第6章/6.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:51:40 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | from pyod.models.lof import LOF 9 | 10 | #训练异常检测模型,然后输出训练集样本的异常分 11 | clf = LOF(n_neighbors=20, algorithm='auto', leaf_size=30, 12 | metric='minkowski', p=2,metric_params=None, 13 | contamination=0.1, n_jobs=1) 14 | clf.fit(x) 15 | 16 | #异常分 17 | out_pred = clf.predict_proba(x,method ='linear')[:,1] 18 | train['out_pred'] = out_pred 19 | 20 | #异常分在0.9百分位以下的样本删掉 21 | key = train['out_pred'].quantile(0.9) 22 | 23 | x = train[train.out_pred< key][feature_lst] 24 | y = train[train.out_pred < key]['bad_ind'] 25 | 26 | val_x = val[feature_lst] 27 | val_y = val['bad_ind'] 28 | 29 | #重新训练模型 30 | lr_model = LogisticRegression(C=0.1,class_weight='balanced') 31 | lr_model.fit(x,y) 32 | y_pred = lr_model.predict_proba(x)[:,1] 33 | fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred) 34 | train_ks = abs(fpr_lr_train - tpr_lr_train).max() 35 | print('train_ks : ',train_ks) 36 | 37 | y_pred = lr_model.predict_proba(val_x)[:,1] 38 | fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) 39 | val_ks = abs(fpr_lr - tpr_lr).max() 40 | print('val_ks : ',val_ks) 41 | 42 | from matplotlib import pyplot as plt 43 | plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') 44 | plt.plot(fpr_lr,tpr_lr,label = 'evl LR') 45 | plt.plot([0,1],[0,1],'k--') 46 | plt.xlabel('False positive rate') 47 | plt.ylabel('True positive rate') 48 | plt.title('ROC Curve') 49 | plt.legend(loc = 'best') 50 | plt.show() 51 | 52 | -------------------------------------------------------------------------------- /智能风控(代码)/第6章/6.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:54:36 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | from pyod.models.iforest import IForest 9 | clf = IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0, 10 | max_samples='auto', n_estimators=500, n_jobs=-1, random_state=None,verbose=0) 11 | clf.fit(x) 12 | out_pred = clf.predict_proba(x,method ='linear')[:,1] 13 | train['out_pred'] = out_pred 14 | train['for_pred'] = np.where(train.out_pred>0.7,'负样本占比','正样本占比') 15 | dic = dict(train.groupby(train.for_pred).bad_ind.agg(np.sum)/ \ 16 | train.bad_ind.groupby(train.for_pred).count()) 17 | pd.DataFrame(dic,index=[0]) 18 | 19 | clf = IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0, 20 | max_samples='auto', n_estimators=500, n_jobs=-1, random_state=None,verbose=0) 21 | clf.fit(x) 22 | y_pred = clf.predict_proba(x,method ='linear')[:,1] 23 | fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred) 24 | train_ks = abs(fpr_lr_train - tpr_lr_train).max() 25 | print('train_ks : ',train_ks) 26 | y_pred = clf.predict_proba(val_x,method ='linear')[:,1] 27 | fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) 28 | val_ks = abs(fpr_lr - tpr_lr).max() 29 | print('val_ks : ',val_ks) 30 | from matplotlib import pyplot as plt 31 | plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') 32 | plt.plot(fpr_lr,tpr_lr,label = 'evl LR') 33 | plt.plot([0,1],[0,1],'k--') 34 | plt.xlabel('False positive rate') 35 | plt.ylabel('True positive rate') 36 | plt.title('ROC Curve') 37 | plt.legend(loc = 'best') 38 | plt.show() 39 | 40 | -------------------------------------------------------------------------------- /智能风控(代码)/第7章/7.1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:55:52 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | from sklearn.metrics import accuracy_score 9 | import lightgbm as lgb 10 | 11 | #'regression_l1'等价于MAE损失函数 12 | lgb_param_l1 = { 13 | 'learning_rate': 0.01, 14 | 'boosting_type': 'gbdt', 15 | 'objective': 'regression_l1', 16 | 'min_child_samples': 46, 17 | 'min_child_weight': 0.02, 18 | 'feature_fraction': 0.6, 19 | 'bagging_fraction': 0.8, 20 | 'bagging_freq': 2, 21 | 'num_leaves': 31, 22 | 'max_depth': 5, 23 | 'lambda_l2': 1, 24 | 'lambda_l1': 0, 25 | 'n_jobs': -1, 26 | } 27 | 28 | #'regression_l2'等价于MSE损失函数 29 | lgb_param_l2 = { 30 | 'learning_rate': 0.01, 31 | 'boosting_type': 'gbdt', 32 | 'objective': 'regression_l2', 33 | 'feature_fraction': 0.7, 34 | 'bagging_fraction': 0.7, 35 | 'bagging_freq': 2, 36 | 'num_leaves': 52, 37 | 'max_depth': 5, 38 | 'lambda_l2': 1, 39 | 'lambda_l1': 0, 40 | 'n_jobs': -1, 41 | } 42 | # 第一种参数预测 43 | clf1=lgb.LGBMRegressor(**lgb_params1) 44 | clf.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)], 45 | eval_metric='mae',early_stopping_rounds=200) 46 | #预测的划分出来的测试集的标签 47 | pred_val1=clf1.predict(X_val,num_iteration=clf.best_iteration_) 48 | vali_mae1=accuracy_score(y_val,np.round(pred_val1)) 49 | #预测的未带标签的测试集的标签 50 | pred_test1=clf.predcit(test[feature_name],num_iteration=clf.best_iteration_) 51 | # 第二种参数预测 52 | clf2=lgb.LGBMRegressor(**lgb_params2) 53 | clf2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)], 54 | eval_metric='rmse',early_stopping_rounds=200) 55 | #预测的划分出来的测试集的标签 56 | pred_val2=clf2.predict(X_val,num_iteration=clf2.best_iteration_) 57 | vali_mae2=accuracy_score(y_val,np.round(pred_val2)) 58 | #预测的未带标签的测试集的标签 59 | pred_test2=clf.predcit(test_featur,num_iteration=clf2.best_iteration_) 60 | # 模型参数进行融合之后的结果 61 | pred_test=pd.DataFrame() 62 | pred_test['ranks']=list(range(50000)) 63 | pred_test['result']=1 64 | pred_test.loc[pred_test.ranks<400,'result'] = 65 | pred_test1.loc[pred_test1.ranks< 400,'pred_mae'].values *0.4 66 | + pred_test2.loc[pred_test2.ranks< 400,'pred_mse'].values * 0.6 67 | pred_test.loc[pred_test.ranks>46000,'result'] = 68 | pred_test1.loc[pred_test1.ranks> 46000,'pred_mae'].values *0.4 69 | + pred_test2.loc[pred_test2.ranks> 46000,'pred_mse'].values * 0.6 70 | 71 | -------------------------------------------------------------------------------- /智能风控(代码)/第7章/7.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 15:57:09 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import lightgbm as lgb 9 | import random 10 | import pandas as pd 11 | import numpy as np 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.metrics import mean_squared_error 14 | from sklearn.linear_model import LogisticRegression 15 | from sklearn import metrics 16 | from sklearn.metrics import roc_curve 17 | from matplotlib import pyplot as plt 18 | import math 19 | 20 | df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy() 21 | df_test = data[data.obs_mth == '2018-11-30'].reset_index().copy() 22 | NUMERIC_COLS = ['person_info','finance_info','credit_info','act_info'] 23 | from sklearn.preprocessing import OneHotEncoder,LabelEncoder 24 | 25 | lgb_train = lgb.Dataset(df_train[NUMERIC_COLS], 26 | df_train['bad_ind'], free_raw_data=False) 27 | params = { 28 | 'num_boost_round': 50, 29 | 'boosting_type': 'gbdt', 30 | 'objective': 'binary', 31 | 'num_leaves': 2, 32 | 'metric': 'auc', 33 | 'max_depth':1, 34 | 'feature_fraction':1, 35 | 'bagging_fraction':1, } 36 | model = lgb.train(params,lgb_train) 37 | leaf = model.predict(df_train[NUMERIC_COLS],pred_leaf=True) 38 | lgb_enc = OneHotEncoder() 39 | #生成交叉特征 40 | lgb_enc.fit(leaf) 41 | #和原始特征进行合并 42 | data_leaf = np.hstack((lgb_enc.transform(leaf).toarray(),df_train[NUMERIC_COLS])) 43 | leaf_test = model.predict(df_test[NUMERIC_COLS],pred_leaf=True) 44 | lgb_enc = OneHotEncoder() 45 | lgb_enc.fit(leaf_test) 46 | data_leaf_test = np.hstack((lgb_enc.transform(leaf_test).toarray(), 47 | df_test[NUMERIC_COLS])) 48 | train = data_leaf.copy() 49 | train_y = df_train['bad_ind'].copy() 50 | val = data_leaf_test.copy() 51 | val_y = df_test['bad_ind'].copy() 52 | lgb_lm = LogisticRegression(penalty='l2',C=0.2, class_weight='balanced',solver='liblinear') 53 | lgb_lm.fit(train, train_y) 54 | y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1] 55 | fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y,y_pred_lgb_lm_train) 56 | y_pred_lgb_lm = lgb_lm.predict_proba(val)[:,1] 57 | fpr_lgb_lm,tpr_lgb_lm,_ = roc_curve(val_y,y_pred_lgb_lm) 58 | plt.figure(1) 59 | plt.plot([0, 1], [0, 1], 'k--') 60 | plt.plot(fpr_lgb_lm_train,tpr_lgb_lm_train,label='LGB + LR train') 61 | plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test') 62 | plt.xlabel('False positive rate') 63 | plt.ylabel('True positive rate') 64 | plt.title('ROC curve') 65 | plt.legend(loc='best') 66 | plt.show() 67 | print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(), 68 | 'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train)) 69 | print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(), 70 | 'LGB+LR AUC:', metrics.auc(fpr_lgb_lm, tpr_lgb_lm)) 71 | dff_train = pd.DataFrame(train) 72 | dff_train.columns = [ 'ft' + str(x) for x in range(train.shape[1])] 73 | 74 | dff_val = pd.DataFrame(val) 75 | dff_val.columns = [ 'ft' + str(x) for x in range(val.shape[1])] 76 | #生成可以传入PSI的数据集 77 | def make_psi_data(dff_train): 78 | dftot = pd.DataFrame() 79 | for col in dff_train.columns: 80 | zero= sum(dff_train[col] == 0) 81 | one= sum(dff_train[col] == 1) 82 | ftdf = pd.DataFrame(np.array([zero,one])) 83 | ftdf.columns = [col] 84 | if len(dftot) == 0: 85 | dftot = ftdf.copy() 86 | else: 87 | dftot[col] = ftdf[col].copy() 88 | return dftot 89 | psi_data_train = make_psi_data(dff_train) 90 | psi_data_val = make_psi_data(dff_val) 91 | def var_PSI(dev_data, val_data): 92 | dev_cnt, val_cnt = sum(dev_data), sum(val_data) 93 | if dev_cnt * val_cnt == 0: 94 | return 0 95 | PSI = 0 96 | for i in range(len(dev_data)): 97 | dev_ratio = dev_data[i] / dev_cnt 98 | val_ratio = val_data[i] / val_cnt + 1e-10 99 | psi = (dev_ratio - val_ratio) * math.log(dev_ratio/val_ratio) 100 | PSI += psi 101 | return PSI 102 | psi_dct = {} 103 | for col in dff_train.columns: 104 | psi_dct[col] = var_PSI(psi_data_train[col],psi_data_val[col]) 105 | f = zip(psi_dct.keys(),psi_dct.values()) 106 | f = sorted(f,key = lambda x:x[1],reverse = False) 107 | psi_df = pd.DataFrame(f) 108 | psi_df.columns = pd.Series(['变量名','PSI']) 109 | feature_lst = list(psi_df[psi_df['PSI']5].name) 160 | train = dff_train[feature_lst2].copy() 161 | train_y = df_train['bad_ind'].copy() 162 | val = dff_val[feature_lst2].copy() 163 | val_y = df_test['bad_ind'].copy() 164 | lgb_lm = LogisticRegression(C = 0.3,class_weight='balanced',solver='liblinear') 165 | lgb_lm.fit(train, train_y) 166 | 167 | y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1] 168 | fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y, y_pred_lgb_lm_train) 169 | 170 | y_pred_lgb_lm = lgb_lm.predict_proba(val)[:, 1] 171 | fpr_lgb_lm, tpr_lgb_lm, _ = roc_curve(val_y, y_pred_lgb_lm) 172 | 173 | plt.figure(1) 174 | plt.plot([0, 1], [0, 1], 'k--') 175 | plt.plot(fpr_lgb_lm_train, tpr_lgb_lm_train, label='LGB + LR train') 176 | plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test') 177 | plt.xlabel('False positive rate') 178 | plt.ylabel('True positive rate') 179 | plt.title('ROC curve') 180 | plt.legend(loc='best') 181 | plt.show() 182 | print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(), 183 | 'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train)) 184 | print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(),'LGB+LR AUC:', 185 | metrics.auc(fpr_lgb_lm, tpr_lgb_lm)) 186 | 187 | -------------------------------------------------------------------------------- /智能风控(代码)/第7章/7.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 16:00:19 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import torch 9 | import torch.nn as nn 10 | import random 11 | from sklearn.model_selection import train_test_split 12 | import torchvision.transforms as transforms 13 | import torchvision.datasets as dsets 14 | from torch.autograd import Variable 15 | 16 | random_st = random.choice(range(10000)) 17 | train_images, test_images = train_test_split(train_images,test_size=0.15, 18 | random_state=random_st) 19 | 20 | train_data = MyDataset(train_images) 21 | test_data = MyDataset(test_images) 22 | 23 | train_loader = torch.utils.data.DataLoader(train_data, batch_size=50, 24 | shuffle=True, num_workers=0) 25 | test_loader = torch.utils.data.DataLoader(test_data, batch_size=25, 26 | huffle=False, num_workers=0) 27 | #搭建LSTM网络 28 | class Rnn(nn.Module): 29 | def __init__(self, in_dim, hidden_dim, n_layer, n_class): 30 | super(Rnn, self).__init__() 31 | self.n_layer = n_layer 32 | self.hidden_dim = hidden_dim 33 | self.LSTM = nn.LSTM(in_dim, hidden_dim, 34 | n_layer,batch_first=True) 35 | self.linear = nn.Linear(hidden_dim,n_class) 36 | self.sigmoid = nn.Sigmoid() 37 | 38 | def forward(self, x): 39 | x = x.sum(dim = 1) 40 | out, _ = self.LSTM(x) 41 | out = out[:, -1, :] 42 | out = self.linear(out) 43 | out = self.sigmoid(out) 44 | return out 45 | #28个特征,42个月切片,2个隐层,2分类 46 | model = Rnn(28,42,2,2) 47 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 48 | model = model.to(device) 49 | #使用二分类对数损失函数 50 | criterion = nn.SoftMarginLoss(reduction='mean') 51 | opt = torch.optim.Adam(model.parameters()) 52 | total_step = len(train_loader) 53 | total_step_test = len(test_loader) 54 | num_epochs = 50 55 | 56 | for epoch in range(num_epochs): 57 | train_label = [] 58 | train_pred = [] 59 | model.train() 60 | for i, (images, labels) in enumerate(train_loader): 61 | images = images.to(device) 62 | labels = labels.to(device) 63 | #网络训练 64 | out = model(images) 65 | loss = criterion(out, labels) 66 | opt.zero_grad() 67 | loss.backward() 68 | opt.step() 69 | #每一百轮打印一次 70 | if i%100 == 0: 71 | print('train epoch: {}/{}, round: {}/{},loss: {}'.format( 72 | epoch + 1, num_epochs, i + 1, total_step, loss)) 73 | #真实标记和预测值 74 | train_label.extend(labels.cpu().numpy().flatten().tolist()) 75 | train_pred.extend(out.detach().cpu().numpy().flatten().tolist()) 76 | #计算真正率和假正率 77 | fpr_lm_train, tpr_lm_train, _ = roc_curve(np.array(train_label), 78 | np.array(train_pred)) 79 | #计算KS和AUC 80 | print('train epoch: {}/{}, KS: {}, ROC: {}'.format( 81 | epoch + 1, num_epochs,abs(fpr_lm_train - tpr_lm_train).max(), 82 | metrics.auc(fpr_lm_train, tpr_lm_train))) 83 | 84 | test_label = [] 85 | test_pred = [] 86 | 87 | model.eval() 88 | #计算测试集上的KS值和AUC值 89 | for i, (images, labels) in enumerate(test_loader): 90 | 91 | images = images.to(device) 92 | labels = labels.to(device) 93 | out = model(images) 94 | loss = criterion(out, labels) 95 | 96 | #计算KS和AUC 97 | if i%100 == 0: 98 | print('test epoch: {}/{}, round: {}/{},loss: {}'.format( 99 | epoch + 1, num_epochs,i + 1, total_step_test, loss)) 100 | test_label.extend(labels.cpu().numpy().flatten().tolist()) 101 | test_pred.extend(out.detach().cpu().numpy().flatten().tolist()) 102 | 103 | fpr_lm_test, tpr_lm_test, _ = roc_curve(np.array(test_label), 104 | np.array(test_pred)) 105 | 106 | print('test epoch: {}/{}, KS: {}, ROC: {}'.format( epoch + 1, 107 | num_epochs, 108 | abs(fpr_lm_test - tpr_lm_test).max(), 109 | metrics.auc(fpr_lm_test - tpr_lm_test))) 110 | 111 | -------------------------------------------------------------------------------- /智能风控(代码)/第7章/7.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 16:01:35 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | #加载xlearn包 9 | import xlearn as xl 10 | #调用FM模型 11 | fm_model = xl.create_fm() 12 | # 训练集 13 | fm_model.setTrain("train.txt") 14 | # 设置验证集 15 | fm_model.setValidate("test.txt") 16 | # 分类问题:acc(Accuracy);prec(precision);f1(f1 score);auc(AUC score) 17 | param = {'task':'binary','lr':0.2,'lambda':0.002,'metric':'auc'} 18 | fm_model.fit(param, "model.out") 19 | fm_model.setSigmoid() 20 | fm_model.predict("model.out","output.txt") 21 | fm_model.setTXTModel("model.txt") 22 | 23 | -------------------------------------------------------------------------------- /智能风控(代码)/第7章/7.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 16:02:19 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | from heamy.dataset import Dataset 9 | from heamy.estimator import Regressor 10 | from heamy.pipeline import ModelsPipeline 11 | import pandas as pd 12 | import xgboost as xgb 13 | from sklearn.metrics import roc_auc_score 14 | import lightgbm as lgb 15 | from sklearn.linear_model import LinearRegression 16 | from sklearn.ensemble import ExtraTreesClassifier 17 | from sklearn.ensemble import GradientBoostingClassifier 18 | from sklearn.linear_model import LogisticRegression 19 | from sklearn import svm 20 | import numpy as np 21 | 22 | def xgb_model1(X_train, y_train, X_test, y_test=None): 23 | # xgboost1 24 | params = {'booster': 'gbtree', 25 | 'objective':'rank:pairwise', 26 | 'eval_metric' : 'auc', 27 | 'eta': 0.02, 28 | 'max_depth': 5, # 4 3 29 | 'colsample_bytree': 0.7,#0.8 30 | 'subsample': 0.7, 31 | 'min_child_weight': 1, # 2 3 32 | 'seed': 1111, 33 | 'silent':1 34 | } 35 | dtrain = xgb.DMatrix(X_train, label=y_train) 36 | dvali = xgb.DMatrix(X_test) 37 | model = xgb.train(params, dtrain, num_boost_round=800) 38 | predict = model.predict_proba(dvali) 39 | minmin = min(predict) 40 | maxmax = max(predict) 41 | vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin)) 42 | return vfunc(predict) 43 | 44 | def xgb_model2(X_train, y_train, X_test, y_test=None): 45 | # xgboost2 46 | params = {'booster': 'gbtree', 47 | 'objective':'rank:pairwise', 48 | 'eval_metric' : 'auc', 49 | 'eta': 0.015, 50 | 'max_depth': 5, # 4 3 51 | 'colsample_bytree': 0.7,#0.8 52 | 'subsample': 0.7, 53 | 'min_child_weight': 1, # 2 3 54 | 'seed': 11, 55 | 'silent':1 56 | } 57 | dtrain = xgb.DMatrix(X_train, label=y_train) 58 | dvali = xgb.DMatrix(X_test) 59 | model = xgb.train(params, dtrain, num_boost_round=1200) 60 | predict = model.predict_proba (dvali) 61 | minmin = min(predict) 62 | maxmax = max(predict) 63 | vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin)) 64 | return vfunc(predict) 65 | 66 | def xgb_model3(X_train, y_train, X_test, y_test=None): 67 | # xgboost3 68 | params = {'booster': 'gbtree', 69 | 'objective':'rank:pairwise', 70 | 'eval_metric' : 'auc', 71 | 'eta': 0.01, 72 | 'max_depth': 5, # 4 3 73 | 'colsample_bytree': 0.7,#0.8 74 | 'subsample': 0.7, 75 | 'min_child_weight': 1, # 2 3 76 | 'seed': 1, 77 | 'silent':1 78 | } 79 | dtrain = xgb.DMatrix(X_train, label=y_train) 80 | dvali = xgb.DMatrix(X_test) 81 | model = xgb.train(params, dtrain, num_boost_round=2000) 82 | predict = model.predict_proba (dvali) 83 | minmin = min(predict) 84 | maxmax = max(predict) 85 | vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin)) 86 | return vfunc(predict) 87 | 88 | def et_model(X_train, y_train, X_test, y_test=None): 89 | #ExtraTree 90 | model = ExtraTreesClassifier(max_features='log2',n_estimators=1000,n_jobs=1).fit(X_train,y_train) 91 | predict = model.predict_proba(X_test)[:,1] 92 | minmin = min(predict) 93 | maxmax = max(predict) 94 | vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin)) 95 | return vfunc(predict) 96 | 97 | def gbdt_model(X_train, y_train, X_test, y_test=None): 98 | #GBDT 99 | model = GradientBoostingClassifier(learning_rate=0.02,max_features=0.7, 100 | n_estimators=700,max_depth=5).fit(X_train,y_train) 101 | predict = model.predict_proba(X_test)[:,1] 102 | minmin = min(predict) 103 | maxmax = max(predict) 104 | vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin)) 105 | return vfunc(predict) 106 | 107 | def logistic_model(X_train, y_train, X_test, y_test=None): 108 | #逻辑回归 109 | model = LogisticRegression(penalty = 'l2').fit(X_train,y_train) 110 | predict = model.predict_proba(X_test)[:,1] 111 | minmin = min(predict) 112 | maxmax = max(predict) 113 | vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin)) 114 | return vfunc(predict) 115 | 116 | 117 | def lgb_model(X_train, y_train, X_test, y_test=None): 118 | #LightGBM 119 | lgb_train=lgb.Dataset(X_train,y_train,categorical_feature={'sex','merriage','income', 120 | 'qq_bound','degree', 121 | 'wechat_bound', 122 | 'account_grade','industry'}) 123 | lgb_test = lgb.Dataset(X_test,categorical_feature={'sex','merriage','income','qq_bound', 124 | 'degree','wechat_bound', 125 | 'account_grade','industry'}) 126 | params = { 127 | 'task': 'train', 128 | 'boosting_type': 'gbdt', 129 | 'objective': 'binary', 130 | 'metric':'auc', 131 | 'num_leaves': 25, 132 | 'learning_rate': 0.01, 133 | 'feature_fraction': 0.7, 134 | 'bagging_fraction': 0.7, 135 | 'bagging_freq': 5, 136 | 'min_data_in_leaf':5, 137 | 'max_bin':200, 138 | 'verbose': 0, 139 | } 140 | gbm = lgb.train(params, 141 | lgb_train, 142 | num_boost_round=2000) 143 | predict = gbm.predict_proba(X_test) 144 | minmin = min(predict) 145 | maxmax = max(predict) 146 | vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin)) 147 | return vfunc(predict) 148 | 149 | def svm_model(X_train, y_train, X_test, y_test=None): 150 | #支持向量机 151 | model = svm.SVC(C=0.8,kernel='rbf',gamma=20, 152 | decision_function_shape='ovr').fit(X_train,y_train) 153 | predict = model.predict_proba(X_test)[:,1] 154 | minmin = min(predict) 155 | maxmax = max(predict) 156 | vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin)) 157 | return vfunc(predict) 158 | 159 | import pandas as pd 160 | import numpy as np 161 | from minepy import MINE 162 | 163 | """ 164 | 从csv文件中,加载8个模型的预测分数 165 | """ 166 | xgb1_result = pd.read_csv('xgb1.csv') 167 | xgb2_result = pd.read_csv('xgb2.csv') 168 | xgb3_result = pd.read_csv('xgb3.csv') 169 | et_result = pd.read_csv('et_model.csv') 170 | svm_result = pd.read_csv('svm.csv') 171 | lr_result = pd.read_csv('lr.csv') 172 | lgb_result = pd.read_csv('lgb.csv') 173 | gbdt_result = pd.read_csv('gbdt.csv') 174 | 175 | res = [] 176 | res.append(xgb1_result.score.values) 177 | res.append(xgb2_result.score.values) 178 | res.append(xgb3_result.score.values) 179 | res.append(et_result.score.values) 180 | res.append(svm_result.score.values) 181 | res.append(lr_result.score.values) 182 | res.append(lgb_result.score.values) 183 | res.append(gbdt_result.score.values) 184 | 185 | """ 186 | 计算向量两两之间的MIC值 187 | """ 188 | cm = [] 189 | for i in range(7): 190 | tmp = [] 191 | for j in range(7): 192 | m = MINE() 193 | m.compute_score(res[i], res[j]) 194 | tmp.append(m.mic()) 195 | cm.append(tmp) 196 | 197 | """ 198 | 绘制MIC图像 199 | """ 200 | 201 | fs = ['xgb1','xgb2','xgb3','et','svm','lr','lgb','gbdt'] 202 | 203 | import matplotlib.pyplot as plt 204 | 205 | def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues): 206 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 207 | plt.title(title) 208 | plt.colorbar() 209 | tick_marks = np.arange(8) 210 | plt.xticks(tick_marks, fs, rotation=45) 211 | plt.yticks(tick_marks, fs) 212 | plt.tight_layout() 213 | 214 | plot_confusion_matrix(cm, title='mic') 215 | plt.show() 216 | model_xgb2 = Regressor(dataset= dataset, estimator=xgb_feature2,name='xgb2',use_cache=False) 217 | model_lr = Regressor(dataset= dataset, estimator=logistic_model,name='lr',use_cache=False) 218 | model_lgb = Regressor(dataset= dataset, estimator=lgb_model,name='lgb',use_cache=False) 219 | model_ gbdt = Regressor(dataset= dataset, estimator=gbdt_model,name='gbdt',use_cache=False) 220 | pipeline = ModelsPipeline(model_xgb2, model_lr, model_lgb, model_svm) 221 | stack_data = pipeline.stack(k=5, seed=0, add_diff=False, full_test=True) 222 | stacker = Regressor(dataset=stack_data,estimator=LinearRegression, 223 | parameters={'fit_intercept': False}) 224 | predict_result = stacker.predict() 225 | val = pd.read_csv('val_list.csv') 226 | val['PROB'] = predict_result 227 | minmin, maxmax = min(val ['PROB']),max(val ['PROB']) 228 | val['PROB'] = val['PROB'].map(lambda x:(x-minmin)/(maxmax-minmin)) 229 | val['PROB'] = val['PROB'].map(lambda x:'%.4f' % x) 230 | 231 | -------------------------------------------------------------------------------- /智能风控(代码)/第8章/8.2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 16:05:34 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import networkx as nx 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | 12 | edge_list=pd.read_csv('./data/stack_network_links.csv') 13 | G=nx.from_pandas_edgelist(edge_list,edge_attr='value' ) 14 | plt.figure(figsize=(15,10)) 15 | nx.draw(G,with_labels=True, 16 | edge_color='grey', 17 | node_color='pink', 18 | node_size = 500, 19 | font_size = 40, 20 | pos=nx.spring_layout(G,k=0.2)) 21 | #度 22 | nx.degree(G) 23 | 24 | import networkx as nx 25 | nx.eigenvector_centrality(G) 26 | 27 | import networkx as nx 28 | nx.pagerank(G,Ap=0.9) 29 | 30 | import networkx as nx 31 | nx.betweenness_centrality(G) 32 | 33 | import networkx as nx 34 | nx.closeness_centrality(G) 35 | 36 | preds = nx.jaccard_coefficient(G, [('azure','.net')]) 37 | for u, v, p in preds: 38 | print('(%s, %s) -> %.8f' % (u, v, p)) 39 | 40 | -------------------------------------------------------------------------------- /智能风控(代码)/第8章/8.3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 16:07:41 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import networkx as nx 9 | import numpy as np 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.neighbors import KNeighborsClassifier 12 | from sklearn.svm import SVC 13 | #给定真实标签 14 | G = nx.karate_club_graph() 15 | groundTruth = [0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,0,1,1] 16 | #定义邻接矩阵,将网络节点转换成n*n的方阵 17 | def graphmatrix(G): 18 | n = G.number_of_nodes() 19 | temp = np.zeros([n,n]) 20 | for edge in G.edges(): 21 | temp[int(edge[0])][int(edge[1])] = 1 22 | temp[int(edge[1])][int(edge[0])] = 1 23 | return temp 24 | 25 | edgeMat = graphmatrix(G) 26 | 27 | x_train, x_test, y_train, y_test = train_test_split(edgeMat, 28 | groundTruth, test_size=0.6, random_state=0) 29 | #使用线性核svm分类器进行训练 30 | clf = SVC(kernel="linear") 31 | 32 | clf.fit(x_train, y_train) 33 | predicted= clf.predict(x_test) 34 | print(predicted) 35 | 36 | score = clf.score(x_test, y_test) 37 | print(score) 38 | 39 | import networkx as nx 40 | import numpy as np 41 | from sklearn.model_selection import train_test_split 42 | from sklearn.metrics import accuracy_score 43 | #二值化,默认用0.5作为阈值,可以根据业务标签分布调整 44 | def binary(nodelist, threshold=0.5): 45 | for i in range(len(nodelist)): 46 | if( nodelist[i] > threshold ): nodelist[i] = 1.0 47 | else: nodelist[i] = 0 48 | return nodelist 49 | 50 | G = nx.karate_club_graph() 51 | groundTruth = [0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,0,1,1] 52 | max_iter = 2 #迭代次数 53 | nodes = list(G.nodes()) 54 | nodes_list = {nodes[i]: i for i in range(0, len(nodes))} 55 | 56 | vote = np.zeros(len(nodes)) 57 | x_train, x_test, y_train, y_test = train_test_split(nodes, 58 | groundTruth, test_size=0.7, random_state=1) 59 | 60 | vote[x_train] = y_train 61 | vote[x_test] = 0.5 #初始化概率为0.5 62 | 63 | for i in range(max_iter): 64 | #只用前一次迭代的值 65 | last = np.copy(vote) 66 | for u in G.nodes(): 67 | if( u in x_train ): 68 | continue 69 | temp = 0.0 70 | for item in G.neighbors(u): 71 | #对所有邻居求和 72 | temp = temp + last[nodes_list[item]] 73 | vote[nodes_list[u]] = temp/len(list(G.neighbors(u))) 74 | 75 | #二值化得到分类标签 76 | temp = binary(vote) 77 | 78 | pred = temp[x_test] 79 | #计算准确率 80 | print(accuracy_score(y_test, pred)) 81 | import networkx as nx 82 | import numpy as np 83 | from sklearn.model_selection import train_test_split 84 | from sklearn.metrics import accuracy_score 85 | from sklearn import preprocessing 86 | from scipy import sparse 87 | 88 | G = nx.karate_club_graph() 89 | groundTruth = [0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,0,1,1] 90 | 91 | def graphmatrix(G): 92 | #节点抽象成边 93 | n = G.number_of_nodes() 94 | temp = np.zeros([n,n]) 95 | for edge in G.edges(): 96 | temp[int(edge[0])][int(edge[1])] = 1 97 | temp[int(edge[1])][int(edge[0])] = 1 98 | return temp 99 | 100 | def propagation_matrix(G): 101 | #矩阵标准化 102 | degrees = G.sum(axis=0) 103 | degrees[degrees==0] += 1 # 避免除以0 104 | 105 | D2 = np.identity(G.shape[0]) 106 | for i in range(G.shape[0]): 107 | D2[i,i] = np.sqrt(1.0/degrees[i]) 108 | 109 | S = D2.dot(G).dot(D2) 110 | return S 111 | #定义取最大值的函数 112 | def vec2label(Y): 113 | return np.argmax(Y,axis=1) 114 | 115 | edgematrix = graphmatrix(G) 116 | S = propagation_matrix(edgematrix) 117 | 118 | Ap = 0.8 119 | cn = 2 120 | max_iter = 10 121 | 122 | #定义迭代函数 123 | F = np.zeros([G.number_of_nodes(),2]) 124 | X_train, X_test, y_train, y_test = train_test_split(list(G.nodes()), 125 | groundTruth, test_size=0.7, random_state=1) 126 | for (node, label) in zip(X_train, y_train): 127 | F[node][label] = 1 128 | 129 | Y = F.copy() 130 | 131 | for i in range(max_iter): 132 | F_old = np.copy(F) 133 | F = Ap*np.dot(S, F_old) + (1-Ap)*Y 134 | 135 | temp = vec2label(F) 136 | pred = temp[X_test] 137 | print(accuracy_score(y_test, pred)) 138 | 139 | -------------------------------------------------------------------------------- /智能风控(代码)/第8章/8.4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 16:11:33 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import networkx as nx 9 | from networkx.algorithms import community 10 | import itertools 11 | 12 | G = nx.karate_club_graph() 13 | comp = community.girvan_newman(G) 14 | # 令社区个数为4,这样会依次得到K=2,K=3,K=4时候的划分结果 15 | k = 4 16 | limited = itertools.takewhile(lambda c: len(c) <= k, comp) 17 | for communities in limited: 18 | print(tuple(sorted(c) for c in communities)) 19 | 20 | 21 | import networkx as nx 22 | import community 23 | G = nx.karate_club_graph() 24 | part = community.best_partition(G) 25 | print(len(part)) 26 | 27 | 28 | import math 29 | import numpy as np 30 | from sklearn import metrics 31 | def NMI(A,B): 32 | total = len(A) 33 | X = set(A) 34 | Y = set(B) 35 | #计算互信息MI 36 | MI = 0 37 | eps = 1.4e-45 38 | for x in X: 39 | for y in Y: 40 | AOC = np.where(A==x) 41 | BOC = np.where(B==y) 42 | ABOC = np.intersect1d(AOC,BOC) 43 | px = 1.0*len(AOC[0])/total 44 | py = 1.0*len(BOC[0])/total 45 | pxy = 1.0*len(ABOC)/total 46 | MI = MI + pxy*math.log(pxy/(px*py)+eps,2) 47 | # 标准化互信息NMI 48 | Hx = 0 49 | for x in X: 50 | AOC = 1.0*len(np.where(A==x)[0]) 51 | Hx = Hx - (AOC/total)*math.log(AOC/total+eps,2) 52 | Hy = 0 53 | for y in Y: 54 | BOC = 1.0*len(np.where(B==y)[0]) 55 | Hy = Hy - (BOC/total)*math.log(BOC/total+eps,2) 56 | NMI = 2.0*MI/(Hx+Hy) 57 | return NMI 58 | #测试 59 | if __name__ == '__main__': 60 | A = np.array([1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3]) 61 | B = np.array([1,2,1,1,1,1,1,2,2,2,2,3,1,1,3,3,3]) 62 | #调用自定义的NMI函数 63 | print(NMI(A,B)) 64 | #调用sklearn封装好的NMI函数 65 | print(metrics.normalized_mutual_info_score(A,B)) 66 | 67 | -------------------------------------------------------------------------------- /智能风控(代码)/第8章/8.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 16:23:41 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import numpy as np 9 | 10 | def get_cost(X, U, V, lamb=0): 11 | ''''' 12 | 计算损失函数 13 | J = |X-UV|+ lamb*(|U|+|V|) 14 | 输入: X [n, d], U [n, m], V [m, d] 15 | ''' 16 | UV = np.dot(U, V) 17 | cost1 = np.sum((X - UV)**2) 18 | cost2 = np.sum(U**2) + np.sum(V**2) 19 | res = cost1 + lamb*cost2 20 | return res 21 | 22 | def Matrix_Factor(X, m, lamb=0.1, learnRate=0.01): 23 | ''''' 24 | 损失函数定义 25 | J = |X-UV| + lamb*(|U|+|V|) 26 | 输入: X [n, d] 27 | 输出: U [n, m], V [m, n] 28 | ''' 29 | maxIter = 100 30 | n, d = X.shape 31 | #随机初始化 32 | U = np.random.random([n, m])/n 33 | V = np.random.random([m, d])/m 34 | # 迭代 35 | iter_num = 1 36 | while iter_num < maxIter: 37 | #计算U的偏导 38 | dU = 2*( -np.dot(X, V.T) + np.linalg.multi_dot([U, V, V.T]) + lamb*U ) 39 | U = U - learnRate * dU 40 | #计算V的偏导 41 | dV = 2*( -np.dot(U.T, X) + np.linalg.multi_dot([U.T, U, V]) + lamb*V ) 42 | V = V - learnRate * dV 43 | iter_num += 1 44 | return U, V 45 | 46 | 47 | import numpy as np 48 | import networkx as nx 49 | from sklearn.model_selection import train_test_split 50 | from sklearn.linear_model import LogisticRegression 51 | from sklearn.metrics import roc_curve,auc 52 | from matplotlib import pyplot as plt 53 | import random 54 | #加载数据 55 | G = nx.karate_club_graph() 56 | groundTruth = [0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1] 57 | 58 | #构造邻接矩阵 59 | def graph2matrix(G): 60 | n = G.number_of_nodes() 61 | res = np.zeros([n,n]) 62 | for edge in G.edges(): 63 | res[int(edge[0])][int(edge[1])] = 1 64 | res[int(edge[1])][int(edge[0])] = 1 65 | return res 66 | 67 | #生成网络 68 | G = nx.karate_club_graph() 69 | G = graph2matrix(G) 70 | 71 | #迭代20次 72 | [U, V] = Matrix_Factor(G, 20) 73 | #划分训练集测试集 74 | X_train, X_test, y_train, y_test = train_test_split(U,groundTruth,test_size=0.7,random_state=1) 75 | #逻辑回归训练 76 | lgb_lm = LogisticRegression(penalty='l2',C=0.2,class_weight='balanced',solver='liblinear') 77 | lgb_lm.fit(X_train, y_train) 78 | 79 | y_pred_lgb_lm_train = lgb_lm.predict_proba(X_train)[:, 1] 80 | fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(y_train,y_pred_lgb_lm_train) 81 | 82 | y_pred_lgb_lm = lgb_lm.predict_proba(X_test)[:,1] 83 | fpr_lgb_lm,tpr_lgb_lm,_ = roc_curve(y_test,y_pred_lgb_lm) 84 | 85 | #计算KS值并绘制ROC曲线 86 | plt.figure(1) 87 | plt.plot([0, 1], [0, 1], 'k--') 88 | plt.plot(fpr_lgb_lm_train,tpr_lgb_lm_train,label='LGB + LR train') 89 | plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test') 90 | plt.xlabel('False positive rate') 91 | plt.ylabel('True positive rate') 92 | plt.title('ROC curve') 93 | plt.legend(loc='best') 94 | plt.show() 95 | print('train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(), 96 | 'test AUC:',auc(fpr_lgb_lm_train, tpr_lgb_lm_train)) 97 | print('test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(), 98 | ' test AUC:', auc(fpr_lgb_lm, tpr_lgb_lm)) 99 | 100 | def rondom_walk (self,length, start_node): 101 | walk = [start_node] 102 | while len(walk) < length: 103 | temp = walk[-1] 104 | temp_nbrs = list(self.G.neighbors(temp)) 105 | if len(temp_nbrs) > 0: 106 | walk.append(random.choice(temp_nbrs)) 107 | else: 108 | break 109 | return walk 110 | 111 | #Node2Vec 112 | import networkx as nx 113 | from node2vec import Node2Vec 114 | 115 | # 自定义图 116 | graph = nx.fast_gnp_random_graph(n=100, p=0.5) 117 | 118 | # 预计算概率并生成行走 119 | node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) 120 | 121 | # 嵌入节点 122 | model = node2vec.fit(window=10, min_count=1, batch_words=4) 123 | 124 | # 寻找最相似节点 125 | model.wv.most_similar('2') 126 | 127 | # 保存节点嵌入结果 128 | model.wv.save_word2vec_format('EMBEDDING_FILENAME') 129 | 130 | # 保存模型 131 | model.save('EMBEDDING_MODEL_FILENAME') 132 | 133 | # 用Hadamard方法嵌入边 134 | from node2vec.edges import HadamardEmbedder 135 | 136 | edges_embs = HadamardEmbedder(keyed_vectors=model.wv) 137 | 138 | # 快速查找嵌入 139 | edges_embs[('1', '2')] 140 | 141 | # 在单独的实例中获取所有边 142 | edges_kv = edges_embs.as_keyed_vectors() 143 | 144 | # 寻找最相似边 145 | edges_kv.most_similar(str(('1', '2'))) 146 | 147 | # 保存边嵌入结果 148 | edges_kv.save_word2vec_format('EDGES_EMBEDDING_FILENAME') 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /智能风控(代码)/第8章/8.6.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 24 16:51:29 2019 4 | 5 | @author: zixing.mei 6 | """ 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import networkx as nx 12 | def normalize(A , symmetric=True): 13 | # A = A+I 14 | A = A + torch.eye(A.size(0)) 15 | # 所有节点的度 16 | d = A.sum(1) 17 | if symmetric: 18 | #D = D^-1/2 19 | D = torch.diag(torch.pow(d , -0.5)) 20 | return D.mm(A).mm(D) 21 | else : 22 | # D=D^-1 23 | D =torch.diag(torch.pow(d,-1)) 24 | return D.mm(A) 25 | class GCN(nn.Module): 26 | ''' 27 | Z = AXW 28 | ''' 29 | def __init__(self , A, dim_in , dim_out): 30 | super(GCN,self).__init__() 31 | self.A = A 32 | self.fc1 = nn.Linear(dim_in ,dim_in,bias=False) 33 | self.fc2 = nn.Linear(dim_in,dim_in//2,bias=False) 34 | self.fc3 = nn.Linear(dim_in//2,dim_out,bias=False) 35 | 36 | def forward(self,X): 37 | ''' 38 | 计算三层GCN 39 | ''' 40 | X = F.relu(self.fc1(self.A.mm(X))) 41 | X = F.relu(self.fc2(self.A.mm(X))) 42 | return self.fc3(self.A.mm(X)) 43 | #获得数据 44 | G = nx.karate_club_graph() 45 | A = nx.adjacency_matrix(G).todense() 46 | #矩阵A需要标准化 47 | A_normed = normalize(torch.FloatTensor(A/1.0),True) 48 | 49 | N = len(A) 50 | X_dim = N 51 | 52 | # 没有节点的特征,简单用一个单位矩阵表示所有节点 53 | X = torch.eye(N,X_dim) 54 | # 正确结果 55 | Y = torch.zeros(N,1).long() 56 | # 计算loss的时候要去掉没有标记的样本 57 | Y_mask = torch.zeros(N,1,dtype=torch.uint8) 58 | # 一个分类给一个样本 59 | Y[0][0]=0 60 | Y[N-1][0]=1 61 | #有样本的地方设置为1 62 | Y_mask[0][0]=1 63 | Y_mask[N-1][0]=1 64 | 65 | #真实的空手道俱乐部的分类数据 66 | Real = torch.zeros(34 , dtype=torch.long) 67 | for i in [1,2,3,4,5,6,7,8,11,12,13,14,17,18,20,22] : 68 | Real[i-1] = 0 69 | for i in [9,10,15,16,19,21,23,24,25,26,27,28,29,30,31,32,33,34] : 70 | Real[i-1] = 1 71 | 72 | # GCN模型 73 | gcn = GCN(A_normed ,X_dim,2) 74 | #选择adam优化器 75 | gd = torch.optim.Adam(gcn.parameters()) 76 | 77 | for i in range(300): 78 | #转换到概率空间 79 | y_pred =F.softmax(gcn(X),dim=1) 80 | #下面两行计算cross entropy 81 | loss = (-y_pred.log().gather(1,Y.view(-1,1))) 82 | #仅保留有标记的样本 83 | loss = loss.masked_select(Y_mask).mean() 84 | 85 | #梯度下降 86 | #清空前面的导数缓存 87 | gd.zero_grad() 88 | #求导 89 | loss.backward() 90 | #一步更新 91 | gd.step() 92 | 93 | if i%100==0 : 94 | _,mi = y_pred.max(1) 95 | print(mi) 96 | #计算精确度 97 | print((mi == Real).float().mean()) 98 | 99 | -------------------------------------------------------------------------------- /智能风控(数据集)/data_for_tree.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangXinNan/IntelligentRiskControl/e8a5be3bbbb0bf73936c6c6670a451c9ddd51202/智能风控(数据集)/data_for_tree.xlsx -------------------------------------------------------------------------------- /智能风控(数据集)/stack_network_links.csv: -------------------------------------------------------------------------------- 1 | source,target,value 2 | azure,.net,20.933192346640457 3 | sql-server,.net,32.322524219339904 4 | asp.net,.net,48.40702996199019 5 | entity-framework,.net,24.37090250532431 6 | wpf,.net,32.35092522005943 7 | linq,.net,20.501743858149066 8 | wcf,.net,28.074400427611113 9 | c#,.net,62.167895042923824 10 | tdd,agile,37.146589924204555 11 | codeigniter,ajax,23.19190040565183 12 | jquery,ajax,50.56672861589973 13 | mysql,ajax,24.80008942291756 14 | css,ajax,26.613713724688935 15 | php,ajax,28.01067925660409 16 | javascript,ajax,24.39914442262329 17 | json,ajax,32.94744601093195 18 | cloud,amazon-web-services,21.31860679884144 19 | azure,amazon-web-services,21.30994959394633 20 | devops,amazon-web-services,24.98353120101788 21 | docker,amazon-web-services,32.198071014100535 22 | ios,android,39.77803622570551 23 | android-studio,android,33.661083176336234 24 | java,android,50.984730766476574 25 | android,android-studio,33.661083176336234 26 | typescript,angular,31.03648178393663 27 | typescript,angular2,38.87998222920348 28 | angularjs,angular2,26.032697164489267 29 | ionic-framework,angularjs,29.840445766924983 30 | reactjs,angularjs,31.62020230471286 31 | mongodb,angularjs,31.510711170360768 32 | css,angularjs,22.210413043130057 33 | sass,angularjs,20.425878416671157 34 | twitter-bootstrap,angularjs,24.153685861107462 35 | javascript,angularjs,39.37662666227728 36 | express,angularjs,24.43382880528071 37 | node.js,angularjs,47.56352702530362 38 | jquery,angularjs,30.34794743352026 39 | asp.net-web-api,angularjs,20.11309966430225 40 | angular2,angularjs,26.032697164489267 41 | html5,angularjs,23.082664020750425 42 | nginx,apache,48.583173464083416 43 | mysql,apache,29.097834547698525 44 | linux,apache,28.168213013158717 45 | scala,apache-spark,50.79184563415286 46 | hadoop,apache-spark,59.82678558413366 47 | rest,api,22.175589109335288 48 | .net,asp.net,48.40702996199019 49 | sql,asp.net,21.6722646057341 50 | sql-server,asp.net,59.67328948689907 51 | c#,asp.net,80.4485421720991 52 | asp.net-web-api,asp.net,47.38627215537402 53 | jquery,asp.net,28.723178220098205 54 | entity-framework,asp.net,48.1136690894626 55 | mvc,asp.net,22.828412163028812 56 | azure,asp.net,23.764072995058054 57 | wpf,asp.net,28.15902333873328 58 | linq,asp.net,31.581277746170855 59 | wcf,asp.net,40.951421726059984 60 | vb.net,asp.net,23.711346281144564 61 | asp.net,asp.net-web-api,47.38627215537402 62 | azure,asp.net-web-api,21.585694763313093 63 | c#,asp.net-web-api,26.748821548289044 64 | angularjs,asp.net-web-api,20.11309966430225 65 | sql-server,asp.net-web-api,25.67647418219353 66 | wcf,asp.net-web-api,28.356535431019818 67 | entity-framework,asp.net-web-api,31.18349509959109 68 | .net,azure,20.933192346640457 69 | c#,azure,22.14448701181891 70 | asp.net-web-api,azure,21.585694763313093 71 | asp.net,azure,23.764072995058054 72 | amazon-web-services,azure,21.30994959394633 73 | linux,bash,38.88811575032741 74 | shell,bash,24.71706557378606 75 | git,bash,27.096957129984922 76 | jquery,bootstrap,22.406154649004446 77 | css,bootstrap,24.71031260188158 78 | c++,c,80.89104614147385 79 | python,c,22.320432253935472 80 | embedded,c,28.403647769064776 81 | java,c,26.04945930410044 82 | linq,c#,25.222931643142886 83 | sql,c#,25.613903617043626 84 | asp.net,c#,80.4485421720991 85 | asp.net-web-api,c#,26.748821548289044 86 | entity-framework,c#,30.728425581793207 87 | vb.net,c#,25.185135956324604 88 | .net,c#,62.167895042923824 89 | sql-server,c#,45.91465123552504 90 | xamarin,c#,24.673147548722167 91 | azure,c#,22.14448701181891 92 | wpf,c#,38.95749217569616 93 | unity3d,c#,25.000233473145794 94 | wcf,c#,29.299880033528954 95 | visual-studio,c#,27.71554342863794 96 | qt,c++,30.144660032562147 97 | c,c++,80.89104614147385 98 | java,c++,23.201095658265963 99 | python,c++,24.301045666206715 100 | amazon-web-services,cloud,21.31860679884144 101 | mysql,codeigniter,22.596885929311604 102 | jquery,codeigniter,26.729771309673538 103 | laravel,codeigniter,31.658709317460826 104 | php,codeigniter,37.40149646160739 105 | ajax,codeigniter,23.19190040565183 106 | wordpress,codeigniter,25.13313076574851 107 | ajax,css,26.613713724688935 108 | mysql,css,27.010622489444646 109 | photoshop,css,20.855721424968042 110 | jquery,css,66.67420569975171 111 | html,css,126.57112712972764 112 | javascript,css,75.53660009612221 113 | html5,css,87.13826986156899 114 | bootstrap,css,24.71031260188158 115 | twitter-bootstrap,css,31.56405510257182 116 | less,css,25.340535324481124 117 | wordpress,css,31.264824668848835 118 | angularjs,css,22.210413043130057 119 | sass,css,40.96336126549837 120 | php,css,51.447604909108925 121 | amazon-web-services,devops,24.98353120101788 122 | docker,devops,24.554202817668248 123 | jenkins,devops,23.602587607535273 124 | python,django,49.905942953683244 125 | flask,django,42.33664313992495 126 | postgresql,django,22.48925264318247 127 | amazon-web-services,docker,32.198071014100535 128 | go,docker,28.375405907346195 129 | devops,docker,24.554202817668248 130 | jenkins,docker,26.528607599809522 131 | wordpress,drupal,24.919296182152596 132 | maven,eclipse,25.314659453818305 133 | redis,elasticsearch,29.349098676116647 134 | mongodb,elasticsearch,20.22800685545149 135 | c,embedded,28.403647769064776 136 | .net,entity-framework,24.37090250532431 137 | wpf,entity-framework,24.228201856682706 138 | asp.net,entity-framework,48.1136690894626 139 | sql-server,entity-framework,32.62377908692168 140 | linq,entity-framework,54.00592956009751 141 | wcf,entity-framework,32.81595165036012 142 | asp.net-web-api,entity-framework,31.18349509959109 143 | c#,entity-framework,30.728425581793207 144 | vba,excel,45.26074988295236 145 | excel-vba,excel,54.50523562629742 146 | vba,excel-vba,49.61621727834464 147 | excel,excel-vba,54.50523562629742 148 | reactjs,express,31.924272588963444 149 | redux,express,21.543458417676824 150 | angularjs,express,24.43382880528071 151 | node.js,express,58.829076622959285 152 | mongodb,express,48.76806081178149 153 | python,flask,25.251371861419308 154 | django,flask,42.33664313992495 155 | jenkins,git,23.075440145975143 156 | github,git,56.20246391729147 157 | bash,git,27.096957129984922 158 | linux,git,27.70879277472438 159 | git,github,56.20246391729147 160 | docker,go,28.375405907346195 161 | scala,hadoop,21.62593446217338 162 | apache-spark,hadoop,59.82678558413366 163 | scala,haskell,22.757440009799737 164 | jsp,hibernate,39.406945985510596 165 | java,hibernate,32.52356510452292 166 | spring-boot,hibernate,30.232903706376273 167 | maven,hibernate,34.100052024791246 168 | rest,hibernate,21.646667877763228 169 | web-services,hibernate,21.210956925188103 170 | spring-mvc,hibernate,64.109898213092265 171 | java-ee,hibernate,39.90817378554484 172 | spring,hibernate,103.26828446355263 173 | mysql,html,21.35568889023083 174 | javascript,html,59.75548884052987 175 | php,html,45.66104087971069 176 | css,html,126.57112712972764 177 | sass,html,23.639529235488705 178 | jquery,html,44.23362023021944 179 | php,html5,32.350506919162136 180 | wordpress,html5,22.216440754019796 181 | javascript,html5,47.00636375705097 182 | angularjs,html5,23.082664020750425 183 | less,html5,20.92318766828214 184 | twitter-bootstrap-3,html5,22.161036413455076 185 | jquery,html5,47.49277338891457 186 | css,html5,87.13826986156899 187 | twitter-bootstrap,html5,26.230983374393585 188 | sass,html5,32.070376656092115 189 | angularjs,ionic-framework,29.840445766924983 190 | android,ios,39.77803622570551 191 | swift,ios,87.21964246099864 192 | osx,ios,30.341581071883272 193 | objective-c,ios,78.75928046651394 194 | iphone,ios,57.15857405623158 195 | xcode,ios,46.36509077387072 196 | swift,iphone,36.02337467321895 197 | xcode,iphone,34.71286507120063 198 | ios,iphone,57.15857405623158 199 | objective-c,iphone,47.97788760375011 200 | spring,java,43.954259225314345 201 | c++,java,23.201095658265963 202 | jsp,java,21.619803035260286 203 | java-ee,java,25.116182820462374 204 | android,java,50.984730766476574 205 | hibernate,java,32.52356510452292 206 | c,java,26.04945930410044 207 | spring-mvc,java,25.041214810765112 208 | spring,java-ee,39.812105148516856 209 | hibernate,java-ee,39.90817378554484 210 | spring-mvc,java-ee,27.964591852574834 211 | java,java-ee,25.116182820462374 212 | jquery,javascript,57.84183152642191 213 | twitter-bootstrap,javascript,20.238823043724278 214 | node.js,javascript,42.73172932305638 215 | mysql,javascript,22.91619071652686 216 | angularjs,javascript,39.37662666227728 217 | php,javascript,47.3281575555596 218 | ajax,javascript,24.39914442262329 219 | reactjs,javascript,33.56735910485145 220 | css,javascript,75.53660009612221 221 | html,javascript,59.75548884052987 222 | sass,javascript,23.782469883653217 223 | html5,javascript,47.00636375705097 224 | git,jenkins,23.075440145975143 225 | devops,jenkins,23.602587607535273 226 | maven,jenkins,33.47708077913068 227 | docker,jenkins,26.528607599809522 228 | json,jquery,20.62957734085586 229 | html,jquery,44.23362023021944 230 | wordpress,jquery,28.87017059988903 231 | sass,jquery,24.68172813218768 232 | asp.net,jquery,28.723178220098205 233 | php,jquery,46.619090898232265 234 | css,jquery,66.67420569975171 235 | html5,jquery,47.49277338891457 236 | ajax,jquery,50.56672861589973 237 | twitter-bootstrap,jquery,36.79192549817867 238 | mysql,jquery,35.71297781108102 239 | angularjs,jquery,30.34794743352026 240 | twitter-bootstrap-3,jquery,21.153243249312748 241 | javascript,jquery,57.84183152642191 242 | codeigniter,jquery,26.729771309673538 243 | bootstrap,jquery,22.406154649004446 244 | rest,json,25.049498396111403 245 | jquery,json,20.62957734085586 246 | xml,json,42.721668458812765 247 | ajax,json,32.94744601093195 248 | hibernate,jsp,39.406945985510596 249 | java,jsp,21.619803035260286 250 | spring-mvc,jsp,24.06449025192885 251 | spring,jsp,30.61333985121873 252 | vue.js,laravel,28.79989665496036 253 | php,laravel,45.88473321024233 254 | mysql,laravel,20.23026605428928 255 | codeigniter,laravel,31.658709317460826 256 | sass,less,60.545941127987604 257 | css,less,25.340535324481124 258 | html5,less,20.92318766828214 259 | wcf,linq,34.58709265871981 260 | sql-server,linq,20.444792111689296 261 | c#,linq,25.222931643142886 262 | asp.net,linq,31.581277746170855 263 | entity-framework,linq,54.00592956009751 264 | .net,linq,20.501743858149066 265 | wpf,linq,26.468391555977576 266 | bash,linux,38.88811575032741 267 | git,linux,27.70879277472438 268 | unix,linux,25.357412874239948 269 | osx,linux,23.41281493970028 270 | ubuntu,linux,29.894342585346926 271 | shell,linux,21.131792917374074 272 | nginx,linux,21.080478447912995 273 | windows,linux,29.945400773839896 274 | python,linux,21.17036044400364 275 | apache,linux,28.168213013158717 276 | python,machine-learning,30.27077436735024 277 | r,machine-learning,23.21869703965049 278 | r,matlab,20.021932698311588 279 | spring-mvc,maven,24.933802259856545 280 | jenkins,maven,33.47708077913068 281 | eclipse,maven,25.314659453818305 282 | spring,maven,39.53022984092444 283 | hibernate,maven,34.100052024791246 284 | express,mongodb,48.76806081178149 285 | node.js,mongodb,58.65780661217398 286 | reactjs,mongodb,22.3209731892725 287 | postgresql,mongodb,22.855557000277642 288 | mysql,mongodb,26.36926476163211 289 | redis,mongodb,33.554731128787694 290 | elasticsearch,mongodb,20.22800685545149 291 | angularjs,mongodb,31.510711170360768 292 | asp.net,mvc,22.828412163028812 293 | css,mysql,27.010622489444646 294 | jquery,mysql,35.71297781108102 295 | mongodb,mysql,26.36926476163211 296 | laravel,mysql,20.23026605428928 297 | php,mysql,65.07025912410015 298 | postgresql,mysql,25.909942488756776 299 | javascript,mysql,22.91619071652686 300 | html,mysql,21.35568889023083 301 | ajax,mysql,24.80008942291756 302 | apache,mysql,29.097834547698525 303 | codeigniter,mysql,22.596885929311604 304 | linux,nginx,21.080478447912995 305 | redis,nginx,27.473141406934243 306 | apache,nginx,48.583173464083416 307 | javascript,node.js,42.73172932305638 308 | reactjs,node.js,55.192747551978705 309 | express,node.js,58.829076622959285 310 | angularjs,node.js,47.56352702530362 311 | mongodb,node.js,58.65780661217398 312 | react-native,node.js,22.16325031188543 313 | redux,node.js,23.40192325369395 314 | xcode,objective-c,43.41882511710604 315 | iphone,objective-c,47.97788760375011 316 | swift,objective-c,79.08853577916759 317 | ios,objective-c,78.75928046651394 318 | osx,objective-c,24.763189829170084 319 | sql,oracle,21.82760476470736 320 | plsql,oracle,45.06151433906438 321 | windows,osx,20.860246776482484 322 | objective-c,osx,24.763189829170084 323 | ios,osx,30.341581071883272 324 | linux,osx,23.41281493970028 325 | regex,perl,21.364077886249937 326 | css,photoshop,20.855721424968042 327 | css,php,51.447604909108925 328 | laravel,php,45.88473321024233 329 | wordpress,php,41.03704549651282 330 | jquery,php,46.619090898232265 331 | javascript,php,47.3281575555596 332 | ajax,php,28.01067925660409 333 | html,php,45.66104087971069 334 | mysql,php,65.07025912410015 335 | html5,php,32.350506919162136 336 | codeigniter,php,37.40149646160739 337 | oracle,plsql,45.06151433906438 338 | sql,plsql,22.717442387504864 339 | mysql,postgresql,25.909942488756776 340 | redis,postgresql,26.28018174137368 341 | ruby-on-rails,postgresql,25.69011032907274 342 | mongodb,postgresql,22.855557000277642 343 | django,postgresql,22.48925264318247 344 | ruby,postgresql,21.79517505760149 345 | windows,powershell,20.24069060325144 346 | flask,python,25.251371861419308 347 | c++,python,24.301045666206715 348 | machine-learning,python,30.27077436735024 349 | django,python,49.905942953683244 350 | c,python,22.320432253935472 351 | r,python,28.53574785327096 352 | linux,python,21.17036044400364 353 | c++,qt,30.144660032562147 354 | python,r,28.53574785327096 355 | matlab,r,20.021932698311588 356 | machine-learning,r,23.21869703965049 357 | reactjs,react-native,61.53102297253567 358 | redux,react-native,25.480581476491025 359 | node.js,react-native,22.16325031188543 360 | node.js,reactjs,55.192747551978705 361 | sass,reactjs,24.9979588147436 362 | react-native,reactjs,61.53102297253567 363 | redux,reactjs,65.12985505970208 364 | angularjs,reactjs,31.62020230471286 365 | mongodb,reactjs,22.3209731892725 366 | express,reactjs,31.924272588963444 367 | javascript,reactjs,33.56735910485145 368 | mongodb,redis,33.554731128787694 369 | nginx,redis,27.473141406934243 370 | elasticsearch,redis,29.349098676116647 371 | postgresql,redis,26.28018174137368 372 | reactjs,redux,65.12985505970208 373 | node.js,redux,23.40192325369395 374 | react-native,redux,25.480581476491025 375 | express,redux,21.543458417676824 376 | perl,regex,21.364077886249937 377 | spring,rest,26.11759862631264 378 | web-services,rest,33.69066907565828 379 | api,rest,22.175589109335288 380 | hibernate,rest,21.646667877763228 381 | json,rest,25.049498396111403 382 | postgresql,ruby,21.79517505760149 383 | ruby-on-rails,ruby,95.36131071220332 384 | postgresql,ruby-on-rails,25.69011032907274 385 | ruby,ruby-on-rails,95.36131071220332 386 | jquery,sass,24.68172813218768 387 | javascript,sass,23.782469883653217 388 | reactjs,sass,24.9979588147436 389 | html,sass,23.639529235488705 390 | html5,sass,32.070376656092115 391 | less,sass,60.545941127987604 392 | angularjs,sass,20.425878416671157 393 | css,sass,40.96336126549837 394 | twitter-bootstrap,sass,20.18548801090922 395 | hadoop,scala,21.62593446217338 396 | haskell,scala,22.757440009799737 397 | apache-spark,scala,50.79184563415286 398 | testing,selenium,33.685943095200074 399 | bash,shell,24.71706557378606 400 | linux,shell,21.131792917374074 401 | web-services,spring,20.16560629687762 402 | spring-mvc,spring,63.330217313152836 403 | rest,spring,26.11759862631264 404 | spring-boot,spring,57.04039265671136 405 | maven,spring,39.53022984092444 406 | java-ee,spring,39.812105148516856 407 | hibernate,spring,103.26828446355263 408 | jsp,spring,30.61333985121873 409 | java,spring,43.954259225314345 410 | spring,spring-boot,57.04039265671136 411 | hibernate,spring-boot,30.232903706376273 412 | spring-mvc,spring-boot,60.61682620955685 413 | maven,spring-mvc,24.933802259856545 414 | java-ee,spring-mvc,27.964591852574834 415 | spring,spring-mvc,63.330217313152836 416 | jsp,spring-mvc,24.06449025192885 417 | spring-boot,spring-mvc,60.61682620955685 418 | hibernate,spring-mvc,64.109898213092265 419 | java,spring-mvc,25.041214810765112 420 | plsql,sql,22.717442387504864 421 | sql-server,sql,24.354772917848095 422 | c#,sql,25.613903617043626 423 | oracle,sql,21.82760476470736 424 | asp.net,sql,21.6722646057341 425 | asp.net-web-api,sql-server,25.67647418219353 426 | wcf,sql-server,26.29146633095843 427 | vb.net,sql-server,21.788893442838376 428 | .net,sql-server,32.322524219339904 429 | entity-framework,sql-server,32.62377908692168 430 | asp.net,sql-server,59.67328948689907 431 | linq,sql-server,20.444792111689296 432 | sql,sql-server,24.354772917848095 433 | c#,sql-server,45.91465123552504 434 | xcode,swift,48.62033486702057 435 | iphone,swift,36.02337467321895 436 | objective-c,swift,79.08853577916759 437 | ios,swift,87.21964246099864 438 | agile,tdd,37.146589924204555 439 | selenium,testing,33.685943095200074 440 | angularjs,twitter-bootstrap,24.153685861107462 441 | html5,twitter-bootstrap,26.230983374393585 442 | sass,twitter-bootstrap,20.18548801090922 443 | jquery,twitter-bootstrap,36.79192549817867 444 | javascript,twitter-bootstrap,20.238823043724278 445 | css,twitter-bootstrap,31.56405510257182 446 | jquery,twitter-bootstrap-3,21.153243249312748 447 | html5,twitter-bootstrap-3,22.161036413455076 448 | angular2,typescript,38.87998222920348 449 | angular,typescript,31.03648178393663 450 | linux,ubuntu,29.894342585346926 451 | c#,unity3d,25.000233473145794 452 | linux,unix,25.357412874239948 453 | asp.net,vb.net,23.711346281144564 454 | sql-server,vb.net,21.788893442838376 455 | c#,vb.net,25.185135956324604 456 | excel-vba,vba,49.61621727834464 457 | excel,vba,45.26074988295236 458 | c#,visual-studio,27.71554342863794 459 | laravel,vue.js,28.79989665496036 460 | .net,wcf,28.074400427611113 461 | entity-framework,wcf,32.81595165036012 462 | c#,wcf,29.299880033528954 463 | sql-server,wcf,26.29146633095843 464 | wpf,wcf,53.17990345536856 465 | linq,wcf,34.58709265871981 466 | asp.net,wcf,40.951421726059984 467 | asp.net-web-api,wcf,28.356535431019818 468 | rest,web-services,33.69066907565828 469 | hibernate,web-services,21.210956925188103 470 | spring,web-services,20.16560629687762 471 | powershell,windows,20.24069060325144 472 | linux,windows,29.945400773839896 473 | osx,windows,20.860246776482484 474 | php,wordpress,41.03704549651282 475 | html5,wordpress,22.216440754019796 476 | css,wordpress,31.264824668848835 477 | codeigniter,wordpress,25.13313076574851 478 | drupal,wordpress,24.919296182152596 479 | jquery,wordpress,28.87017059988903 480 | linq,wpf,26.468391555977576 481 | wcf,wpf,53.17990345536856 482 | entity-framework,wpf,24.228201856682706 483 | c#,wpf,38.95749217569616 484 | asp.net,wpf,28.15902333873328 485 | .net,wpf,32.35092522005943 486 | c#,xamarin,24.673147548722167 487 | objective-c,xcode,43.41882511710604 488 | swift,xcode,48.62033486702057 489 | iphone,xcode,34.71286507120063 490 | ios,xcode,46.36509077387072 491 | json,xml,42.721668458812765 492 | -------------------------------------------------------------------------------- /智能风控(数据集)/tra_sample.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangXinNan/IntelligentRiskControl/e8a5be3bbbb0bf73936c6c6670a451c9ddd51202/智能风控(数据集)/tra_sample.xlsx --------------------------------------------------------------------------------