├── Financial fraud prediction.ipynb ├── README.md └── result1.PNG /README.md: -------------------------------------------------------------------------------- 1 | # Financial-fraud-prediction
2 | 1、导入数据,并查看数据信息 3 | ---------- 4 | ``` 5 | # 导入数据文件 6 | df = pd.read_csv('F:\Python\data\LoanStats_2016Q3.csv', skiprows=1, low_memory=False) 7 | df.info() # 查看文件属性 8 | ``` 9 | 2、对数据进行预处理,将相关度不大的属性列删除
10 | ---------- 11 | ``` 12 | # drop id and member_id 13 | df.drop('id',1,inplace=True) 14 | df.drop('member_id',1,inplace=True) 15 | 16 | # drop nan rows 17 | df.dropna(axis=0,how='all',inplace=True) 18 | df.info() 19 | 20 | # emp_title is too many to ignore 21 | df.drop('emp_title',1,inplace=True) 22 | ``` 23 | 24 | 3、查看数据类别,将缺失值较大的属性列删除
25 | ---------- 26 | ``` 27 | # drop hige missing_pct 28 | df.drop('desc',1,inplace=True) 29 | df.drop('verification_status_joint',1,inplace=True) 30 | 31 | # drop unnecessary object 32 | df.drop('term',1,inplace=True) 33 | df.drop('issue_d',1,inplace=True) 34 | df.drop('purpose',1,inplace=True) 35 | df.drop('title',1,inplace=True) 36 | df.drop('zip_code',1,inplace=True) 37 | df.drop('addr_state',1,inplace=True) 38 | df.drop('earliest_cr_line',1,inplace=True) 39 | df.drop('revol_util',1,inplace=True) 40 | 41 | # drop after loan 42 | df.drop(['out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp', 'grade', 'sub_grade'] ,1, inplace=True) 43 | df.drop(['total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee','collection_recovery_fee'],1, inplace=True) 44 | df.drop(['last_pymnt_d','last_pymnt_amnt','next_pymnt_d','last_credit_pull_d'],1, inplace=True) 45 | df.drop(['policy_code'],1, inplace=True) 46 | ``` 47 | 48 | 4.整理数据列,关注loan_status属性列,并计算与该属性列相关系数,相关系数高的可以删除
49 | ------------ 50 | ``` 51 | # focus on loan_status 52 | df.loan_status.value_counts() 53 | 54 | # Current and Fully Paid regard as excellent 1, late regard as bad loan 0, others regard ass nan 55 | df.loan_status.replace('Current',int(1),inplace=True) 56 | df.loan_status.replace('Fully Paid',int(1),inplace=True) 57 | df.loan_status.replace('Charged Off',np.nan,inplace=True) 58 | df.loan_status.replace('In Grace Period',np.nan,inplace=True) 59 | df.loan_status.replace('Default',np.nan,inplace=True) 60 | df.loan_status.replace('Late (31-120 days)',int(0),inplace=True) 61 | df.loan_status.replace('Late (16-30 days)',int(0),inplace=True) 62 | ``` 63 | ``` 64 | # find highly corr Data 65 | cor = df.corr() 66 | cor.loc[:,:] = np.tril(cor,k=-1) 67 | cor = cor.stack() 68 | cor[abs(cor) > 0.5] 69 | 70 | # drop the columns highly correlated with loan_status 71 | df.drop(['funded_amnt','funded_amnt_inv','installment'],1,inplace=True) 72 | ``` 73 | 5.对已处理好的数据进行建模训练,测试集为0.2
74 | ---------- 75 | ``` 76 | Y = df.loan_status 77 | df.drop('loan_status',1,inplace=True) 78 | X = df 79 | 80 | # train_data and test_data 81 | x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=1) 82 | ``` 83 | 6.使用GBRT对数据进行建模训练
84 | --------- 85 | ``` 86 | from sklearn.model_selection import train_test_split 87 | from sklearn.model_selection import GridSearchCV 88 | from sklearn import ensemble 89 | from sklearn.preprocessing import OneHotEncoder 90 | ``` 91 | ``` 92 | param_grid = {'learning_rate': [0.1], 93 | 'max_depth': [2], 94 | 'min_samples_split': [50,100], 95 | 'n_estimators': [100,200] 96 | } 97 | 98 | 99 | est = GridSearchCV(ensemble.GradientBoostingRegressor(), 100 | param_grid, n_jobs=4, refit=True) 101 | 102 | est.fit(x_train, y_train) 103 | 104 | best_params = est.best_params_ 105 | print(best_params) 106 | ``` 107 | ``` 108 | %%time 109 | est = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=300, 110 | learning_rate=0.1,max_depth=1,random_state=0, 111 | loss='ls').fit(x_train,y_train) 112 | est.score(x_test,y_test) 113 | ``` 114 | 7、测试模型,找出10个最重要属性
115 | ------- 116 | ``` 117 | def computer_ks(data): 118 | sorted_list = data.sort_values(['predict'],ascending=True) 119 | total_bad = sorted_list['label'].sum(axis=None,skipna=None,level=None,numeric_only=None)/3 120 | total_good = sorted_list.shape[0] - total_bad 121 | 122 | max_ks = 0.0 123 | good_count = 0.0 124 | bad_count = 0.0 125 | for index, row in sorted_list.iterrows(): 126 | if row['label'] == 3: 127 | bad_count += 1.0 128 | else: 129 | good_count += 1.0 130 | 131 | val = bad_count/total_bad - good_count/total_good 132 | max_ks = max(max_ks,val) 133 | 134 | return max_ks 135 | ``` 136 | 137 | ``` 138 | feature_importance = est.feature_importances_ 139 | feature_importance = 100.0 * (feature_importance / feature_importance.max()) 140 | 141 | indices = np.argsort(feature_importance)[-10:] 142 | plt.barh(np.arange(10),feature_importance[indices],color='green',alpha=0.4) 143 | plt.yticks(np.arange(10+0.25),np.array(X.columns)[indices]) 144 | _ = plt.xlabel('Relative importance'), plt.title('Top Ten Important Variables') 145 | ``` 146 | 8、最终结果
147 | ------- 148 | ![](https://github.com/fanzhihai/Financial-fraud-prediction/raw/master/result1.PNG)   149 | 150 | -------------------------------------------------------------------------------- /result1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanzhihai/Financial-fraud-prediction/2a281db6059ec598711936490afcdf705e994ebf/result1.PNG --------------------------------------------------------------------------------