├── .gitattributes ├── .gitignore ├── README.md └── mojing.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PPD_mojing_fengkong 2 | 拍拍贷"魔镜杯”风控大赛 3 | 排名21/485,队伍名 呵呵哒 4 | https://www.kesci.com/apps/home_log/index.html#!/competition/56cd5f02b89b5bd026cb39c9/leaderboard/1 5 | -------------------------------------------------------------------------------- /mojing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 10 15:17:13 2016 4 | """ 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.ensemble import GradientBoostingClassifier 8 | from sklearn.linear_model import LogisticRegression 9 | from keras.models import Sequential 10 | from xgboost.sklearn import XGBClassifier 11 | from keras.layers.core import Dense, Dropout, Activation 12 | from keras.layers.normalization import BatchNormalization 13 | from keras.layers.advanced_activations import PReLU 14 | from keras.constraints import maxnorm 15 | from keras.utils import np_utils 16 | from sklearn.cross_validation import train_test_split 17 | from sklearn.metrics import roc_auc_score,accuracy_score 18 | from sklearn.preprocessing import LabelEncoder 19 | from sklearn.preprocessing import StandardScaler 20 | from keras.callbacks import EarlyStopping 21 | from keras.layers.core import Dense 22 | 23 | np.random.seed(11) 24 | need_normalise=True 25 | need_validataion=True 26 | need_categorical=False 27 | save_categorical_file=False 28 | #nb_epoch=180 29 | 30 | def save2model(submission,file_name,y_pre): 31 | assert len(y_pre)==len(submission) 32 | submission['score']=y_pre 33 | submission.to_csv(file_name,index=False) 34 | print ("saved files %s" % file_name) 35 | 36 | def load_data(): 37 | 38 | path = 'D:/JDO/PPD-Second-Round-Data/' 39 | df_testM = pd.read_csv(path+'chusai_test/Kesci_Master_9w_gbk_2.csv') 40 | df_testL = pd.read_csv(path+'chusai_test/LogInfo_9w_2.csv') 41 | df_testU = pd.read_csv(path+'chusai_test/Userupdate_Info_9w_2.csv') 42 | 43 | train1_M = pd.read_csv(path+'chusai_train/PPD_Training_Master_GBK_3_1_Training_Set.csv') 44 | train1_L = pd.read_csv(path+'chusai_train/PPD_LogInfo_3_1_Training_Set.csv') 45 | train1_U = pd.read_csv(path+'chusai_train/PPD_Userupdate_Info_3_1_Training_Set.csv') 46 | train2_M = pd.read_csv(path+'fusai_train/Kesci_Master_9w_gbk_3_2.csv') 47 | train2_L = pd.read_csv(path+'fusai_train/LogInfo_9w_3_2.csv') 48 | train2_U = pd.read_csv(path+'fusai_train/Userupdate_Info_9w_3_2.csv') 49 | 50 | df_trainM = pd.concat([train1_M,train2_M],ignore_index=True) 51 | df_trainL = pd.concat([train1_L,train2_L],ignore_index=True) 52 | df_trainU = pd.concat([train1_U,train2_U],ignore_index=True) 53 | 54 | df_testM['UserInfo_2'] = df_testM['UserInfo_2'].apply(lambda x:str(x)[:4]) 55 | df_testM['UserInfo_4'] = df_testM['UserInfo_4'].apply(lambda x:str(x)[:4]) 56 | df_testM['UserInfo_8'] = df_testM['UserInfo_8'].apply(lambda x:str(x)[:4]) 57 | df_testM['UserInfo_7'] = df_testM['UserInfo_7'].apply(lambda x:str(x)[:4]) 58 | df_testM['UserInfo_20'] = df_testM['UserInfo_20'].apply(lambda x:str(x)[:4]) 59 | df_testM['UserInfo_19'] = df_testM['UserInfo_19'].apply(lambda x:str(x)[:4]) 60 | df_trainM['UserInfo_2'] = df_trainM['UserInfo_2'].apply(lambda x:str(x)[:4]) 61 | df_trainM['UserInfo_4'] = df_trainM['UserInfo_4'].apply(lambda x:str(x)[:4]) 62 | df_trainM['UserInfo_8'] = df_trainM['UserInfo_8'].apply(lambda x:str(x)[:4]) 63 | df_trainM['UserInfo_7'] = df_trainM['UserInfo_7'].apply(lambda x:str(x)[:4]) 64 | df_trainM['UserInfo_20'] = df_trainM['UserInfo_20'].apply(lambda x:str(x)[:4]) 65 | df_trainM['UserInfo_19'] = df_trainM['UserInfo_19'].apply(lambda x:str(x)[:4]) 66 | 67 | df_trainM['UserInfo_24'] = df_trainM['UserInfo_24'].apply(lambda x:str(x)[:10]) 68 | df_testM['UserInfo_24'] = df_testM['UserInfo_24'].apply(lambda x:str(x)[:10]) 69 | 70 | df_trainM = df_trainM.replace(u'不详',np.nan) 71 | df_testM = df_testM.replace(u'不详',np.nan) 72 | df_testM['Date'] = pd.to_datetime(pd.Series(df_testM['ListingInfo'])) 73 | df_testM = df_testM.drop('ListingInfo', axis=1) 74 | df_testM['Year'] = df_testM['Date'].apply(lambda x: int(str(x)[:4])) 75 | df_testM['Month'] = df_testM['Date'].apply(lambda x: int(str(x)[5:7])) 76 | df_testM['weekday'] = [df_testM['Date'][i].dayofweek for i in range(len(df_testM['Date']))] 77 | 78 | df_trainM['Date'] = pd.to_datetime(pd.Series(df_trainM['ListingInfo'])) 79 | df_trainM = df_trainM.drop('ListingInfo', axis=1) 80 | df_trainM['Year'] = df_trainM['Date'].apply(lambda x: int(str(x)[:4])) 81 | df_trainM['Month'] = df_trainM['Date'].apply(lambda x: int(str(x)[5:7])) 82 | df_trainM['weekday'] = [df_trainM['Date'][i].dayofweek for i in range(len(df_trainM['Date']))] 83 | frame1 = [df_testL,df_trainL] 84 | frame2 = [df_testU,df_trainU] 85 | df_L = pd.concat(frame1,ignore_index=True) 86 | df_U = pd.concat(frame2,ignore_index=True) 87 | df_U['UserupdateInfo1'] = df_U['UserupdateInfo1'].apply(lambda x:str(x).upper()) 88 | df_Uu = pd.get_dummies(df_U['UserupdateInfo1']).join(df_U['Idx']) 89 | df_L1 = pd.get_dummies(df_L['LogInfo1']).join(df_L['Idx']) 90 | df_L1 = df_L1.groupby('Idx',as_index=False).sum() 91 | df_L2 = pd.get_dummies(df_L['LogInfo2']).join(df_L['Idx']) 92 | df_L2 = df_L2.groupby('Idx',as_index=False).sum() 93 | df_L3 = pd.merge(df_L1,df_L2,on = 'Idx',how = 'left') 94 | result_L = df_L3.groupby('Idx',as_index=False).sum() 95 | result_U = df_Uu.groupby('Idx',as_index=False).sum() 96 | 97 | df_trainM = df_trainM.fillna(-1) 98 | df_testM = df_testM.fillna(-1) 99 | for f in df_testM.columns: 100 | if df_testM[f].dtype=='object': 101 | lbl = LabelEncoder() 102 | lbl.fit(list(df_testM[f])+list(df_trainM[f])) 103 | df_trainM[f] = lbl.transform(list(df_trainM[f].values)) 104 | df_testM[f] = lbl.transform(list(df_testM[f].values)) 105 | df_train = pd.merge(df_trainM,result_L,on = 'Idx') 106 | train = pd.merge(df_train,result_U,on = 'Idx') 107 | df_test = pd.merge(df_testM,result_L,on = 'Idx',how = 'left') 108 | test = pd.merge(df_test,result_U,on = 'Idx',how = 'left') 109 | 110 | train = train.fillna(-1) 111 | test = test.fillna(-1) 112 | submission=pd.DataFrame() 113 | submission["Idx"]= test["Idx"] 114 | 115 | drop_feature = ['WeblogInfo_10'] 116 | X = train.drop(['Date','target','Idx'],axis = 1) 117 | X = X.drop(drop_feature,axis = 1) 118 | #X = train[golden_feature] 119 | Y = train['target'] 120 | final = test['target'] 121 | TEST = test.drop(['Date','Idx','target'],axis = 1) 122 | TEST = TEST.drop(drop_feature,axis = 1) 123 | #TEST = TEST[golden_feature] 124 | return [X,Y,TEST,submission,final] 125 | 126 | def cross_validation(): 127 | datasets=load_data() 128 | x_train,x_test,y_train,y_test = train_test_split(datasets[0],datasets[1],test_size=13000) 129 | 130 | encoder = LabelEncoder() 131 | train_y,valid_y = y_train.values,y_test.values 132 | train_y,valid_y = encoder.fit_transform(train_y).astype(np.int32),encoder.fit_transform(valid_y).astype(np.int32) 133 | train_y,valid_y = np_utils.to_categorical(train_y),np_utils.to_categorical(valid_y) 134 | 135 | print ("processsing finished") 136 | valid=None 137 | x = np.array(datasets[0]) 138 | x = x.astype(np.float32) 139 | train,valid = np.array(x_train),np.array(x_test) 140 | train,valid = train.astype(np.float32),valid.astype(np.float32) 141 | test=np.array(datasets[2]) 142 | test=test.astype(np.float32) 143 | if need_normalise: 144 | scaler = StandardScaler().fit(x) 145 | train,valid = scaler.transform(train),scaler.transform(valid) 146 | test = scaler.transform(test) 147 | 148 | return [(train,train_y),(test,datasets[3]),(valid,valid_y), 149 | (x_train,y_train),(datasets[2],datasets[3]),(x_test,y_test)] 150 | 151 | print('Loading data...') 152 | 153 | datasets=cross_validation() 154 | 155 | X_train, y_train = datasets[0] 156 | X_test, submission = datasets[1] 157 | X_valid, y_valid = datasets[2] 158 | 159 | X_Train, y_Train = datasets[3] 160 | X_Test, submission = datasets[4] 161 | X_Valid, y_Valid = datasets[5] 162 | 163 | nb_classes = y_train.shape[1] 164 | print(nb_classes, 'classes') 165 | 166 | dims = X_train.shape[1] 167 | print(dims, 'dims') 168 | 169 | model = Sequential() 170 | 171 | model.add(Dense(1024,input_shape=(dims,), init = 'glorot_normal', W_constraint = maxnorm(4))) 172 | model.add(PReLU()) 173 | model.add(BatchNormalization()) 174 | model.add(Dropout(0.5)) 175 | 176 | model.add(Dense(360, init = 'glorot_normal', W_constraint = maxnorm(4))) 177 | model.add(PReLU()) 178 | model.add(BatchNormalization()) 179 | model.add(Dropout(0.5)) 180 | ''' 181 | model.add(Dense(420, init = 'glorot_normal', W_constraint = maxnorm(4))) 182 | model.add(PReLU()) 183 | model.add(BatchNormalization()) 184 | model.add(Dropout(0.5)) 185 | ''' 186 | model.add(Dense(nb_classes)) 187 | model.add(Activation('sigmoid')) 188 | model.compile(loss='binary_crossentropy', optimizer="sgd") 189 | 190 | 191 | model.fit(X_train, y_train, nb_epoch=100, batch_size=128, 192 | callbacks = [EarlyStopping(monitor='val_loss', patience=20)]) 193 | y_pre = model.predict_proba(X_valid) 194 | scores = roc_auc_score(y_valid,y_pre) 195 | model_predprob = model.predict_proba(X_valid)[:,1] 196 | print ("\nnnModel Report") 197 | print ("AUC Score (Test): %f" %scores) 198 | 199 | xgb1 = XGBClassifier( 200 | learning_rate =0.03, 201 | n_estimators=408, 202 | max_depth=9, 203 | min_child_weight=3, 204 | subsample=0.75, 205 | colsample_bytree=0.8, 206 | objective= 'binary:logistic', 207 | nthread=4,) 208 | xgb1.fit(X_Train, y_Train,eval_metric='auc') 209 | 210 | dtrain_predictions = xgb1.predict(X_Valid) 211 | dtrain_predprob = xgb1.predict_proba(X_Valid)[:,1] 212 | 213 | print "\nxgbModel Report" 214 | print "AUC Score: %f" % roc_auc_score(y_Valid, dtrain_predprob) 215 | 216 | AUC = [] 217 | for i in range(0,101,1): 218 | a = round(float(i)/100,2)*dtrain_predprob+(1-round(float(i)/100,2))*model_predprob 219 | auc = roc_auc_score(y_Valid,a) 220 | AUC.append(auc) 221 | i = AUC.index(max(AUC)) 222 | print "\nensembleModel Report" 223 | print i,"AUC Score: %f" % max(AUC) 224 | 225 | test = load_data()[4].values[0:10000] 226 | 227 | y_pre1 = model.predict_proba(X_test)[0:10000][:,1] 228 | y_pre2 = xgb1.predict_proba(X_Test).head(10000)[:,1] 229 | y_pre = round(float(i)/100,2)*y_pre2 + (1-round(float(i)/100,2))*y_pre1 230 | 231 | 232 | print "nn AUC Score: %f" % roc_auc_score(test,y_pre1) 233 | print "xgb AUC Score: %f" % roc_auc_score(test,y_pre2) 234 | print "ensemble AUC Score: %f" % roc_auc_score(test,y_pre) 235 | #print roc_auc_score(y_test,y_pre) 236 | save2model(submission, 'keras_nn_test111.csv',y_pre) --------------------------------------------------------------------------------