├── .gitattributes
├── .gitignore
├── README.md
└── mojing.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PPD_mojing_fengkong
2 | 拍拍贷"魔镜杯”风控大赛
3 | 排名21/485,队伍名 呵呵哒
4 | https://www.kesci.com/apps/home_log/index.html#!/competition/56cd5f02b89b5bd026cb39c9/leaderboard/1
5 | 


--------------------------------------------------------------------------------
/mojing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Apr 10 15:17:13 2016
  4 | """
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.ensemble import GradientBoostingClassifier
  8 | from sklearn.linear_model import LogisticRegression
  9 | from keras.models import Sequential
 10 | from xgboost.sklearn import XGBClassifier
 11 | from keras.layers.core import Dense, Dropout, Activation
 12 | from keras.layers.normalization import BatchNormalization
 13 | from keras.layers.advanced_activations import PReLU
 14 | from keras.constraints import maxnorm
 15 | from keras.utils import np_utils
 16 | from sklearn.cross_validation import train_test_split
 17 | from sklearn.metrics import roc_auc_score,accuracy_score
 18 | from sklearn.preprocessing import LabelEncoder
 19 | from sklearn.preprocessing import StandardScaler
 20 | from keras.callbacks import EarlyStopping
 21 | from keras.layers.core import Dense
 22 | 
 23 | np.random.seed(11)
 24 | need_normalise=True
 25 | need_validataion=True
 26 | need_categorical=False
 27 | save_categorical_file=False
 28 | #nb_epoch=180
 29 | 
 30 | def save2model(submission,file_name,y_pre):
 31 |     assert len(y_pre)==len(submission)
 32 |     submission['score']=y_pre
 33 |     submission.to_csv(file_name,index=False)
 34 |     print ("saved files %s" % file_name)
 35 | 
 36 | def load_data():
 37 |     
 38 |     path = 'D:/JDO/PPD-Second-Round-Data/'
 39 |     df_testM = pd.read_csv(path+'chusai_test/Kesci_Master_9w_gbk_2.csv')
 40 |     df_testL = pd.read_csv(path+'chusai_test/LogInfo_9w_2.csv')
 41 |     df_testU = pd.read_csv(path+'chusai_test/Userupdate_Info_9w_2.csv')
 42 | 
 43 |     train1_M = pd.read_csv(path+'chusai_train/PPD_Training_Master_GBK_3_1_Training_Set.csv')
 44 |     train1_L = pd.read_csv(path+'chusai_train/PPD_LogInfo_3_1_Training_Set.csv')
 45 |     train1_U = pd.read_csv(path+'chusai_train/PPD_Userupdate_Info_3_1_Training_Set.csv')
 46 |     train2_M = pd.read_csv(path+'fusai_train/Kesci_Master_9w_gbk_3_2.csv')
 47 |     train2_L = pd.read_csv(path+'fusai_train/LogInfo_9w_3_2.csv')
 48 |     train2_U = pd.read_csv(path+'fusai_train/Userupdate_Info_9w_3_2.csv')
 49 | 
 50 |     df_trainM = pd.concat([train1_M,train2_M],ignore_index=True)
 51 |     df_trainL = pd.concat([train1_L,train2_L],ignore_index=True)
 52 |     df_trainU = pd.concat([train1_U,train2_U],ignore_index=True)   
 53 |     
 54 |     df_testM['UserInfo_2'] = df_testM['UserInfo_2'].apply(lambda x:str(x)[:4])
 55 |     df_testM['UserInfo_4'] = df_testM['UserInfo_4'].apply(lambda x:str(x)[:4])
 56 |     df_testM['UserInfo_8'] = df_testM['UserInfo_8'].apply(lambda x:str(x)[:4])
 57 |     df_testM['UserInfo_7'] = df_testM['UserInfo_7'].apply(lambda x:str(x)[:4])
 58 |     df_testM['UserInfo_20'] = df_testM['UserInfo_20'].apply(lambda x:str(x)[:4])
 59 |     df_testM['UserInfo_19'] = df_testM['UserInfo_19'].apply(lambda x:str(x)[:4])
 60 |     df_trainM['UserInfo_2'] = df_trainM['UserInfo_2'].apply(lambda x:str(x)[:4])
 61 |     df_trainM['UserInfo_4'] = df_trainM['UserInfo_4'].apply(lambda x:str(x)[:4])
 62 |     df_trainM['UserInfo_8'] = df_trainM['UserInfo_8'].apply(lambda x:str(x)[:4])
 63 |     df_trainM['UserInfo_7'] = df_trainM['UserInfo_7'].apply(lambda x:str(x)[:4])
 64 |     df_trainM['UserInfo_20'] = df_trainM['UserInfo_20'].apply(lambda x:str(x)[:4])
 65 |     df_trainM['UserInfo_19'] = df_trainM['UserInfo_19'].apply(lambda x:str(x)[:4])
 66 |     
 67 |     df_trainM['UserInfo_24'] = df_trainM['UserInfo_24'].apply(lambda x:str(x)[:10])
 68 |     df_testM['UserInfo_24'] = df_testM['UserInfo_24'].apply(lambda x:str(x)[:10])
 69 | 
 70 |     df_trainM = df_trainM.replace(u'不详',np.nan)
 71 |     df_testM  = df_testM.replace(u'不详',np.nan)
 72 |     df_testM['Date'] = pd.to_datetime(pd.Series(df_testM['ListingInfo']))
 73 |     df_testM = df_testM.drop('ListingInfo', axis=1)
 74 |     df_testM['Year'] = df_testM['Date'].apply(lambda x: int(str(x)[:4]))
 75 |     df_testM['Month'] = df_testM['Date'].apply(lambda x: int(str(x)[5:7]))
 76 |     df_testM['weekday'] = [df_testM['Date'][i].dayofweek for i in range(len(df_testM['Date']))]
 77 |     
 78 |     df_trainM['Date'] = pd.to_datetime(pd.Series(df_trainM['ListingInfo']))
 79 |     df_trainM = df_trainM.drop('ListingInfo', axis=1)
 80 |     df_trainM['Year'] = df_trainM['Date'].apply(lambda x: int(str(x)[:4]))
 81 |     df_trainM['Month'] = df_trainM['Date'].apply(lambda x: int(str(x)[5:7]))
 82 |     df_trainM['weekday'] = [df_trainM['Date'][i].dayofweek for i in range(len(df_trainM['Date']))]
 83 |     frame1 = [df_testL,df_trainL]
 84 |     frame2 = [df_testU,df_trainU]
 85 |     df_L = pd.concat(frame1,ignore_index=True)    
 86 |     df_U = pd.concat(frame2,ignore_index=True)
 87 |     df_U['UserupdateInfo1'] = df_U['UserupdateInfo1'].apply(lambda x:str(x).upper())
 88 |     df_Uu = pd.get_dummies(df_U['UserupdateInfo1']).join(df_U['Idx'])
 89 |     df_L1 = pd.get_dummies(df_L['LogInfo1']).join(df_L['Idx'])
 90 |     df_L1 = df_L1.groupby('Idx',as_index=False).sum()
 91 |     df_L2 = pd.get_dummies(df_L['LogInfo2']).join(df_L['Idx'])
 92 |     df_L2 = df_L2.groupby('Idx',as_index=False).sum()
 93 |     df_L3 = pd.merge(df_L1,df_L2,on = 'Idx',how = 'left')
 94 |     result_L = df_L3.groupby('Idx',as_index=False).sum()
 95 |     result_U = df_Uu.groupby('Idx',as_index=False).sum()
 96 |     
 97 |     df_trainM = df_trainM.fillna(-1)
 98 |     df_testM = df_testM.fillna(-1)
 99 |     for f in df_testM.columns:
100 |         if df_testM[f].dtype=='object':
101 |             lbl = LabelEncoder()
102 |             lbl.fit(list(df_testM[f])+list(df_trainM[f]))
103 |             df_trainM[f] = lbl.transform(list(df_trainM[f].values))
104 |             df_testM[f] = lbl.transform(list(df_testM[f].values))
105 |     df_train = pd.merge(df_trainM,result_L,on = 'Idx')
106 |     train = pd.merge(df_train,result_U,on = 'Idx')
107 |     df_test = pd.merge(df_testM,result_L,on = 'Idx',how = 'left')
108 |     test = pd.merge(df_test,result_U,on = 'Idx',how = 'left')
109 |     
110 |     train = train.fillna(-1)
111 |     test = test.fillna(-1)
112 |     submission=pd.DataFrame()
113 |     submission["Idx"]= test["Idx"] 
114 |     
115 |     drop_feature = ['WeblogInfo_10']
116 |     X = train.drop(['Date','target','Idx'],axis = 1)
117 |     X = X.drop(drop_feature,axis = 1)
118 |     #X = train[golden_feature]
119 |     Y = train['target'] 
120 |     final = test['target']
121 |     TEST = test.drop(['Date','Idx','target'],axis = 1)
122 |     TEST = TEST.drop(drop_feature,axis = 1)
123 |     #TEST = TEST[golden_feature]
124 |     return [X,Y,TEST,submission,final]
125 | 
126 | def cross_validation():
127 |     datasets=load_data()
128 |     x_train,x_test,y_train,y_test = train_test_split(datasets[0],datasets[1],test_size=13000)
129 | 
130 |     encoder = LabelEncoder()
131 |     train_y,valid_y = y_train.values,y_test.values
132 |     train_y,valid_y = encoder.fit_transform(train_y).astype(np.int32),encoder.fit_transform(valid_y).astype(np.int32)
133 |     train_y,valid_y = np_utils.to_categorical(train_y),np_utils.to_categorical(valid_y)
134 | 
135 |     print ("processsing finished")
136 |     valid=None
137 |     x = np.array(datasets[0])
138 |     x = x.astype(np.float32)
139 |     train,valid = np.array(x_train),np.array(x_test)
140 |     train,valid = train.astype(np.float32),valid.astype(np.float32)
141 |     test=np.array(datasets[2])
142 |     test=test.astype(np.float32)
143 |     if need_normalise:
144 |         scaler = StandardScaler().fit(x)
145 |         train,valid = scaler.transform(train),scaler.transform(valid)
146 |         test = scaler.transform(test)
147 |     
148 |     return [(train,train_y),(test,datasets[3]),(valid,valid_y),
149 |             (x_train,y_train),(datasets[2],datasets[3]),(x_test,y_test)]
150 | 
151 | print('Loading data...')
152 | 
153 | datasets=cross_validation()
154 | 
155 | X_train, y_train = datasets[0]
156 | X_test, submission = datasets[1]
157 | X_valid, y_valid = datasets[2]
158 | 
159 | X_Train, y_Train = datasets[3]
160 | X_Test, submission = datasets[4]
161 | X_Valid, y_Valid = datasets[5]
162 | 
163 | nb_classes = y_train.shape[1]
164 | print(nb_classes, 'classes')
165 | 
166 | dims = X_train.shape[1]
167 | print(dims, 'dims')
168 | 
169 | model = Sequential()
170 | 
171 | model.add(Dense(1024,input_shape=(dims,), init = 'glorot_normal', W_constraint = maxnorm(4)))
172 | model.add(PReLU())
173 | model.add(BatchNormalization())
174 | model.add(Dropout(0.5))
175 | 
176 | model.add(Dense(360, init = 'glorot_normal', W_constraint = maxnorm(4)))
177 | model.add(PReLU())
178 | model.add(BatchNormalization())
179 | model.add(Dropout(0.5))
180 | '''
181 | model.add(Dense(420, init = 'glorot_normal', W_constraint = maxnorm(4)))
182 | model.add(PReLU())
183 | model.add(BatchNormalization())
184 | model.add(Dropout(0.5))
185 | '''
186 | model.add(Dense(nb_classes))
187 | model.add(Activation('sigmoid'))
188 | model.compile(loss='binary_crossentropy', optimizer="sgd")
189 | 
190 | 
191 | model.fit(X_train, y_train, nb_epoch=100, batch_size=128,
192 |           callbacks = [EarlyStopping(monitor='val_loss', patience=20)])
193 | y_pre = model.predict_proba(X_valid)
194 | scores = roc_auc_score(y_valid,y_pre)
195 | model_predprob = model.predict_proba(X_valid)[:,1]
196 | print ("\nnnModel Report")
197 | print ("AUC Score (Test): %f" %scores)
198 | 
199 | xgb1 = XGBClassifier(
200 |  learning_rate =0.03,
201 |  n_estimators=408,
202 |  max_depth=9,
203 |  min_child_weight=3,
204 |  subsample=0.75,
205 |  colsample_bytree=0.8,
206 |  objective= 'binary:logistic',
207 |  nthread=4,)
208 | xgb1.fit(X_Train, y_Train,eval_metric='auc')
209 |     
210 | dtrain_predictions = xgb1.predict(X_Valid)
211 | dtrain_predprob = xgb1.predict_proba(X_Valid)[:,1]
212 |     
213 | print "\nxgbModel Report"
214 | print "AUC Score: %f" % roc_auc_score(y_Valid, dtrain_predprob)
215 | 
216 | AUC = []
217 | for i in range(0,101,1):
218 |     a = round(float(i)/100,2)*dtrain_predprob+(1-round(float(i)/100,2))*model_predprob
219 |     auc = roc_auc_score(y_Valid,a)
220 |     AUC.append(auc)
221 | i = AUC.index(max(AUC))
222 | print "\nensembleModel Report"
223 | print i,"AUC Score: %f" % max(AUC)
224 | 
225 | test = load_data()[4].values[0:10000]
226 | 
227 | y_pre1 = model.predict_proba(X_test)[0:10000][:,1]
228 | y_pre2 = xgb1.predict_proba(X_Test).head(10000)[:,1]
229 | y_pre = round(float(i)/100,2)*y_pre2 + (1-round(float(i)/100,2))*y_pre1
230 | 
231 | 
232 | print "nn AUC Score: %f" % roc_auc_score(test,y_pre1)
233 | print "xgb AUC Score: %f" % roc_auc_score(test,y_pre2)
234 | print "ensemble AUC Score: %f" % roc_auc_score(test,y_pre)
235 | #print roc_auc_score(y_test,y_pre)
236 | save2model(submission, 'keras_nn_test111.csv',y_pre)


--------------------------------------------------------------------------------