├── README.md
└── xgboost_gbdt_lr_utils.py


/README.md:
--------------------------------------------------------------------------------
1 | # gbdt_lr_in_recsys
2 | * First, using gbdt+lr in recommend system and comparing the auc of lr, gbdt, gbdt+lr.
3 | * Second, using hyperopt-sklearn to automatically tune the hyperparameters of gbdt.
4 | * Data is private and not available, but just the commonly used in recsys(the commonly used libsvm format data file).
5 | 


--------------------------------------------------------------------------------
/xgboost_gbdt_lr_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python
  2 | #-*- coding:utf-8 -*-
  3 | '''
  4 | author:zhiqiangxu
  5 | date:2016/8/7
  6 | '''
  7 | import xgboost as xgb
  8 | from sklearn.preprocessing import OneHotEncoder
  9 | from sklearn.cross_validation import train_test_split
 10 | from sklearn.datasets import load_svmlight_file
 11 | from sklearn.linear_model import LogisticRegression
 12 | from sklearn.metrics import roc_curve, auc, roc_auc_score
 13 | from sklearn.externals import joblib
 14 | from sklearn.ensemble import GradientBoostingClassifier
 15 | from sklearn.grid_search import GridSearchCV
 16 | from sklearn.externals import joblib
 17 | import matplotlib.pyplot as plt
 18 | import numpy as np
 19 | import pandas as pd
 20 | from scipy.sparse import csr_matrix
 21 | from scipy.sparse import hstack
 22 | import time, os, random, sys
 23 | import math
 24 | import hyperopt.tpe
 25 | import hpsklearn.components
 26 | import hpsklearn.demo_support
 27 | random.seed(1)
 28 | 
 29 | #choosing some samples and random split into train set and test set
 30 | def datasetSplit(libSvmFile, trainFileName, testFileName, testSetRatio, lines):
 31 |     dataFile = open(libSvmFile, 'r')
 32 |     dataList = dataFile.readlines()
 33 |     totalLines = len(dataList)
 34 |     testFileLength = int(testSetRatio*lines)
 35 |     trainFileLength = lines - testFileLength
 36 |     List = range(totalLines)
 37 |     random.shuffle(List)
 38 |     trainFile = open(trainFileName, 'w')
 39 |     testFile = open(testFileName, 'w')
 40 |     posSampleCnt = 0
 41 |     for i in range(lines):
 42 |         if float(dataList[List[i]].split(' ')[0]) > 0.0:
 43 |             posSampleCnt = posSampleCnt + 1
 44 |         if i < trainFileLength:
 45 |             trainFile.write(dataList[List[i]])
 46 |         else:
 47 |             testFile.write(dataList[List[i]])
 48 |     dataFile.close()
 49 |     trainFile.close()
 50 |     testFile.close()
 51 |     print('Positive Sample Count: %d' % posSampleCnt)
 52 |     return posSampleCnt
 53 | 
 54 | #calculate the positive and negative samples counts
 55 | def calcPosNegCnt(libSvmFile):
 56 |     dataFile = open(libSvmFile, 'r')
 57 |     dataList = dataFile.readlines()
 58 |     posSampleCnt = 0
 59 |     negSampleCnt = 0
 60 |     for i in range(len(dataList)):
 61 |         if float(dataList[i].split(' ')[0]) > 0.0:
 62 |             posSampleCnt = posSampleCnt + 1
 63 |         else:
 64 |             negSampleCnt = negSampleCnt + 1
 65 |     print 'Positive Sample: %d' % posSampleCnt
 66 |     print 'Negative Sample: %d' % negSampleCnt
 67 | 
 68 | #training xgboost and using xgboost to encode test set features
 69 | def xgboost_lr_train_test(libsvmFileNameInitial):
 70 |     posSampleCnt = datasetSplit(libsvmFileNameInitial, 'data_train_th100', 'data_test_th100', 0.2, 1100000)
 71 |     X_train, y_train = load_svmlight_file('data_train_th100')
 72 |     print(X_train.shape)
 73 |     X_test, y_test = load_svmlight_file('data_test_th100')
 74 |     #training xgboost
 75 |     negPosRatio = (1100000-posSampleCnt)/posSampleCnt
 76 |     xgbclf = xgb.XGBClassifier(nthread=4, scale_pos_weight=negPosRatio, learning_rate=0.08,
 77 |                             n_estimators=120, max_depth=5, gamma=0, subsample=0.9, colsample_bytree=0.5)
 78 |     xgbclf.fit(X_train, y_train)
 79 |     y_pred_train = xgbclf.predict_proba(X_train)[:, 1]
 80 |     xgb_train_auc = roc_auc_score(y_train, y_pred_train)
 81 |     print('xgboost train auc: %.5f' % xgb_train_auc)
 82 |     y_pred_test = xgbclf.predict_proba(X_test)[:, 1]
 83 |     xgb_test_auc = roc_auc_score(y_test, y_pred_test)
 84 |     print('xgboost test auc: %.5f' % xgb_test_auc)
 85 |     #using xgboost to encode train set and test set features
 86 |     X_train_leaves = xgbclf.apply(X_train)
 87 |     train_rows = X_train_leaves.shape[0]
 88 |     X_test_leaves = xgbclf.apply(X_test)
 89 |     X_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0)
 90 |     X_leaves = X_leaves.astype(np.int32)
 91 |     (rows, cols) = X_leaves.shape
 92 |     cum_count = np.zeros((1, cols), dtype=np.int32)
 93 |     for j in range(cols):
 94 |         if j == 0:
 95 |             cum_count[0][j] = len(np.unique(X_leaves[:, j]))
 96 |         else:
 97 |             cum_count[0][j] = len(np.unique(X_leaves[:, j])) + cum_count[0][j-1]
 98 |     print('Transform features genenrated by xgboost...')
 99 |     for j in range(cols):
100 |         keyMapDict = {}
101 |         if j == 0:
102 |             initial_index = 1
103 |         else:
104 |             initial_index = cum_count[0][j-1]+1
105 |         for i in range(rows):
106 |             if keyMapDict.has_key(X_leaves[i, j]) == False:
107 |                 keyMapDict[X_leaves[i, j]] = initial_index
108 |                 X_leaves[i, j] = initial_index
109 |                 initial_index = initial_index + 1
110 |             else:
111 |                 X_leaves[i, j] = keyMapDict[X_leaves[i, j]]
112 |     #writing encoded features into file
113 |     print('Write xgboost learned features to file ...')
114 |     xgbFeatureLibsvm = open('xgb_feature_libsvm', 'w')
115 |     for i in range(rows):
116 |         if i < train_rows:
117 |             xgbFeatureLibsvm.write(str(y_train[i]))
118 |         else:
119 |             xgbFeatureLibsvm.write(str(y_test[i-train_rows]))
120 |         for j in range(cols):
121 |             xgbFeatureLibsvm.write(' '+str(X_leaves[i, j])+':1.0')
122 |         xgbFeatureLibsvm.write('\n')
123 |     xgbFeatureLibsvm.close()
124 | 
125 | #using xgboost encoded feature in lr to calculate auc
126 | def xgb_feature_lr_train_test(xgbfeaturefile, origin_libsvm_file):
127 |     datasetSplit(origin_libsvm_file, 'data_train_th100', 'data_test_th100', 0.2, 1100000)
128 |     datasetSplit(xgbfeaturefile, 'xgb_feature_train_libsvm','xgb_feature_test_libsvm', 0.2, 1100000)
129 |     X_train_origin, y_train_origin = load_svmlight_file('data_train_th100')
130 |     X_test_origin, y_test_origin = load_svmlight_file('data_test_th100')
131 |     X_train, y_train = load_svmlight_file('xgb_feature_train_libsvm')
132 |     print(X_train.shape)
133 |     X_test, y_test = load_svmlight_file('xgb_feature_test_libsvm')
134 |     print(X_test.shape)
135 | 
136 |     #fittting lr using just xgboost encoded feature
137 |     lr = LogisticRegression(n_jobs=-1, C=0.1, penalty='l1')
138 |     lr.fit(X_train, y_train)
139 |     joblib.dump(lr, 'lr.m')
140 |     y_pred_train = lr.predict_proba(X_train)[:, 1]
141 |     lr_train_auc = roc_auc_score(y_train, y_pred_train)
142 |     print('LR Train AUC: %.5f' % lr_train_auc)
143 |     y_pred_test = lr.predict_proba(X_test)[:, 1]
144 |     lr_test_auc = roc_auc_score(y_test, y_pred_test)
145 |     print('LR Test AUC: %.5f' % lr_test_auc)
146 | 
147 |     # fitting lr using xgboost encoded feature and original feature
148 |     X_train_ext = hstack([X_train_origin, X_train])
149 |     print(X_train_ext.shape)
150 |     del(X_train)
151 |     del(X_train_origin)
152 |     X_test_ext = hstack([X_test_origin, X_test])
153 |     print(X_test_ext.shape)
154 |     del(X_test)
155 |     del(X_test_origin)
156 |     lr = LogisticRegression(n_jobs=-1, C=0.1, penalty='l1')
157 |     lr.fit(X_train_ext, y_train)
158 |     joblib.dump(lr, 'lr_ext.m')
159 |     y_pred_train = lr.predict_proba(X_train_ext)[:, 1]
160 |     lr_train_auc = roc_auc_score(y_train, y_pred_train)
161 |     print('LR Ext Train AUC: %.5f' % lr_train_auc)
162 |     y_pred_test = lr.predict_proba(X_test_ext)[:, 1]
163 |     lr_test_auc = roc_auc_score(y_test, y_pred_test)
164 |     print('LR Ext Test AUC: %.5f' % lr_test_auc)
165 | 
166 | #using gbdt, gbdt+lr to calculate auc
167 | def gbdt_lr_train_test(libsvmFileName):
168 |     datasetSplit(libsvmFileName, 0.2, 'label_feature_data_train', 'label_feature_data_test', 600000)
169 |     X_train, y_train = load_svmlight_file('label_feature_data_train')
170 |     X_test, y_test = load_svmlight_file('label_feature_data_test')
171 |     gbclf = GradientBoostingClassifier(n_estimators=30, max_depth=4, verbose=0)
172 |     tuned_parameter = [{'n_estimators':[30, 40, 50, 60], 'max_depth':[3, 4, 5, 6], 'max_features':[0.5,0.7,0.9]}]
173 |     gs_clf = GridSearchCV(gbclf, tuned_parameter, cv=5, scoring='roc_auc')
174 |     gs_clf.fit(X_train.toarray(), y_train)
175 |     print('best parameters set found: ')
176 |     print(gs_clf.best_params_)
177 |     y_pred_gbdt = gs_clf.predict_proba(X_test.toarray())[:, 1]
178 |     gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
179 |     print('gbdt auc: %.5f' % gbdt_auc)
180 |     X_train_leaves = gbclf.apply(X_train)[:,:,0]
181 |     (train_rows, cols) = X_train_leaves.shape
182 |     X_test_leaves = gbclf.apply(X_test)[:,:,0]
183 |     gbdtenc = OneHotEncoder()
184 |     X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))
185 |     lr = LogisticRegression()
186 |     lr.fit(X_trans[:train_rows, :], y_train)
187 |     y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
188 |     gbdtlr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
189 |     print('gbdt+lr auc 1: %.5f' % gbdtlr_auc1)
190 |     lr = LogisticRegression(n_jobs=-1)
191 |     X_train_ext = hstack([X_trans[:train_rows, :], X_train])
192 |     lr.fit(X_train_ext, y_train)
193 |     X_test_ext = hstack([X_trans[train_rows:, :], X_test])
194 |     y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
195 |     gbdtlr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
196 |     print('gbdt+lr auc 2: %.5f' % gbdtlr_auc2)
197 | 
198 | #using lr to calculate auc on original data and cross featured data
199 | def lr_train_test(libsvmFileInitial, libsvmFileCross):
200 |     datasetSplit(libsvmFileInitial, 'data_train_th500', 'data_test_th500', 0.2, 1100000)
201 |     datasetSplit(libsvmFileCross, 'data_cross_train_th500', 'data_cross_test_th500', 0.2, 1100000)
202 |     X_train_origin, y_train_origin = load_svmlight_file('data_train_th500')
203 |     print(X_train_origin.shape)
204 |     X_test_origin, y_test_origin = load_svmlight_file('data_test_th500')
205 |     print(X_test_origin.shape)
206 |     lr = LogisticRegression(C=0.1, penalty='l2')
207 |     lr.fit(X_train_origin, y_train_origin)
208 |     y_pred_train = lr.predict_proba(X_train_origin)[:, 1]
209 |     lr_train_auc = roc_auc_score(y_train_origin, y_pred_train)
210 |     print('lr train auc origin: %.5f' % lr_train_auc)
211 |     y_pred_test = lr.predict_proba(X_test_origin)[:, 1]
212 |     lr_test_auc = roc_auc_score(y_test_origin, y_pred_test)
213 |     print('lr test auc origin: %.5f' % lr_test_auc)
214 |     X_train_cross, y_train_cross = load_svmlight_file('data_cross_train_th500')
215 |     print(X_train_cross.shape)
216 |     X_test_cross, y_test_cross = load_svmlight_file('data_cross_test_th500')
217 |     print(X_test_cross.shape)
218 |     lr = LogisticRegression(C=0.1, penalty='l2')
219 |     lr.fit(X_train_cross, y_train_cross)
220 |     y_pred_train = lr.predict_proba(X_train_cross)[:, 1]
221 |     lr_train_auc = roc_auc_score(y_train_cross, y_pred_train)
222 |     print('lr train auc cross: %.5f' % lr_train_auc)
223 |     y_pred_test = lr.predict_proba(X_test_cross)[:, 1]
224 |     lr_test_auc = roc_auc_score(y_test_cross, y_pred_test)
225 |     print('lr test auc cross: %.5f' % lr_test_auc)
226 | 
227 | #using hyperopt-sklearn to automatically tune the parameters of gbdt
228 | def hyper_opt(libsvmFile):
229 |     datasetSplit(libsvmFile, 'data_train_th100', 'data_test_th100', 0.2, 100000)
230 |     X_train, y_train = load_svmlight_file('data_train_th100')
231 |     X_train = X_train.toarray()
232 |     estimator = hpsklearn.HyperoptEstimator(None,
233 |                                             classifier=hpsklearn.components.any_classifier('clf'),
234 |                                             algo=hyperopt.tpe.suggest,
235 |                                             trial_timeout=10.0,
236 |                                             max_evals=10)
237 |     fit_iterator = estimator.fit_iter(X_train, y_train)
238 |     fit_iterator.next()
239 |     plot_helper = hpsklearn.demo_support.PlotHelper(estimator, mintodate_ylim=(0.0,0.1))
240 |     while len(estimator.trials.trials) < estimator.max_evals:
241 |         fit_iterator.send(1)
242 |         plot_helper.post_iter()
243 |     plot_helper.post_loop()
244 |     estimator.retrain_best_model_on_full_data(X_train, y_train)
245 |     print 'Best classifier: \n', estimator.best_model()
246 | 
247 | if __name__ == '__main__':
248 |     calcPosNegCnt('label_feature_data_libsvm')
249 |     datasetSplit('50018_20160625_cross_sample', 0.2, 'lr_data_train', 'lr_data_test', 600000)
250 |     xgboost_lr_train_test('data_libsvm_th100')
251 |     lr_train_test('data_libsvm_th500', 'data_cross_libsvm_th500')
252 |     xgb_feature_lr_train_test('xgb_feature_libsvm', 'data_cross_libsvm_th100')
253 |     hyper_opt('data_libsvm_th100')
254 | 


--------------------------------------------------------------------------------