├── README.md ├── excute.py ├── function.py └── model.py /README.md: -------------------------------------------------------------------------------- 1 | # CCF-StateGrid--用电异常行为识别 2 | 本次比赛TNT_000队伍获得二等奖。总体思路如下: 3 | 4 | 1.特征思路 5 | 6 | 7 | 按照时间划分的基础属性特征(B:Base): 8 | 9 | 10 | ● 用户每1/2/3/4/5周的用电量和,电表起始度数和,电表终止度数和,用户记录数 11 | 12 | ● 用户每2/3/4/5/6天的用电量和,电表起始度数和,电表终止度数和,用户记录数 13 | 14 | 15 | 16 | 17 | 按照时间划分的用户用电前后相似度特征(S:Similarity): 18 | 19 | 20 | ● 用户每4/5周前后两个时间段的用电量、电表起始度数、电表终止度数、用户记录数PCC相似度 21 | 22 | ● 用户每4/5周前后两个时间段的用电量、电表起始度数、电表终止度数、用户记录数余弦相似度 23 | 24 | 25 | 26 | 27 | 按照时间划分的用户Pooling特征(P:Pool): 28 | 29 | 30 | ● 用户用电记录按照时间构成28*37二维矩阵,一维代表时间,一维代表用电量。 31 | 32 | ● 按照2*2的大小对矩阵进行取最大值,平均值,标准差统计 33 | 34 | ● 按照3*3的大小对矩阵进行取最大值,平均值,标准差统计 35 | 36 | 37 | 38 | 按照时间划分的用户统计特征属性(D:Describe): 39 | 40 | 41 | ● 用户每1/2/3/4/5周的用电量,电表起始度数,电表终止度数,用户记录数的统计特征 42 | 43 | ● 用户每2/3/4/5/6天的用电量,电表起始度数,电表终止度数,用户记录数的统计特征 44 | 45 | ● 统计特征有最大,最小,均值,方差,异常值个数,中位数等等 46 | 47 | 48 | 49 | 2.模型: 50 | 51 | 52 | ● Xgboost 单模型(B+D) cv-3 线下0.914 线上0.92244 53 | 54 | ● Xgboost 单模型(B+S+D) cv-3线下0.915 线上0.92288 55 | 56 | 57 | 58 | 59 | 3.文件解释: 60 | 61 | 62 | ../stateGrid/ 63 | - feature:放置特征文件夹 64 | - matrixFeature:B特征文件夹 65 | - description:D特征文件夹 66 | - stack:stack结果文件 67 | - trend:S特征文件夹 68 | - data:放置原始数据及原始数据预处理 69 | - importance:xgb对特征重要性评价文件 70 | - src:源代码文件 71 | - excute.py 配置文件及执行文件 72 | - function.py 构造特征函数文件 73 | - model.py 模型相关函数文件 74 | - result:结果文件 75 | - model:模型文件 76 | -------------------------------------------------------------------------------- /excute.py: -------------------------------------------------------------------------------- 1 | #-- encoding:UTF-8 -- 2 | #-- Author: TNT_000 by Abner yang 3 | import numpy as np 4 | import pandas as pd 5 | import datetime 6 | from function import * 7 | from model import * 8 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 9 | from sklearn.ensemble import BaggingClassifier 10 | from sklearn.neighbors import KNeighborsClassifier 11 | from sklearn.ensemble import GradientBoostingClassifier 12 | from sklearn.linear_model import LogisticRegression 13 | 14 | #--- xgboost parameters 15 | params={ 16 | 'scale_pos_weight': 0, 17 | 'booster':'gbtree', 18 | 'objective': 'binary:logistic', 19 | #'objective': 'rank:pairwise', 20 | 21 | 'eval_metric': 'map', 22 | 'stratified':True, 23 | 24 | 'max_depth':4, 25 | 'min_child_weight':0.01, 26 | 'gamma':0.1, 27 | 'subsample':0.8, 28 | 'colsample_bytree':0.6, 29 | #'max_delta_step':8, 30 | #'colsample_bylevel':0.5, 31 | #'rate_drop':0.3, 32 | 33 | 'lambda':0.0001, #550 34 | #'alpha':10, 35 | #'lambda_bias':0, 36 | 37 | 'eta': 0.02, 38 | 'seed':1288, 39 | 40 | 'nthread':8, 41 | 42 | 'silent':1 43 | } 44 | 45 | #--- the config of TNT_000(Abner)'s solution in stateGrid competition 46 | config = { 47 | 'xgbParams':params, 48 | 'xgbRounds':2300, 49 | 'stackFolds':5, 50 | 'seed':12, 51 | 'stackPath':'lr-l2', 52 | 'rounds':2300, #---xgb rounds 53 | 'folds':5, #--- cross validation folds 54 | 'useMatrix':True, #--- True: use matrix feature, False: no use matrix feature 55 | 'matrixStack':False, #--- True: use teammate's stack feature, False: no use.. 56 | 'final':True, 57 | 'myStack':False, #--- True: use my stack feature, False: no use 58 | 'des2':False, 59 | 'listMatrix':[2], #--- list: the value means the window of time to get feature 60 | 'uselistMatrix1':[1,2,3], #--- use matrix feature window list 61 | 'uselistMatrix2':[1,2,3,4,5,6,7,14,21,28,35],#--- use description feature window list 62 | 'uselistMatrix3':[1],#--- use Trend feature window list 63 | 'uselistMatrix4':[1], 64 | 'uselistMatrix5':[2,3,4,5,6,7,14,21,28,35], 65 | 'pccList':[27,28,29,30], #--- list: the value means the window of time to get trend feature 66 | 'desList':[7,14,21,28,35], 67 | 'biasList':[0], 68 | 'name':['kwhU','kwhN','kwhE','kwhS'], #--- the column list to make feature 69 | 'useId':False,#--- True: use id feature False: no use 70 | 'base':[[5,20],[4,20],[3,25],[2,30]],#--id wondws to get id feature 71 | 'matrixStackList':['xgb_prob1'], #--use teammate's stack result name 72 | 'myStackList':['xgb-1','xgb-2'], #--my stack result name 73 | 'Description':True, #--- True: use description feature False: no use 74 | 'Trend':False,#--- True: use trend feature False: no use 75 | 'pcc-dis':[29,27,28,30], #--- list: the value means the window of time to get trend feature 76 | 'des-dis':[7,14,21,28,35], 77 | 'filter':True #--- True: feature selection False: no use 78 | } 79 | 80 | 81 | 82 | if __name__ == '__main__': 83 | 84 | translateData() #--- preprocess raw data 85 | 86 | getUseMatrix(config, 2016) #--- get matrix feature 87 | getDescriptionFeature(config, 2016) #--- get description feature 88 | getTrendFeature(config, 2016) #--- get trend feature 89 | getDescribeFeature1(config, 2016) 90 | getFinalFeature(config, 2016) 91 | 92 | trainFeature, testFeature, trainLabel, testIndex = getFeature(config, 2016) #-- read feature 93 | 94 | res = xgbCVModel(trainFeature, trainLabel, config['rounds'], config['folds'], params) #-- xgb cross validation 95 | 96 | model, predict = xgbPredictModel(trainFeature, trainLabel, testFeature, params, config['rounds']) #-- xgb online predict 97 | 98 | storeResult(testIndex, predict, model, 'guodian_final_002') #-- store result 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /function.py: -------------------------------------------------------------------------------- 1 | #-- encoding:UTF-8 -- 2 | #-- Author: TNT_000 by Abner yang 3 | import pandas as pd 4 | import numpy as np 5 | import datetime 6 | import math 7 | 8 | #-- get number of days from 2014/01/01 9 | def getDate(date, p): 10 | listTime = [] 11 | print min(date) 12 | i = 0 13 | for d in date: 14 | time = datetime.datetime.strptime(d,"%Y/%m/%d") 15 | time1 = datetime.datetime.strptime(p,"%Y/%m/%d") 16 | listTime.append((time-time1).days) 17 | if i%10000 == 1: 18 | print i 19 | i += 1 20 | return listTime 21 | 22 | #-- preprocess data 23 | def translateData(): 24 | train = pd.read_csv('../data/train.csv', header = None) 25 | train.columns = ['CONS_NO','label'] 26 | 27 | train.to_csv('../data/trainInfo.csv', index = False) 28 | 29 | test = pd.read_csv('../data/test.csv', header = None) 30 | test.columns = ['CONS_NO'] 31 | 32 | test.to_csv('../data/testInfo.csv', index = False) 33 | 34 | useData = pd.read_csv('../data/all_user_yongdian_data_2015.csv', header = 0) 35 | time = getDate(useData['DATA_DATE'].values.T,'2015/01/01') 36 | 37 | useData['Time'] = time 38 | print useData.shape 39 | useData1 = useData[(useData['Time'] >= 0) & (useData['Time'] < 365)] 40 | print useData1.shape 41 | useData1 = useData1.sort(['CONS_NO','Time', 'KWH_READING'], ascending=[1,1,0]) 42 | useData1.to_csv('../data/useDataInfo_2016.csv', index = False) 43 | 44 | useData2 = useData[(useData['Time'] >= -365) & (useData['Time'] < 0)] 45 | useData2['Time'] = useData2['Time'].values.T+365 46 | print useData2.shape 47 | useData2 = useData2.sort(['CONS_NO','Time', 'KWH_READING'], ascending=[1,1,0]) 48 | useData2.to_csv('../data/useDataInfo_2015.csv', index = False) 49 | 50 | useData3 = useData[(useData['Time'] >= -730) & (useData['Time'] < -365)] 51 | useData3['Time'] = useData3['Time'].values.T+730 52 | print useData3.shape 53 | useData3 = useData3.sort(['CONS_NO','Time', 'KWH_READING'], ascending=[1,1,0]) 54 | useData3.to_csv('../data/useDataInfo_2014.csv', index = False) 55 | 56 | useData = pd.read_csv('../data/user_dianliang_round3.csv', header = 0) 57 | time = getDate(useData['DATA_DATE'].values.T,'2016/01/01') 58 | 59 | useData['Time'] = time 60 | useData = useData.sort(['CONS_NO','Time', 'KWH_READING'], ascending=[1,1,0]) 61 | useData.to_csv('../data/useDataInfo_finalTest_2016.csv', index = False) 62 | 63 | 64 | #-- get matrix feature 65 | def getUseMatrix(config, p): 66 | useData1 = pd.read_csv('../data/useDataInfo_'+str(p)+'.csv', header = 0) 67 | useData1 = useData1.fillna(-1) 68 | 69 | useData2 = pd.read_csv('../data/useDataInfo_finalTest_2016.csv', header = 0) 70 | useData2 = useData2.fillna(-1) 71 | 72 | useData = pd.concat([useData1, useData2], axis = 0) 73 | print useData1.shape, useData2.shape, useData.shape 74 | data = useData[['CONS_NO','Time','KWH','KWH_READING','KWH_READING1']].values 75 | print data.shape 76 | 77 | userNum = len(np.unique(useData['CONS_NO'].values)) 78 | timeT = max(useData['Time'].values.T)+1 79 | print min(useData['Time'].values.T), timeT 80 | for l in config['listMatrix']: 81 | print l 82 | timeNum = int(math.ceil(float(timeT)/l)) 83 | print timeNum 84 | matrix1 = np.zeros([userNum, timeNum]) - 1 85 | matrix2 = np.zeros([userNum, timeNum]) 86 | matrix3 = np.zeros([userNum, timeNum]) - 1 87 | matrix4 = np.zeros([userNum, timeNum]) - 1 88 | uidIndex = [] 89 | 90 | userDict = {} 91 | num = 0 92 | i = 0 93 | for line in data: 94 | if i%100000 == 1: 95 | print i 96 | i += 1 97 | if userDict.has_key(line[0]) == False: 98 | userDict[line[0]] = num 99 | uidIndex.append(line[0]) 100 | num += 1 101 | 102 | col = line[1]/l 103 | 104 | if matrix1[userDict[line[0]], col] == -1: 105 | matrix1[userDict[line[0]], col] = line[2] 106 | else: 107 | matrix1[userDict[line[0]], col] += line[2] 108 | 109 | if matrix3[userDict[line[0]], col] == -1: 110 | matrix3[userDict[line[0]], col] = line[3] 111 | else: 112 | matrix3[userDict[line[0]], col] += line[3] 113 | 114 | if matrix4[userDict[line[0]], col] == -1: 115 | matrix4[userDict[line[0]], col] = line[4] 116 | else: 117 | matrix4[userDict[line[0]], col] += line[4] 118 | 119 | 120 | matrix2[userDict[line[0]], col] += 1 121 | 122 | 123 | 124 | matrixColName1 = getColName(timeNum, 'useDay'+str(l)+'-') 125 | matrixColName2 = getColName(timeNum, 'useDayNum'+str(l)+'-') 126 | matrixColName3 = getColName(timeNum, 'endNum'+str(l)+'-') 127 | matrixColName4 = getColName(timeNum, 'startNum'+str(l)+'-') 128 | 129 | 130 | matrixFeature1 = pd.DataFrame(matrix1, columns = matrixColName1) 131 | matrixFeature2 = pd.DataFrame(matrix2, columns = matrixColName2) 132 | matrixFeature3 = pd.DataFrame(matrix3, columns = matrixColName3) 133 | matrixFeature4 = pd.DataFrame(matrix4, columns = matrixColName4) 134 | 135 | matrixFeature1['CONS_NO'] = uidIndex 136 | matrixFeature2['CONS_NO'] = uidIndex 137 | matrixFeature3['CONS_NO'] = uidIndex 138 | matrixFeature4['CONS_NO'] = uidIndex 139 | 140 | 141 | #matrixFeature = pd.concat([matrixFeature1, matrixFeature2, matrixFeature3, matrixFeature4], axis = 1) 142 | 143 | matrixFeature1.to_csv('../feature/matrixFeature'+str(p)+'/kwhU_matrixFeature'+str(l)+'.csv', index = False) 144 | matrixFeature2.to_csv('../feature/matrixFeature'+str(p)+'/kwhN_matrixFeature'+str(l)+'.csv', index = False) 145 | matrixFeature3.to_csv('../feature/matrixFeature'+str(p)+'/kwhS_matrixFeature'+str(l)+'.csv', index = False) 146 | matrixFeature4.to_csv('../feature/matrixFeature'+str(p)+'/kwhE_matrixFeature'+str(l)+'.csv', index = False) 147 | 148 | #-- numpy array to pandas DataFrame add columns's name list 149 | def getColName(colNum, stri): 150 | print colNum, stri 151 | colName = [] 152 | for i in range(colNum): 153 | colName.append(stri + str(i)) 154 | return colName 155 | 156 | #-- get description feature 157 | def getDescriptionFeature(config, p): 158 | for l in config['listMatrix']: 159 | for n in config['name']: 160 | print n 161 | useMatrix = pd.read_csv('../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv', header = 0) 162 | print useMatrix.shape 163 | uid = useMatrix['CONS_NO'].values.T 164 | feature = useMatrix.drop(['CONS_NO'], axis = 1) 165 | 166 | featureMatrix = np.zeros([len(uid), 8]) 167 | 168 | feature = feature.values 169 | 170 | num = 0 171 | naNum = [] 172 | outNum1 = [] 173 | outNum2 = [] 174 | outNum3 = [] 175 | ii = 0 176 | for line in feature: 177 | if ii%1000 == 1: 178 | print ii 179 | ii+=1 180 | k = len(line) 181 | line = line[line != -1] 182 | if len(line) > 0: 183 | outNum3.append(len(line[line >= np.mean(line)+3*np.std(line)])) 184 | outNum2.append(len(line[line >= np.mean(line)+2*np.std(line)])) 185 | outNum1.append(len(line[line >= np.mean(line)+1*np.std(line)])) 186 | else: 187 | outNum3.append(-1) 188 | outNum2.append(-1) 189 | outNum1.append(-1) 190 | 191 | 192 | naNum.append(k-len(line)-21) 193 | lFrame = pd.DataFrame({'Sta':line}) 194 | des = lFrame.describe() 195 | info = des.values.reshape(des.shape[0]) 196 | featureMatrix[num,:] = info 197 | num += 1 198 | 199 | matrixColName = getColName(8, 'Description-'+n+str(l)) 200 | featureMatrix = pd.DataFrame(featureMatrix, columns = matrixColName) 201 | 202 | naName = 'naNum'+n+str(l) 203 | outName1 = 'outNum1-'+n+str(l) 204 | outName2 = 'outNum2-'+n+str(l) 205 | outName3 = 'outNum3-'+n+str(l) 206 | 207 | featureMatrix[naName] = naNum 208 | 209 | featureMatrix[outName1] = outNum1 210 | featureMatrix[outName2] = outNum2 211 | featureMatrix[outName3] = outNum3 212 | 213 | featureMatrix['CONS_NO'] = uid 214 | 215 | featureMatrix.to_csv('../feature/describeFeature'+str(p)+'/Description_'+n+str(l)+'.csv', index = False) 216 | 217 | def getFinalFeature(config, p): 218 | for l in config['listMatrix']: 219 | for n in config['name']: 220 | useMatrix = pd.read_csv('../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv', header = 0) 221 | uid = useMatrix['CONS_NO'].values.T 222 | feature = useMatrix.drop(['CONS_NO'], axis = 1) 223 | feature = feature.values 224 | 225 | colNum = feature.shape[1] - 1 226 | featureMatrix = np.zeros([len(uid), colNum-1]) 227 | for row in range(feature.shape[0]): 228 | for i in range(colNum-1): 229 | featureMatrix[row,i] = float(feature[row, i+1])/feature[row, i] 230 | if row%1000 == 1: 231 | print row 232 | matrixColName = getColName(colNum-1, 'Trend-final-'+n+str(l)) 233 | featureMatrix = pd.DataFrame(featureMatrix, columns = matrixColName) 234 | 235 | featureMatrix['CONS_NO'] = uid 236 | 237 | featureMatrix.to_csv('../feature/finalFeature'+str(p)+'/Trend_'+n+str(l)+'final'+'.csv', index = False) 238 | 239 | 240 | #-- get trend feature 241 | def getTrendFeature(config, p): 242 | for l in config['listMatrix']: 243 | for n in config['name']: 244 | for bias in config['biasList']: 245 | for pcc in config['pcc-dis']: 246 | print n 247 | useMatrix = pd.read_csv('../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv', header = 0) 248 | print useMatrix.shape 249 | uid = useMatrix['CONS_NO'].values.T 250 | feature = useMatrix.drop(['CONS_NO'], axis = 1) 251 | feature = feature.values 252 | 253 | colNum = feature.shape[1]/pcc 254 | featureMatrix = np.zeros([len(uid), colNum-1]) 255 | for row in range(feature.shape[0]): 256 | for i in range(colNum-1): 257 | featureMatrix[row,i] = np.corrcoef(feature[row,(i*pcc+bias):((i+1)*pcc+bias)], feature[row,((i+1)*pcc+bias):((i+2)*pcc+bias)])[0,1] 258 | if row%1000 == 1: 259 | print row 260 | matrixColName = getColName(colNum-1, 'Trend-PCC-'+n+str(l)+'pcc'+str(pcc)) 261 | featureMatrix = pd.DataFrame(featureMatrix, columns = matrixColName) 262 | 263 | featureMatrix['CONS_NO'] = uid 264 | 265 | featureMatrix.to_csv('../feature/trendFeature'+str(p)+'/Trend_'+n+str(l)+'pcc'+str(pcc)+'-bias-'+str(bias)+'.csv', index = False) 266 | 267 | def getDescribeFeature1(config, p): 268 | for l in config['listMatrix']: 269 | for n in config['name']: 270 | for bias in config['biasList']: 271 | for pcc in config['des-dis']: 272 | print n 273 | useMatrix = pd.read_csv('../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv', header = 0) 274 | print useMatrix.shape 275 | uid = useMatrix['CONS_NO'].values.T 276 | feature = useMatrix.drop(['CONS_NO'], axis = 1) 277 | feature = feature.values 278 | 279 | colNum = (feature.shape[1]-bias)/pcc 280 | featureMatrix = np.zeros([len(uid), colNum*5]) 281 | for row in range(feature.shape[0]): 282 | for i in range(colNum): 283 | kk = feature[row,(i*pcc+bias):((i+1)*pcc+bias)] 284 | ss = [np.mean(kk),np.std(kk),np.median(kk),np.max(kk), np.min(kk)] 285 | featureMatrix[row,(i*5):(i+1)*5] = ss 286 | if row%1000 == 1: 287 | print row 288 | matrixColName = getColName(colNum*5, 'Des2_'+n+str(l)+'pcc'+str(pcc)) 289 | featureMatrix = pd.DataFrame(featureMatrix, columns = matrixColName) 290 | 291 | featureMatrix['CONS_NO'] = uid 292 | 293 | featureMatrix.to_csv('../feature/des2Feature'+str(p)+'/Des2_'+n+str(l)+'static'+str(pcc)+'-bias-'+str(bias)+'.csv', index = False) 294 | 295 | #-- feature selection 296 | def filter(data): 297 | col = data.columns 298 | delName = [] 299 | for i in col: 300 | value = data[i].values.T 301 | if len(np.unique(value)) == 1: 302 | delName.append(i) 303 | return delName 304 | #data = data.drop(delName, axis = 1) 305 | 306 | #return data 307 | 308 | #-- get upper id from raw 309 | def getupper(data): 310 | k = [] 311 | for d in data: 312 | k.append(d.upper()) 313 | return k 314 | 315 | #-- read feature and return 316 | def getFeature(config, p): 317 | train = pd.read_csv('../data/trainInfo.csv', header = 0) 318 | test = pd.read_csv('../data/finalTest.csv', header = 0) 319 | 320 | print train.shape, test.shape 321 | 322 | if config['useMatrix'] == True: 323 | for l in config['uselistMatrix1']: 324 | for n in config['name']: 325 | name = '../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv' 326 | useMatrix = pd.read_csv(name, header = 0) 327 | train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 328 | test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 329 | print train.shape, test.shape 330 | if config['Description'] == True: 331 | for l in config['uselistMatrix2']: 332 | for n in config['name']: 333 | name = '../feature/describeFeature'+str(p)+'/Description_'+n+str(l)+'.csv' 334 | useMatrix = pd.read_csv(name, header = 0) 335 | train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 336 | test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 337 | print train.shape, test.shape 338 | if config['final'] == True: 339 | for l in config['uselistMatrix5']: 340 | for n in config['name']: 341 | name = '../feature/finalFeature'+str(p)+'/Trend_'+n+str(l)+'final'+'.csv' 342 | useMatrix = pd.read_csv(name, header = 0) 343 | train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 344 | test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 345 | print train.shape, test.shape 346 | if config['Trend'] == True: 347 | for l in config['uselistMatrix3']: 348 | for n in config['name']: 349 | for b in config['biasList']: 350 | for pcc in config['pccList']: 351 | name = '../feature/trendFeature'+str(p)+'/Trend_'+n+str(l)+'pcc'+str(pcc)+'-bias-'+str(b)+'.csv' 352 | useMatrix = pd.read_csv(name, header = 0) 353 | train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 354 | test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 355 | print train.shape, test.shape 356 | if config['des2'] == True: 357 | for l in config['uselistMatrix4']: 358 | for n in config['name']: 359 | for b in config['biasList']: 360 | for pcc in config['desList']: 361 | name = '../feature/des2Feature'+str(p)+'/Des2_'+n+str(l)+'static'+str(pcc)+'-bias-'+str(b)+'.csv' 362 | useMatrix = pd.read_csv(name, header = 0) 363 | train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 364 | test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1) 365 | print train.shape, test.shape 366 | if config['myStack'] == True: 367 | for l in config['myStackList']: 368 | data1 = pd.read_csv('../feature/stack/'+l+'_train.csv', header = 0) 369 | data2 = pd.read_csv('../feature/stack/'+l+'_test.csv', header = 0) 370 | 371 | train = pd.concat([train, data1], axis = 1).fillna(-1) 372 | test = pd.concat([test, data2], axis = 1).fillna(-1) 373 | print train.shape, test.shape 374 | 375 | if config['matrixStack'] == True: 376 | for l in config['matrixStackList']: 377 | data = pd.read_csv('../feature/stackFeature/'+l+'.csv',header = 0) 378 | data['CONS_NO'] = np.append(train['CONS_NO'].values.T, test['CONS_NO'].values.T) 379 | train = pd.merge(train, data, on = 'CONS_NO', how = 'left').fillna(-1) 380 | test = pd.merge(test, data, on = 'CONS_NO', how = 'left').fillna(-1) 381 | print train.shape, test.shape 382 | 383 | 384 | 385 | trainUid = train['CONS_NO'].values.T 386 | testUid = test['CONS_Index'].values.T 387 | 388 | trainFeature = train.drop(['CONS_NO','label'], axis = 1) 389 | testFeature = test.drop(['CONS_NO','CONS_Index'], axis = 1) 390 | 391 | trainLabel = train['label'].values.T 392 | 393 | 394 | print trainFeature.shape, testFeature.shape, trainLabel.shape 395 | 396 | # print trainFeature 397 | # print testFeature 398 | 399 | if config['filter'] == True: 400 | print 'filter...' 401 | delName = filter(trainFeature) 402 | trainFeature = trainFeature.drop(delName, axis = 1) 403 | testFeature = testFeature.drop(delName, axis = 1) 404 | 405 | 406 | print trainFeature.shape, testFeature.shape, trainLabel.shape 407 | 408 | return trainFeature.fillna(-1).values, testFeature.fillna(-1).values, trainLabel, testUid 409 | 410 | #-- store the online result 411 | def storeResult(testIndex, predict, model, day): 412 | result = pd.DataFrame({'CONS_NO':testIndex, 'label':predict}) 413 | #print result 414 | rpath = '../result/'+ day + '.csv' 415 | rpath1 = '../result/'+ day + '_prob.csv' 416 | 417 | mpath = '../model/'+ day + '.m' 418 | result = result.sort('label', ascending = False) 419 | #print result 420 | result.to_csv(rpath1, index = False) 421 | result = result['CONS_NO'] 422 | result.to_csv(rpath, index = False, header = False) 423 | if model != False: 424 | model.save_model(mpath) 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # -- encoding:UTF-8 -- 2 | #-- Author: TNT_000 by Abner yang 3 | import numpy as np 4 | import pandas as pd 5 | import xgboost as xgb 6 | from sklearn.preprocessing import OneHotEncoder 7 | from function import * 8 | from scipy.sparse import hstack 9 | from matplotlib import pyplot 10 | from sklearn.cross_validation import StratifiedKFold 11 | from sklearn.cross_validation import cross_val_score 12 | from sklearn.metrics import average_precision_score 13 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 14 | from sklearn.ensemble import BaggingClassifier 15 | from sklearn.neighbors import KNeighborsClassifier 16 | from sklearn.ensemble import GradientBoostingClassifier 17 | from sklearn.linear_model import LogisticRegression 18 | #-- map eval function 19 | 20 | # -*- encoding:utf-8 -*- 21 | import numpy as np 22 | import pandas as pd 23 | import xgboost as xgb 24 | from sklearn.model_selection import StratifiedShuffleSplit 25 | from sklearn.model_selection import StratifiedKFold 26 | import time 27 | 28 | 29 | def evalerror(predict, true): 30 | print average_precision_score(true, predict, average='macro', sample_weight=None) 31 | 32 | 33 | 34 | 35 | def map_eval(true, predict): 36 | result = pd.DataFrame({'true':true, 'predict':predict}) 37 | result = result.sort(['predict'], ascending = [0]) 38 | #print result 39 | score = [] 40 | num = 0 41 | total = 0 42 | for line in result['true'].values.T: 43 | total += 1 44 | if line == 1: 45 | num += 1 46 | score.append(float(num)/total) 47 | mapScore = np.mean(score) 48 | print mapScore 49 | return mapScore 50 | 51 | #-- xgboost local train-test Model frame 52 | def xgbLocalModel(trainFeature, testFeature, trainLabel, testLabel, params, rounds): 53 | params['scale_pos_weight'] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1]) 54 | print params['scale_pos_weight'] 55 | 56 | dtrain = xgb.DMatrix(trainFeature, label = trainLabel) 57 | dtest = xgb.DMatrix(testFeature, label = testLabel) 58 | 59 | watchlist = [(dtest,'eval'), (dtrain,'train')] 60 | num_round = rounds 61 | print 'run local: ' + 'round: ' + str(rounds) 62 | model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval = 20)#,feval = evalerror) 63 | 64 | predict = model.predict(dtest) 65 | 66 | return predict 67 | 68 | #-- xgboost cross-validation Model frame 69 | def xgbCVModel(trainFeature, trainLabel, rounds, folds, params): 70 | 71 | #--Set parameter: scale_pos_weight-- 72 | params['scale_pos_weight'] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1]) 73 | print params['scale_pos_weight'] 74 | 75 | 76 | #--Get User-define DMatrix: dtrain-- 77 | #print trainQid[0] 78 | dtrain = xgb.DMatrix(trainFeature, label = trainLabel) 79 | num_round = rounds 80 | 81 | #--Run CrossValidation-- 82 | print 'run cv: ' + 'round: ' + str(rounds) + ' folds: ' + str(folds) 83 | res = xgb.cv(params, dtrain, num_round, nfold = folds, verbose_eval = 20) 84 | return res 85 | 86 | #-- xgboost online predict Model frame 87 | def xgbPredictModel(trainFeature, trainLabel, testFeature, params, rounds): 88 | 89 | dtrain = xgb.DMatrix(trainFeature, label = trainLabel) 90 | dtest = xgb.DMatrix(testFeature, label = np.zeros(testFeature.shape[0])) 91 | 92 | watchlist = [(dtest,'eval'), (dtrain,'train')] 93 | 94 | params['scale_pos_weight'] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1]) 95 | 96 | print params['scale_pos_weight'] 97 | 98 | num_round = rounds 99 | 100 | model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval = 100) 101 | 102 | 103 | importance = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort('importance', ascending=False) 104 | 105 | 106 | predict = model.predict(dtest) 107 | 108 | importance.to_csv('../importance/im.csv', index = False) 109 | 110 | return model, predict 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | --------------------------------------------------------------------------------