├── README.md
├── excute.py
├── function.py
└── model.py


/README.md:
--------------------------------------------------------------------------------
 1 | # CCF-StateGrid--用电异常行为识别
 2 | 本次比赛TNT_000队伍获得二等奖。总体思路如下：
 3 | 
 4 | 1.特征思路
 5 | 
 6 | 
 7 | 按照时间划分的基础属性特征（B：Base）：
 8 | 
 9 | 
10 |   ● 用户每1/2/3/4/5周的用电量和，电表起始度数和，电表终止度数和，用户记录数
11 |   
12 |   ● 用户每2/3/4/5/6天的用电量和，电表起始度数和，电表终止度数和，用户记录数
13 | 
14 |  
15 |  
16 |  
17 | 按照时间划分的用户用电前后相似度特征（S：Similarity）：
18 | 
19 | 
20 |   ● 用户每4/5周前后两个时间段的用电量、电表起始度数、电表终止度数、用户记录数PCC相似度
21 |   
22 |   ● 用户每4/5周前后两个时间段的用电量、电表起始度数、电表终止度数、用户记录数余弦相似度
23 | 
24 |   
25 |   
26 |   
27 | 按照时间划分的用户Pooling特征（P：Pool）：
28 | 
29 | 
30 |   ● 用户用电记录按照时间构成28*37二维矩阵，一维代表时间，一维代表用电量。
31 |   
32 |   ● 按照2*2的大小对矩阵进行取最大值，平均值，标准差统计
33 |   
34 |   ● 按照3*3的大小对矩阵进行取最大值，平均值，标准差统计
35 | 
36 | 
37 | 
38 | 按照时间划分的用户统计特征属性（D：Describe）：
39 | 
40 | 
41 |   ● 用户每1/2/3/4/5周的用电量，电表起始度数，电表终止度数，用户记录数的统计特征
42 |   
43 |   ● 用户每2/3/4/5/6天的用电量，电表起始度数，电表终止度数，用户记录数的统计特征
44 |   
45 |   ● 统计特征有最大，最小，均值，方差，异常值个数，中位数等等
46 |     
47 |     
48 | 
49 | 2.模型：
50 | 
51 | 
52 |   ● Xgboost 单模型（B+D）  cv-3 线下0.914   线上0.92244
53 |   
54 |   ● Xgboost 单模型（B+S+D） cv-3线下0.915  线上0.92288
55 | 
56 | 
57 | 
58 | 
59 | 3.文件解释：
60 | 
61 | 
62 | ../stateGrid/
63 |   - feature：放置特征文件夹
64 |       - matrixFeature：B特征文件夹
65 |       - description：D特征文件夹
66 |       - stack：stack结果文件
67 |       - trend：S特征文件夹
68 |   - data：放置原始数据及原始数据预处理
69 |   - importance：xgb对特征重要性评价文件
70 |   - src：源代码文件
71 |      - excute.py  配置文件及执行文件
72 |      - function.py 构造特征函数文件
73 |      - model.py 模型相关函数文件
74 |   - result：结果文件
75 |   - model：模型文件 
76 | 


--------------------------------------------------------------------------------
/excute.py:
--------------------------------------------------------------------------------
  1 | #-- encoding:UTF-8 --
  2 | #-- Author: TNT_000 by Abner yang
  3 | import numpy as np 
  4 | import pandas as pd 
  5 | import datetime
  6 | from function import *
  7 | from model import *
  8 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
  9 | from sklearn.ensemble import BaggingClassifier
 10 | from sklearn.neighbors import KNeighborsClassifier
 11 | from sklearn.ensemble import GradientBoostingClassifier
 12 | from sklearn.linear_model import LogisticRegression
 13 | 
 14 | #--- xgboost parameters
 15 | params={
 16 | 	'scale_pos_weight': 0,
 17 | 	'booster':'gbtree',
 18 | 	'objective': 'binary:logistic',
 19 | 	#'objective': 'rank:pairwise',
 20 | 
 21 |     'eval_metric': 'map',
 22 | 	'stratified':True,
 23 | 
 24 | 	'max_depth':4,
 25 | 	'min_child_weight':0.01,
 26 | 	'gamma':0.1,
 27 | 	'subsample':0.8,
 28 | 	'colsample_bytree':0.6,
 29 | 	#'max_delta_step':8,
 30 | 	#'colsample_bylevel':0.5,
 31 | 	#'rate_drop':0.3,
 32 | 	
 33 | 	'lambda':0.0001,   #550
 34 | 	#'alpha':10,
 35 | 	#'lambda_bias':0,
 36 | 	
 37 | 	'eta': 0.02,
 38 | 	'seed':1288,
 39 | 
 40 | 	'nthread':8,
 41 | 	
 42 | 	'silent':1
 43 | }
 44 | 
 45 | #--- the config of TNT_000(Abner)'s solution in stateGrid competition
 46 | config = {
 47 | 	'xgbParams':params,
 48 | 	'xgbRounds':2300,
 49 | 	'stackFolds':5,
 50 | 	'seed':12,
 51 | 	'stackPath':'lr-l2',
 52 | 	'rounds':2300, #---xgb rounds
 53 | 	'folds':5, #--- cross validation folds 
 54 | 	'useMatrix':True, #--- True: use matrix feature, False: no use matrix feature
 55 | 	'matrixStack':False, #--- True: use teammate's stack feature, False: no use..
 56 | 	'final':True,
 57 | 	'myStack':False, #--- True: use my stack feature, False: no use
 58 | 	'des2':False,
 59 | 	'listMatrix':[2], #--- list: the value means the window of time to get feature
 60 | 	'uselistMatrix1':[1,2,3], #--- use matrix feature window list
 61 | 	'uselistMatrix2':[1,2,3,4,5,6,7,14,21,28,35],#--- use description feature window list
 62 | 	'uselistMatrix3':[1],#--- use Trend feature window list
 63 | 	'uselistMatrix4':[1],
 64 | 	'uselistMatrix5':[2,3,4,5,6,7,14,21,28,35],
 65 | 	'pccList':[27,28,29,30], #--- list: the value means the window of time to get trend feature
 66 | 	'desList':[7,14,21,28,35],
 67 | 	'biasList':[0],
 68 | 	'name':['kwhU','kwhN','kwhE','kwhS'], #--- the column list to make feature
 69 | 	'useId':False,#--- True: use id feature False: no use
 70 | 	'base':[[5,20],[4,20],[3,25],[2,30]],#--id wondws to get id feature
 71 | 	'matrixStackList':['xgb_prob1'], #--use teammate's stack result name
 72 | 	'myStackList':['xgb-1','xgb-2'], #--my stack result name
 73 | 	'Description':True, #--- True: use description feature False: no use
 74 | 	'Trend':False,#--- True: use trend feature False: no use
 75 | 	'pcc-dis':[29,27,28,30], #--- list: the value means the window of time to get trend feature
 76 | 	'des-dis':[7,14,21,28,35],
 77 | 	'filter':True #--- True: feature selection False: no use
 78 | 	}
 79 | 
 80 | 
 81 | 
 82 | if __name__ == '__main__':
 83 | 
 84 | 	translateData() #--- preprocess raw data
 85 | 
 86 | 	getUseMatrix(config, 2016) #--- get matrix feature 
 87 | 	getDescriptionFeature(config, 2016) #--- get description feature
 88 | 	getTrendFeature(config, 2016) #--- get trend feature
 89 | 	getDescribeFeature1(config, 2016)
 90 | 	getFinalFeature(config, 2016)
 91 | 
 92 | 	trainFeature, testFeature, trainLabel, testIndex = getFeature(config, 2016)  #-- read feature
 93 | 
 94 | 	res = xgbCVModel(trainFeature, trainLabel, config['rounds'], config['folds'], params)  #-- xgb cross validation
 95 | 
 96 | 	model, predict = xgbPredictModel(trainFeature, trainLabel, testFeature, params, config['rounds']) #-- xgb online predict
 97 | 
 98 | 	storeResult(testIndex, predict, model, 'guodian_final_002') #-- store result
 99 | 
100 | 	
101 | 
102 | 	
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/function.py:
--------------------------------------------------------------------------------
  1 | #-- encoding:UTF-8 --
  2 | #-- Author: TNT_000 by Abner yang
  3 | import pandas as pd 
  4 | import numpy as np 
  5 | import datetime
  6 | import math
  7 | 
  8 | #-- get number of days from 2014/01/01 
  9 | def getDate(date, p):
 10 | 	listTime = []
 11 | 	print min(date)
 12 | 	i = 0
 13 | 	for d in date:
 14 | 		time = datetime.datetime.strptime(d,"%Y/%m/%d")
 15 | 		time1 = datetime.datetime.strptime(p,"%Y/%m/%d")
 16 | 		listTime.append((time-time1).days)
 17 | 		if i%10000 == 1:
 18 | 			print i
 19 | 		i += 1
 20 | 	return listTime
 21 | 
 22 | #-- preprocess data
 23 | def translateData():
 24 | 	train = pd.read_csv('../data/train.csv', header = None)
 25 | 	train.columns = ['CONS_NO','label']
 26 | 
 27 | 	train.to_csv('../data/trainInfo.csv', index = False)
 28 | 
 29 | 	test = pd.read_csv('../data/test.csv', header = None)
 30 | 	test.columns = ['CONS_NO']
 31 | 
 32 | 	test.to_csv('../data/testInfo.csv', index = False)
 33 | 	
 34 | 	useData	= pd.read_csv('../data/all_user_yongdian_data_2015.csv', header = 0)
 35 | 	time = getDate(useData['DATA_DATE'].values.T,'2015/01/01')
 36 | 	
 37 | 	useData['Time'] = time
 38 | 	print useData.shape
 39 | 	useData1 = useData[(useData['Time'] >= 0) & (useData['Time'] < 365)]
 40 | 	print useData1.shape
 41 | 	useData1 = useData1.sort(['CONS_NO','Time', 'KWH_READING'], ascending=[1,1,0])
 42 | 	useData1.to_csv('../data/useDataInfo_2016.csv', index = False)
 43 | 
 44 | 	useData2 = useData[(useData['Time'] >= -365) & (useData['Time'] < 0)]
 45 | 	useData2['Time'] = useData2['Time'].values.T+365
 46 | 	print useData2.shape
 47 | 	useData2 = useData2.sort(['CONS_NO','Time', 'KWH_READING'], ascending=[1,1,0])
 48 | 	useData2.to_csv('../data/useDataInfo_2015.csv', index = False)
 49 | 
 50 | 	useData3 = useData[(useData['Time'] >= -730) & (useData['Time'] < -365)]
 51 | 	useData3['Time'] = useData3['Time'].values.T+730
 52 | 	print useData3.shape
 53 | 	useData3 = useData3.sort(['CONS_NO','Time', 'KWH_READING'], ascending=[1,1,0])
 54 | 	useData3.to_csv('../data/useDataInfo_2014.csv', index = False)
 55 | 
 56 | 	useData	= pd.read_csv('../data/user_dianliang_round3.csv', header = 0)
 57 | 	time = getDate(useData['DATA_DATE'].values.T,'2016/01/01')
 58 | 
 59 | 	useData['Time'] = time
 60 | 	useData = useData.sort(['CONS_NO','Time', 'KWH_READING'], ascending=[1,1,0])
 61 | 	useData.to_csv('../data/useDataInfo_finalTest_2016.csv', index = False)
 62 | 
 63 | 
 64 | #-- get matrix feature
 65 | def getUseMatrix(config, p):
 66 | 	useData1 = pd.read_csv('../data/useDataInfo_'+str(p)+'.csv', header = 0)
 67 | 	useData1 = useData1.fillna(-1)
 68 | 
 69 | 	useData2 = pd.read_csv('../data/useDataInfo_finalTest_2016.csv', header = 0)
 70 | 	useData2 = useData2.fillna(-1)
 71 | 
 72 | 	useData = pd.concat([useData1, useData2], axis = 0)
 73 | 	print useData1.shape, useData2.shape, useData.shape
 74 | 	data = useData[['CONS_NO','Time','KWH','KWH_READING','KWH_READING1']].values
 75 | 	print data.shape
 76 | 
 77 | 	userNum = len(np.unique(useData['CONS_NO'].values))
 78 | 	timeT = max(useData['Time'].values.T)+1
 79 | 	print min(useData['Time'].values.T), timeT
 80 | 	for l in config['listMatrix']:
 81 | 		print l
 82 | 		timeNum = int(math.ceil(float(timeT)/l)) 
 83 | 		print timeNum
 84 | 		matrix1 = np.zeros([userNum, timeNum]) - 1
 85 | 		matrix2 = np.zeros([userNum, timeNum]) 
 86 | 		matrix3 = np.zeros([userNum, timeNum]) - 1
 87 | 		matrix4 = np.zeros([userNum, timeNum]) - 1
 88 | 		uidIndex = []
 89 | 
 90 | 		userDict = {}
 91 | 		num = 0
 92 | 		i = 0
 93 | 		for line in data:
 94 | 			if i%100000 == 1:
 95 | 				print i
 96 | 			i += 1
 97 | 			if userDict.has_key(line[0]) == False:
 98 | 				userDict[line[0]] = num
 99 | 				uidIndex.append(line[0])
100 | 				num += 1
101 | 
102 | 			col = line[1]/l
103 | 			
104 | 			if matrix1[userDict[line[0]], col] == -1:
105 | 				matrix1[userDict[line[0]], col] = line[2]
106 | 			else:
107 | 				matrix1[userDict[line[0]], col] += line[2]
108 | 
109 | 			if matrix3[userDict[line[0]], col] == -1:
110 | 				matrix3[userDict[line[0]], col] = line[3]
111 | 			else:
112 | 				matrix3[userDict[line[0]], col] += line[3]
113 | 
114 | 			if matrix4[userDict[line[0]], col] == -1:
115 | 				matrix4[userDict[line[0]], col] = line[4]
116 | 			else:
117 | 				matrix4[userDict[line[0]], col] += line[4]
118 | 
119 | 
120 | 			matrix2[userDict[line[0]], col] += 1
121 | 
122 | 			
123 | 
124 | 		matrixColName1 = getColName(timeNum, 'useDay'+str(l)+'-')
125 | 		matrixColName2 = getColName(timeNum, 'useDayNum'+str(l)+'-')
126 | 		matrixColName3 = getColName(timeNum, 'endNum'+str(l)+'-')
127 | 		matrixColName4 = getColName(timeNum, 'startNum'+str(l)+'-')
128 | 		
129 | 
130 | 		matrixFeature1 = pd.DataFrame(matrix1, columns = matrixColName1)
131 | 		matrixFeature2 = pd.DataFrame(matrix2, columns = matrixColName2)
132 | 		matrixFeature3 = pd.DataFrame(matrix3, columns = matrixColName3)
133 | 		matrixFeature4 = pd.DataFrame(matrix4, columns = matrixColName4)
134 | 
135 | 		matrixFeature1['CONS_NO'] = uidIndex
136 | 		matrixFeature2['CONS_NO'] = uidIndex
137 | 		matrixFeature3['CONS_NO'] = uidIndex
138 | 		matrixFeature4['CONS_NO'] = uidIndex
139 | 
140 | 
141 | 		#matrixFeature = pd.concat([matrixFeature1, matrixFeature2, matrixFeature3, matrixFeature4], axis = 1)
142 | 		
143 | 		matrixFeature1.to_csv('../feature/matrixFeature'+str(p)+'/kwhU_matrixFeature'+str(l)+'.csv', index = False)
144 | 		matrixFeature2.to_csv('../feature/matrixFeature'+str(p)+'/kwhN_matrixFeature'+str(l)+'.csv', index = False)
145 | 		matrixFeature3.to_csv('../feature/matrixFeature'+str(p)+'/kwhS_matrixFeature'+str(l)+'.csv', index = False)
146 | 		matrixFeature4.to_csv('../feature/matrixFeature'+str(p)+'/kwhE_matrixFeature'+str(l)+'.csv', index = False)
147 | 	
148 | #-- numpy array to pandas DataFrame add columns's name list
149 | def getColName(colNum, stri):
150 | 	print colNum, stri
151 | 	colName = []
152 | 	for i in range(colNum):
153 | 		colName.append(stri + str(i))
154 | 	return colName
155 | 
156 | #-- get description feature
157 | def getDescriptionFeature(config, p):
158 | 	for l in config['listMatrix']:
159 | 		for n in config['name']:
160 | 			print n
161 | 			useMatrix = pd.read_csv('../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv', header = 0)
162 | 			print useMatrix.shape
163 | 			uid = useMatrix['CONS_NO'].values.T
164 | 			feature = useMatrix.drop(['CONS_NO'], axis = 1)
165 | 
166 | 			featureMatrix = np.zeros([len(uid), 8])
167 | 
168 | 			feature = feature.values
169 | 
170 | 			num = 0
171 | 			naNum = []
172 | 			outNum1 = []
173 | 			outNum2 = []
174 | 			outNum3 = []
175 | 			ii = 0
176 | 			for line in feature:
177 | 				if ii%1000 == 1:
178 | 					print ii
179 | 				ii+=1
180 | 				k = len(line)
181 | 				line = line[line != -1]
182 | 				if len(line) > 0:
183 | 					outNum3.append(len(line[line >= np.mean(line)+3*np.std(line)]))
184 | 					outNum2.append(len(line[line >= np.mean(line)+2*np.std(line)]))
185 | 					outNum1.append(len(line[line >= np.mean(line)+1*np.std(line)]))
186 | 				else:
187 | 					outNum3.append(-1)
188 | 					outNum2.append(-1)
189 | 					outNum1.append(-1)
190 | 
191 | 				
192 | 				naNum.append(k-len(line)-21)
193 | 				lFrame = pd.DataFrame({'Sta':line})
194 | 				des = lFrame.describe()
195 | 				info = des.values.reshape(des.shape[0])
196 | 				featureMatrix[num,:] = info
197 | 				num += 1
198 | 
199 | 			matrixColName = getColName(8, 'Description-'+n+str(l))	
200 | 			featureMatrix = pd.DataFrame(featureMatrix, columns = matrixColName)
201 | 			
202 | 			naName = 'naNum'+n+str(l) 
203 | 			outName1 = 'outNum1-'+n+str(l) 
204 | 			outName2 = 'outNum2-'+n+str(l) 
205 | 			outName3 = 'outNum3-'+n+str(l) 
206 | 			
207 | 			featureMatrix[naName] = naNum
208 | 
209 | 			featureMatrix[outName1] = outNum1
210 | 			featureMatrix[outName2] = outNum2
211 | 			featureMatrix[outName3] = outNum3
212 | 
213 | 			featureMatrix['CONS_NO'] = uid 
214 | 
215 | 			featureMatrix.to_csv('../feature/describeFeature'+str(p)+'/Description_'+n+str(l)+'.csv', index = False)
216 | 
217 | def getFinalFeature(config, p):
218 | 	for l in config['listMatrix']:
219 | 		for n in config['name']:
220 | 			useMatrix = pd.read_csv('../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv', header = 0)
221 | 			uid = useMatrix['CONS_NO'].values.T
222 | 			feature = useMatrix.drop(['CONS_NO'], axis = 1)
223 | 			feature = feature.values
224 | 
225 | 			colNum = feature.shape[1] - 1
226 | 			featureMatrix = np.zeros([len(uid), colNum-1])
227 | 			for row in range(feature.shape[0]):
228 | 				for i in range(colNum-1):
229 | 					featureMatrix[row,i] = float(feature[row, i+1])/feature[row, i]
230 | 				if row%1000 == 1:
231 | 					print row
232 | 			matrixColName = getColName(colNum-1, 'Trend-final-'+n+str(l))	
233 | 			featureMatrix = pd.DataFrame(featureMatrix, columns = matrixColName)
234 | 			
235 | 			featureMatrix['CONS_NO'] = uid 
236 | 
237 | 			featureMatrix.to_csv('../feature/finalFeature'+str(p)+'/Trend_'+n+str(l)+'final'+'.csv', index = False)
238 | 
239 | 
240 | #-- get trend feature
241 | def getTrendFeature(config, p):
242 | 	for l in config['listMatrix']:
243 | 		for n in config['name']:
244 | 			for bias in config['biasList']:
245 | 				for pcc in config['pcc-dis']:
246 | 					print n
247 | 					useMatrix = pd.read_csv('../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv', header = 0)
248 | 					print useMatrix.shape
249 | 					uid = useMatrix['CONS_NO'].values.T
250 | 					feature = useMatrix.drop(['CONS_NO'], axis = 1)
251 | 					feature = feature.values
252 | 
253 | 					colNum = feature.shape[1]/pcc
254 | 					featureMatrix = np.zeros([len(uid), colNum-1])
255 | 					for row in range(feature.shape[0]):
256 | 						for i in range(colNum-1):
257 | 							featureMatrix[row,i] = np.corrcoef(feature[row,(i*pcc+bias):((i+1)*pcc+bias)], feature[row,((i+1)*pcc+bias):((i+2)*pcc+bias)])[0,1]
258 | 						if row%1000 == 1:
259 | 							print row
260 | 					matrixColName = getColName(colNum-1, 'Trend-PCC-'+n+str(l)+'pcc'+str(pcc))	
261 | 					featureMatrix = pd.DataFrame(featureMatrix, columns = matrixColName)
262 | 					
263 | 					featureMatrix['CONS_NO'] = uid 
264 | 
265 | 					featureMatrix.to_csv('../feature/trendFeature'+str(p)+'/Trend_'+n+str(l)+'pcc'+str(pcc)+'-bias-'+str(bias)+'.csv', index = False)
266 | 
267 | def getDescribeFeature1(config, p):
268 | 	for l in config['listMatrix']:
269 | 		for n in config['name']:
270 | 			for bias in config['biasList']:
271 | 				for pcc in config['des-dis']:
272 | 					print n
273 | 					useMatrix = pd.read_csv('../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv', header = 0)
274 | 					print useMatrix.shape
275 | 					uid = useMatrix['CONS_NO'].values.T
276 | 					feature = useMatrix.drop(['CONS_NO'], axis = 1)
277 | 					feature = feature.values
278 | 
279 | 					colNum = (feature.shape[1]-bias)/pcc
280 | 					featureMatrix = np.zeros([len(uid), colNum*5])
281 | 					for row in range(feature.shape[0]):
282 | 						for i in range(colNum):
283 | 							kk = feature[row,(i*pcc+bias):((i+1)*pcc+bias)]
284 | 							ss = [np.mean(kk),np.std(kk),np.median(kk),np.max(kk), np.min(kk)]
285 | 							featureMatrix[row,(i*5):(i+1)*5] = ss
286 | 						if row%1000 == 1:
287 | 							print row
288 | 					matrixColName = getColName(colNum*5, 'Des2_'+n+str(l)+'pcc'+str(pcc))	
289 | 					featureMatrix = pd.DataFrame(featureMatrix, columns = matrixColName)
290 | 					
291 | 					featureMatrix['CONS_NO'] = uid 
292 | 
293 | 					featureMatrix.to_csv('../feature/des2Feature'+str(p)+'/Des2_'+n+str(l)+'static'+str(pcc)+'-bias-'+str(bias)+'.csv', index = False)
294 | 
295 | #-- feature selection
296 | def filter(data):
297 | 	col = data.columns
298 | 	delName = []
299 | 	for i in col:
300 | 		value = data[i].values.T
301 | 		if len(np.unique(value)) == 1:
302 | 			delName.append(i)
303 | 	return delName
304 | 	#data = data.drop(delName, axis = 1)
305 | 
306 | 	#return data
307 | 
308 | #-- get upper id from raw		
309 | def getupper(data):
310 | 	k = []
311 | 	for d in data:
312 | 		k.append(d.upper())
313 | 	return k
314 | 
315 | #-- read feature and return
316 | def getFeature(config, p):
317 | 	train = pd.read_csv('../data/trainInfo.csv', header = 0)
318 | 	test = pd.read_csv('../data/finalTest.csv', header = 0)
319 | 
320 | 	print train.shape, test.shape
321 | 
322 | 	if config['useMatrix'] == True:
323 | 		for l in config['uselistMatrix1']:
324 | 			for n in config['name']:
325 | 				name = '../feature/matrixFeature'+str(p)+'/'+n+'_matrixFeature'+str(l)+'.csv'
326 | 				useMatrix = pd.read_csv(name, header = 0)
327 | 				train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
328 | 				test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
329 | 				print train.shape, test.shape
330 | 	if config['Description'] == True:
331 | 		for l in config['uselistMatrix2']:
332 | 			for n in config['name']:
333 | 				name = '../feature/describeFeature'+str(p)+'/Description_'+n+str(l)+'.csv'
334 | 				useMatrix = pd.read_csv(name, header = 0)
335 | 				train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
336 | 				test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
337 | 				print train.shape, test.shape
338 | 	if config['final'] == True:
339 | 		for l in config['uselistMatrix5']:
340 | 			for n in config['name']:
341 | 				name = '../feature/finalFeature'+str(p)+'/Trend_'+n+str(l)+'final'+'.csv'
342 | 				useMatrix = pd.read_csv(name, header = 0)
343 | 				train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
344 | 				test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
345 | 				print train.shape, test.shape
346 | 	if config['Trend'] == True:
347 | 		for l in config['uselistMatrix3']:
348 | 			for n in config['name']:
349 | 				for b in config['biasList']:
350 | 					for pcc in config['pccList']:
351 | 						name = '../feature/trendFeature'+str(p)+'/Trend_'+n+str(l)+'pcc'+str(pcc)+'-bias-'+str(b)+'.csv'
352 | 						useMatrix = pd.read_csv(name, header = 0)
353 | 						train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
354 | 						test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
355 | 						print train.shape, test.shape
356 | 	if config['des2'] == True:
357 | 		for l in config['uselistMatrix4']:
358 | 			for n in config['name']:
359 | 				for b in config['biasList']:
360 | 					for pcc in config['desList']:
361 | 						name = '../feature/des2Feature'+str(p)+'/Des2_'+n+str(l)+'static'+str(pcc)+'-bias-'+str(b)+'.csv'
362 | 						useMatrix = pd.read_csv(name, header = 0)
363 | 						train = pd.merge(train, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
364 | 						test = pd.merge(test, useMatrix, on = 'CONS_NO', how = 'left').fillna(-1)
365 | 						print train.shape, test.shape
366 | 	if config['myStack'] == True:
367 | 		for l in config['myStackList']:
368 | 			data1 = pd.read_csv('../feature/stack/'+l+'_train.csv', header = 0)
369 | 			data2 = pd.read_csv('../feature/stack/'+l+'_test.csv', header = 0)
370 | 			
371 | 			train = pd.concat([train, data1], axis = 1).fillna(-1)
372 | 			test = pd.concat([test, data2], axis = 1).fillna(-1)
373 | 			print train.shape, test.shape
374 | 
375 | 	if config['matrixStack'] == True:
376 | 		for l in config['matrixStackList']:
377 | 			data = pd.read_csv('../feature/stackFeature/'+l+'.csv',header = 0)
378 | 			data['CONS_NO'] = np.append(train['CONS_NO'].values.T, test['CONS_NO'].values.T)
379 | 			train = pd.merge(train, data, on = 'CONS_NO', how = 'left').fillna(-1)
380 | 			test = pd.merge(test, data, on = 'CONS_NO', how = 'left').fillna(-1)
381 | 			print train.shape, test.shape
382 | 
383 | 
384 | 
385 | 	trainUid = train['CONS_NO'].values.T
386 | 	testUid = test['CONS_Index'].values.T
387 | 	
388 | 	trainFeature = train.drop(['CONS_NO','label'], axis = 1)
389 | 	testFeature = test.drop(['CONS_NO','CONS_Index'], axis = 1)
390 | 
391 | 	trainLabel = train['label'].values.T
392 | 
393 | 
394 | 	print trainFeature.shape, testFeature.shape, trainLabel.shape
395 | 
396 | 	# print trainFeature
397 | 	# print testFeature
398 | 
399 | 	if config['filter'] == True:
400 | 		print 'filter...'
401 | 		delName = filter(trainFeature)
402 | 		trainFeature = trainFeature.drop(delName, axis = 1)
403 | 		testFeature = testFeature.drop(delName, axis = 1)
404 | 
405 | 
406 | 	print trainFeature.shape, testFeature.shape, trainLabel.shape
407 | 
408 | 	return trainFeature.fillna(-1).values, testFeature.fillna(-1).values, trainLabel, testUid
409 | 
410 | #-- store the online result
411 | def storeResult(testIndex, predict, model, day):
412 | 	result = pd.DataFrame({'CONS_NO':testIndex, 'label':predict})
413 | 	#print result
414 | 	rpath = '../result/'+ day + '.csv'
415 | 	rpath1 = '../result/'+ day + '_prob.csv'
416 | 	
417 | 	mpath = '../model/'+ day + '.m'
418 | 	result = result.sort('label', ascending = False)
419 | 	#print result
420 | 	result.to_csv(rpath1, index = False)
421 | 	result = result['CONS_NO']
422 | 	result.to_csv(rpath, index = False, header = False)
423 | 	if model != False:
424 | 		model.save_model(mpath)	
425 | 
426 | 
427 | 
428 | 
429 | 
430 | 
431 | 	
432 | 
433 | 
434 | 
435 | 
436 | 
437 | 
438 | 
439 | 
440 | 
441 | 
442 | 
443 | 
444 | 
445 | 
446 | 
447 | 
448 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | # -- encoding:UTF-8 --
  2 | #-- Author: TNT_000 by Abner yang
  3 | import numpy as np 
  4 | import pandas as pd 
  5 | import xgboost as xgb 
  6 | from sklearn.preprocessing import OneHotEncoder
  7 | from function import *
  8 | from scipy.sparse import hstack
  9 | from matplotlib import pyplot
 10 | from sklearn.cross_validation import StratifiedKFold
 11 | from sklearn.cross_validation import cross_val_score
 12 | from sklearn.metrics import average_precision_score
 13 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 14 | from sklearn.ensemble import BaggingClassifier
 15 | from sklearn.neighbors import KNeighborsClassifier
 16 | from sklearn.ensemble import GradientBoostingClassifier
 17 | from sklearn.linear_model import LogisticRegression
 18 | #-- map eval function
 19 | 
 20 | # -*- encoding:utf-8 -*- 
 21 | import numpy as np 
 22 | import pandas as pd 
 23 | import xgboost as xgb 
 24 | from sklearn.model_selection import StratifiedShuffleSplit
 25 | from sklearn.model_selection import StratifiedKFold
 26 | import time
 27 | 
 28 | 
 29 | def evalerror(predict, true):
 30 | 	print average_precision_score(true, predict, average='macro', sample_weight=None)
 31 | 	
 32 | 
 33 | 
 34 | 
 35 | def map_eval(true, predict):
 36 | 	result = pd.DataFrame({'true':true, 'predict':predict})
 37 | 	result = result.sort(['predict'], ascending = [0])
 38 | 	#print result
 39 | 	score = []
 40 | 	num = 0
 41 | 	total = 0
 42 | 	for line in result['true'].values.T:
 43 | 		total += 1
 44 | 		if line == 1:
 45 | 			num += 1
 46 | 			score.append(float(num)/total)
 47 | 	mapScore = np.mean(score)
 48 | 	print mapScore
 49 | 	return mapScore
 50 | 
 51 | #-- xgboost local train-test Model frame
 52 | def xgbLocalModel(trainFeature, testFeature, trainLabel, testLabel, params, rounds):
 53 | 	params['scale_pos_weight'] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1])
 54 | 	print params['scale_pos_weight']
 55 | 
 56 | 	dtrain = xgb.DMatrix(trainFeature, label = trainLabel)
 57 | 	dtest = xgb.DMatrix(testFeature, label = testLabel)
 58 | 
 59 | 	watchlist  = [(dtest,'eval'), (dtrain,'train')]
 60 | 	num_round = rounds
 61 | 	print 'run local: ' + 'round: ' + str(rounds)
 62 | 	model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval = 20)#,feval = evalerror)
 63 | 
 64 | 	predict = model.predict(dtest)
 65 | 
 66 | 	return predict
 67 | 
 68 | #-- xgboost cross-validation Model frame
 69 | def xgbCVModel(trainFeature, trainLabel, rounds, folds, params):
 70 | 	
 71 | 	#--Set parameter: scale_pos_weight-- 
 72 | 	params['scale_pos_weight'] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1])
 73 | 	print params['scale_pos_weight']
 74 | 
 75 | 
 76 | 	#--Get User-define DMatrix: dtrain--
 77 | 	#print trainQid[0]
 78 | 	dtrain = xgb.DMatrix(trainFeature, label = trainLabel)
 79 | 	num_round = rounds
 80 | 
 81 | 	#--Run CrossValidation--
 82 | 	print 'run cv: ' + 'round: ' + str(rounds) + ' folds: ' + str(folds) 
 83 | 	res = xgb.cv(params, dtrain, num_round, nfold = folds, verbose_eval = 20)
 84 | 	return res
 85 | 
 86 | #-- xgboost online predict Model frame
 87 | def xgbPredictModel(trainFeature, trainLabel, testFeature, params, rounds):
 88 | 
 89 | 	dtrain = xgb.DMatrix(trainFeature, label = trainLabel)
 90 | 	dtest = xgb.DMatrix(testFeature, label = np.zeros(testFeature.shape[0]))
 91 | 
 92 | 	watchlist  = [(dtest,'eval'), (dtrain,'train')]
 93 | 	
 94 | 	params['scale_pos_weight'] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1])
 95 | 
 96 | 	print params['scale_pos_weight']
 97 | 
 98 | 	num_round = rounds
 99 | 	
100 | 	model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval = 100)
101 | 
102 | 
103 | 	importance = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort('importance', ascending=False)
104 | 
105 | 
106 | 	predict = model.predict(dtest)
107 | 
108 | 	importance.to_csv('../importance/im.csv', index = False)
109 | 	
110 | 	return model, predict
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------