├── README.md └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # subway_traffic_forecast-tianchi 2 | 萌新开源,大佬些多给点指导。 3 | 天池全球城市计算AI挑战赛-地铁人流量预测, 4 | A榜22/2319,该代码是A榜代码,如果能有所收获,老铁右上角,star一下,感谢! 5 | 队友:buger,taoberica、selina雪,感谢鱼佬baseline, 6 | A榜代码有部分是借鉴鱼佬开源代码, 7 | 没能进入决赛也就不开源淘汰赛的代码了。 8 | 数据集下载: 9 | 10 | 链接: https://pan.baidu.com/s/1iLHomv5NRodB_3jr7FcFow 提取码: arse 11 | 12 | 比赛链接;https://tianchi.aliyun.com/competition/entrance/231708/introduction?spm=5176.12281957.1004.5.38b04c2alLBS7L 13 | 14 | 目前还有一些未来得及验证的想法,有兴趣的大佬些可以试试看。 15 | 16 | a.将间隔十分钟改为间隔五分钟,相对增加了数据量 17 | 18 | b.将shfit后的前三天删掉,因为shift后前三天引入了很多0 19 | 20 | c.除了shift最近三天的策略,还可以试试shift最近两天+上一周相对应的week的数据 21 | 22 | d.最开始也试了lgb模型,效果比xgb差点,可以将xgb和lgb采用blending融合 23 | 24 | 如有疑问,欢迎学习交流,QQ:1796320597 25 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Mar 22 15:38:01 2019 4 | 5 | @author: dell 6 | """ 7 | ############################################################### 8 | import numpy as np 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | import seaborn as sns 12 | import lightgbm as lgb 13 | import xgboost as xgb 14 | from sklearn.model_selection import KFold, StratifiedKFold 15 | from sklearn.metrics import accuracy_score, roc_auc_score 16 | from scipy.stats import norm, rankdata 17 | import warnings 18 | import gc 19 | import os 20 | import time 21 | import sys 22 | import datetime 23 | import matplotlib.pyplot as plt 24 | import seaborn as sns 25 | from sklearn.metrics import mean_squared_error 26 | warnings.simplefilter(action='ignore', category=FutureWarning) 27 | warnings.filterwarnings('ignore') 28 | from sklearn import metrics 29 | 30 | plt.style.use('seaborn') 31 | sns.set(font_scale=1) 32 | from sklearn.ensemble import RandomForestRegressor 33 | from sklearn.linear_model import LassoCV, RidgeCV 34 | 35 | path = '/Users/dell/Desktop/tianchi_AI' 36 | test = pd.read_csv(path + '/Metro_testA/testA_submit_2019-01-29.csv') 37 | test_28 = pd.read_csv(path + '/Metro_testA/testA_record_2019-01-28.csv') 38 | station_con = pd.read_csv('Metro_roadMap.csv') 39 | 40 | #计算每个站口相连地铁站口个数,后面可选择用或不用,我们用了没提分,或许打开方式不对 41 | del station_con['Unnamed: 0'] 42 | station_con_sum=pd.DataFrame() 43 | station_con_sum['station_con_sum'] = np.sum(station_con,axis=0) 44 | station_con_sum = station_con_sum[0:] 45 | station_con_sum['stationID'] = np.arange(81) 46 | 47 | def mean_absolute_percentage_error(y_true, y_pred): 48 | return np.mean(np.abs(y_true - y_pred)) 49 | 50 | def get_base_features(df_,test,time_str): 51 | 52 | df = df_.copy() 53 | #df1 = pd.get_dummies(df,columns=['lineID']) 54 | df['startTime'] = df['time'].apply(lambda x: x[:15].replace(time_str, '-01-29') + '0:00') 55 | 56 | 57 | df = df.groupby(['startTime','stationID']).status.agg(['count', 'sum']).reset_index() 58 | df = test.merge(df, 'left', ['stationID','startTime']) 59 | 60 | 61 | df['time'] = df['startTime'].apply(lambda x: x[:15].replace('-01-29', time_str) + '0:00') 62 | 63 | del df['startTime'],df['endTime'] 64 | 65 | # base time 66 | df['day'] = df['time'].apply(lambda x: int(x[8:10])) 67 | 68 | df['week'] = pd.to_datetime(df['time']).dt.dayofweek + 1 69 | #df['weekend'] = (pd.to_datetime(df.time).dt.weekday >=5).astype(int) 70 | df['hour'] = df['time'].apply(lambda x: int(x[11:13])) 71 | df['minute'] = df['time'].apply(lambda x: int(x[14:15]+'0')) 72 | 73 | result = df.copy() 74 | 75 | # in,out 76 | result['inNums'] = result['sum'] 77 | result['outNums'] = result['count'] - result['sum'] 78 | # 79 | result['day_since_first'] = result['day'] - 1 80 | 81 | ###rank复赛记得有提分,初赛没有用起来,当时打开方式不对 82 | #result['rank'] = (result['stationID']+1)*(result['day']*144+result['hour']*6+result['minute']) 83 | result.fillna(0, inplace=True) 84 | del result['sum'],result['count'] 85 | 86 | return result 87 | 88 | 89 | time_str = '-01-28' 90 | data1 = get_base_features(test_28,test,time_str) 91 | 92 | ###29号时间等信息是本身的,inNums和outNums用的28号的数据 93 | ###后面也就可以直接将29号作为测试集 94 | time_str = '-01-29' 95 | df = pd.read_csv(path + '/Metro_testA/testA_record_2019-01-28.csv') 96 | 97 | df['time'] = df['time'].apply(lambda x: x[:15].replace('-01-28', time_str)+ '0:00') 98 | df = get_base_features(df,test,time_str) 99 | data1 = pd.concat([data1, df], axis=0, ignore_index=True) 100 | 101 | 102 | data_list = os.listdir(path+'/Metro_train/') 103 | for i in range(0, len(data_list)): 104 | if data_list[i].split('.')[-1] == 'csv': 105 | time_str = data_list[i].split('.')[0][11:17] 106 | print(data_list[i], i) 107 | df = pd.read_csv(path+'/Metro_train/' + data_list[i]) 108 | df = get_base_features(df,test,time_str) 109 | data1 = pd.concat([data1, df], axis=0, ignore_index=True) 110 | else: 111 | continue 112 | 113 | ###merge每个站口相连地铁站口个数 114 | #data1 = data1.merge(station_con_sum, on=['stationID'], how='left') 115 | 116 | ###特征自己可以添加, 117 | def more_feature(result): 118 | tmp = result.copy() 119 | tmp = tmp[['stationID','week','day','hour']] 120 | ###按week计算每个站口每小时客流量特征 121 | tmp = result.groupby(['stationID','week','hour'], as_index=False)['inNums'].agg({ 122 | 'inNums_ID_dh_max' : 'max',### 123 | 'inNums_ID_dh_min' : 'min',### 124 | 'inNums_ID_dh_mean' : 'mean',### 125 | 'inNums_ID_dh_sum' : 'sum' 126 | }) 127 | result = result.merge(tmp, on=['stationID','week','hour'], how='left') 128 | ###按week计算每个站口客流量特征 129 | tmp = result.groupby(['stationID','week'], as_index=False)['inNums'].agg({ 130 | 'inNums_ID_d_max' : 'max', 131 | 'inNums_ID_d_min' : 'min', #都为0 132 | 'inNums_ID_d_mean' : 'mean',## 133 | 'inNums_ID_d_sum' : 'sum' 134 | }) 135 | result = result.merge(tmp, on=['stationID','week'], how='left') 136 | 137 | ###每个站口所有天客流量特征 138 | tmp = result.groupby(['stationID'], as_index=False)['inNums'].agg({ 139 | 'inNums_ID_max' : 'max', 140 | 'inNums_ID_min' : 'min', 141 | 'inNums_ID_mean' : 'mean',## 142 | 'inNums_ID_sum' : 'sum' 143 | }) 144 | result = result.merge(tmp, on=['stationID'], how='left') 145 | ###每天所有站口客流量特征 146 | tmp = result.groupby(['day'], as_index=False)['inNums'].agg({ 147 | 'inNums_d_max' : 'max', 148 | 'inNums_d_min' : 'min',#都为0 149 | 'inNums_d_mean' : 'mean',## 150 | 'inNums_d_sum' : 'sum' 151 | }) 152 | result = result.merge(tmp, on=['day'], how='left') 153 | 154 | 155 | 156 | ###出站与进站类似 157 | tmp = result.groupby(['stationID','week','hour'], as_index=False)['outNums'].agg({ 158 | 'outNums_ID_dh_max' : 'max', 159 | 'outNums_ID_dh_min' : 'min',## 160 | 'outNums_ID_dh_mean' : 'mean',## 161 | 'outNums_ID_dh_sum' : 'sum' 162 | }) 163 | result = result.merge(tmp, on=['stationID','week','hour'], how='left') 164 | 165 | tmp = result.groupby(['stationID','week'], as_index=False)['outNums'].agg({ 166 | 'outNums_ID_d_max' : 'max', 167 | 'outNums_ID_d_min' : 'min',#都为0 168 | 'outNums_ID_d_mean' : 'mean',## 169 | 'outNums_ID_d_sum' : 'sum' 170 | }) 171 | result = result.merge(tmp, on=['stationID','week'], how='left') 172 | 173 | 174 | tmp = result.groupby(['stationID'], as_index=False)['outNums'].agg({ 175 | 'outNums_ID_max' : 'max', 176 | 'outNums_ID_min' : 'min', 177 | 'outNums_ID_mean' : 'mean', 178 | 'outNums_ID_sum' : 'sum' 179 | }) 180 | result = result.merge(tmp, on=['stationID'], how='left') 181 | 182 | tmp = result.groupby(['day'], as_index=False)['outNums'].agg({ 183 | 'outNums_d_max' : 'max', 184 | 'outNumss_d_min' : 'min',#都为0 185 | 'outNums_d_mean' : 'mean', 186 | 'outNums_d_sum' : 'sum' 187 | }) 188 | result = result.merge(tmp, on=['day'], how='left') 189 | 190 | return result 191 | 192 | data2 = more_feature(data1) 193 | 194 | # 删除某一类别占比超过90%的列 195 | good_cols = list(data2.columns) 196 | for col in data2.columns: 197 | rate = data2[col].value_counts(normalize=True, dropna=False).values[0] 198 | if rate > 0.90: 199 | good_cols.remove(col) 200 | print(col,rate) 201 | 202 | data2 = data2[good_cols] 203 | 204 | ''' 205 | ###皮尔相关系数 206 | fea_train = data2.copy() 207 | del fea_train['time'] 208 | fea_y = fea_train['inNums'] 209 | del fea_train['inNums'] 210 | del fea_train['outNums'] 211 | fe = pd.concat([fea_train, fea_y], axis = 1,ignore_index=False) 212 | colormap = plt.cm.viridis 213 | plt.figure(figsize=(30,30)) 214 | plt.title('Pearson Correlation of Features', y=1.05, size=15) 215 | sns.heatmap(fe.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True) 216 | 217 | 218 | 219 | ####xgb特征选择 220 | data_mod = data2.copy() 221 | del data_mod['time'] 222 | train = data_mod[data_mod.day<25] 223 | 224 | valid = data_mod[data_mod.day==25] 225 | 226 | #test = data_in_shfit_temp[data_in_shfit_temp.day==j] 227 | 228 | from xgboost import plot_importance 229 | y_train = train['inNums'] 230 | y_valid = valid['inNums'] 231 | #y_data = X_data['inNums'] 232 | 233 | del train['inNums'],valid['inNums']#,X_data['inNums'] 234 | del train['outNums'],valid['outNums']#,X_data['outNums'] 235 | 236 | ####xgb的特征选择(不太成功) 237 | xgb_params = {'eta': 0.004, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 238 | 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 1, 'nthread': 4,'lambda': 1,} 239 | #X_train, X_test, y_train, y_test = train_test_split(train, y_train, test_size=0.15, random_state=5) 240 | dtrain = xgb.DMatrix(train, y_train) 241 | dtest = xgb.DMatrix(valid,y_valid) 242 | num_rounds = 10000 243 | watchlist = [ (dtrain,'train'), (dtest, 'test') ] 244 | clf = xgb.train(dtrain=dtrain, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params) 245 | plt.figure(figsize=(35,35)) 246 | plot_importance(clf) 247 | plt.show() 248 | ''' 249 | 250 | ###时间shift 251 | 252 | ##将28,29拼接到最后,整体有序了 253 | data_28 = data2[data2.day==28] 254 | data_29 = data2[data2.day==29] 255 | data2 = data2[(data2.day!=28)&(data2.day!=29)] 256 | data2 = pd.concat([data2, data_28,data_29], axis=0, ignore_index=True) 257 | data = data2.copy() 258 | 259 | 260 | ''' 261 | ###移动平均,效果不好 262 | data['inNums'] = data['inNums'].rolling(window=2).mean() 263 | data['outNums'] = data['outNums'].rolling(window=2).mean() 264 | data = data.fillna(0) 265 | data['inNums'] = np.round(data['inNums']) 266 | data['outNums'] = np.round(data['outNums']) 267 | ''' 268 | 269 | # 剔除周末 270 | data = data[(data.day!=5)&(data.day!=6)&(data.day!=1)] 271 | data = data[(data.day!=12)&(data.day!=13)] 272 | data = data[(data.day!=19)&(data.day!=20)] 273 | data = data[(data.day!=26)&(data.day!=27)] 274 | 275 | ###shift时间,144个时间点是一天,选取的近三天的时间及其组合特征 276 | def time_shift(data_in_sta,data_in_shfit_cols,data_out_shfit_cols): 277 | lag_start=144 278 | lag_end=144*3 279 | data_out_sta = data_in_sta.copy() 280 | for i in range(lag_start, lag_end+1,144): 281 | for col in data_in_shfit_cols: 282 | data_in_sta[col+"_lag_{}".format(i)] = data_in_sta[col].shift(i) 283 | if (col != 'inNums') & (col != 'outNums') &(i==lag_end): 284 | del data_in_sta[col] 285 | for col1 in data_out_shfit_cols: 286 | data_out_sta[col1+"_lag_{}".format(i)] = data_out_sta[col1].shift(i) 287 | if (col1 != 'inNums') & (col1 != 'outNums') &(i==lag_end): 288 | del data_out_sta[col1] 289 | 290 | return data_in_sta,data_out_sta 291 | 292 | ###由于只shift inNums和outNums,则先排除其余特征 293 | data_in_shfit = pd.DataFrame() 294 | data_out_shfit = pd.DataFrame() 295 | 296 | data_in_shfit_cols = list(data) 297 | data_in_shfit_cols.remove('stationID') 298 | data_in_shfit_cols.remove('time') 299 | data_in_shfit_cols.remove('day') 300 | data_in_shfit_cols.remove('week') 301 | #data_in_shfit_cols.remove('weekend') 302 | data_in_shfit_cols.remove('hour') 303 | data_in_shfit_cols.remove('minute') 304 | data_in_shfit_cols.remove('day_since_first') 305 | #data_in_shfit = data_in_shfit[data_in_shfit_cols] 306 | 307 | data_out_shfit_cols = list(data) 308 | data_out_shfit_cols.remove('stationID') 309 | data_out_shfit_cols.remove('time') 310 | data_out_shfit_cols.remove('day') 311 | data_out_shfit_cols.remove('week') 312 | #data_in_shfit_cols.remove('weekend') 313 | data_out_shfit_cols.remove('hour') 314 | data_out_shfit_cols.remove('minute') 315 | data_out_shfit_cols.remove('day_since_first') 316 | #data_out_shfit = data_out_shfit[data_out_shfit_cols] 317 | 318 | 319 | ###对每个站口进行shift操作 320 | for i in range(81): 321 | data_temp = data[data['stationID'] == i] 322 | data_in_sta,data_out_sta = time_shift(data_temp,data_in_shfit_cols,data_out_shfit_cols) 323 | data_in_shfit = pd.concat([data_in_shfit, data_in_sta], axis=0, ignore_index=True) 324 | data_out_shfit = pd.concat([data_out_shfit, data_out_sta], axis=0, ignore_index=True) 325 | 326 | 327 | ############################################### 328 | ###############################################inNums 329 | 330 | data_in_shfit_temp = data_in_shfit.copy() 331 | 332 | del data_in_shfit_temp['time'] 333 | data_in_shfit_temp.fillna(0, inplace=True) 334 | 335 | ###进行时间序列的交叉验证,比如选取23号作为验证集,则23号前的作为训练集,24号作为测试集,依次类推 336 | ###自我感觉比较可靠,基本上线下提分,线上也提分 337 | ###特别要注意防止数据泄露 338 | test_day = [23,24,25,28] 339 | error_in = [] 340 | for i in test_day: 341 | 342 | if ( (i != 28)&(i!=25) ): 343 | test = data_in_shfit_temp[data_in_shfit_temp.day==i+1] 344 | y_test = test['inNums'] 345 | del test['inNums'] 346 | del test['outNums'] 347 | 348 | if i==25: 349 | test = data_in_shfit_temp[data_in_shfit_temp.day==i+3] 350 | y_test = test['inNums'] 351 | del test['inNums'] 352 | del test['outNums'] 353 | 354 | print('###############################inNums验证集',i) 355 | 356 | train = data_in_shfit_temp[data_in_shfit_temp.day