├── betaencoder.py ├── README.md ├── other.py └── main.py /betaencoder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: zhushuai 3 | # @Date: 2019-04-02 12:02:01 4 | # @Last Modified by: zhushuai 5 | # @Last Modified time: 2019-04-02 12:03:41 6 | # 定义目标编码函数 7 | 8 | class BetaEncoder(object): 9 | 10 | def __init__(self, group): 11 | 12 | self.group = group 13 | self.stats = None 14 | 15 | # get counts from df 16 | def fit(self, df, target_col): 17 | self.prior_mean = np.mean(df[target_col]) 18 | stats = df[[target_col, self.group]].groupby(self.group) 19 | stats = stats.agg(['sum', 'count'])[target_col] 20 | stats.rename(columns={'sum': 'n', 'count': 'N'}, inplace=True) 21 | stats.reset_index(level=0, inplace=True) 22 | self.stats = stats 23 | 24 | # extract posterior statistics 25 | def transform(self, df, stat_type, N_min=1): 26 | 27 | df_stats = pd.merge(df[[self.group]], self.stats, how='left') 28 | n = df_stats['n'].copy() 29 | N = df_stats['N'].copy() 30 | 31 | # fill in missing 32 | nan_indexs = np.isnan(n) 33 | n[nan_indexs] = self.prior_mean 34 | N[nan_indexs] = 1.0 35 | 36 | # prior parameters 37 | N_prior = np.maximum(N_min - N, 0) 38 | alpha_prior = self.prior_mean * N_prior 39 | beta_prior = (1 - self.prior_mean) * N_prior 40 | 41 | # posterior parameters 42 | alpha = alpha_prior + n 43 | beta = beta_prior + N - n 44 | 45 | # calculate statistics 46 | if stat_type == 'mean': 47 | num = alpha 48 | dem = alpha + beta 49 | 50 | elif stat_type == 'mode': 51 | num = alpha - 1 52 | dem = alpha + beta - 2 53 | 54 | elif stat_type == 'median': 55 | num = alpha - 1 / 3 56 | dem = alpha + beta - 2 / 3 57 | 58 | elif stat_type == 'var': 59 | num = alpha * beta 60 | dem = (alpha + beta) ** 2 * (alpha + beta + 1) 61 | 62 | elif stat_type == 'skewness': 63 | num = 2 * (beta - alpha) * np.sqrt(alpha + beta + 1) 64 | dem = (alpha + beta + 2) * np.sqrt(alpha * beta) 65 | 66 | elif stat_type == 'kurtosis': 67 | num = 6 * (alpha - beta) ** 2 * (alpha + beta + 1) - alpha * beta * (alpha + beta + 2) 68 | dem = alpha * beta * (alpha + beta + 2) * (alpha + beta + 3) 69 | 70 | else: 71 | num = self.prior_mean 72 | dem = np.ones_like(N_prior) 73 | 74 | # replace missing 75 | value = num / dem 76 | value[np.isnan(value)] = np.nanmedian(value) 77 | return value -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 地铁乘客流量预测 2 | 3 | 4 | Table of Contents 5 | ================= 6 | 7 | * [地铁乘客流量预测](#title) 8 | * [1. 赛题分析和前期思路](#1) 9 | * [1.1 数据清洗](#1.1) 10 | * [1.2 特征工程](#1.2) 11 | * [1.3 划分训练集和测试集](#1.3) 12 | * [1.4 搭建模型预测](#1.4) 13 | * [2. 其他的一些想法](#2) 14 | * [3. 总结与思考](#3) 15 | 16 | 17 | > **竞赛题目** 18 | 19 | 通过分析地铁站的历史刷卡数据，预测站点未来的客流量变化。开放了20190101至20190125共25天的刷卡记录，共涉及3条线路81个地铁站约7000万条数据作为训练数据`Metro_train.zip`，供选手搭建地铁站点乘客流量预测模型。同时大赛提供了路网地图，即各个地铁站之间的连接关系表，存储在文件`Metro_roadMap.csv`文件中。 20 | 21 | 测试阶段，提供某天所有线路所有站点的刷卡记录数据，预测未来一天00时至24时以10分钟为单位的各时段各站点的进站和出站人次。 22 | 23 | 测试集A集上，提供2019年1月28日的刷卡数据`testA_record_2019-01-28.csv`，选手需对2019年1月29日全天各地铁站以10分钟为单位的人流量进行预测。 24 | 25 | 评估指标采用**平均绝对误差`Mean Absolute Error, MAE`**，分别对入站人数和出站人数预测结果进行评估，然后在对两者取平均，得到最终评分。 26 | 27 | 关于数据的具体描述以及说明[详见天池官网]()。 28 | 29 | 30 | 31 | # 1. 赛题分析和前期思路 32 | 33 | 比赛提供了1号到25号共25天的刷卡记录数据，所以第一步就是对每一天的文件进行处理。原始数据集中包含了`time`, `lineID`, `stationID`, `deviceID`, `status`, `userID`, `payType`这几个列，根据题目要求要预测进站和出站的人流量，所以要先统计出每一天的进站和出站流量。 34 | 35 | ## 1.1 数据清洗 36 | 37 | > **提取基础信息** 38 | 39 | 首先对时间信息进行处理，提取出日、周、时、分、秒的信息，由于是按照10分钟为间隔统计，所以在提取分钟信息的时候只需要取整十。接着总计80个站点(除去缺失数据的54站)，每个站点从0点到24点，以10分钟为一次单位，总计144段时间间隔。根据站点、日、时、分进行分组统计，得到每个时段的进站人数和出站人数。 40 | 41 | 经过第一轮处理之后，得到了每一天的文件包含的`columns`有：`stationID`, `weekday`, `is_holiday`, `day`, `hour`, `minute`, `time_cut`, `inNums`以及`outNums`这几列。 42 | 43 | 增加了一些和刷卡设备相关的特征，包括`nuni_deveiceID_of_stationID`, `nuni_deviceID_of_stationID_hour`, `nuni_deviceID_of_stationID_hour_minute`。 44 | 45 | 46 | 47 | ## 1.2 特征工程 48 | 49 | > **增加同一站点相邻时间段的进出站流量信息** 50 | 51 | 考虑到当前时刻的流量信息与前后时刻的流量信息存在一定的关系，所以将当前时间段的前两个时段以及后两个时段流量信息作为特征。 52 | 53 | 增加的特征包括`inNums_before1`, `inNums_before2`, `inNums_after1`, `inNums_after2`, `outNums_before1`, `outNums_before2`, `outNums_after1`, `outNums_after2`。 54 | 55 | 56 | 57 | > **增加乘车高峰时段相关的特征** 58 | 59 | 根据杭州地铁的运营时段信息，将高峰时段分为四类，0表示非运营时间段，1表示非高峰时间段，2表示高峰时间段，3表示特殊高峰时间段。由于周末和非周末的高峰时间存在一定的差异，所以需要分别计算。 60 | 61 | 增加了特征`peak_type`。 62 | 63 | 64 | 65 | > **增加同周次的进出站流量信息** 66 | 67 | 均值、最大值以及最小值在一定程度上反映了数据的分布信息，所以增加同周次进站流量和出站流量的均值、最大值以及最小值作为特征。 68 | 69 | 增加的特征包括`inNums_whm_max`, `inNums_whm_min`, `inNums_whm_mean`, `outNums_whm_max`, `outNums_whm_min`, `outNums_whm_mean`, `inNums_wh_max`, `inNums_wh_min`, `inNums_wh_mean`, `outNums_wh_mean`。 70 | 71 | 72 | 73 | > **增加线路信息** 74 | 75 | 根据某一天的刷卡记录表统计每条线路和站点的对应信息，并计算各条线路的站点数，用站点数量代表该站的线路信息。 76 | 77 | 增加特征`line`。 78 | 79 | 80 | 81 | > **增加站点的类型信息** 82 | 83 | 不同的站点属于不同的类型，比如起点站、终点站、换乘站、普通站等，而这些站点的类别信息可以通过邻站点的数量表示，所以根据路网图对邻站点的数量进行统计，表示各个站点的类别。 84 | 85 | 增加特征`station_type`。 86 | 87 | 88 | 89 | > **增加特殊站点的标记** 90 | 91 | 对站点流量的分析过程中，发现第15站的流量与其他站点存在明显的区别，全天都处于高峰状态，因此给15站添加特别的标记。 92 | 93 | 增加特征`is_special`。 94 | 95 | 96 | 97 | > **连接训练特征和目标值** 98 | 99 | 本次建模的思想使用前一天的流量特征和时间特征，以及预测当天的时间特征，来预测进站流量和出站流量。所以要对之前处理好的数据集进行拼接。 100 | 101 | 增加新的特征`yesterday_is_holiday`以及`today_is_holiday`，增加目标值列`inNums`和`outNums`。 102 | 103 | 104 | 105 | > **对时间间隔进行目标编码** 106 | 107 | 考虑时间间隔信息与进站、出站流量的相关性，对时间间隔信息针对`inNums`和`outNums`进行目标编码。(这一步并非必须的，一定程度上可能会导致过拟合，所以可以考虑加入和不加入的情况都测试一下) 108 | 109 | 目标编码后得到了`in_time_cut`和`out_time_cut`。 110 | 111 | 112 | 113 | ## 1.3 划分训练集和测试集 114 | 115 | 根据上面数据清洗以及特征工程得到的结果对数据集进行划分。 116 | 117 | 118 | 119 | ## 1.4 搭建模型预测 120 | 121 | 我们使用了LightGBM和CatBoost两个模型预测并取其均值，其实也可以尝试加入XGBoost，然后取3个模型的加权平均，但是我们当时训练时发现XGBoost得到的结果不是很好，所以直接丢掉了。其实，通过加权平均，给XGBoost的结果一个比较好的权重，也有可能会得到比较不错的结果。最后，**对模型结果平均**。 122 | 123 | 124 | 125 | # 2. 其他的一些想法 126 | 127 | (1) 由于官方给了路网图，所以我们尝试将路网图拼接在特征后面，表示各个站点之间的连接关系，但是这样反而降低了模型最终的性能。 128 | 129 |
130 | 131 | (2) 过程中我们一直想充分利用邻站点的信息，想在邻站点上提取尽可能多的特征，包括邻站点相同时刻以及相邻时刻的流量信息，但是这样都会降低模型的性能，也在这上面浪费了不少的时间。 132 | 133 |
134 | 135 | (3) 除了从以后的数据中提取特征之外，我们还对提出出来的特征做了一些特征工程，包括计算一些可能有一定关联的特征**计算加减以及比率信息**，但是这些工作都没有能够提升模型的性能。 136 | 137 |
138 | 139 | (4) 根据官方交流群的讨论，我们在A榜的时候尝试去掉周末的信息，只讲工作日的信息进行提取和拼接，这在一定程度上提升了模型的效果，后来我们在[鱼的代码]()基础上加入了我们之前找到的一些特征。之后，仔细推敲了一下鱼的代码，发现在进行特征`feature`和目标值`target`拼接的时候，和`deviceID`相关的特征使用的是预测当天的，这样一方面会导致leak，另一方面就是最后的测试集这些特征都是`nan`值，也就是说在最终的预测中没有起到作用。于是，我们改变了拼接的方法，再次加入了自己提取的特征，最终在A榜跑出的成绩是`12.99`。注意，**这里使用的数据并不是全部的数据，而只是使用了工作日的数据拼接**。 140 | 141 |
142 | 143 | 在B榜的时候，我们还是希望能够训练一个通用的模型，所以这次把所有的数据放在一起训练，并没有将周末的数据单独提取出来，但是在滑窗的时候使用了`13`和`20`两天的数据作为验证，最终跑出了`12.57`的成绩，说明这种想法是可行的。为了验证一下只提取周末信息的效果，我们也尝试把周末的信息单独提取出来，最后得分一直在`14`以上，可能也是因为我们提的特征不太适用于这种场景。 144 | 145 |
146 | 147 | 最终，我们使用了全部的数据通过Stacking的方法进行了训练，将三个梯度提升树模型进行了堆叠，最后得到的结果也是`14`多一点。 148 | 149 | 150 | 151 | # 3. 总结与思考 152 | 153 | (1) 首先是对数据的EDA做的不够，包括对各个站点的分析，各个时间段综合分析，对特征重要性的分析等等。主要还是因为经验不够，不知道该怎么做，甚至15站点的特殊性也是从交流群里得到的信息。另外，就是调参做的有问题，反而把模型的性能调低了，说明对参数的理解不够。 154 | 155 |
156 | 157 | (2) 第一次团队作战，不知道该怎么协作，分工不是很明确，所以效率不是很高，没能有机会尝试更多的模型。代码写的不规范，导致后面修改的时候浪费了比较多的时间，包括整理也花了不少的时间。 158 | 159 |
160 | 161 | (3) 知道的模型太少，只使用了梯度提升树模型，其实还有很多可能有效的模型可以尝试，包括图神经网络，时空模型以及LSTM，但是都因为不够熟悉而无从入手。 162 | 163 |
164 | 165 | (4) 总结： 166 | 167 | - 了解自己能做的事情，明确分工； 168 | - 做好EDA，做到对数据的充分理解； 169 | - 代码书写规范，每一个功能模块应该定义为一个函数； 170 | - 熟悉不同模型的功能以及试用场景。 171 | 172 | -------------------------------------------------------------------------------- /other.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: zhushuai 3 | # @Date: 2019-04-02 13:08:20 4 | # @Last Modified by: zhushuai 5 | # @Last Modified time: 2019-04-02 13:11:13 6 | 7 | import xgboost as xgb 8 | from sklearn.metrics import mean_absolute_error 9 | from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone 10 | 11 | # 定义XGB模型 12 | 13 | # 设置模型参数 14 | xgb_params = { 15 | 'booster': 'gbtree', 16 | 'objective': 'reg:linear', 17 | 18 | 'eval_metric': 'mae', 19 | 20 | 'learning_rate': 0.0894, 21 | 'max_depth': 9, 22 | 'max_leaves': 20, 23 | 24 | 'lambda': 2, 25 | 'alpha': 1, 26 | 'subsample': 0.8, 27 | 'colsample_bytree': 0.8, 28 | 'silent': 1, 29 | 30 | 'gpu_id': 0, 31 | 'tree_method': 'gpu_hist' 32 | } 33 | 34 | ## 预测进站流量 35 | in_xgb_pred = np.zeros(len(X_test)) 36 | X_data = train_data[features].values 37 | y_data = train_data['inNums'].values 38 | 39 | 40 | for i, date in enumerate(slip): 41 | train = train_data[train_data.day=6 ).astype(int) 50 | train_df['hour'] = train_df['time'].dt.hour 51 | date = str(train_df.month.values[0])+'-'+str(train_df.day.values[0]) 52 | if date in w2h or (date not in h2w and train_df.weekday.values[0] in [6,7]): 53 | train_df['is_holiday'] = 1 54 | else: 55 | train_df['is_holiday'] = 0 56 | 57 | train_final = train_df.groupby(['stationID', 'weekday', 'is_holiday', 'day', 'hour', 'minute']).status.agg(['count', 'sum']).reset_index() 58 | # 这一段参考yu的代码 59 | # 考虑和刷卡设备相关的特征 60 | # 每个站点的刷卡设备数量与该站点的人流量存在一定的关系 61 | # 每个时段的刷卡设备数与流量也存在一定的关系 62 | tmp = train_df.groupby(['stationID'])['deviceID'].nunique().reset_index(name='nuni_deviceID_of_stationID') 63 | train_final = train_final.merge(tmp, on=['stationID'], how='left') 64 | tmp = train_df.groupby(['stationID','hour'])['deviceID'].nunique().reset_index(name='nuni_deviceID_of_stationID_hour') 65 | train_final = train_final.merge(tmp, on=['stationID', 'hour'], how='left') 66 | tmp = train_df.groupby(['stationID','hour','minute'])['deviceID'].nunique().\ 67 | reset_index(name='nuni_deviceID_of_stationID_hour_minute') 68 | train_final = train_final.merge(tmp, on=['stationID','hour','minute'], how='left') 69 | 70 | train_final['time_cut'] = train_final['hour'] * 6 + train_final['minute'] // 10 71 | train_final['inNums'] = train_final['sum'] 72 | train_final['outNums'] = train_final['count'] - train_final['sum'] 73 | del train_final['sum'], train_final['count'] 74 | 75 | return train_final 76 | 77 | # 特征工程 78 | 79 | ## 增加同站点相邻时间段的信息 80 | def add_neighbor_time(df_): 81 | train_df = df_.copy() 82 | # 生成一个用于中间转换的DataFrame 83 | train_now = train_df[['stationID', 'day', 'time_cut', 'inNums', 'outNums']] 84 | 85 | train_df.rename(columns={'inNums': 'inNums_now', 'outNums': 'outNums_now'}, inplace=True) 86 | 87 | # 考虑前多少个时间段，默认考虑前两个时间段 88 | for i in range(2, 0, -1): 89 | train_before = train_now.copy() 90 | train_before['time_cut'] = train_before['time_cut'] + i 91 | train_df = train_df.merge(train_before, how='left', on = ["stationID","day", "time_cut"]) 92 | train_df.rename(columns = {'inNums': f'inNums_before{i}', 'outNums': f'outNums_before{i}'}, inplace=True) 93 | train_df.fillna(0, inplace=True) 94 | 95 | # 考虑后多少个时间段，默认考虑前两个时间段 96 | for j in range(2, 0, -1): 97 | train_after = train_now.copy() 98 | train_after['time_cut'] = train_after['time_cut'] - j 99 | train_df = train_df.merge(train_after, how='left', on = ["stationID", "day", "time_cut"]) 100 | train_df.rename(columns = {'inNums': f'inNums_after{j}', 'outNums': f'outNums_after{j}'}, inplace=True) 101 | train_df.fillna(0, inplace=True) 102 | 103 | return train_df 104 | 105 | ## 增加乘车高峰时段相关的特征 106 | 107 | def add_peak_type(df_): 108 | 109 | train_df = df_.copy() 110 | 111 | ### 7:00-9:00、17:00-19:00为高峰时段，7:30-8:30、17:30-18:30为特殊高峰 112 | w_time = ['6:00', '7:00', '7:30', '8:30', '9:00', '17:00', '17:30', '18:30', '19:00', '0:00'] 113 | 114 | w_time_cut = [10*pd.to_datetime(ti).hour+pd.to_datetime(ti).minute//10 for ti in w_time] 115 | 116 | # 节假日的高峰时间段 117 | h_time = ['6:00','7:00', '9:00', '12:00', '15:00', '18:00', '20:00', '0:00'] 118 | 119 | h_time_cut = [10*pd.to_datetime(ti).hour+pd.to_datetime(ti).minute//10 for ti in h_time] 120 | 121 | # 每个时间点对应的time_cut 122 | temp_array = {'temp': [i*10+j for i in range(24) for j in range(6)], 'time_cut': list(range(0, 144))} 123 | 124 | # 工作日 125 | ## 早高峰的开始和结束时间 126 | ### 运营开始时间 127 | w_start = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[0])] 128 | 129 | mp_start_1 = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[1])] 130 | mp_end_1 = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[2])] 131 | 132 | mp_start_2 = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[3])] 133 | mp_end_2 = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[4])] 134 | 135 | ## 早特殊高峰开始结束时间 136 | msp_start = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[2])] 137 | msp_end = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[3])] 138 | 139 | ## 晚高峰开始和结束时间 140 | ap_start_1 = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[5])] 141 | ap_end_1 = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[6])] 142 | 143 | ap_start_2 = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[7])] 144 | ap_end_2 = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[8])] 145 | 146 | ## 晚特殊高峰开始结束时间 147 | asp_start = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[6])] 148 | asp_end = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[7])] 149 | 150 | w_end = temp_array['time_cut'][temp_array['temp'].index(w_time_cut[9])] 151 | 152 | ## 工作日peak类型映射表 153 | ### 0表示停运期间，1表示运行开始到高峰前和高峰后到运营结束，2表示高峰时间，3表示特殊高峰时间 154 | 155 | w_peak = {0: list(range(w_end, w_start)), 156 | 2: list(range(mp_start_1, mp_end_1)) 157 | + list(range(mp_start_2, mp_end_2)) 158 | + list(range(ap_start_1, ap_end_1)) 159 | + list(range(ap_start_2, ap_end_2)), 160 | 3: list(range(msp_start, msp_end)) 161 | + list(range(asp_start, asp_end))} 162 | 163 | peak_workday = {} 164 | 165 | for key, value in w_peak.items(): 166 | for t in value: 167 | peak_workday[t] = key 168 | 169 | 170 | # 节假日 171 | 172 | ## 早高峰的开始和结束时间 173 | 174 | h_start = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[0])] 175 | h_end = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[7])] 176 | 177 | h_mp_start_1 = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[1])] 178 | h_mp_end_1 = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[2])] 179 | 180 | h_mp_start_2 = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[3])] 181 | h_mp_end_2 = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[4])] 182 | 183 | ## 早特殊高峰开始结束时间 184 | h_msp_start = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[2])] 185 | h_msp_end = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[3])] 186 | 187 | ## 晚高峰开始和结束时间 188 | h_ap_start_1 = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[5])] 189 | h_ap_end_1 = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[6])] 190 | 191 | ## 晚特殊高峰开始结束时间 192 | h_asp_start = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[4])] 193 | h_asp_end = temp_array['time_cut'][temp_array['temp'].index(h_time_cut[5])] 194 | 195 | ## 节假日日peak类型映射表 196 | 197 | h_peak = {0: list(range(h_end, h_start)), 198 | 2: list(range(h_mp_start_1, h_mp_end_1)) 199 | + list(range(h_mp_start_2, h_mp_end_2)) 200 | + list(range(h_ap_start_1, h_ap_end_1)) , 201 | 3: list(range(h_msp_start, h_msp_end)) 202 | + list(range(h_asp_start, h_asp_end))} 203 | 204 | peak_holiday = {} 205 | 206 | for key, value in h_peak.items(): 207 | for t in value: 208 | peak_holiday[t] = key 209 | 210 | # 转换成用于连接的DataFrame 211 | wo_peak = pd.DataFrame({'time_cut':list(peak_workday.keys()) , 'peak_type': list(peak_workday.values()) }) 212 | wo_peak['is_holiday'] = 0 213 | 214 | ho_peak = pd.DataFrame({'time_cut': list(peak_holiday.keys()), 'peak_type': list(peak_holiday.values())}) 215 | ho_peak['is_holiday'] = 1 216 | 217 | to_peak = pd.concat([wo_peak, ho_peak], axis=0, sort=False) 218 | 219 | 220 | 221 | train_df = train_df.merge(to_peak, on=['is_holiday', 'time_cut'], how='left') 222 | train_df['peak_type'].fillna(1, inplace=True) 223 | 224 | return train_df 225 | 226 | ## 增加同周次进出站流量的信息 227 | def add_week_flow(df_): 228 | 229 | train_df = df_.copy() 230 | 231 | tmp = train_df.groupby(['stationID','weekday','hour','minute'], as_index=False)['inNums_now'].agg({ 232 | 'inNums_whm_max' : 'max', 233 | 'inNums_whm_min' : 'min', 234 | 'inNums_whm_mean' : 'mean' 235 | }) 236 | train_df = train_df.merge(tmp, on=['stationID','weekday','hour','minute'], how='left') 237 | 238 | tmp = train_df.groupby(['stationID','weekday','hour','minute'], as_index=False)['outNums_now'].agg({ 239 | 'outNums_whm_max' : 'max', 240 | 'outNums_whm_min' : 'min', 241 | 'outNums_whm_mean' : 'mean' 242 | }) 243 | train_df = train_df.merge(tmp, on=['stationID','weekday','hour','minute'], how='left') 244 | 245 | tmp = train_df.groupby(['stationID','weekday','hour'], as_index=False)['inNums_now'].agg({ 246 | 'inNums_wh_max' : 'max', 247 | 'inNums_wh_min' : 'min', 248 | 'inNums_wh_mean' : 'mean' 249 | }) 250 | train_df = train_df.merge(tmp, on=['stationID','weekday','hour'], how='left') 251 | 252 | tmp = train_df.groupby(['stationID','weekday','hour'], as_index=False)['outNums_now'].agg({ 253 | #'outNums_wh_max' : 'max', 254 | #'outNums_wh_min' : 'min', 255 | 'outNums_wh_mean' : 'mean' 256 | }) 257 | train_df = train_df.merge(tmp, on=['stationID','weekday','hour'], how='left') 258 | 259 | return train_df 260 | 261 | ## 增加线路信息 262 | def add_line(df_): 263 | def station_line(record): 264 | station_line = record[['lineID', 'stationID']] 265 | station_line = station_line.drop_duplicates().reset_index(drop=True) 266 | station_line = station_line.sort_values(by='stationID').reset_index(drop=True) 267 | return station_line 268 | 269 | train_df = df_.copy() 270 | 271 | # 这里的file表示原始文件 272 | train_files = [f for f in sorted(os.listdir(train_path)) if f.endswith("csv")] 273 | record = pd.read_csv(os.path.join(train_path, train_files[0])) 274 | line = station_line(record) 275 | line_pad = pd.DataFrame(Counter(line['lineID']), index=line.lineID.unique()) 276 | line_ID = (line_pad.T)['B'].reset_index().rename(columns={'index': 'lineID', 'B': 'line'}) 277 | line = line.merge(line_ID, on='lineID', how='left') 278 | line.drop('lineID', axis=1, inplace=True) 279 | train_df = train_df.merge(line, on=['stationID'], how='left') 280 | return train_df 281 | 282 | ## 增加车站类型信息 283 | def add_station_type(df_): 284 | 285 | def get_map(roadmap): 286 | roadmap.rename(columns={"Unnamed: 0": 'stationID'}, inplace=True) 287 | tmp = roadmap.drop(['stationID'], axis=1) 288 | roadmap['station_type'] = tmp.sum(axis=1) 289 | return roadmap[['stationID', 'station_type']] 290 | 291 | 292 | roadmap = pd.read_csv(roadfile) 293 | map_pad = get_map(roadmap) 294 | 295 | train_df = df_.copy() 296 | train_df = train_df.merge(map_pad, on=['stationID'], how='left') 297 | return train_df 298 | 299 | ## 增加特征站点标记 300 | 301 | def add_special_station(df_): 302 | train_df = df_.copy() 303 | 304 | train_df['is_special'] = np.nan 305 | train_df.loc[train_df['stationID']==15, 'is_special'] = 1 306 | train_df['is_special'].fillna(0, inplace=True) 307 | return train_df 308 | 309 | 310 | ## 连接训练特征和目标值 311 | # 定义增加周末信息的函数 312 | def add_is_holiday(test): 313 | date = str(test.month.values[0])+'-'+str(test.day.values[0]) 314 | if date in w2h or (date not in h2w and test.weekday.values[0] in [6,7]): 315 | test['is_holiday'] = 1 316 | else: 317 | test['is_holiday'] = 0 318 | return test 319 | 320 | 321 | # 获取最终的测试日期数据 322 | def read_test(test): 323 | test['weekday'] = pd.to_datetime(test['startTime']).dt.dayofweek + 1 324 | #test['weekend'] = (pd.to_datetime(test.startTime).dt.weekday >=5).astype(int) 325 | test['month'] = pd.to_datetime(test['startTime']).dt.month 326 | test['day'] = test['startTime'].apply(lambda x: int(x[8:10])) 327 | test['hour'] = test['startTime'].apply(lambda x: int(x[11:13])) 328 | test['minute'] = test['startTime'].apply(lambda x: int(x[14:15]+'0')) 329 | test = test.drop(['startTime', 'endTime'], axis=1) 330 | 331 | test = add_is_holiday(test) 332 | test['time_cut'] = test['hour'] * 6 + test['minute'] // 10 333 | test.drop(['inNums', 'outNums', 'month'], axis=1, inplace=True) 334 | return test 335 | 336 | # 用于对weekday的信息进行修正 337 | def fix_weekday(w): 338 | if w == 7: 339 | return 1 340 | else: 341 | return w+1 342 | 343 | # 合并得到训练集和测试集的数据 344 | def merge_train_test(df_): 345 | train_df = df_.copy() 346 | 347 | # 读取最后要提交的数据 348 | test = pd.read_csv(os.path.join(test_path, sub_file)) 349 | test_df = read_test(test) 350 | all_data = pd.concat([train_df, test_df], axis=0, sort=False) 351 | 352 | # 将当天对应的节假日信息取出备用 353 | th = all_data[['day', 'is_holiday']].drop_duplicates().sort_values(by=['day']).reset_index(drop=True).rename(columns={'is_holiday': 'today_is_holiday'}) 354 | # 提取用于合并为target的部分 355 | train_target = all_data[['stationID','day', 'hour','minute', 'time_cut', 'inNums_now', 'outNums_now']] 356 | 357 | train_target.rename(columns={'inNums_now': 'inNums', 'outNums_now': 'outNums'}, inplace=True) 358 | # 将所有数据的节假日信息名称改为前一天的节假日信息 359 | all_data.rename(columns={'is_holiday': 'yesterday_is_holiday'}, inplace=True) 360 | # 为了之后合并，将day的特征加1 361 | all_data['day'] += 1 362 | # 对周的信息进行修正 363 | all_data['weekday'] = all_data['weekday'].apply(fix_weekday) 364 | 365 | # 需要将训练集和测试集单独合并 366 | 367 | train_df = all_data[(all_data.day != 29) & (all_data.day != 26) & (all_data.day != 30)] 368 | 369 | test_df = all_data[all_data.day == 29] 370 | 371 | # 首先对生成训练集数据 372 | train_df = train_df.merge(train_target, on=['stationID', 'day', 'hour', 'minute', 'time_cut'], how='left') 373 | ## 对预测目标值的缺失值补0 374 | train_df['inNums'].fillna(0, inplace=True) 375 | train_df['outNums'].fillna(0, inplace=True) 376 | 377 | ## 补充当天是否周末的信息 378 | train_df = train_df.merge(th, on='day', how='left') 379 | 380 | # 生成测试集的数据 381 | test_target = train_target[train_target.day == 29] 382 | # 对测试值进行连接 383 | test_df = test_df.merge(test_target, on=['stationID', 'day', 'hour', 'minute', 'time_cut'], how='outer') 384 | test_df = test_df.sort_values(by=['stationID', 'hour', 'minute']).reset_index(drop=True) 385 | use_fe = ['stationID', 'weekday', 'yesterday_is_holiday', 'nuni_deviceID_of_stationID', 'line', 'station_type', 'is_special'] 386 | test_merge = test_df[use_fe].drop_duplicates().dropna() 387 | test_df = test_df.drop(use_fe[1:], axis=1) 388 | test_df = test_df.merge(test_merge, on=['stationID'], how='left') 389 | 390 | test_df = test_df.merge(th, on='day', how='left') 391 | test_df.fillna(0, inplace=True) 392 | all_data = pd.concat([train_df, test_df], axis=0, sort=False) 393 | return all_data, train_df 394 | 395 | # 考虑进出站流量和时间段密切相关 396 | # 针对time_cut进行目标编码 397 | def target_encoding(all_data, train_df): 398 | # 设置参数 399 | N_min = 300 400 | fe = 'time_cut' 401 | 402 | # 针对进站流量的目标编码 403 | te_in = BetaEncoder(fe) 404 | te_in.fit(train_df, 'inNums') 405 | all_data['in_time_cut'] = te_in.transform(all_data, 'mean', N_min=N_min) 406 | 407 | # 针对出站流量的目标编码 408 | te_out = BetaEncoder(fe) 409 | te_out.fit(train_df, 'outNums') 410 | all_data['out_time_cut'] = te_out.transform(all_data, 'mean', N_min = N_min) 411 | 412 | return all_data 413 | 414 | 415 | def train(all_data): 416 | # 设置需要使用的特征 417 | features = [f for f in all_data.columns if f not in ['inNums', 'outNums', 'time_cut']] 418 | 419 | # 提取训练集和测试集 420 | 421 | # 所有数据 422 | # 由于1号是元旦节，情况相对特殊，所以去掉了该极端值 423 | # 也可以使用一下看看效果 424 | train_data = all_data[(all_data.day != 29) & (all_data.day != 2)] 425 | 426 | # 用于训练的数据 427 | test = all_data[all_data.day == 29] 428 | X_test = test[features].values 429 | 430 | # 设置滑动窗口 431 | # 这里A榜要预测的是29号的信息，所以设置同是周二的日期作为滑动窗口的末尾 432 | # 而B榜要预测的是27号的信息，所以设置周末的日期作为滑窗末尾，即13，20 433 | slip = [15, 22] 434 | 435 | n = len(slip) 436 | 437 | 438 | ## 搭建LGB模型 439 | # 设置模型参数 440 | lgb_params = { 441 | 'boosting_type': 'gbdt', 442 | 'objective': 'regression_l1', 443 | 'metric': 'mae', 444 | 'num_leaves': 63, 445 | 'learning_rate': 0.01, 446 | 'feature_fraction': 0.9, 447 | 'bagging_fraction': 0.9, 448 | 'bagging_seed':0, 449 | 'bagging_freq': 1, 450 | 'verbose': 1, 451 | 'reg_alpha':1, 452 | 'reg_lambda':2, 453 | 454 | # 设置GPU 455 | 'device' : 'gpu', 456 | 'gpu_platform_id':1, 457 | 'gpu_device_id':0 458 | } 459 | 460 | ## 预测进站流量 461 | in_lgb_pred = np.zeros(len(X_test)) 462 | X_data = train_data[features].values 463 | y_data = train_data['inNums'].values 464 | for i, date in enumerate(slip): 465 | train = train_data[train_data.day