├── .gitignore ├── kdd2017 ├── __init__.py ├── remove_outliers.py ├── models.py └── utils.py ├── README.md ├── config ├── config_vol.json └── config.json └── bin ├── compute_kdd2017_vol └── compute_kdd2017 /.gitignore: -------------------------------------------------------------------------------- 1 | *pyc 2 | -------------------------------------------------------------------------------- /kdd2017/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kdd2017/remove_outliers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #from kdd2017.models import * 4 | # 5 | # 6 | #def remove_outliers_by_classifier(X, y, dates, ): 7 | # xgboost = XGBoost(max_depth=2) 8 | # xgboost.fit(X, y) 9 | # y_pred = xgboost.predict(X) 10 | # diff_values = np.abs(y_pred - y) 11 | # abs_diff_vals = np.abs(diff_values) 12 | # sorted_indexes = sorted(range(len(abs_diff_vals)), key = lambda x: abs_diff_vals[x]) 13 | # sorted_indexes_lead = sorted_indexes[:int(len(abs_diff_vals)*0.9)] 14 | # return X[sorted_indexes_lead], y[sorted_indexes_lead], dates[sorted_indexes_lead] 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | KDD Travel time 排名28/3574的解决方案 2 | ================================ 3 | 4 | 共享一下自己的代码, 主要用了grid search 5 folds CV + 1 fold eval set,因为到了>后期cv grid search出来的参数也出现overfit了。所以第6 fold的cv没有参与最优的搜索,这是作为eval作用。最终排名28/3574,一个人的写的程序,作为参考。 5 | 6 | 入口程序 7 | 8 | ``` 9 | cd kdd2017 10 | ./bin/compute_kdd2017 config/config.json 11 | ``` 12 | 13 | KDD travel time competition rank 28/3574 solution 14 | ====================================== 15 | 16 | 17 | Share my code here. Basically I have used 5-fold cv grid search. Finally I got over-fitting , so I added one more fold for evaluation (not for grid search). At the end, I got a rank of 28/3574. Single person work. Hope it would be useful for you. 18 | 19 | how to start 20 | 21 | ``` 22 | cd kdd2017 23 | ./bin/compute_kdd2017 config/config.json 24 | ``` 25 | -------------------------------------------------------------------------------- /config/config_vol.json: -------------------------------------------------------------------------------- 1 | { 2 | "path_volumes_train": ["/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/volume(table 6)_training.csv", "/home/jinpengli/work_data/jinpeng/dataSet_phase2/volume(table 6)_training2.csv"], 3 | "path_volumes_val": "/home/jinpengli/work_data/jinpeng/dataSet_phase2/volume(table 6)_test2.csv", 4 | "weather_infos": ["/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/weather (table 7)_training_update.csv", "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/testing_phase1/weather (table 7)_test1.csv", "/home/jinpengli/work_data/jinpeng/dataSet_phase2/weather (table 7)_2.csv"], 5 | "path_links": "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/links (table 3).csv", 6 | "path_routes": "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/routes (table 4).csv", 7 | "working_dir": "/home/jinpengli/work_data/jinpeng" 8 | } 9 | -------------------------------------------------------------------------------- /config/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "path_trajectories_train": ["/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/trajectories(table 5)_training.csv", "/home/jinpengli/work_data/jinpeng/dataSet_phase2/trajectories(table_5)_training2.csv"], 3 | "path_trajectories_val": "/home/jinpengli/work_data/jinpeng/dataSet_phase2/trajectories(table 5)_test2.csv", 4 | "path_links": "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/links (table 3).csv", 5 | "path_routes": "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/routes (table 4).csv", 6 | "working_dir": "/home/jinpengli/work_data/jinpeng", 7 | "weather_infos" : ["/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/weather (table 7)_training_update.csv", "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/testing_phase1/weather (table 7)_test1.csv", "/home/jinpengli/work_data/jinpeng/dataSet_phase2/weather (table 7)_2.csv"] 8 | } 9 | -------------------------------------------------------------------------------- /bin/compute_kdd2017_vol: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from sklearn import linear_model 5 | import numpy as np 6 | 7 | from argparse import RawTextHelpFormatter 8 | import argparse 9 | import json 10 | from datetime import timedelta 11 | from datetime import datetime 12 | import dateutil 13 | 14 | import os 15 | import matplotlib.pyplot as plt 16 | import matplotlib 17 | import sys, traceback 18 | 19 | from sklearn.ensemble import GradientBoostingRegressor 20 | from sklearn.ensemble import BaggingRegressor 21 | from sklearn.ensemble import AdaBoostRegressor 22 | from sklearn.ensemble import ExtraTreesRegressor 23 | from sklearn.ensemble import RandomForestRegressor 24 | 25 | from sklearn.neighbors import KNeighborsRegressor 26 | from sklearn.neighbors import RadiusNeighborsRegressor 27 | 28 | from sklearn.linear_model import Lasso 29 | from sklearn.linear_model import LinearRegression 30 | from sklearn.linear_model import ARDRegression 31 | from sklearn.linear_model import HuberRegressor 32 | from sklearn.linear_model import LinearRegression 33 | from sklearn.linear_model import LogisticRegression 34 | from sklearn.linear_model import LogisticRegressionCV 35 | from sklearn.linear_model import PassiveAggressiveRegressor 36 | from sklearn.linear_model import RandomizedLogisticRegression 37 | from sklearn.linear_model import RANSACRegressor 38 | from sklearn.linear_model import SGDRegressor 39 | from sklearn.linear_model import TheilSenRegressor 40 | from sklearn.linear_model import logistic_regression_path 41 | 42 | from sklearn.neural_network import MLPRegressor 43 | from sklearn.cross_decomposition import PLSRegression 44 | from sklearn.svm import SVR 45 | from sklearn.svm import LinearSVR 46 | from sklearn.svm import NuSVR 47 | 48 | from sklearn.tree import DecisionTreeRegressor 49 | from sklearn.tree import ExtraTreeRegressor 50 | 51 | 52 | from sklearn.pipeline import FeatureUnion 53 | from sklearn.decomposition import PCA 54 | from sklearn.feature_selection import SelectKBest 55 | 56 | 57 | from kdd2017.utils import load_volumes_info 58 | from kdd2017.utils import convert_volumes_into_X_y 59 | from kdd2017.utils import generate_final_volumes 60 | from kdd2017.utils import GridSearchCVDatesWithVal 61 | from kdd2017.utils import load_routes 62 | from kdd2017.utils import load_links 63 | 64 | from kdd2017.utils import mape_loss 65 | from kdd2017.utils import inv_mape_loss 66 | from kdd2017.utils import GridSearchCVDates 67 | from kdd2017.utils import invboxcox 68 | from kdd2017.utils import plot_travel_times_fix_date 69 | from kdd2017.utils import plot_travel_times_fix_hour 70 | 71 | from kdd2017.utils import remove_outliers 72 | from kdd2017.utils import load_weather_info 73 | 74 | from scipy.stats import boxcox 75 | from sklearn.ensemble import GradientBoostingRegressor 76 | from sklearn.ensemble import BaggingRegressor 77 | from sklearn.ensemble import AdaBoostRegressor 78 | from sklearn.ensemble import ExtraTreesRegressor 79 | from sklearn.ensemble import RandomForestRegressor 80 | from sklearn.neighbors import KNeighborsRegressor 81 | 82 | from sklearn.linear_model import LinearRegression 83 | from sklearn.linear_model import LogisticRegression 84 | from sklearn.model_selection import GridSearchCV 85 | from sklearn.model_selection import train_test_split 86 | from sklearn.svm import SVR 87 | from sklearn.metrics import r2_score 88 | from sklearn.metrics import make_scorer 89 | from sklearn.metrics import mean_absolute_error 90 | from kdd2017.models import * 91 | 92 | import matplotlib.pyplot as plt 93 | import matplotlib 94 | import collections 95 | from kdd2017.utils import findsubsets 96 | from kdd2017.utils import findsubsets2 97 | import random 98 | 99 | if __name__ == "__main__": 100 | description=''' 101 | compute_kdd2017_vol /media/jl237561/usb_ext/workspace/kdd2017/kdd2017/config/config_vol.json 102 | ''' 103 | parser = argparse.ArgumentParser( 104 | description=description, 105 | formatter_class=RawTextHelpFormatter) 106 | 107 | parser.add_argument('config', type=unicode, nargs=1, 108 | help='...') 109 | 110 | options = parser.parse_args() 111 | 112 | if not options.config: 113 | raise ValueError("Please set all the parameters.") 114 | 115 | config_path = options.config[0] 116 | config = json.load(open(config_path, "r")) 117 | path_trajectories_train = config["path_volumes_train"] 118 | path_trajectories_val = config["path_volumes_val"] 119 | path_working_dir = config["working_dir"] 120 | path_weather_infos = config["weather_infos"] 121 | path_links = config["path_links"] 122 | path_routes = config["path_routes"] 123 | is_boxcox = False 124 | is_y_log = False 125 | is_include_future_training = False 126 | remove_future_training_test = False 127 | boxcox_lambda = 1.0 128 | estimate_val_w = 1.0 129 | skip_cvs = [] 130 | skip_date_ranges = [ 131 | #(datetime(2016, 9, 14), datetime(2016, 9, 19)), 132 | #(datetime(2016, 9, 29), datetime(2016, 10, 9)), 133 | ] 134 | 135 | 136 | #path_combine_model_cache = os.path.join(path_working_dir, "cache_combine_model_vol_%d.json" % random.randint(0, 1000)) 137 | #if os.path.isfile(path_combine_model_cache): 138 | # raise ValueError("cache exist.. please remove %s" % path_combine_model_cache) 139 | 140 | if is_y_log: 141 | print("config is_y_log=", is_y_log) 142 | 143 | datetime_weather = load_weather_info(path_weather_infos) 144 | routes_data = load_routes(path_routes) 145 | link_data = load_links(path_links) 146 | if not os.path.isdir(path_working_dir): 147 | os.makedirs(path_working_dir) 148 | 149 | volumes_train = load_volumes_info(path_trajectories_train) 150 | volumes_val = load_volumes_info(path_trajectories_val) 151 | volumes_final = generate_final_volumes(volumes_train) 152 | 153 | X_train, y_train, dates_train, les_train = convert_volumes_into_X_y( 154 | volumes_train, datetime_weather, link_data, routes_data) 155 | X_val, y_val, dates_val, _ = convert_volumes_into_X_y( 156 | volumes_val, datetime_weather, link_data, routes_data, les_train) 157 | X_final, y_final, dates_final, _, raw_info = convert_volumes_into_X_y( 158 | volumes_final, datetime_weather, link_data, routes_data, les_train, True, verbose=False) 159 | 160 | print(X_train.shape) 161 | print(y_train.shape) 162 | print(X_val.shape) 163 | print(y_val.shape) 164 | 165 | Configurations = [ 166 | #{ 167 | # "model": BoxcoxModel, 168 | # "tuned_parameters": [ 169 | # { 170 | # "model": [GradientBoostingRegressor], 171 | # "loss": ["lad"], 172 | # "learning_rate": [0.1,], 173 | # "n_estimators": [200,], 174 | # "is_boxcox": [True, ], 175 | # "boxcox_lambda": [-1.0, -0.6, -0.3, 0, 0.1, 0.5, 1.0, 2.0], 176 | # }, 177 | # { 178 | # "model": [GradientBoostingRegressor,], 179 | # "loss": ["lad"], 180 | # "learning_rate": [0.1,], 181 | # "n_estimators": [200,], 182 | # "is_boxcox": [False, ], 183 | # "boxcox_lambda": [1,], 184 | # }, 185 | # ], 186 | #}, 187 | ##{ 188 | # "model": XGBoost, 189 | # "tuned_parameters": [], 190 | #}, 191 | #{ 192 | # "model": MedianModel, 193 | # "tuned_parameters":[], 194 | #}, 195 | # ##### make model ensemble.. 196 | # { 197 | # "model": CombineModes, 198 | # "tuned_parameters": [ 199 | # { 200 | # 'models': [ 201 | # [ 202 | # ## 1 203 | # DaterangeModel( 204 | # model=XGBoost, 205 | # rm_n_head_days=4, 206 | # rm_n_head_days_hours=[(0, 6), (12, 14), (20, 24)], 207 | # ft_th=[(7, 0.3), (8, 0.8), (9, 0.3)], 208 | # is_y_log=True, 209 | # y_log_e=np.e, 210 | # norm_y=False, 211 | # use_mspe=False, 212 | # num_round=1100, 213 | # early_stopping_rounds=10, 214 | # eta=0.02, 215 | # max_depth=7, 216 | # subsample=0.6, 217 | # colsample_bytree=0.9, 218 | # objective='reg:linear', 219 | # booster='gbtree', 220 | # eval_metric='rmse', 221 | # ft_select=[0, 1, 3, 4, 5, 6, 12, 15, 16, 17], 222 | # train_days=9+7, 223 | # is_rm_outliers=True, 224 | # rm_outliers_m=2, 225 | # rm_outliers_key=[0, 1, 2, 5], 226 | # is_avg_or_median=2, 227 | # ), 228 | # ## 2 229 | # DaterangeModel( 230 | # model=KNeighborsRegressor, 231 | # #rm_n_head_days=9, 232 | # #rm_n_head_days_hours=[(0, 6), (12, 14), (20, 24)], 233 | # ft_th=[(7, 0.3), (8, 0.8), (9, 0.3)], 234 | # is_y_log=True, 235 | # y_log_e=np.e, 236 | # norm_y=False, 237 | # ft_select=[0, 1, 5, 6], 238 | # train_days=10, 239 | # is_rm_outliers=False, 240 | # ), 241 | # ## 3 242 | # DaterangeModel( 243 | # model=KNeighborsRegressor, 244 | # #rm_n_head_days=9, 245 | # #rm_n_head_days_hours=[(0, 6), (12, 14), (20, 24)], 246 | # ft_th=[(7, 0.3), (8, 0.8), (9, 0.3)], 247 | # is_y_log=True, 248 | # y_log_e=np.e, 249 | # norm_y=False, 250 | # ft_select=[0, 1, 2, 3, 5, 6], 251 | # train_days=9, 252 | # is_rm_outliers=False, 253 | # rm_outliers_m=2, 254 | # rm_outliers_key=[0, 1, 2, 5], 255 | # is_avg_or_median=2, 256 | # ), 257 | # # DaterangeModel( 258 | # # model=model, 259 | # # random_state=0, 260 | # # #rm_n_head_days=9, 261 | # # #rm_n_head_days_hours=[(0, 6), (12, 14), (20, 24)], 262 | # # ft_th=[(7, 0.3), (8, 0.8), (9, 0.3)], 263 | # # is_y_log=True, 264 | # # y_log_e=np.e, 265 | # # norm_y=False, 266 | # # ft_select=ft_select, 267 | # # train_days=model_day, 268 | # # is_rm_outliers=False, 269 | # # rm_outliers_m=2, 270 | # # rm_outliers_key=[0, 1, 2, 5], 271 | # # is_avg_or_median=2, 272 | # # ), 273 | # ] 274 | # #for model in [ #PassiveAggressiveRegressor, 275 | # # #KNeighborsRegressor, 276 | # # MLPRegressor, 277 | # # #NuSVR, 278 | # # #HuberRegressor, PassiveAggressiveRegressor, RANSACRegressor, SGDRegressor, TheilSenRegressor, 279 | # # #RadiusNeighborsRegressor, PLSRegression, LinearSVR, 280 | # # ] 281 | # #for model_day in range(7, 20, 3) 282 | # ##for model_day in [10,] 283 | # #for ft_select in list( [[0, 1, ] + list(one) for one in findsubsets2(range(2, 7))]) 284 | # ##for ft_select in [[0, 1, 3, 4, 5, 6, 12, 15, 16, 17]] 285 | # ], 286 | # 'subsample': [1.0, ], 287 | # 'combine_method': [2 ], 288 | # 'weights': [ 289 | # [1.0, 0.13, 0.0, ] 290 | # #for x1 in range(0, 101, 5) 291 | # #for x2 in range(0, 101, 5) 292 | # #for x3 in range(0, 101, 5) 293 | # #[1.0, 0.0] 294 | # #[1.0, 0.1, 0.1, 0.1, 0.0] 295 | # ], 296 | # }, 297 | # ], 298 | # }, 299 | { 300 | "model": DaterangeModel, 301 | "tuned_parameters": [ 302 | # { 303 | # 'n_estimators': [50,], 304 | # 'loss': ["ls", "lad", "huber", "quantile"], 305 | # #'loss': ["ls", ], 306 | # 'is_y_log' : [True,], 307 | # 'learning_rate': [0.1,], 308 | # 'train_days': [3,7,14,21,28,35,], 309 | # 'is_rm_outliers': [True,], 310 | # 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,], 311 | # }, 312 | # { 313 | # 'model':[GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, KNeighborsRegressor], 314 | # #'n_estimators': [200,], 315 | # #'kernel':["rbf", ], 316 | # 'rm_n_head_days': [9, ], 317 | # 'rm_n_head_days_hours': [ 318 | # [(0, 6), (12, 14), (20, 24)], 319 | # ], 320 | # 'ft_th': [[ 321 | # (7, 0.3), 322 | # (8, 0.8), 323 | # (9, 0.3), 324 | # #(42, i/10.0), 325 | # ] #for i in range(0, 11, 1) 326 | # ], 327 | # 'is_y_log': [True, ], 328 | # 'y_log_e': [np.e,], 329 | # 'norm_y': [False, ], 330 | # 'ft_select': [[0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24),], 331 | # 'train_days': [9+7], 332 | # 'is_rm_outliers': [True,], 333 | # 'rm_outliers_m': [0.5,], 334 | # 'rm_outliers_key': [[0, 1, 3, 4], ], 335 | # 'is_avg_or_median': [2,], 336 | # }, 337 | # { 338 | # 'model':[LGBM,], 339 | # 'rm_n_head_days': [9, ], 340 | # 'rm_n_head_days_hours': [ 341 | # [(0, 6), (12, 14), (20, 24)], 342 | # ], 343 | # 'ft_th': [[ 344 | # (7, 0.3), 345 | # (8, 0.8), 346 | # (9, 0.3), 347 | # #(42, i/10.0), 348 | # ] #for i in range(0, 11, 1) 349 | # ], 350 | # #'objective':["regression", "regression_l2", "regression_l1", "huber", "fair", "poisson"], 351 | # 'objective': ["regression", ], 352 | # 'boosting_type': ['dart', ], 353 | # #'boosting_type': ['gbdt', 'dart'], 354 | # #'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.6, 0.8, 1.0,], 355 | # 'learning_rate': [0.6], 356 | # #'num_leaves': range(7, 1000, 1), 357 | # 'num_leaves': [185, ], 358 | # #'subsample': [i/10.0 for i in range(5, 11)], 359 | # #'colsample_bytree': [i/10.0 for i in range(5, 11)], 360 | # 'subsample': [0.9, ], 361 | # 'colsample_bytree': [0.9,], 362 | # 'is_y_log': [True, ], 363 | # 'y_log_e': [np.e,], 364 | # 'norm_y': [False, ], 365 | # #'ft_select': [[0, 1, 3, 4, 6, ], ], 366 | # #'ft_select': list([[0, 1, 3, 4, 6,] + list(one) for one in findsubsets2(set(range(11, 18)))]), 367 | # 'ft_select': [[0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24),], 368 | # 'train_days': [9+7], 369 | # #'train_days': [10, ], 370 | # 'is_rm_outliers': [True,], 371 | # #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)], 372 | # 'rm_outliers_m': [0.5,], 373 | # #'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4]))), 374 | # 'rm_outliers_key': [[0, 1, 3, 4], ], 375 | # 'is_avg_or_median': [2,], 376 | # }, 377 | { 378 | 'model':[XGBoost, ], 379 | 'is_ignore_skip_date_count': [True, ], 380 | #'remove_outliers_by_classifier': [ 381 | # { 382 | # "model": GradientBoostingRegressor( 383 | # max_depth=x, 384 | # ), 385 | # "m": v/100., 386 | # } 387 | # for v in range(80, 101, 1) for x in range(2,5) 388 | #], 389 | 'rm_n_head_days': [4, ], 390 | #'n_estimators': range(100, 2000, 100), 391 | #'rm_n_head_days': range(17), 392 | 'rm_n_head_days_hours': [ 393 | #[], 394 | #[(0, 6), (12, 15), (21, 22)], 395 | #[(0, 7), (11, 16), (20, 22)], 396 | #[(0, 8), (10, 17), (19, 22)], 397 | [(0, 6), (12, 14), (20, 24)], 398 | #[one, two, three] 399 | #for one in [(0, 6), (0, 5), (0, 7), (0, 8),] 400 | #for two in [(10, 17), (11, 16), (12, 14), ] 401 | #for three in [(19, 24), (20, 24), (21, 24), (22, 24)] 402 | ], 403 | 'ft_th': [[ 404 | (7, 0.3), 405 | (8, 0.8), 406 | (9, 0.3), 407 | #(42, i/10.0), 408 | ] #for i in range(0, 11, 1) 409 | ], 410 | 'is_y_log': [True, ], 411 | 'use_mspe': [False, ], 412 | #'y_log_e': [2.0, np.e, 3.0, 4.0, 5.0, 6.0, 7.0,], 413 | #'y_log_e': [np.e, ] + [pow(10, i) for i in range(1, 10)], 414 | #'y_log_e': [1000000,], 415 | 'y_log_e': [np.e, ], 416 | 'norm_y': [False, ], 417 | 'num_round': range(1000, 3000, 100), 418 | 'eta': [0.02], 419 | 'verbose_eval': [100, ], 420 | 'early_stopping_rounds': [10, ], 421 | #'gamma': [0.0, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0,], 422 | #'eta': [v/100.0 for v in range(1, 101, 1)], 423 | #'eta': [0.15, ], 424 | #'eta': [0.01, 0.05, 0.10, 0.005], 425 | #'max_depth': [7,], 426 | 'max_depth': range(2,15), 427 | #'min_child_weight': [1, 3, 5, 7,], 428 | 'colsample_bytree': [0.9, ], 429 | #'colsample_bytree': [i / 10.0 for i in range(6, 11)], 430 | 'eval_metric': ['rmse', ], 431 | #'eval_metric': ['mape', ], 432 | #'objective': ["reg:gamma", "reg:linear"], 433 | 'objective': ["reg:linear", ], 434 | #'eval_metric': ['logloss', ], 435 | 'subsample': [0.6, ], 436 | #'subsample': [v/10.0 for v in range(6, 11)], 437 | 'booster': ['gbtree', ], 438 | #'ft_select': list([list(one) for one in findsubsets2(range(2, 7))]),, 439 | 'ft_select': [ 440 | #[0, 1, 3, 4, 5, 6, 12, 15, 16, 17] + range(18, 18+24), 441 | #[0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24+1), 442 | [0, 1, 3, 4, 5, 6, 12, 15, 16, 17], 443 | ], 444 | #'ft_select': [[0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + list(one) for one in findsubsets2( range(18, 18+24) )], 445 | #'ft_select': list( [[0, 1, ] + list(one) for one in findsubsets2(range(2, 7))]), 446 | #'ft_select': list([[0, 1, 3, 4, 5, 6] + list(one) for one in findsubsets2(set(range(11, 18)))]), 447 | #'ft_select': list([[0, 1, 3, 4, 5, 6, 12, 15, 16, 17] + list(one) for one in findsubsets2(set(range(18, 18+24)))]), 448 | #'train_days': range(3, 30, 1), 449 | #'train_days': [9, 9+7, 50], 450 | 'train_days': [9+7,], 451 | 'is_rm_outliers': [True, ], 452 | #'remove_non_predict_hour_range': [True, False], 453 | #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)], 454 | #'rm_outliers_m': [0.5,], 455 | #'rm_outliers_m': [4.0, ], 456 | 'rm_outliers_m': [2.0,], 457 | #'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4,5,6,7,8,9]))), 458 | #'rm_outliers_key': [[0, 1, ] + list(one) for one in list(findsubsets2(set([2,3,4,5]))) ], 459 | 'rm_outliers_key': [[0, 1, 2, 5], ], 460 | 'is_avg_or_median': [2,], 461 | #'is_avg_or_median': [2,], 462 | #'rm_outliers_m': [i/100.0 for i in range(10, 60, 10)], 463 | #'rm_outliers_key': [[0, 1, ] + list(one) for one in list(findsubsets2(set([2,3,4,5,6,7,8,9]))) ], 464 | #'is_avg_or_median': [2, ], 465 | #'is_sample_weight': [6, ], 466 | #'is_one_hot_encode': [True, False], 467 | #'remove_non_predict_hour_range': [True], 468 | #'predict_hour_range':[ 469 | # [ 470 | # [[8, 0], [10, 0]], 471 | # [[17, 0], [19, 0]], 472 | # ], 473 | # #[ 474 | # # [[7, 0], [11, 0]], 475 | # # [[16, 0], [20, 0]], 476 | # #], 477 | #], 478 | 'skip_date_ranges': [ 479 | #[ 480 | # (datetime(2016, 9, 30), datetime(2016, 10, 10)), 481 | #], 482 | #[ 483 | # (datetime(2016, 9, 15), datetime(2016, 9, 16)), 484 | #], 485 | #[ 486 | # (datetime(2016, 9, 14), datetime(2016, 9, 16)), 487 | #], 488 | #[ 489 | # (datetime(2016, 9, 17), datetime(2016, 9, 18)), 490 | #], 491 | #[ 492 | # (datetime(2016, 10, 7), datetime(2016, 10, 8)), 493 | #], 494 | #[ 495 | # (datetime(2016, 10, 7), datetime(2016, 10, 8)), 496 | #], 497 | #[ 498 | # (datetime(2016, 9, 30), datetime(2016, 10, 1)), 499 | #], 500 | #[ 501 | # (datetime(2016, 9, 30), datetime(2016, 10, 2)), 502 | #], 503 | #[ 504 | # (datetime(2016, 10, 1), datetime(2016, 10, 2)), 505 | #], 506 | #[ 507 | # (datetime(2016, 10, 2), datetime(2016, 10, 3)), 508 | #], 509 | #[ 510 | # (datetime(2016, 10, 3), datetime(2016, 10, 4)), 511 | #], 512 | ##[ 513 | ## (datetime(2016, 9, 15), datetime(2016, 9, 16)), 514 | ## (datetime(2016, 10, 7), datetime(2016, 10, 8)), 515 | ##], 516 | #[ 517 | # (datetime(2016, 9, 14), datetime(2016, 9, 19)), 518 | # (datetime(2016, 9, 30), datetime(2016, 10, 10)), 519 | #], 520 | [], 521 | ], 522 | }, 523 | # { 524 | # 'model': [GradientBoostingRegressor, ], 525 | # 'is_ignore_skip_date_count': [True,], 526 | # 'rm_n_head_days': [9, ], 527 | # 'rm_n_head_days_hours': [ 528 | # [(0, 6), (12, 14), (20, 24)], 529 | # ], 530 | # 'ft_th': [[ 531 | # (7, 0.3), 532 | # (8, 0.8), 533 | # (9, 0.3), 534 | # #(42, i/10.0), 535 | # ] #for i in range(0, 11, 1) 536 | # ], 537 | # 'n_estimators': [200,], 538 | # 'is_y_log' : [True,], 539 | # 'norm_y': [False,], 540 | # 'is_one_hot_encode': [False], 541 | # 'ft_select': [ [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24),], 542 | # 'train_days': [9+7, 50 ], 543 | # 'is_rm_outliers': [True,], 544 | # #'rm_outliers_key': [[0, 1, ] + list(one) for one in list(findsubsets2(set([2,3,4,]))) ], 545 | # 'rm_outliers_key': [ [0, 1, 3, ], ], 546 | # #'is_avg_or_median': [1, 2, ], 547 | # 'is_avg_or_median': [2, ], 548 | # 'rm_outliers_m': [0.4, ], 549 | # #'rm_outliers_m': [i/100.0 for i in range(10, 60, 10)], 550 | # 'is_sample_weight': range(16), 551 | # 'skip_date_ranges': [ 552 | # [ 553 | # (datetime(2016, 9, 30), datetime(2016, 10, 10)), 554 | # ], 555 | # ], 556 | # 557 | # }, 558 | # { 559 | # 'model':[MedianModel,], 560 | # 'is_y_log': [True,], 561 | # 'ft_pos':[[0, 1, 2, 4], ], 562 | # 'ft_select': [ 563 | # [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24), 564 | # ], 565 | # #'train_days': range(14, 90, 1), 566 | # 'train_days': [9+7, ], 567 | # 'is_rm_outliers': [False,], 568 | # 'rm_outliers_m': [0.5], 569 | # #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)], 570 | # 'rm_outliers_key': [[0, 1, ]], 571 | # #'rm_outliers_key': [[0, 1, 2] + list(one) for one in list(findsubsets2(set([2,3,5,6]))) ], 572 | # #'is_avg_or_median': [1, 2], 573 | # 'is_avg_or_median': [1, ], 574 | # }, 575 | # { 576 | # 'model': [NonparametricKNN,], 577 | # 'ft_th': [[ 578 | # (7, 0.3), 579 | # (8, 0.8), 580 | # (9, 0.3), 581 | # #(42, i/10.0), 582 | # ] #for i in range(0, 11, 1) 583 | # ], 584 | # "n_neighbors": [1,], 585 | # 'ft_select': [[0, 1, 3, 4, 6, 7, ], ], 586 | # #'ft_select': [[0, 1, 3, 4, 5, 6, 8] + list(one) for one in list(findsubsets2(set(range(7, 10)))) ], 587 | # "loss": ["SMAPE", ], 588 | # 'is_y_log' : [False,], 589 | # 'train_days': [7,], 590 | # 'is_rm_outliers': [True,], 591 | # #'rm_outliers_key': [[0, 1, ] + list(one) for one in list(findsubsets2(set([2,3,4,5]))) ], 592 | # 'rm_outliers_key': [[0, 1, 2, 4, 5], ], 593 | # 'rm_outliers_m': [1.5, ], 594 | # #'rm_outliers_m': [i/10.0 for i in range(5, 30, 5)], 595 | # 'is_avg_or_median': [2,], 596 | # }, 597 | # { 598 | # 'model': [RandomForestRegressor,ExtraTreesRegressor,], 599 | # 'train_days': [3,7,14,21,28,35,42,49,56,150], 600 | # 'is_rm_outliers': [True,], 601 | # 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,], 602 | # }, 603 | # { 604 | # 'model': [LogisticRegression,], 605 | # 'penalty': ['l1', 'l2'], 606 | # 'C': [1.0, 10.0], 607 | # 'train_days': [3,7,14,21,28,35,42,49,56,150], 608 | # 'is_rm_outliers': [True,], 609 | # 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,], 610 | # }, 611 | # { 612 | # "model": [BaggingRegressor,], 613 | # "base_estimator": [GradientBoostingRegressor(loss="ls", n_estimators=50), 614 | # GradientBoostingRegressor(loss="lad", n_estimators=50), 615 | # GradientBoostingRegressor(loss="huber", n_estimators=50), 616 | # GradientBoostingRegressor(loss="quantile", n_estimators=50), 617 | # ExtraTreesRegressor(n_estimators=50), 618 | # RandomForestRegressor(n_estimators=50)], 619 | # 'train_days': [3, 7,14,21,28,35,], 620 | # 'is_rm_outliers': [True,], 621 | # 'dates_train': [dates_train,], 622 | # 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,], 623 | # }, 624 | # { 625 | # "model": [BaggingRegressor,], 626 | # "base_estimator": [None, GradientBoostingRegressor(n_estimators=200), 627 | # GradientBoostingRegressor(loss="lad", n_estimators=200), 628 | # ExtraTreesRegressor(n_estimators=200), RandomForestRegressor(n_estimators=200)], 629 | # 'train_days': [3, 7,14,21,28,35,], 630 | # 'is_rm_outliers': [False,], 631 | # 'rm_outliers_m': [0.5, ], 632 | # }, 633 | # 634 | # 635 | ], 636 | }, 637 | # { 638 | # "model": BaggingRegressor, 639 | # "tuned_parameters":[ 640 | # { 641 | # "base_estimator": [None, 642 | # GradientBoostingRegressor(n_estimators=200, loss='lad', learning_rate=0.1), 643 | # ExtraTreesRegressor() RandomForestRegressor()], 644 | # }, 645 | # ], 646 | # }, 647 | # { 648 | # "model": GradientBoostingRegressor, 649 | # "tuned_parameters": [ 650 | # { 651 | # 'n_estimators': [200,], 652 | # 'loss': ["ls", "lad", "huber", "quantile"], 653 | # 'learning_rate': [0.1,], 654 | # } 655 | # ], 656 | # }, 657 | # { 658 | # "model": AdaBoostRegressor, 659 | # "tuned_parameters": [], 660 | # }, 661 | # { 662 | # "model": ExtraTreesRegressor, 663 | # "tuned_parameters": [ 664 | # { 665 | # #"criterion": ["mse", "mae"], 666 | # "n_estimators": [200,] 667 | # }, 668 | # ] 669 | # }, 670 | # { 671 | # "model": RandomForestRegressor, 672 | # "tuned_parameters": [ 673 | # {"n_estimators": [200,]}, 674 | # ], 675 | # }, 676 | # { 677 | # "model": SVR, 678 | # "tuned_parameters": [ 679 | # {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10,]}, 680 | # #{'kernel': ['linear'], 'C': [1, ]}] 681 | # {'kernel': ['linear'], 'C': [1, 10, ]} 682 | # ] 683 | # } 684 | ] 685 | 686 | 687 | #skip_cvs = [] 688 | predict_y_final = GridSearchCVDatesWithVal( 689 | Configurations, 690 | X_train, y_train, dates_train, 691 | X_val, y_val, dates_val, 692 | X_final, 693 | is_y_log=is_y_log, is_boxcox=is_boxcox, boxcox_lambda=boxcox_lambda, 694 | is_include_val_loss_for_eval=False, cv=3, days_to_test=7, skip_cvs=skip_cvs, 695 | estimate_val_w=estimate_val_w, is_include_future_training=is_include_future_training, 696 | remove_future_training_test=remove_future_training_test, 697 | ) 698 | 699 | ##output final output 700 | path_final_res = os.path.join( 701 | path_working_dir, 702 | "volumes_predict.csv") 703 | 704 | print("*" * 60) 705 | print("writing out final results...") 706 | file_final_res = open(path_final_res, "w+") 707 | file_final_res.writelines(','.join(['"tollgate_id"', 708 | '"time_window"', '"direction"', '"volume"']) + '\n') 709 | for iy, rinfo in enumerate(raw_info): 710 | tollgate_id, direction, start_datetime = rinfo 711 | end_datetime = start_datetime + timedelta(minutes=20) 712 | timewindow = "["+str(start_datetime) + "," + str(end_datetime)+ ")" 713 | words = [tollgate_id, timewindow, direction, str(predict_y_final[iy])] 714 | line = '","'.join(words) 715 | line = '"' + line + '"\n' 716 | file_final_res.write(line) 717 | file_final_res.close() 718 | -------------------------------------------------------------------------------- /kdd2017/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.pipeline import FeatureUnion 3 | from sklearn.ensemble import ExtraTreesRegressor 4 | from sklearn.ensemble import GradientBoostingRegressor 5 | from scipy import stats 6 | import inspect 7 | import numpy as np 8 | from sklearn.neighbors import NearestNeighbors 9 | from datetime import timedelta 10 | from datetime import datetime 11 | import numpy as np 12 | from kdd2017.utils import invboxcox 13 | from kdd2017.utils import mape_loss 14 | from kdd2017.utils import remove_outliers 15 | from kdd2017.utils import remove_outliers2 16 | from kdd2017.utils import remove_outliers3 17 | from kdd2017.utils import invboxcox 18 | from kdd2017.utils import compute_harmonic_mean 19 | from sklearn.feature_selection import SelectKBest, f_regression 20 | from sklearn.pipeline import make_pipeline 21 | from sklearn import preprocessing 22 | import collections 23 | import xgboost as xgb 24 | from scipy.stats import boxcox 25 | from sklearn.decomposition import PCA 26 | import copy 27 | from random import shuffle 28 | import lightgbm as lgb 29 | import lightgbm as lgb 30 | import pandas as pd 31 | import numpy as np 32 | from sklearn.metrics import mean_squared_error 33 | from sklearn import preprocessing 34 | import os 35 | import json 36 | from time import sleep 37 | import random 38 | from sklearn.model_selection import train_test_split 39 | 40 | global_variables = {} 41 | 42 | 43 | random_state = 1 44 | 45 | 46 | def mspe(y, dtrain): 47 | yhat = dtrain.get_label() 48 | grad = 2.0/yhat * (y * 1.0 / yhat - 1) 49 | hess = 2.0/(yhat**2) 50 | return grad, hess 51 | 52 | def L2(pred,true): 53 | loss = np.square(pred-true) 54 | return loss.mean() 55 | 56 | def L1(pred,true): 57 | loss = np.abs(pred-true) 58 | return loss.mean() 59 | 60 | def SMAPE(pred,true): 61 | loss = np.abs((pred-true)/(pred+true)) 62 | return loss.mean() 63 | 64 | #This function chooses the best point estimate for a numpy array, according to a particular loss. 65 | #The loss function should take two numpy arrays as arguments, and return a scalar. One example is SMAPE, see above. 66 | def solver(x,loss): 67 | mean = x.mean() 68 | best = loss(mean,x) 69 | result = mean 70 | for i in x: 71 | score = loss(i,x) 72 | if score < best: 73 | best = score 74 | result = i 75 | return result 76 | 77 | class NonparametricKNN(object): 78 | def __init__(self,n_neighbors=5,loss='L2'): 79 | if loss in ['L1','L2','SMAPE']: 80 | loss = {'L1':L1,'L2':L2,'SMAPE':SMAPE}[loss] 81 | self.loss = loss 82 | self.n_neighbors = n_neighbors 83 | self.model = NearestNeighbors(n_neighbors,algorithm='auto',n_jobs=-1) 84 | self.solver = lambda x:solver(x,loss) 85 | def __repr__(self, ): 86 | return "NonparametricKNN: loss:" + repr(self.loss) + ", n_neighbors=" + repr(self.n_neighbors) 87 | def __str__(self,): 88 | return repr(self) 89 | 90 | def fit(self,train,target):#All inputs should be numpy arrays. 91 | self.model.fit(train) 92 | self.f=np.vectorize(lambda x:target[x]) 93 | return self 94 | 95 | def predict(self,test):#Return predictions as a numpy array. 96 | neighbors = self.model.kneighbors(test,return_distance=False) 97 | neighbors = self.f(neighbors) 98 | result = np.apply_along_axis(self.solver,1,neighbors) 99 | return result 100 | 101 | 102 | def evalerror(preds, dtrain): 103 | labels = dtrain.get_label() 104 | # return a pair metric_name, result 105 | # since preds are margin(before logistic transformation, cutoff at 0) 106 | return 'error', mape_loss(preds, labels) 107 | 108 | def xgboostobj(preds, dtrain): 109 | labels = dtrain.get_label() 110 | preds = 1.0 / (1.0 + np.exp(-preds)) 111 | grad = preds - labels 112 | hess = preds * (1.0-preds) 113 | return grad, hess 114 | 115 | class Pipeline(object): 116 | def __init__(self, **kwargs): 117 | self.models = kwargs["models"] 118 | kwargs.pop("models") 119 | 120 | def fit(self, X, y): 121 | last_y = np.copy(y) 122 | for i, model in enumerate(self.models): 123 | self.models[i].fit(X, last_y) 124 | last_y = np.copy(self.models[i].predict(X)) 125 | 126 | def predict(self, X, **kwargs): 127 | return self.models[-1].predict(X) 128 | 129 | class MedianModel(object): 130 | def __init__(self, **kwargs): 131 | self.ft_pos = kwargs.get("ft_pos", np.asarray([0,1])) 132 | if "ft_pos" in kwargs: 133 | kwargs.pop("ft_pos") 134 | def __str__(self,): 135 | return "MedianModel:\n " + repr(self.ft_pos) 136 | def __repr__(self,): 137 | return str(self) 138 | 139 | def fit(self, X, y, **kwargs): 140 | X = X[:,self.ft_pos] 141 | self.values = {} 142 | for i, x in enumerate(X): 143 | key = tuple([j for j in x]) 144 | if key not in self.values: 145 | self.values[key] = [] 146 | self.values[key].append(y[i]) 147 | 148 | def predict(self, X, **kwargs): 149 | X = X[:,self.ft_pos] 150 | y = [] 151 | for i, x in enumerate(X): 152 | key = tuple([j for j in x]) 153 | y.append(np.median(self.values[key])) 154 | return np.asarray(y) 155 | 156 | 157 | class XGBoost(object): 158 | def __init__(self, **kwargs): 159 | #self.eval_metric = kwargs.get("eval_metric", "logloss") 160 | #self.eta = kwargs.get("eta", 0.02) 161 | #self.max_depth = kwargs.get("max_depth", 3) 162 | #self.objective = kwargs.get("objective", "reg:gamma") 163 | #self.booster = kwargs.get("booster", "gbtree") 164 | self.use_mspe = kwargs.get("use_mspe", False) 165 | self.num_round = kwargs.get("num_round", 1500) 166 | self.early_stopping_rounds = kwargs.get("early_stopping_rounds", 10) 167 | self.verbose_eval = kwargs.get("verbose_eval", 500) 168 | self.eval_metric = kwargs.get("eval_metric", None) 169 | self.feval = None 170 | if self.eval_metric == "mape": 171 | self.eval_metric = None 172 | self.feval = evalerror 173 | if "use_mspe" in kwargs: 174 | kwargs.pop("use_mspe") 175 | if "early_stopping_rounds" in kwargs: 176 | kwargs.pop("early_stopping_rounds") 177 | if "num_round" in kwargs: 178 | kwargs.pop("num_round") 179 | if "verbose_eval" in kwargs: 180 | kwargs.pop("verbose_eval") 181 | if "eval_metric" in kwargs: 182 | kwargs.pop("eval_metric") 183 | self.param = kwargs 184 | 185 | def __str__(self, ): 186 | members = [attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__")] 187 | ret_val = "" 188 | for attr in members: 189 | value = getattr(self, attr) 190 | ret_val += " (%s:%s)\n" % (attr, repr(value)) 191 | return "XGBoost: " + ret_val 192 | 193 | def __repr__(self, ): 194 | return str(self) 195 | 196 | def fit(self, X, y, sample_weight=None, **kwargs): 197 | if sample_weight is not None: 198 | print("use sample_weight") 199 | dtrain = xgb.DMatrix(X, label=y, weight=sample_weight, silent=True) 200 | else: 201 | dtrain = xgb.DMatrix(X, label=y, silent=True) 202 | evallist = [(dtrain, 'train')] 203 | param = { 204 | 'n_estimators':200, 205 | 'booster': 'gbtree', 206 | 'nthread': -1, 207 | 'max_depth': 3, 208 | 'eta': 0.02, 209 | 'silent': 1, 210 | 'objective': 'reg:gamma', 211 | 'colsample_bytree': 0.7, 212 | 'eval_metric': 'logloss', 213 | 'subsample': 0.5} 214 | if self.eval_metric is not None: 215 | param["eval_metric"] = self.eval_metric 216 | param.update(self.param) 217 | if not self.use_mspe: 218 | if self.early_stopping_rounds > 0: 219 | self.bst = xgb.train(param, dtrain, self.num_round, 220 | evallist, feval=self.feval, early_stopping_rounds=self.early_stopping_rounds, 221 | verbose_eval=self.verbose_eval) 222 | else: 223 | self.bst = xgb.train(param, dtrain, self.num_round, 224 | evallist, feval=self.feval, verbose_eval=self.verbose_eval) 225 | else: 226 | param.pop("objective") 227 | #param.pop("eval_metric") 228 | if self.early_stopping_rounds > 0: 229 | self.bst = xgb.train(param, dtrain, self.num_round, 230 | evallist, 231 | mspe, feval=self.feval, early_stopping_rounds=self.early_stopping_rounds, 232 | verbose_eval=self.verbose_eval) 233 | else: 234 | self.bst = xgb.train(param, dtrain, self.num_round, 235 | evallist, 236 | mspe, feval=self.feval, 237 | verbose_eval=self.verbose_eval) 238 | 239 | def predict(self, X, **kwargs): 240 | dtest = xgb.DMatrix(X) 241 | return self.bst.predict(dtest) 242 | 243 | 244 | def evalerror_lgbm(preds, dtrain): 245 | labels = dtrain.get_label() 246 | # return a pair metric_name, result 247 | # since preds are margin(before logistic transformation, cutoff at 0) 248 | return 'error', mape_loss(preds, labels), False 249 | 250 | 251 | class LGBM(object): 252 | def __init__(self, **kwargs): 253 | self.kwargs = kwargs 254 | #self.kwargs["random_state"] = random_state 255 | self.use_mspe = kwargs.get("use_mspe", False) 256 | if "use_mspe" in self.kwargs: 257 | self.kwargs.pop("use_mspe") 258 | self.gbm = lgb.LGBMRegressor(**self.kwargs) 259 | 260 | def __repr__(self,): 261 | return "LGBM:" + repr(self.kwargs) 262 | 263 | def __str__(self, ): 264 | return repr(self) 265 | 266 | def fit(self, X, y): 267 | if self.use_mspe: 268 | lgb_train = lgb.Dataset(X, y, 269 | weight=np.ones(X.shape[0]), 270 | free_raw_data=False) 271 | lgb_test = lgb.Dataset(X, y, reference=lgb_train, 272 | weight=np.ones(X.shape[0]), 273 | free_raw_data=False) 274 | self.gbm = lgb.train( 275 | self.kwargs, 276 | lgb_train, 277 | num_boost_round=10, 278 | fobj=mspe, 279 | feval=evalerror_lgbm, 280 | valid_sets=lgb_test) 281 | else: 282 | X_train, X_test, y_train, y_test = train_test_split( 283 | X, y, test_size=0.3) 284 | #lgb_test = lgb.Dataset(X, y, reference=lgb_train, 285 | # weight=np.ones(X.shape[0]), 286 | # free_raw_data=False) 287 | self.gbm.fit(X, y, early_stopping_rounds=10, eval_set=[(X, y)], verbose=False) 288 | #print "gbm best_iteration=", self.gbm.best_iteration 289 | 290 | def predict(self, X): 291 | if self.use_mspe: 292 | return self.gbm.predict(X) 293 | else: 294 | return self.gbm.predict(X, num_iteration=self.gbm.best_iteration) 295 | 296 | def remove_outliers_by_classifier(X, y, dates, model, m=0.9): 297 | #xgboost = XGBoost(max_depth=2, num_round=6000) 298 | if np.isnan(X).any(): 299 | print("X contains NaN") 300 | if np.isinf(X).any(): 301 | print("X contains inf") 302 | if np.isnan(np.log(y)).any(): 303 | print("y contains nan") 304 | if np.isinf(np.log(y)).any(): 305 | print("y contains inf") 306 | print("X=", X.shape) 307 | print("y=", y.shape) 308 | model.fit(X, y) 309 | y_pred = model.predict(X) 310 | diff_values = np.abs(y_pred - y) 311 | abs_diff_vals = np.abs(diff_values) 312 | sorted_indexes = sorted(range(len(abs_diff_vals)), key = lambda x: abs_diff_vals[x]) 313 | sorted_indexes_lead = sorted_indexes[:int(len(abs_diff_vals)*m)] 314 | return X[sorted_indexes_lead], y[sorted_indexes_lead], dates[sorted_indexes_lead] 315 | 316 | 317 | class BoxcoxModel(object): 318 | def __init__(self, **kwargs): 319 | #print("kwargs=", kwargs) 320 | self.is_boxcox = kwargs.get("is_boxcox", False) 321 | self.boxcox_lambda = kwargs.get("boxcox_lambda", 0.0) 322 | self.Model = kwargs.get("model", GradientBoostingRegressor) 323 | if "is_boxcox" in kwargs: 324 | kwargs.pop("is_boxcox") 325 | if "boxcox_lambda" in kwargs: 326 | kwargs.pop("boxcox_lambda") 327 | if "model" in kwargs: 328 | kwargs.pop("model") 329 | self.clf = self.Model(**kwargs) 330 | def fit(self, X, y): 331 | if self.is_boxcox: 332 | self.clf.fit(X, stats.boxcox(y, self.boxcox_lambda)) 333 | else: 334 | self.clf.fit(X, y) 335 | def predict(self, X): 336 | if self.is_boxcox: 337 | return invboxcox(self.clf.predict(X), self.boxcox_lambda) 338 | else: 339 | return self.clf.predict(X) 340 | 341 | class CombineModes(object): 342 | def __init__(self, **kwargs): 343 | self.models = copy.deepcopy(kwargs.get("models", None)) 344 | self.dates_train = copy.deepcopy(kwargs.get("dates_train", None)) 345 | self.weights = copy.deepcopy(kwargs.get("weights", None)) 346 | if self.weights is not None: 347 | self.weights = np.asarray(self.weights) 348 | self.weights = self.weights / np.sum(self.weights) 349 | self.harmonic_mean = kwargs.get("harmonic_mean", True) 350 | self.subsample = kwargs.get("subsample", 0.8) 351 | self.combine_method = kwargs.get("combine_method", 0) 352 | self.sample_weight = kwargs.get("sample_weight", None) 353 | self.cache_file = kwargs.get("cache_file", "not useful any more") 354 | self.model_hash_input_fit_key = [] 355 | self.cache_data = {} 356 | self.is_save_cache_to_disk = True ### should be true otherwise all the model will be re-initialize each time 357 | if self.models is not None: 358 | self.model_hash_input_fit_key = [None] * len(self.models) 359 | 360 | def load_cache_data(self, ): 361 | global global_variables 362 | if "cache_combine_model" not in global_variables: 363 | global_variables["cache_combine_model"] = {} 364 | self.cache_data.update(global_variables["cache_combine_model"]) 365 | #print("self.cache_data=", self.cache_data) 366 | 367 | def save_cache_data(self, ): 368 | global global_variables 369 | if "cache_combine_model" not in global_variables: 370 | global_variables["cache_combine_model"] = {} 371 | global_variables["cache_combine_model"].update(self.cache_data) 372 | #print("self.cache_data=", self.cache_data) 373 | 374 | def fit(self, X, y): 375 | self.X_train = np.copy(X) 376 | self.y_train = np.copy(y) 377 | 378 | def _fit(self, X, y, model_i): 379 | mX = np.copy(X) 380 | my = np.copy(y) 381 | sub_index = range(len(my)) 382 | shuffle(sub_index) 383 | if self.subsample < 1.0: 384 | sub_index = sub_index[:int(len(my)*self.subsample)] 385 | sub_index = np.asarray(sub_index) 386 | sub_index.sort() 387 | mX = mX[sub_index, :] 388 | my = my[sub_index] 389 | dates_train = copy.deepcopy(self.dates_train[sub_index]) 390 | if hasattr(self.models[model_i], "dates_train"): 391 | self.models[model_i].dates_train = copy.deepcopy(dates_train) 392 | self.models[model_i].fit(mX, my) 393 | 394 | def _fit_predict(self, X, model_i): 395 | if self.cache_file is not None: 396 | self.X_train.flags.writeable = False 397 | self.y_train.flags.writeable = False 398 | self.dates_train.flags.writeable = False 399 | self.model_hash_input_fit_key[model_i] = str(hash(repr(self.models[model_i]))) + \ 400 | str(hash(self.X_train.data)) + \ 401 | str(hash(self.y_train.data)) + \ 402 | str(hash(self.dates_train.data)) 403 | self.X_train.flags.writeable = True 404 | self.y_train.flags.writeable = True 405 | self.dates_train.flags.writeable = True 406 | 407 | X.flags.writeable = False 408 | model_hash_predict_key = str(hash(X.data)) 409 | X.flags.writeable = True 410 | total_key = self.model_hash_input_fit_key[model_i] + model_hash_predict_key 411 | if total_key in self.cache_data: 412 | #print("using cache ", total_key) 413 | return np.asarray(self.cache_data[total_key]) 414 | self._fit(self.X_train, self.y_train, model_i) 415 | ret_val = self.models[model_i].predict(X) 416 | if self.cache_file is not None: 417 | self.cache_data[total_key] = ret_val.tolist() 418 | self.save_cache_data() 419 | return ret_val 420 | 421 | def predict(self, X): 422 | self.load_cache_data() 423 | ret_val = None 424 | if self.combine_method == 0: 425 | ys = [] 426 | for i in range(len(self.models)): 427 | y_i = self._fit_predict(X, i) 428 | ys.append(y_i) 429 | ret_val = compute_harmonic_mean(ys) 430 | elif self.combine_method == 1: 431 | ys = [] 432 | for i in range(len(self.models)): 433 | y_i = self._fit_predict(X, i) 434 | ys.append(y_i) 435 | ys = np.asarray(ys) 436 | ret_val = np.average(ys, axis=0) 437 | elif self.combine_method == 2: 438 | if self.weights is not None: 439 | sum_w = np.sum(self.weights) 440 | self.weights = self.weights / sum_w 441 | y = [] 442 | for i in range(len(self.models)): 443 | y_i = self._fit_predict(X, i) 444 | y_i = np.asarray(y_i, np.float32) 445 | y_i *= np.asarray(self.weights[i], np.float32) 446 | y.append(y_i.reshape(-1)) 447 | y = np.asarray(y) 448 | #print(y.shape) 449 | y = np.sum(y, axis=0) 450 | ret_val = y 451 | return ret_val 452 | 453 | 454 | class DaterangeModel(object): 455 | def is_in_predict_hour_range(self, item_datetime): 456 | for i in self.predict_hour_range: 457 | start_time = datetime(item_datetime.year, item_datetime.month, item_datetime.day, i[0][0], i[0][1]) 458 | end_time = datetime(item_datetime.year, item_datetime.month, item_datetime.day, i[1][0], i[1][1]) 459 | if item_datetime >= start_time and item_datetime < end_time: 460 | return True 461 | return False 462 | 463 | def __str__(self, ): 464 | members = [attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__")] 465 | ret_val = "" 466 | for attr in members: 467 | value = getattr(self, attr) 468 | ret_val += " (%s:%s)\n" % (attr, repr(value)) 469 | return "DaterangeModel: " + repr(self.clf) + " " + ret_val 470 | 471 | def __repr__(self, ): 472 | return str(self) 473 | 474 | def __init__(self, **kwargs): 475 | #print("kwargs=", kwargs) 476 | self.dates_train = copy.deepcopy(kwargs.get("dates_train", None)) 477 | self.skip_date_ranges = copy.deepcopy(kwargs.get("skip_date_ranges", [])) 478 | self.ft_select = copy.deepcopy(kwargs.get("ft_select", None)) 479 | self.train_days = kwargs.get("train_days", None) 480 | self.is_rm_outliers = kwargs.get("is_rm_outliers", None) 481 | self.is_y_log = kwargs.get("is_y_log", False) 482 | self.rm_outliers_m = kwargs.get("rm_outliers_m", 3.0) 483 | self.Model = kwargs.get("model", GradientBoostingRegressor) 484 | self.rm_outliers_key = kwargs.get("rm_outliers_key", [0, ]) 485 | self.is_avg_or_median = kwargs.get("is_avg_or_median", True) 486 | self.is_boxcox = kwargs.get("is_boxcox", False) 487 | self.boxcox_lambda = kwargs.get("boxcox_lambda", False) 488 | self.y_log_e = kwargs.get("y_log_e", np.e) 489 | self.random_state = kwargs.get("random_state", None) 490 | self.anova_filter = kwargs.get("anova_filter", 0) 491 | self.norm_y = kwargs.get("norm_y", False) 492 | self.is_one_hot_encode = kwargs.get("is_one_hot_encode", False) 493 | self.is_ft_union = kwargs.get("is_ft_union", None) 494 | self.ft_th = kwargs.get("ft_th", None) 495 | self.ft_weights = kwargs.get("ft_weights", None) 496 | self.is_sample_weight = kwargs.get("is_sample_weight", None) 497 | self.remove_non_predict_hour_range = kwargs.get("remove_non_predict_hour_range", False) 498 | self.remove_test_date_data = kwargs.get("remove_test_date_data", False) 499 | self.is_ignore_skip_date_count = kwargs.get("is_ignore_skip_date_count", False) 500 | self.rm_n_head_days = kwargs.get("rm_n_head_days", 0) 501 | self.rm_n_head_days_hours = kwargs.get("rm_n_head_days_hours", [(0, 6), (10, 15), (20, 22)]) 502 | self.predict_hour_range = kwargs.get("predict_hour_range", 503 | [ 504 | [[8, 0], [10, 0]], 505 | [[17, 0], [19, 0]], 506 | ] 507 | ) 508 | self.remove_outliers_by_classifier = kwargs.get("remove_outliers_by_classifier", None) 509 | 510 | self.ft_norm = kwargs.get("ft_norm", []) 511 | self.ft_norm_clfs = [] 512 | # ft_norm = [0, 1] 513 | #self.predict_hour_range = [ 514 | # [[8, 0], [10, 0]], 515 | # [[17, 0], [19, 0]], 516 | #] 517 | 518 | if self.Model is None: 519 | self.Model = GradientBoostingRegressor 520 | if "rm_n_head_days_hours" in kwargs: 521 | kwargs.pop("rm_n_head_days_hours") 522 | if "remove_outliers_by_classifier" in kwargs: 523 | kwargs.pop("remove_outliers_by_classifier") 524 | if "rm_n_head_days" in kwargs: 525 | kwargs.pop("rm_n_head_days") 526 | if "ft_norm" in kwargs: 527 | kwargs.pop("ft_norm") 528 | if "is_sample_weight" in kwargs: 529 | kwargs.pop("is_sample_weight") 530 | if "ft_select" in kwargs: 531 | kwargs.pop("ft_select") 532 | if "is_rm_outliers" in kwargs: 533 | kwargs.pop("is_rm_outliers") 534 | if "rm_outliers_m" in kwargs: 535 | kwargs.pop("rm_outliers_m") 536 | if "dates_train" in kwargs: 537 | kwargs.pop("dates_train") 538 | if "model" in kwargs: 539 | kwargs.pop("model") 540 | if "train_days" in kwargs: 541 | kwargs.pop("train_days") 542 | if "rm_outliers_key" in kwargs: 543 | kwargs.pop("rm_outliers_key") 544 | if "is_y_log" in kwargs: 545 | kwargs.pop("is_y_log") 546 | if "y_log_e" in kwargs: 547 | kwargs.pop("y_log_e") 548 | if "is_avg_or_median" in kwargs: 549 | kwargs.pop("is_avg_or_median") 550 | if "is_boxcox" in kwargs: 551 | kwargs.pop("is_boxcox") 552 | if "boxcox_lambda" in kwargs: 553 | kwargs.pop("boxcox_lambda") 554 | if "anova_filter" in kwargs: 555 | kwargs.pop("anova_filter") 556 | if "norm_y" in kwargs: 557 | kwargs.pop("norm_y") 558 | if "is_one_hot_encode" in kwargs: 559 | kwargs.pop("is_one_hot_encode") 560 | if "is_ft_union" in kwargs: 561 | kwargs.pop("is_ft_union") 562 | if "ft_th" in kwargs: 563 | kwargs.pop("ft_th") 564 | if "ft_weights" in kwargs: 565 | kwargs.pop("ft_weights") 566 | if "remove_non_predict_hour_range" in kwargs: 567 | kwargs.pop("remove_non_predict_hour_range") 568 | if "predict_hour_range" in kwargs: 569 | kwargs.pop("predict_hour_range") 570 | if "remove_test_date_data" in kwargs: 571 | kwargs.pop("remove_test_date_data") 572 | if "is_ignore_skip_date_count" in kwargs: 573 | kwargs.pop("is_ignore_skip_date_count") 574 | if "skip_date_ranges" in kwargs: 575 | kwargs.pop("skip_date_ranges") 576 | if "random_state" in kwargs: 577 | kwargs.pop("random_state") 578 | #print inspect.getargspec(self.Model.__init__) 579 | arguments = inspect.getargspec(self.Model.__init__)[0] 580 | if "random_state" in arguments: 581 | kwargs["random_state"] = self.random_state 582 | self.clf = self.Model(**kwargs) 583 | #if hasattr(self.clf, "random_state"): 584 | # self.clf.random_state = random_state 585 | if self.anova_filter > 0: 586 | anova_filter_clf = SelectKBest(f_regression, k=self.anova_filter) 587 | self.clf = make_pipeline(anova_filter_clf, self.clf) 588 | if self.is_one_hot_encode: 589 | self.enc = preprocessing.OneHotEncoder() 590 | 591 | 592 | def is_need_skip_n_head_days_hours(self, cur_date): 593 | #self.rm_n_head_days_hours = kwargs.get("rm_n_head_days_hours", [(0, 8), (10, 17), (19, 22)]) 594 | for hour_range in self.rm_n_head_days_hours: 595 | if cur_date.hour >= hour_range[0] and cur_date.hour < hour_range[1]: 596 | return True 597 | return False 598 | 599 | def is_need_skip(self, cur_date): 600 | for skip_date_range in self.skip_date_ranges: 601 | if cur_date >= skip_date_range[0] and cur_date < skip_date_range[1]: 602 | return True 603 | return False 604 | 605 | def fit(self, X, y): 606 | X = np.copy(X) 607 | y = np.copy(y) 608 | if self.ft_th is not None: 609 | for item in self.ft_th: 610 | ft_pos = item[0] 611 | th = item[1] 612 | print("ft_pos=", ft_pos) 613 | print("X.shape=", X.shape) 614 | print(X[:5, ft_pos]) 615 | #print(np.sum(X[:, ft_pos])) 616 | positive_items = X[:, ft_pos] >= th 617 | negative_items = np.logical_not(positive_items) 618 | X[positive_items, ft_pos] = 1 619 | X[negative_items, ft_pos] = 0 620 | #print(np.sum(X[:, ft_pos])) 621 | for pos in self.ft_norm: 622 | min_max_scaler = preprocessing.MinMaxScaler() 623 | min_max_scaler.fit(X[:, pos].reshape(-1, 1)) 624 | X[:, pos] = min_max_scaler.transform(X[:, pos].reshape(-1, 1)).reshape(-1) 625 | self.ft_norm_clfs.append(min_max_scaler) 626 | #tmp_dates_train = copy.deepcopy(self.dates_train) 627 | #print(X[:5, :]) 628 | if self.ft_select is not None: 629 | self.ft_select = np.asarray(self.ft_select) 630 | self.ft_select = self.ft_select.reshape((-1, )) 631 | X = X[:, self.ft_select] 632 | if self.is_y_log: 633 | y = np.log(y) / np.log(self.y_log_e) 634 | elif self.is_boxcox: 635 | y = boxcox(y, self.boxcox_lambda) 636 | if self.norm_y: 637 | self.norm_y_max_y = np.max(y) 638 | self.norm_y_min_y = np.min(y) 639 | y = (y-self.norm_y_min_y)/(self.norm_y_max_y-self.norm_y_min_y) 640 | if not all(self.dates_train[i] <= self.dates_train[i+1] 641 | for i in xrange(len(self.dates_train)-1)): 642 | raise ValueError("train dates are not sorted...") 643 | tmp_dates_train = copy.deepcopy(self.dates_train) 644 | if self.is_rm_outliers: 645 | if self.dates_train is None: 646 | raise ValueError("self.dates_train is None") 647 | X, y, tmp_dates_train = remove_outliers3( 648 | X, y, tmp_dates_train, self.rm_outliers_m, 649 | key=self.rm_outliers_key, 650 | is_avg_or_median=self.is_avg_or_median) 651 | if self.ft_weights is not None: 652 | self.ft_weights = np.asarray(self.ft_weights) 653 | X = np.multiply(X, np.tile(self.ft_weights, (X.shape[0], 1))) 654 | if self.remove_outliers_by_classifier is not None: 655 | X, y, tmp_dates_train = remove_outliers_by_classifier(X, y, tmp_dates_train, **self.remove_outliers_by_classifier) 656 | if self.is_one_hot_encode: 657 | self.enc.fit(X) 658 | X = self.enc.transform(X).toarray() 659 | #print("X[:5,:]=", X[:5,:]) 660 | if self.is_ft_union is not None: 661 | print("X[:5,:]=", X[:5,:]) 662 | print("X.shape=", X.shape) 663 | self.is_ft_union.fit(X,y) 664 | X_ft_u =self.is_ft_union.transform(X) 665 | X = np.hstack([X, X_ft_u]) 666 | 667 | i_train_days = 0 668 | train_min_date = tmp_dates_train[-1] 669 | while i_train_days < self.train_days and train_min_date >= tmp_dates_train[0]: 670 | if not self.is_need_skip(train_min_date) or (self.is_ignore_skip_date_count): 671 | i_train_days += 1 672 | train_min_date -= timedelta(days=1) 673 | 674 | train_min_date += timedelta(days=1) 675 | print("i_train_days=", i_train_days) 676 | print("real diff days=", (tmp_dates_train[-1] - train_min_date).days) 677 | train_items = tmp_dates_train >= train_min_date 678 | X_train = X[train_items, :] 679 | y_train = y[train_items] 680 | tmp_dates_train_left = tmp_dates_train[train_items] 681 | if self.remove_non_predict_hour_range: 682 | hour_range_items = [] 683 | for item_date in tmp_dates_train_left: 684 | if self.is_in_predict_hour_range(item_date): 685 | hour_range_items.append(True) 686 | else: 687 | hour_range_items.append(False) 688 | hour_range_items = np.asarray(hour_range_items) 689 | X_train = X_train[hour_range_items] 690 | y_train = y_train[hour_range_items] 691 | tmp_dates_train_left = tmp_dates_train_left[hour_range_items] 692 | #print("tmp_dates_train_left=", tmp_dates_train_left[-60:]) 693 | 694 | if self.rm_n_head_days > 0: 695 | max_train_day = tmp_dates_train_left[-1] - timedelta(days=self.rm_n_head_days) 696 | 697 | n_head_days_items = [] 698 | for item_date in tmp_dates_train_left: 699 | if ( (not self.is_need_skip_n_head_days_hours(item_date)) and item_date >= max_train_day) \ 700 | or item_date < max_train_day: 701 | n_head_days_items.append(True) 702 | else: 703 | n_head_days_items.append(False) 704 | n_head_days_items = np.asarray(n_head_days_items) 705 | X_train = X_train[n_head_days_items] 706 | y_train = y_train[n_head_days_items] 707 | tmp_dates_train_left = tmp_dates_train_left[n_head_days_items] 708 | 709 | if len(self.skip_date_ranges) > 0: 710 | left_date_range_items = [] 711 | for item_date in tmp_dates_train_left: 712 | if not self.is_need_skip(item_date): 713 | left_date_range_items.append(True) 714 | else: 715 | left_date_range_items.append(False) 716 | left_date_range_items = np.asarray(left_date_range_items) 717 | X_train = X_train[left_date_range_items] 718 | y_train = y_train[left_date_range_items] 719 | tmp_dates_train_left = tmp_dates_train_left[left_date_range_items] 720 | 721 | print("date range min train dates:", tmp_dates_train_left[0]) 722 | print("date range max train dates:", tmp_dates_train_left[-1]) 723 | print("self.clf.name=", self.clf) 724 | #self.dates_train = self.dates_train[train_items] 725 | arguments = inspect.getargspec(self.clf.fit)[0] 726 | if "sample_weight" in arguments and self.is_sample_weight > 0 and self.is_sample_weight is not None and self.is_sample_weight: 727 | #print("use sample_weight") 728 | sample_weight = [] 729 | for datei, tdate in enumerate(tmp_dates_train_left): 730 | if self.dates_train[-1] >= tdate: 731 | div_factor = int((self.dates_train[-1] - tdate).days/self.is_sample_weight) + 1 732 | else: 733 | div_factor = 1.0 734 | #sample_weight.append(1.0/(np.log(div_factor) + 1.0)) 735 | sample_weight.append(1.0/div_factor) 736 | sample_weight = np.asarray(sample_weight) 737 | #sample_weight = (np.max(sample_weight) - sample_weight) / (np.max(sample_weight) - np.min(sample_weight))*100.0 738 | self.clf.fit(X_train, y_train, sample_weight) 739 | else: 740 | self.clf.fit(X_train, y_train) 741 | if hasattr(self.clf, "feature_importances_"): 742 | print(self.clf.feature_importances_) 743 | 744 | def predict(self, X): 745 | X = np.copy(X) 746 | if self.ft_th is not None: 747 | for item in self.ft_th: 748 | ft_pos = item[0] 749 | th = item[1] 750 | #print(np.sum(X[:, ft_pos])) 751 | positive_items = X[:, ft_pos] >= th 752 | negative_items = np.logical_not(positive_items) 753 | X[positive_items, ft_pos] = 1 754 | X[negative_items, ft_pos] = 0 755 | #print(np.sum(X[:, ft_pos])) 756 | for i, pos in enumerate(self.ft_norm): 757 | min_max_scaler = self.ft_norm_clfs[i] 758 | X[:, pos] = min_max_scaler.transform(X[:, pos].reshape(-1, 1)).reshape(-1) 759 | 760 | if self.ft_weights is not None: 761 | self.ft_weights = np.asarray(self.ft_weights) 762 | X = np.multiply(X, np.tile(self.ft_weights, (X.shape[0], 1))) 763 | if self.ft_select is not None: 764 | self.ft_select = np.asarray(self.ft_select) 765 | X = X[:, self.ft_select] 766 | if self.is_one_hot_encode: 767 | X = self.enc.transform(X).toarray() 768 | if self.is_ft_union is not None: 769 | X_ft_u = self.is_ft_union.transform(X) 770 | X = np.hstack([X,X_ft_u]) 771 | pre_y = self.clf.predict(X) 772 | if self.norm_y: 773 | pre_y = pre_y * (self.norm_y_max_y - self.norm_y_min_y) + self.norm_y_min_y 774 | if self.is_y_log: 775 | pre_y = np.exp(pre_y * np.log(self.y_log_e)) 776 | elif self.is_boxcox: 777 | pre_y = invboxcox(pre_y, self.boxcox_lambda) 778 | else: 779 | pre_y = pre_y 780 | return pre_y 781 | 782 | 783 | class TestModel(object): 784 | def __init__(self,): 785 | pass 786 | 787 | def fit(self, X, y): 788 | self.clf = ExtraTreesRegressor() 789 | #y = np.log(y) 790 | self.clf.fit(X, y) 791 | 792 | def predict(self, X): 793 | return self.clf.predict(X) 794 | 795 | -------------------------------------------------------------------------------- /kdd2017/utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # import necessary modules 5 | import math 6 | from datetime import datetime 7 | from sklearn import preprocessing 8 | import numpy as np 9 | import copy 10 | from datetime import timedelta 11 | import collections 12 | from sklearn.decomposition import IncrementalPCA 13 | from scipy.stats import boxcox 14 | import os 15 | import matplotlib.pyplot as plt 16 | from sklearn.preprocessing import OneHotEncoder 17 | import itertools 18 | import collections 19 | from multiprocessing import Pool, TimeoutError 20 | 21 | 22 | def findsubsets(S,m): 23 | return set(itertools.combinations(S, m)) 24 | 25 | def findsubsets2(S, add_origin=True): 26 | ret = set() 27 | for i in range(1, len(S)): 28 | ret = ret.union(findsubsets(S, i)) 29 | print("S=", S) 30 | if add_origin: 31 | ret.add(tuple(S)) 32 | return ret 33 | 34 | 35 | def generate_final_volumes(volumes): 36 | volumes_final = {} 37 | tollgate_id_dirs = set() 38 | for start_time_window in volumes: 39 | for tollgate_id in volumes[start_time_window]: 40 | for direction in volumes[start_time_window][tollgate_id]: 41 | tollgate_id_dirs.add((tollgate_id, direction)) 42 | 43 | tollgate_id_dirs = list(tollgate_id_dirs) 44 | finnal_predict_times1 = [ 45 | (datetime(2016,10,i,8), datetime(2016,10,i,10)) 46 | for i in range(25, 32) 47 | ] 48 | finnal_predict_times2 = [ 49 | (datetime(2016,10,i,17), datetime(2016,10,i,19)) 50 | for i in range(25, 32) 51 | ] 52 | finnal_predict_times = finnal_predict_times1 + finnal_predict_times2 53 | for tollgate_id, direction in tollgate_id_dirs: 54 | predict_datetimes = [] 55 | for time_range in finnal_predict_times: 56 | start_datetime = time_range[0] 57 | end_datetime = time_range[1] 58 | cur_datetime = start_datetime 59 | while cur_datetime < end_datetime: 60 | predict_datetimes.append(cur_datetime) 61 | cur_datetime = cur_datetime + timedelta(minutes=20) 62 | for predict_datetime in predict_datetimes: 63 | if predict_datetime not in volumes_final: 64 | volumes_final[predict_datetime] = {} 65 | if tollgate_id not in volumes_final[predict_datetime]: 66 | volumes_final[predict_datetime][tollgate_id] = {} 67 | if direction not in volumes_final[predict_datetime][tollgate_id]: 68 | volumes_final[predict_datetime][tollgate_id][direction] = 1 69 | return volumes_final 70 | 71 | 72 | def extract_is_work_day(cur_date): 73 | if cur_date.year == 2015: 74 | # http://news.sina.com.cn/c/2014-12-16/154731291679.shtml 75 | if cur_date.month == 1: 76 | if cur_date.day >= 1 and cur_date.day <= 3: 77 | return 0 78 | if cur_date.day == 4: 79 | return 1 80 | if cur_date.month == 2: 81 | if cur_date.day == 15 or cur_date.day == 28: 82 | return 1 83 | if cur_date.day >= 18 and cur_date.day <= 24: 84 | return 0 85 | if cur_date.month == 4: 86 | if cur_date.day >= 4 and cur_date.day <= 6: 87 | return 0 88 | if cur_date.month == 5: 89 | if cur_date.day >= 1 and cur_date.day <= 3: 90 | return 0 91 | if cur_date.month == 6: 92 | if cur_date.day >= 20 and cur_date.day <= 22: 93 | return 0 94 | if cur_date.month == 9: 95 | if cur_date.day >= 26 and cur_date.day <= 27: 96 | return 0 97 | if cur_date.month == 10: 98 | if cur_date.day >= 1 and cur_date.day <= 7: 99 | return 0 100 | if cur_date.day == 10: 101 | return 1 102 | if cur_date.year == 2016: 103 | # http://news.qq.com/cross/20151211/xK0R05S8.html 104 | if cur_date.month == 1: 105 | if cur_date.day >= 1 and cur_date.day <= 3: 106 | return 0 107 | if cur_date.month == 2: 108 | if cur_date.day >= 7 and cur_date.day <= 13: 109 | return 0 110 | if cur_date.day == 6 or cur_date.day == 4: 111 | return 1 112 | if cur_date.month == 4: 113 | if cur_date.day >= 2 and cur_date.day <= 4: 114 | return 0 115 | if cur_date.month == 5: 116 | if cur_date.day >= 1 and cur_date.day <= 2: 117 | return 0 118 | if cur_date.month == 6: 119 | if cur_date.day >= 9 and cur_date.day <= 11: 120 | return 0 121 | if cur_date.day == 12: 122 | return 1 123 | if cur_date.month == 9: 124 | if cur_date.day >= 15 and cur_date.day <= 17: 125 | return 0 126 | if cur_date.day == 18: 127 | return 1 128 | if cur_date.month == 10: 129 | if cur_date.day >= 1 and cur_date.day <= 7: 130 | return 0 131 | if cur_date.day == 8 or cur_date.day == 9: 132 | return 1 133 | if cur_date.weekday() == 6: 134 | return 0 135 | if cur_date.weekday() == 5: 136 | return 0 137 | return 1 138 | #return cur_date.weekday != 6 and cur_date.weekday != 5 139 | 140 | 141 | def load_links(path_links): 142 | lines = open(path_links, "r").readlines() 143 | link_data = {} 144 | for line in lines[1:]: 145 | #line = line.replace('"', '') 146 | line = line.strip() 147 | words = line.split('","') 148 | link_id = int(words[0].replace('"', '')) 149 | length = int(words[1]) 150 | width = int(words[2]) 151 | lanes = int(words[3]) 152 | in_top = words[4].split(",") 153 | out_top = words[5].split(",") 154 | lane_width = int(words[6].replace('"', '')) 155 | link_data[link_id] = {} 156 | link_data[link_id]["length"] = length 157 | link_data[link_id]["width"] = width 158 | link_data[link_id]["lanes"] = lanes 159 | link_data[link_id]["in_top"] = in_top 160 | link_data[link_id]["out_top"] = out_top 161 | link_data[link_id]["lane_width"] = lane_width 162 | return link_data 163 | 164 | 165 | def load_routes(path_routes): 166 | lines = open(path_routes, "r").readlines() 167 | routes_data = {} 168 | for line in lines[1:]: 169 | line = line.strip() 170 | line = line.replace('"', '') 171 | words = line.split(",") 172 | intersection_id = words[0] 173 | tollgate_id = int(words[1]) 174 | link_seq = [int(link_id) for link_id in words[2:]] 175 | routes_data[(intersection_id, tollgate_id)] = link_seq 176 | return routes_data 177 | 178 | def load_weather_info(path_weather_infos): 179 | datetime_weather = {} 180 | for path_weather_info in path_weather_infos: 181 | is_first_line = True 182 | for line in open(path_weather_info, "r"): 183 | if is_first_line: 184 | is_first_line = False 185 | continue 186 | line = line.replace('"', '') 187 | words = line.split(",") 188 | trace_start_time = words[0] 189 | trace_start_time = datetime.strptime(trace_start_time, "%Y-%m-%d") 190 | hour = int(words[1]) 191 | trace_start_time = datetime(trace_start_time.year, 192 | trace_start_time.month, 193 | trace_start_time.day, 194 | hour) 195 | datetime_weather[trace_start_time] = {} 196 | datetime_weather[trace_start_time]["pressure"] = float(words[2]) 197 | datetime_weather[trace_start_time]["sea_pressure"] = float(words[3]) 198 | datetime_weather[trace_start_time]["wind_direction"] = float(words[4]) 199 | datetime_weather[trace_start_time]["wind_speed"] = float(words[5]) 200 | datetime_weather[trace_start_time]["temperature"] = float(words[6]) 201 | datetime_weather[trace_start_time]["rel_humidity"] = float(words[7]) 202 | datetime_weather[trace_start_time]["precipitation"] = float(words[8]) 203 | return datetime_weather 204 | 205 | 206 | def load_volumes_info(in_file_names): 207 | 208 | if not isinstance(in_file_names, list): 209 | in_file_names = [in_file_names] 210 | 211 | volumes = {} 212 | for in_file_name in in_file_names: 213 | # Step 1: Load volume data 214 | fr = open(in_file_name, 'r') 215 | fr.readline() # skip the header 216 | vol_data = fr.readlines() 217 | fr.close() 218 | 219 | # Step 2: Create a dictionary to caculate and store volume per time window 220 | # volumes = {} # key: time window value: dictionary 221 | for i in range(len(vol_data)): 222 | each_pass = vol_data[i].replace('"', '').split(',') 223 | tollgate_id = each_pass[1] 224 | direction = each_pass[2] 225 | 226 | pass_time = each_pass[0] 227 | pass_time = datetime.strptime(pass_time, "%Y-%m-%d %H:%M:%S") 228 | time_window_minute = int(math.floor(pass_time.minute / 20) * 20) 229 | #print pass_time 230 | start_time_window = datetime(pass_time.year, pass_time.month, 231 | pass_time.day, 232 | pass_time.hour, time_window_minute, 0) 233 | 234 | if start_time_window not in volumes: 235 | volumes[start_time_window] = {} 236 | if tollgate_id not in volumes[start_time_window]: 237 | volumes[start_time_window][tollgate_id] = {} 238 | if direction not in volumes[start_time_window][tollgate_id]: 239 | volumes[start_time_window][tollgate_id][direction] = 1 240 | else: 241 | volumes[start_time_window][tollgate_id][direction] += 1 242 | return volumes 243 | 244 | 245 | 246 | 247 | def load_travel_times_from_trajectories(path_trajectorieses, skip_date_ranges, 248 | load_frequent_info=False, 249 | frequent_threshold=1): 250 | 251 | if isinstance(path_trajectorieses, basestring): 252 | path_trajectorieses = [path_trajectorieses] 253 | elif isinstance(path_trajectorieses, list): 254 | path_trajectorieses = path_trajectorieses 255 | else: 256 | raise ValueError("unknown format...") 257 | 258 | travel_times = {} 259 | for path_trajectories in path_trajectorieses: 260 | # Step 1: Load trajectories 261 | fr = open(path_trajectories, 'r') 262 | fr.readline() # skip the header 263 | traj_data = fr.readlines() 264 | fr.close() 265 | # print(traj_data[0]) 266 | 267 | vehicle_id_f = collections.defaultdict(lambda : 0) 268 | all_route_ids = set() 269 | ## compute the vehicule 270 | for i in range(len(traj_data)): 271 | each_traj = traj_data[i].replace('"', '').split(',') 272 | intersection_id = each_traj[0] 273 | tollgate_id = each_traj[1] 274 | vehicle_id = each_traj[2] 275 | route_id = intersection_id + '-' + tollgate_id 276 | all_route_ids.add(route_id) 277 | vehicle_id_f[(route_id, vehicle_id)] += 1 278 | 279 | frequent_route_ids = set() 280 | for key in vehicle_id_f: 281 | if vehicle_id_f[key] > 1: 282 | route_id = key[0] 283 | frequent_route_ids.add(route_id) 284 | #print vehicle_id_f[key] 285 | if all_route_ids == frequent_route_ids: 286 | print("enough paramters") 287 | else: 288 | raise ValueError("not info...") 289 | 290 | # Step 2: Create a dictionary to store travel time for each route per time window 291 | # travel_times = {} # key: route_id. Value is also a dictionary of which key is the start time for the time window and value is a list of travel times 292 | # travel_times_avg = collections.defaultdict(list) 293 | for i in range(len(traj_data)): 294 | each_traj = traj_data[i].replace('"', '').split(',') 295 | intersection_id = each_traj[0] 296 | tollgate_id = each_traj[1] 297 | # vehicle_id = each_traj[2] 298 | 299 | route_id = intersection_id + '-' + tollgate_id 300 | 301 | if load_frequent_info: 302 | key = (route_id, vehicle_id) 303 | if key not in vehicle_id_f or vehicle_id_f[key] <= frequent_threshold: 304 | continue 305 | 306 | if route_id not in travel_times.keys(): 307 | travel_times[route_id] = {} 308 | 309 | trace_start_time = each_traj[3] 310 | trace_start_time = datetime.strptime(trace_start_time, "%Y-%m-%d %H:%M:%S") 311 | time_window_minute = math.floor(trace_start_time.minute / 20) * 20 312 | start_time_window = datetime(trace_start_time.year, 313 | trace_start_time.month, 314 | trace_start_time.day, 315 | trace_start_time.hour, 316 | int(time_window_minute), 317 | 0) 318 | is_need_skip = False 319 | for skip_date_range in skip_date_ranges: 320 | if start_time_window >= skip_date_range[0] and \ 321 | start_time_window < skip_date_range[1]: 322 | is_need_skip = True 323 | break 324 | if is_need_skip: 325 | continue 326 | tt = float(each_traj[-1]) # travel time 327 | if start_time_window not in travel_times[route_id].keys(): 328 | travel_times[route_id][start_time_window] = [tt] 329 | else: 330 | travel_times[route_id][start_time_window].append(tt) 331 | if trace_start_time.hour >= 8 and trace_start_time.hour <= 10 \ 332 | or trace_start_time.hour >= 17 and trace_start_time.hour <= 19: 333 | key = (route_id, trace_start_time.hour, int(time_window_minute)) 334 | #travel_times_avg[key].append(tt) 335 | 336 | # for key in travel_times_avg: 337 | # print(key) 338 | # print(np.median(travel_times_avg[key])) 339 | # 340 | # exit(0) 341 | 342 | return travel_times 343 | 344 | def search_closest_date_weather_info(date, datetime_weather): 345 | closest_date = None 346 | for cur_date in datetime_weather: 347 | if closest_date is None or (cur_date <= date and closest_date < cur_date): 348 | closest_date = cur_date 349 | #print("closest_date=", closest_date) 350 | return datetime_weather[closest_date] 351 | 352 | 353 | 354 | def convert_date_to_x_volumes(date, datetime_weather, tollgate_id_x): 355 | weather_info = search_closest_date_weather_info( 356 | date, datetime_weather) 357 | x = [] 358 | #x.append(date.year) 359 | x.append(date.month) ## 2 360 | #x.append(date.day) 361 | x.append(date.hour) ## 3 362 | x.append(date.minute) ## 4 363 | x.append(date.weekday()) ## 5 364 | x.append(not extract_is_work_day(date)) # 6 365 | x.append(float(weather_info["wind_speed"])) ## 7 366 | x.append(float(weather_info["temperature"])) ## 8 367 | x.append(float(weather_info["rel_humidity"])) ## 9 368 | x.append(float(weather_info["precipitation"])) ## 10 369 | wod = [0] * 7 370 | wod[date.weekday()] = 1 371 | x += wod ## 11 - 17 372 | x += list(tollgate_id_x) # 18 - 41 (include) 373 | x += [date.weekday()==5 or date.weekday()==6] ## weekend 42 374 | return x 375 | 376 | 377 | def transform_data(x_i, le, with_fit=True): 378 | if isinstance(le, preprocessing.MinMaxScaler) or isinstance(le, preprocessing.Binarizer): 379 | x_i = x_i.astype(np.float) 380 | if with_fit: 381 | le.fit(x_i.reshape((-1, 1))) 382 | x_i = le.transform(x_i.reshape((-1, 1))) 383 | elif isinstance(le, preprocessing.LabelEncoder): 384 | if with_fit: 385 | le.fit(x_i) 386 | x_i = le.transform(x_i.reshape((-1, 1))) 387 | else: 388 | raise ValueError("unknow transform") 389 | return x_i.reshape((-1)), le 390 | 391 | def compute_tollgate_id_to_link_ids_x(routes_data,): 392 | link_ids = [] 393 | 394 | tollgate_id_to_link_ids = collections.defaultdict(list) 395 | tollgate_id = set() 396 | for it in routes_data: 397 | tollgate_id_to_link_ids[it[1]] += routes_data[it] 398 | tollgate_id_to_link_ids[it[1]] = list(set(tollgate_id_to_link_ids[it[1]])) 399 | link_ids += routes_data[it] 400 | 401 | all_link_ids = list(set(link_ids)) 402 | tollgate_id_to_x = {} 403 | for tollgate_id in tollgate_id_to_link_ids: 404 | x = [0] * len(all_link_ids) 405 | x = np.asarray(x) 406 | link_ids = tollgate_id_to_link_ids[tollgate_id] 407 | for link_id in link_ids: 408 | #print("all_link_ids=", all_link_ids) 409 | #print("link_id=", link_id) 410 | pos = all_link_ids.index(link_id) 411 | #print("pos=", pos) 412 | #print("x=", x) 413 | x[pos] = 1 414 | tollgate_id_to_x[tollgate_id] = x 415 | return tollgate_id_to_x 416 | 417 | def convert_volumes_into_X_y(volumes, datetime_weather, link_data, routes_data, 418 | les_train=None, 419 | is_ret_raw_info=False, verbose=False): 420 | 421 | tollgate_id_to_x = compute_tollgate_id_to_link_ids_x(routes_data) 422 | tollgate_id_to_x_len = len(tollgate_id_to_x[tollgate_id_to_x.keys()[0]]) 423 | print("tollgate_id_to_x_len=", tollgate_id_to_x_len) 424 | X = [] 425 | y = [] 426 | les = [ 427 | preprocessing.LabelEncoder(), ## tollgate_id 428 | preprocessing.LabelEncoder(), ## direction 429 | preprocessing.LabelEncoder(), ## month 430 | preprocessing.LabelEncoder(), ## hour 431 | preprocessing.LabelEncoder(), ## minute 432 | preprocessing.LabelEncoder(), ## weekday 433 | preprocessing.LabelEncoder(), ## work day 434 | preprocessing.MinMaxScaler(), ## wind 435 | preprocessing.MinMaxScaler(), ## temperature 436 | preprocessing.MinMaxScaler(), ## rel_humidity 437 | preprocessing.MinMaxScaler(), ## precipitation 438 | ] 439 | wod_les = [preprocessing.LabelEncoder()] * 7 440 | les += wod_les 441 | les += [preprocessing.LabelEncoder()] * tollgate_id_to_x_len 442 | les += [preprocessing.LabelEncoder(),] 443 | les += [preprocessing.MinMaxScaler(), ] ## future hour feature 444 | dates = [] 445 | raw_info = [] 446 | for start_time_window in volumes: 447 | for tollgate_id in volumes[start_time_window]: 448 | #tollgate_id = int(tollgate_id) 449 | for direction in volumes[start_time_window][tollgate_id]: 450 | dates.append(start_time_window) 451 | x = [] 452 | x.append(tollgate_id) 453 | x.append(direction) 454 | #print("tollgate_id=", tollgate_id) 455 | x += convert_date_to_x_volumes(start_time_window, datetime_weather, tollgate_id_to_x[int(tollgate_id)]) 456 | X.append(x) 457 | y.append(volumes[start_time_window][tollgate_id][direction]) 458 | raw_info.append((tollgate_id, direction, start_time_window)) 459 | X = np.asarray(X) 460 | y = np.asarray(y) 461 | 462 | X = add_prev_two_hour_ft(X, y, dates) 463 | print("X[:5,:]=", X[:5,:]) 464 | print("y[:5]=", y[:5]) 465 | for i in range(X.shape[1]): 466 | if les_train is None: 467 | le = les[i] 468 | x_i = X[:, i] 469 | x_i, le = transform_data(x_i, le, True) 470 | X[:, i] = x_i 471 | else: 472 | le = les_train[i] 473 | x_i = X[:, i] 474 | x_i, le = transform_data(x_i, le, False) 475 | X[:, i] = x_i 476 | X = np.asarray(X, np.float) 477 | 478 | 479 | dates = np.asarray(dates) 480 | sorted_indexes = sorted(range(len(dates)), key = lambda x: dates[x]) 481 | X = X[sorted_indexes, :] 482 | y = y[sorted_indexes] 483 | dates = dates[sorted_indexes] 484 | if not is_ret_raw_info: 485 | if les_train is None: 486 | return X, y, dates, les 487 | else: 488 | return X, y, dates, les_train 489 | else: 490 | new_raw_info = [] 491 | for i in sorted_indexes: 492 | new_raw_info.append(raw_info[i]) 493 | raw_info = new_raw_info 494 | if les_train is None: 495 | return X, y, dates, les, raw_info 496 | else: 497 | return X, y, dates, les_train, raw_info 498 | 499 | 500 | def convert_date_to_x(date, datetime_weather, link_ids_x): 501 | 502 | #print("len(link_ids_x)=", len(link_ids_x)) 503 | weather_info = search_closest_date_weather_info( 504 | date, datetime_weather) 505 | 506 | x = [] 507 | #x.append(date.year) 508 | x.append(date.month) ## 2 509 | x.append(float(date.day)) ## 3 510 | x.append(date.hour) ## 4 511 | x.append(date.minute) ## 5 512 | x.append(date.weekday()) ## 6 513 | x.append(not extract_is_work_day(date)) ## 7 514 | x.append(float(weather_info["wind_speed"])) ## 8 515 | x.append(float(weather_info["temperature"])) ## 9 516 | x.append(float(weather_info["rel_humidity"])) ## 10 517 | x.append(float(weather_info["precipitation"])) ## 11 518 | wod = [0] * 7 519 | wod[date.weekday()] = 1 520 | x += wod # 12 - 18 521 | x += list(link_ids_x) # 19 - 42 (include) 522 | return x 523 | 524 | 525 | def compute_route_to_link_ids_x(routes_data,): 526 | link_ids = [] 527 | tollgate_id = set() 528 | print("routes_data=", routes_data) 529 | for it in routes_data: 530 | link_ids += routes_data[it] 531 | all_link_ids = list(set(link_ids)) 532 | 533 | route_to_x = {} 534 | for route_id in routes_data: 535 | x = [0] * len(all_link_ids) 536 | x = np.asarray(x) 537 | link_ids = routes_data[route_id] 538 | for link_id in link_ids: 539 | #print("all_link_ids=", all_link_ids) 540 | #print("link_id=", link_id) 541 | pos = all_link_ids.index(link_id) 542 | #print("pos=", pos) 543 | #print("x=", x) 544 | x[pos] = 1 545 | route_to_x[route_id] = x 546 | return route_to_x 547 | 548 | 549 | def search_prev_two_hour_y(cur_date, y, dates): 550 | prev_cur_date = cur_date - timedelta(hours=2) 551 | for i, date in enumerate(dates): 552 | if date == prev_cur_date: 553 | return y[i] 554 | return 0 555 | 556 | def add_prev_two_hour_ft(X, y, dates): 557 | addition_x = [] 558 | for i, date in enumerate(dates): 559 | addition_x.append(search_prev_two_hour_y(date, y, dates)) 560 | addition_x = np.asarray(addition_x).reshape((-1, 1)) 561 | print("X.shape=", X.shape) 562 | print("addition_x.shape=", len(addition_x)) 563 | X = np.hstack([X, np.asarray(addition_x)]) 564 | return X 565 | 566 | def convert_into_X_y(travel_times, datetime_weather, link_data, routes_data, 567 | les_train=None, is_ret_raw_info=False, is_skip_not_trainning_hours=False, verbose=False): 568 | 569 | route_to_link_ids_x = compute_route_to_link_ids_x(routes_data) 570 | route_to_link_ids_x_len = len(route_to_link_ids_x[route_to_link_ids_x.keys()[0]]) 571 | X = [] 572 | y = [] 573 | les = [ 574 | preprocessing.LabelEncoder(), 575 | preprocessing.LabelEncoder(), 576 | preprocessing.LabelEncoder(), 577 | preprocessing.LabelEncoder(), 578 | preprocessing.LabelEncoder(), 579 | preprocessing.LabelEncoder(), 580 | preprocessing.LabelEncoder(), 581 | preprocessing.LabelEncoder(), 582 | preprocessing.MinMaxScaler(), ## wind speed 583 | preprocessing.MinMaxScaler(), ## temperature 584 | preprocessing.MinMaxScaler(), ## rel_humidity 585 | preprocessing.MinMaxScaler(), ## precipitation 586 | ] 587 | wod_les = [preprocessing.LabelEncoder()] * 7 588 | les += wod_les 589 | les += [preprocessing.MinMaxScaler(), ] * route_to_link_ids_x_len 590 | les += [preprocessing.MinMaxScaler(), ] 591 | dates = [] 592 | raw_info = [] 593 | for route_id in travel_times: 594 | if verbose: 595 | print("route_id=", route_id) 596 | for start_time_window in travel_times[route_id]: 597 | is_trainning_hour = False 598 | start_year = start_time_window.year 599 | start_month = start_time_window.month 600 | start_day = start_time_window.day 601 | if start_time_window >= datetime(start_year, start_month, start_day, 8) and \ 602 | start_time_window < datetime(start_year, start_month, start_day, 10): 603 | is_trainning_hour = True 604 | if start_time_window >= datetime(start_year, start_month, start_day, 17) and \ 605 | start_time_window < datetime(start_year, start_month, start_day, 19): 606 | is_trainning_hour = True 607 | if (not is_trainning_hour) and is_skip_not_trainning_hours: 608 | continue 609 | if verbose: 610 | print("start_time_window=", start_time_window) 611 | 612 | raw_info.append((route_id, start_time_window)) 613 | A, B = route_id.split('-') 614 | x = [] 615 | #x.append(route_id) 616 | x.append(A) 617 | x.append(B) 618 | route_id_for_link_id = (A, int(B)) 619 | #print(route_to_link_ids_x[route_id_for_link_id]) 620 | x += convert_date_to_x(start_time_window, datetime_weather, route_to_link_ids_x[route_id_for_link_id]) 621 | #print("x=", x) 622 | if len(travel_times[route_id][start_time_window]) >= 1: 623 | X.append(x) 624 | y.append(np.average(travel_times[route_id][start_time_window])) 625 | dates.append(start_time_window) 626 | X = np.asarray(X) 627 | y = np.asarray(y) 628 | X = add_prev_two_hour_ft(X, y, dates) 629 | print("X=", X) 630 | print("X[:5,:]=", X[:5,:]) 631 | print("y[:5]=", y[:5]) 632 | for i in range(X.shape[1]): 633 | if les_train is None: 634 | le = les[i] 635 | x_i = X[:, i] 636 | x_i, le = transform_data(x_i, le, True) 637 | X[:, i] = x_i 638 | else: 639 | le = les_train[i] 640 | x_i = X[:, i] 641 | x_i, le = transform_data(x_i, le, False) 642 | X[:, i] = x_i 643 | X = np.asarray(X, np.float) 644 | dates = np.asarray(dates) 645 | sorted_indexes = sorted(range(len(dates)), key = lambda x: dates[x]) 646 | X = X[sorted_indexes, :] 647 | y = y[sorted_indexes] 648 | dates = dates[sorted_indexes] 649 | 650 | # if les_train is None: 651 | # print("LabelEncoder........info") 652 | # for le in les: 653 | # print(list(le.classes_)) 654 | 655 | if not is_ret_raw_info: 656 | if les_train is None: 657 | return X, y, dates, les 658 | else: 659 | return X, y, dates, les_train 660 | else: 661 | new_raw_info = [] 662 | for i in sorted_indexes: 663 | new_raw_info.append(raw_info[i]) 664 | raw_info = new_raw_info 665 | if les_train is None: 666 | return X, y, dates, les, raw_info 667 | else: 668 | return X, y, dates, les_train, raw_info 669 | 670 | 671 | #Function 672 | def invboxcox(y,ld): 673 | if ld == 0: 674 | return(np.exp(y)) 675 | else: 676 | return(np.exp(np.log(ld*y+1)/ld)) 677 | 678 | 679 | def mape_loss(y, y_predict): 680 | loss = np.sum(np.abs((y_predict - y) / y)) / float(len(y)) 681 | if np.isnan(loss): 682 | return 100.0 683 | else: 684 | return loss 685 | 686 | def inv_mape_loss(estimator, y, y_predict): 687 | return 1.0 - mape_loss(y, y_predict) 688 | 689 | def remove_outliers(X, y, dates, m=3.0): 690 | ret_keep_indexes = np.asarray([], dtype=int) 691 | route_ids = set(X[:, 0]) 692 | keep_indexes = np.asarray(range(len(y))) 693 | for route_id in route_ids: 694 | route_items = X[:, 0] == route_id 695 | y_route = y[route_items] 696 | keep_indexes_route = keep_indexes[route_items] 697 | avg_y_route = np.average(y_route) 698 | m_diff_y = y_route - avg_y_route 699 | stable_indexes = keep_indexes_route[ 700 | abs(m_diff_y) < m * np.std(m_diff_y)] 701 | ret_keep_indexes = np.append(ret_keep_indexes, stable_indexes) 702 | ret_keep_indexes.sort() 703 | return X[ret_keep_indexes], y[ret_keep_indexes], dates[ret_keep_indexes] 704 | 705 | 706 | def remove_outliers2(X, y, dates, m=3.0): 707 | ret_keep_indexes = np.asarray([], dtype=int) 708 | route_ids = set(X[:, 0]) 709 | keep_indexes = np.asarray(range(len(y))) 710 | time_keys = set() 711 | for start_time_window in dates: 712 | key_str = "%02d%02d" % ( 713 | start_time_window.hour, 714 | start_time_window.minute) 715 | time_keys.add(key_str) 716 | for time_key in time_keys: 717 | for route_id in route_ids: 718 | route_items = X[:, 0] == route_id 719 | X_route = X[route_items,:] 720 | y_route = y[route_items] 721 | dates_route = dates[route_items] 722 | keep_indexes_route = keep_indexes[route_items] 723 | inner_time_items = np.asarray([False] * len(dates_route)) 724 | for i, date in enumerate(dates_route): 725 | key_str = "%02d%02d" % ( 726 | start_time_window.hour, 727 | start_time_window.minute) 728 | if key_str == time_key: 729 | inner_time_items[i] = True 730 | X_route_time = X_route[inner_time_items] 731 | y_route_time = y_route[inner_time_items] 732 | keep_indexes_route_time = keep_indexes_route[inner_time_items] 733 | avg_y_route_time = np.average(y_route_time) 734 | m_diff_y = y_route_time - avg_y_route_time 735 | stable_indexes = keep_indexes_route_time[ 736 | abs(m_diff_y) < m * np.std(m_diff_y)] 737 | ret_keep_indexes = np.append(ret_keep_indexes, stable_indexes) 738 | ret_keep_indexes.sort() 739 | 740 | 741 | def remove_outliers3(X, y, dates, m=3.0, key=[0,1], is_avg_or_median=True): 742 | if len(key) == 0: 743 | return X, y, dates 744 | 745 | key = np.asarray(key) 746 | key = key[key < X.shape[1]] 747 | 748 | ret_keep_indexes = np.asarray([], dtype=int) 749 | entities = set() 750 | for entity in X[:, key]: 751 | entities.add(tuple(list(entity))) 752 | keep_indexes = np.asarray(range(len(y))) 753 | for entity in entities: 754 | entity_items = np.all(X[:, key] == entity, axis=1) 755 | y_entities = y[entity_items] 756 | keep_indexes_route = keep_indexes[entity_items] 757 | if is_avg_or_median == 1: 758 | avg_y_entity = np.average(y_entities) 759 | m_diff_y = y_entities - avg_y_entity 760 | stable_indexes = keep_indexes_route[ 761 | abs(m_diff_y) < m * np.std(m_diff_y)] 762 | elif is_avg_or_median == 0: 763 | d = np.abs(y_entities - np.median(y_entities)) 764 | mdev = np.median(d) 765 | if mdev != 0: 766 | s = d/mdev 767 | else: 768 | s = [0.0,] * len(d) 769 | s = np.asarray(s) 770 | stable_indexes = keep_indexes_route[s