├── .gitignore
├── kdd2017
    ├── __init__.py
    ├── remove_outliers.py
    ├── models.py
    └── utils.py
├── README.md
├── config
    ├── config_vol.json
    └── config.json
└── bin
    ├── compute_kdd2017_vol
    └── compute_kdd2017


/.gitignore:
--------------------------------------------------------------------------------
1 | *pyc
2 | 


--------------------------------------------------------------------------------
/kdd2017/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kdd2017/remove_outliers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | #from kdd2017.models import *
 4 | #
 5 | #
 6 | #def remove_outliers_by_classifier(X, y, dates, ):
 7 | #    xgboost = XGBoost(max_depth=2)
 8 | #    xgboost.fit(X, y)
 9 | #    y_pred = xgboost.predict(X)
10 | #    diff_values = np.abs(y_pred - y)
11 | #    abs_diff_vals = np.abs(diff_values)
12 | #    sorted_indexes = sorted(range(len(abs_diff_vals)), key = lambda x: abs_diff_vals[x])
13 | #    sorted_indexes_lead = sorted_indexes[:int(len(abs_diff_vals)*0.9)]
14 | #    return X[sorted_indexes_lead], y[sorted_indexes_lead], dates[sorted_indexes_lead]
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | KDD Travel time 排名28/3574的解决方案
 2 | ================================
 3 | 
 4 | 共享一下自己的代码， 主要用了grid search 5 folds CV + 1 fold eval set，因为到了>后期cv grid search出来的参数也出现overfit了。所以第6 fold的cv没有参与最优的搜索，这是作为eval作用。最终排名28/3574，一个人的写的程序，作为参考。
 5 | 
 6 | 入口程序
 7 | 
 8 | ```
 9 | cd kdd2017
10 | ./bin/compute_kdd2017 config/config.json
11 | ```
12 | 
13 | KDD travel time competition rank 28/3574 solution
14 | ======================================
15 | 
16 | 
17 | Share my code here. Basically I have used 5-fold cv grid search. Finally I got over-fitting , so I added one more fold for evaluation (not for grid search). At the end, I got a rank of 28/3574. Single person work. Hope it would be useful for you. 
18 | 
19 | how to start
20 | 
21 | ```
22 | cd kdd2017
23 | ./bin/compute_kdd2017 config/config.json
24 | ```
25 | 


--------------------------------------------------------------------------------
/config/config_vol.json:
--------------------------------------------------------------------------------
1 | {
2 |    "path_volumes_train": ["/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/volume(table 6)_training.csv", "/home/jinpengli/work_data/jinpeng/dataSet_phase2/volume(table 6)_training2.csv"],
3 |    "path_volumes_val": "/home/jinpengli/work_data/jinpeng/dataSet_phase2/volume(table 6)_test2.csv",
4 |    "weather_infos": ["/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/weather (table 7)_training_update.csv", "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/testing_phase1/weather (table 7)_test1.csv", "/home/jinpengli/work_data/jinpeng/dataSet_phase2/weather (table 7)_2.csv"],
5 |    "path_links": "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/links (table 3).csv",
6 |    "path_routes": "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/routes (table 4).csv",
7 |    "working_dir": "/home/jinpengli/work_data/jinpeng"
8 | }
9 | 


--------------------------------------------------------------------------------
/config/config.json:
--------------------------------------------------------------------------------
1 | {
2 |    "path_trajectories_train": ["/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/trajectories(table 5)_training.csv", "/home/jinpengli/work_data/jinpeng/dataSet_phase2/trajectories(table_5)_training2.csv"],
3 |    "path_trajectories_val": "/home/jinpengli/work_data/jinpeng/dataSet_phase2/trajectories(table 5)_test2.csv",
4 |    "path_links": "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/links (table 3).csv",
5 |    "path_routes": "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/routes (table 4).csv",
6 |    "working_dir": "/home/jinpengli/work_data/jinpeng",
7 |    "weather_infos" : ["/home/jinpengli/work_data/jinpeng/dataSets/dataSets/training/weather (table 7)_training_update.csv", "/home/jinpengli/work_data/jinpeng/dataSets/dataSets/testing_phase1/weather (table 7)_test1.csv", "/home/jinpengli/work_data/jinpeng/dataSet_phase2/weather (table 7)_2.csv"]
8 | }
9 | 


--------------------------------------------------------------------------------
/bin/compute_kdd2017_vol:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from sklearn import linear_model
  5 | import numpy as np
  6 | 
  7 | from argparse import RawTextHelpFormatter
  8 | import argparse
  9 | import json
 10 | from datetime import timedelta
 11 | from datetime import datetime
 12 | import dateutil
 13 | 
 14 | import os
 15 | import matplotlib.pyplot as plt
 16 | import matplotlib
 17 | import sys, traceback
 18 | 
 19 | from sklearn.ensemble import GradientBoostingRegressor
 20 | from sklearn.ensemble import BaggingRegressor
 21 | from sklearn.ensemble import AdaBoostRegressor
 22 | from sklearn.ensemble import ExtraTreesRegressor
 23 | from sklearn.ensemble import RandomForestRegressor
 24 | 
 25 | from sklearn.neighbors import KNeighborsRegressor
 26 | from sklearn.neighbors import RadiusNeighborsRegressor
 27 | 
 28 | from sklearn.linear_model import Lasso
 29 | from sklearn.linear_model import LinearRegression
 30 | from sklearn.linear_model import ARDRegression
 31 | from sklearn.linear_model import HuberRegressor
 32 | from sklearn.linear_model import LinearRegression
 33 | from sklearn.linear_model import LogisticRegression
 34 | from sklearn.linear_model import LogisticRegressionCV
 35 | from sklearn.linear_model import PassiveAggressiveRegressor
 36 | from sklearn.linear_model import RandomizedLogisticRegression
 37 | from sklearn.linear_model import RANSACRegressor
 38 | from sklearn.linear_model import SGDRegressor
 39 | from sklearn.linear_model import TheilSenRegressor
 40 | from sklearn.linear_model import logistic_regression_path
 41 | 
 42 | from sklearn.neural_network import MLPRegressor
 43 | from sklearn.cross_decomposition import PLSRegression
 44 | from sklearn.svm import SVR
 45 | from sklearn.svm import LinearSVR
 46 | from sklearn.svm import NuSVR
 47 | 
 48 | from sklearn.tree import DecisionTreeRegressor
 49 | from sklearn.tree import ExtraTreeRegressor
 50 | 
 51 | 
 52 | from sklearn.pipeline import FeatureUnion
 53 | from sklearn.decomposition import PCA
 54 | from sklearn.feature_selection import SelectKBest
 55 | 
 56 | 
 57 | from kdd2017.utils import load_volumes_info
 58 | from kdd2017.utils import convert_volumes_into_X_y
 59 | from kdd2017.utils import generate_final_volumes
 60 | from kdd2017.utils import GridSearchCVDatesWithVal
 61 | from kdd2017.utils import load_routes
 62 | from kdd2017.utils import load_links
 63 | 
 64 | from kdd2017.utils import mape_loss
 65 | from kdd2017.utils import inv_mape_loss
 66 | from kdd2017.utils import GridSearchCVDates
 67 | from kdd2017.utils import invboxcox
 68 | from kdd2017.utils import plot_travel_times_fix_date
 69 | from kdd2017.utils import plot_travel_times_fix_hour
 70 | 
 71 | from kdd2017.utils import remove_outliers
 72 | from kdd2017.utils import load_weather_info
 73 | 
 74 | from scipy.stats import boxcox
 75 | from sklearn.ensemble import GradientBoostingRegressor
 76 | from sklearn.ensemble import BaggingRegressor
 77 | from sklearn.ensemble import AdaBoostRegressor
 78 | from sklearn.ensemble import ExtraTreesRegressor
 79 | from sklearn.ensemble import RandomForestRegressor
 80 | from sklearn.neighbors import KNeighborsRegressor
 81 | 
 82 | from sklearn.linear_model import LinearRegression
 83 | from sklearn.linear_model import LogisticRegression
 84 | from sklearn.model_selection import GridSearchCV
 85 | from sklearn.model_selection import train_test_split
 86 | from sklearn.svm import SVR
 87 | from sklearn.metrics import r2_score
 88 | from sklearn.metrics import make_scorer
 89 | from sklearn.metrics import mean_absolute_error
 90 | from kdd2017.models import *
 91 | 
 92 | import matplotlib.pyplot as plt
 93 | import matplotlib
 94 | import collections
 95 | from kdd2017.utils import findsubsets
 96 | from kdd2017.utils import findsubsets2
 97 | import random
 98 | 
 99 | if __name__ == "__main__":
100 |     description='''
101 |         compute_kdd2017_vol /media/jl237561/usb_ext/workspace/kdd2017/kdd2017/config/config_vol.json
102 |     '''
103 |     parser = argparse.ArgumentParser(
104 |         description=description,
105 |         formatter_class=RawTextHelpFormatter)
106 | 
107 |     parser.add_argument('config', type=unicode, nargs=1,
108 |                         help='...')
109 | 
110 |     options = parser.parse_args()
111 | 
112 |     if not options.config:
113 |         raise ValueError("Please set all the parameters.")
114 | 
115 |     config_path = options.config[0]
116 |     config = json.load(open(config_path, "r"))
117 |     path_trajectories_train = config["path_volumes_train"]
118 |     path_trajectories_val = config["path_volumes_val"]
119 |     path_working_dir = config["working_dir"]
120 |     path_weather_infos = config["weather_infos"]
121 |     path_links = config["path_links"]
122 |     path_routes = config["path_routes"]
123 |     is_boxcox = False
124 |     is_y_log = False
125 |     is_include_future_training = False
126 |     remove_future_training_test = False
127 |     boxcox_lambda = 1.0
128 |     estimate_val_w = 1.0
129 |     skip_cvs = []
130 |     skip_date_ranges = [
131 |         #(datetime(2016, 9, 14), datetime(2016, 9, 19)),
132 |         #(datetime(2016, 9, 29), datetime(2016, 10, 9)),
133 |     ]
134 | 
135 | 
136 |     #path_combine_model_cache = os.path.join(path_working_dir, "cache_combine_model_vol_%d.json" % random.randint(0, 1000))
137 |     #if os.path.isfile(path_combine_model_cache):
138 |     #    raise ValueError("cache exist.. please remove %s" % path_combine_model_cache)
139 | 
140 |     if is_y_log:
141 |         print("config is_y_log=", is_y_log)
142 | 
143 |     datetime_weather = load_weather_info(path_weather_infos)
144 |     routes_data = load_routes(path_routes)
145 |     link_data = load_links(path_links)
146 |     if not os.path.isdir(path_working_dir):
147 |         os.makedirs(path_working_dir)
148 | 
149 |     volumes_train = load_volumes_info(path_trajectories_train)
150 |     volumes_val = load_volumes_info(path_trajectories_val)
151 |     volumes_final = generate_final_volumes(volumes_train)
152 | 
153 |     X_train, y_train, dates_train, les_train = convert_volumes_into_X_y(
154 |         volumes_train, datetime_weather, link_data, routes_data)
155 |     X_val, y_val, dates_val, _ = convert_volumes_into_X_y(
156 |         volumes_val, datetime_weather, link_data, routes_data, les_train)
157 |     X_final, y_final, dates_final, _, raw_info = convert_volumes_into_X_y(
158 |         volumes_final, datetime_weather, link_data, routes_data, les_train, True, verbose=False)
159 | 
160 |     print(X_train.shape)
161 |     print(y_train.shape)
162 |     print(X_val.shape)
163 |     print(y_val.shape)
164 | 
165 |     Configurations = [
166 |         #{
167 |         #    "model": BoxcoxModel,
168 |         #    "tuned_parameters": [
169 |         #        {
170 |         #           "model": [GradientBoostingRegressor],
171 |         #           "loss": ["lad"],
172 |         #           "learning_rate": [0.1,],
173 |         #           "n_estimators": [200,],
174 |         #           "is_boxcox": [True, ],
175 |         #           "boxcox_lambda": [-1.0, -0.6, -0.3, 0, 0.1, 0.5, 1.0, 2.0],
176 |         #        },
177 |         #        {
178 |         #           "model": [GradientBoostingRegressor,],
179 |         #           "loss": ["lad"],
180 |         #           "learning_rate": [0.1,],
181 |         #           "n_estimators": [200,],
182 |         #           "is_boxcox": [False, ],
183 |         #           "boxcox_lambda": [1,],
184 |         #        },
185 |         #   ],
186 |         #},
187 |         ##{
188 |         #    "model": XGBoost,
189 |         #    "tuned_parameters": [],
190 |         #},
191 |         #{
192 |         #    "model": MedianModel,
193 |         #    "tuned_parameters":[],
194 |         #},
195 | #        ##### make model ensemble..
196 | #        {
197 | #            "model": CombineModes,
198 | #            "tuned_parameters": [
199 | #                {
200 | #                    'models': [
201 | #                        [
202 | #                            ## 1
203 | #                            DaterangeModel(
204 | #                                model=XGBoost,
205 | #                                rm_n_head_days=4,
206 | #                                rm_n_head_days_hours=[(0, 6), (12, 14), (20, 24)],
207 | #                                ft_th=[(7, 0.3), (8, 0.8), (9, 0.3)],
208 | #                                is_y_log=True,
209 | #                                y_log_e=np.e,
210 | #                                norm_y=False,
211 | #                                use_mspe=False,
212 | #                                num_round=1100,
213 | #                                early_stopping_rounds=10,
214 | #                                eta=0.02,
215 | #                                max_depth=7,
216 | #                                subsample=0.6,
217 | #                                colsample_bytree=0.9,
218 | #                                objective='reg:linear',
219 | #                                booster='gbtree',
220 | #                                eval_metric='rmse',
221 | #                                ft_select=[0, 1, 3, 4, 5, 6, 12, 15, 16, 17],
222 | #                                train_days=9+7,
223 | #                                is_rm_outliers=True,
224 | #                                rm_outliers_m=2,
225 | #                                rm_outliers_key=[0, 1, 2, 5],
226 | #                                is_avg_or_median=2,
227 | #                            ),
228 | #                            ## 2
229 | #                            DaterangeModel(
230 | #                                model=KNeighborsRegressor,
231 | #                                #rm_n_head_days=9,
232 | #                                #rm_n_head_days_hours=[(0, 6), (12, 14), (20, 24)],
233 | #                                ft_th=[(7, 0.3), (8, 0.8), (9, 0.3)],
234 | #                                is_y_log=True,
235 | #                                y_log_e=np.e,
236 | #                                norm_y=False,
237 | #                                ft_select=[0, 1, 5, 6],
238 | #                                train_days=10,
239 | #                                is_rm_outliers=False,
240 | #                            ),
241 | #                            ## 3
242 | #                            DaterangeModel(
243 | #                                model=KNeighborsRegressor,
244 | #                                #rm_n_head_days=9,
245 | #                                #rm_n_head_days_hours=[(0, 6), (12, 14), (20, 24)],
246 | #                                ft_th=[(7, 0.3), (8, 0.8), (9, 0.3)],
247 | #                                is_y_log=True,
248 | #                                y_log_e=np.e,
249 | #                                norm_y=False,
250 | #                                ft_select=[0, 1, 2, 3, 5, 6],
251 | #                                train_days=9,
252 | #                                is_rm_outliers=False,
253 | #                                rm_outliers_m=2,
254 | #                                rm_outliers_key=[0, 1, 2, 5],
255 | #                                is_avg_or_median=2,
256 | #                            ),
257 | #                        #    DaterangeModel(
258 | #                        #        model=model,
259 | #                        #        random_state=0,
260 | #                        #        #rm_n_head_days=9,
261 | #                        #        #rm_n_head_days_hours=[(0, 6), (12, 14), (20, 24)],
262 | #                        #        ft_th=[(7, 0.3), (8, 0.8), (9, 0.3)],
263 | #                        #        is_y_log=True,
264 | #                        #        y_log_e=np.e,
265 | #                        #        norm_y=False,
266 | #                        #        ft_select=ft_select,
267 | #                        #        train_days=model_day,
268 | #                        #        is_rm_outliers=False,
269 | #                        #        rm_outliers_m=2,
270 | #                        #        rm_outliers_key=[0, 1, 2, 5],
271 | #                        #        is_avg_or_median=2,
272 | #                        #    ),
273 | #                        ] 
274 | #                        #for model in [ #PassiveAggressiveRegressor,
275 | #                        #               #KNeighborsRegressor, 
276 | #                        #               MLPRegressor, 
277 | #                        #               #NuSVR,
278 | #                        #               #HuberRegressor, PassiveAggressiveRegressor, RANSACRegressor, SGDRegressor, TheilSenRegressor,
279 | #                        #               #RadiusNeighborsRegressor, PLSRegression, LinearSVR, 
280 | #                        #             ]
281 | #                        #for model_day in range(7, 20, 3)
282 | #                        ##for model_day in [10,]
283 | #                        #for ft_select in list( [[0, 1, ] + list(one) for one in findsubsets2(range(2, 7))])
284 | #                        ##for ft_select in [[0, 1, 3, 4, 5, 6, 12, 15, 16, 17]]
285 | #                     ],
286 | #                     'subsample': [1.0, ],
287 | #                     'combine_method': [2 ],
288 | #                     'weights': [
289 | #                         [1.0, 0.13, 0.0,  ] 
290 | #                         #for x1 in range(0, 101, 5)
291 | #                         #for x2 in range(0, 101, 5)
292 | #                         #for x3 in range(0, 101, 5)
293 | #                         #[1.0, 0.0]
294 | #                         #[1.0, 0.1, 0.1, 0.1, 0.0]
295 | #                     ],
296 | #                },
297 | #            ],
298 | #        },
299 |         {
300 |             "model": DaterangeModel,
301 |             "tuned_parameters": [
302 | #              {
303 | #                'n_estimators': [50,],
304 | #                 'loss': ["ls", "lad", "huber", "quantile"],
305 | #                 #'loss': ["ls", ],
306 | #                 'is_y_log' : [True,],
307 | #                 'learning_rate': [0.1,],
308 | #                 'train_days': [3,7,14,21,28,35,],
309 | #                 'is_rm_outliers': [True,],
310 | #                 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,],
311 | #              },
312 | #              {
313 | #                 'model':[GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, KNeighborsRegressor],
314 | #                 #'n_estimators': [200,],
315 | #                 #'kernel':["rbf", ],
316 | #                 'rm_n_head_days': [9, ],
317 | #                 'rm_n_head_days_hours': [
318 | #                     [(0, 6), (12, 14), (20, 24)],
319 | #                 ],
320 | #                 'ft_th': [[
321 | #                            (7, 0.3),
322 | #                            (8, 0.8),
323 | #                            (9, 0.3),
324 | #                            #(42, i/10.0),
325 | #                          ] #for i in range(0, 11, 1)
326 | #                          ],
327 | #                 'is_y_log': [True, ],
328 | #                 'y_log_e': [np.e,],
329 | #                 'norm_y': [False, ],
330 | #                 'ft_select': [[0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24),],
331 | #                 'train_days': [9+7],
332 | #                 'is_rm_outliers': [True,],
333 | #                 'rm_outliers_m': [0.5,],
334 | #                 'rm_outliers_key': [[0, 1, 3, 4], ],
335 | #                 'is_avg_or_median': [2,],
336 | #              },
337 | #              {
338 | #                 'model':[LGBM,],
339 | #                 'rm_n_head_days': [9, ],
340 | #                 'rm_n_head_days_hours': [
341 | #                     [(0, 6), (12, 14), (20, 24)],
342 | #                 ],
343 | #                 'ft_th': [[
344 | #                            (7, 0.3),
345 | #                            (8, 0.8),
346 | #                            (9, 0.3),
347 | #                            #(42, i/10.0),
348 | #                          ] #for i in range(0, 11, 1)
349 | #                          ],
350 | #                 #'objective':["regression", "regression_l2", "regression_l1", "huber", "fair", "poisson"],
351 | #                 'objective': ["regression", ],
352 | #                 'boosting_type': ['dart', ],
353 | #                 #'boosting_type': ['gbdt', 'dart'],
354 | #                 #'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.6, 0.8, 1.0,],
355 | #                 'learning_rate': [0.6],
356 | #                 #'num_leaves': range(7, 1000, 1),
357 | #                 'num_leaves': [185, ],
358 | #                 #'subsample': [i/10.0 for i in range(5, 11)],
359 | #                 #'colsample_bytree': [i/10.0 for i in range(5, 11)],
360 | #                 'subsample': [0.9, ],
361 | #                 'colsample_bytree': [0.9,],
362 | #                 'is_y_log': [True, ],
363 | #                 'y_log_e': [np.e,],
364 | #                 'norm_y': [False, ],
365 | #                 #'ft_select': [[0, 1, 3, 4, 6, ], ],
366 | #                 #'ft_select': list([[0, 1, 3, 4, 6,] + list(one) for one in findsubsets2(set(range(11, 18)))]),
367 | #                 'ft_select': [[0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24),],
368 | #                 'train_days': [9+7],
369 | #                 #'train_days': [10, ],
370 | #                 'is_rm_outliers': [True,],
371 | #                 #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)],
372 | #                 'rm_outliers_m': [0.5,],
373 | #                 #'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4]))),
374 | #                 'rm_outliers_key': [[0, 1, 3, 4], ],
375 | #                 'is_avg_or_median': [2,],
376 | #              },
377 |               {
378 |                  'model':[XGBoost, ],
379 |                  'is_ignore_skip_date_count': [True, ],
380 |                  #'remove_outliers_by_classifier': [
381 |                  #    {
382 |                  #        "model": GradientBoostingRegressor(
383 |                  #           max_depth=x,
384 |                  #        ),
385 |                  #        "m": v/100.,
386 |                  #    }
387 |                  #    for v in range(80, 101, 1) for x in range(2,5)
388 |                  #],
389 |                  'rm_n_head_days': [4, ],
390 |                  #'n_estimators': range(100, 2000, 100),
391 |                  #'rm_n_head_days': range(17),
392 |                  'rm_n_head_days_hours': [
393 |                      #[],
394 |                      #[(0, 6), (12, 15), (21, 22)],
395 |                      #[(0, 7), (11, 16), (20, 22)],
396 |                      #[(0, 8), (10, 17), (19, 22)],
397 |                      [(0, 6), (12, 14), (20, 24)],
398 |                      #[one, two, three]
399 |                      #for one in [(0, 6), (0, 5), (0, 7), (0, 8),]
400 |                      #for two in [(10, 17), (11, 16), (12, 14), ]
401 |                      #for three in [(19, 24), (20, 24), (21, 24), (22, 24)]
402 |                  ],
403 |                  'ft_th': [[
404 |                             (7, 0.3),
405 |                             (8, 0.8),
406 |                             (9, 0.3),
407 |                             #(42, i/10.0),
408 |                           ] #for i in range(0, 11, 1)
409 |                           ],
410 |                  'is_y_log': [True, ],
411 |                  'use_mspe': [False, ],
412 |                  #'y_log_e': [2.0, np.e, 3.0, 4.0, 5.0, 6.0, 7.0,],
413 |                  #'y_log_e': [np.e, ] + [pow(10, i) for i in range(1, 10)],
414 |                  #'y_log_e': [1000000,],
415 |                  'y_log_e': [np.e, ],
416 |                  'norm_y': [False, ],
417 |                  'num_round': range(1000, 3000, 100),
418 |                  'eta': [0.02],
419 |                  'verbose_eval': [100, ],
420 |                  'early_stopping_rounds': [10, ],
421 |                  #'gamma': [0.0, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0,],
422 |                  #'eta': [v/100.0 for v in range(1, 101, 1)],
423 |                  #'eta': [0.15, ],
424 |                  #'eta': [0.01, 0.05, 0.10, 0.005],
425 |                  #'max_depth': [7,],
426 |                  'max_depth': range(2,15),
427 |                  #'min_child_weight': [1, 3, 5, 7,],
428 |                  'colsample_bytree': [0.9, ],
429 |                  #'colsample_bytree': [i / 10.0 for i in range(6, 11)],
430 |                  'eval_metric': ['rmse', ],
431 |                  #'eval_metric': ['mape', ],
432 |                  #'objective': ["reg:gamma", "reg:linear"],
433 |                  'objective': ["reg:linear", ],
434 |                  #'eval_metric': ['logloss', ],
435 |                  'subsample': [0.6, ],
436 |                  #'subsample': [v/10.0 for v in range(6, 11)],
437 |                  'booster': ['gbtree', ],
438 |                  #'ft_select': list([list(one) for one in findsubsets2(range(2, 7))]),,
439 |                  'ft_select': [
440 |                      #[0, 1, 3, 4, 5, 6, 12, 15, 16, 17] + range(18, 18+24),
441 |                      #[0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24+1), 
442 |                      [0, 1, 3, 4, 5, 6, 12, 15, 16, 17],
443 |                   ],                 
444 |                  #'ft_select': [[0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + list(one) for one in findsubsets2( range(18, 18+24) )],
445 |                  #'ft_select': list( [[0, 1, ] + list(one) for one in findsubsets2(range(2, 7))]),
446 |                  #'ft_select': list([[0, 1, 3, 4, 5, 6] + list(one) for one in findsubsets2(set(range(11, 18)))]),
447 |                  #'ft_select': list([[0, 1, 3, 4, 5, 6, 12, 15, 16, 17] + list(one) for one in findsubsets2(set(range(18, 18+24)))]),
448 |                  #'train_days': range(3, 30, 1),
449 |                  #'train_days': [9, 9+7, 50],
450 |                  'train_days': [9+7,],
451 |                  'is_rm_outliers': [True, ],
452 |                  #'remove_non_predict_hour_range': [True, False],
453 |                  #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)],
454 |                  #'rm_outliers_m': [0.5,],
455 |                  #'rm_outliers_m': [4.0, ],
456 |                  'rm_outliers_m': [2.0,],
457 |                  #'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4,5,6,7,8,9]))),
458 |                  #'rm_outliers_key': [[0, 1, ] + list(one) for one in list(findsubsets2(set([2,3,4,5]))) ],
459 |                  'rm_outliers_key': [[0, 1, 2, 5], ],
460 |                  'is_avg_or_median': [2,],
461 |                  #'is_avg_or_median': [2,],
462 |                  #'rm_outliers_m': [i/100.0 for i in range(10, 60, 10)],
463 |                  #'rm_outliers_key': [[0, 1, ] + list(one) for one in list(findsubsets2(set([2,3,4,5,6,7,8,9]))) ],
464 |                  #'is_avg_or_median': [2, ],
465 |                  #'is_sample_weight': [6, ],
466 |                  #'is_one_hot_encode': [True, False],
467 |                  #'remove_non_predict_hour_range': [True],
468 |                  #'predict_hour_range':[
469 |                  #     [
470 |                  #          [[8, 0], [10, 0]],
471 |                  #          [[17, 0], [19, 0]],
472 |                  #     ],
473 |                  #     #[
474 |                  #     #     [[7, 0], [11, 0]],
475 |                  #     #     [[16, 0], [20, 0]],    
476 |                  #     #],
477 |                  #], 
478 |                  'skip_date_ranges': [
479 |                      #[
480 |                      #  (datetime(2016, 9, 30), datetime(2016, 10, 10)),
481 |                      #],
482 |                      #[
483 |                      #  (datetime(2016, 9, 15), datetime(2016, 9, 16)),
484 |                      #],
485 |                      #[
486 |                      #  (datetime(2016, 9, 14), datetime(2016, 9, 16)),
487 |                      #],
488 |                      #[
489 |                      #  (datetime(2016, 9, 17), datetime(2016, 9, 18)),
490 |                      #], 
491 |                      #[
492 |                      #  (datetime(2016, 10, 7), datetime(2016, 10, 8)),
493 |                      #],
494 |                      #[
495 |                      #  (datetime(2016, 10, 7), datetime(2016, 10, 8)),
496 |                      #],
497 |                      #[
498 |                      #  (datetime(2016, 9, 30), datetime(2016, 10, 1)),
499 |                      #],
500 |                      #[
501 |                      #  (datetime(2016, 9, 30), datetime(2016, 10, 2)),
502 |                      #],
503 |                      #[
504 |                      #  (datetime(2016, 10, 1), datetime(2016, 10, 2)),
505 |                      #],
506 |                      #[
507 |                      #  (datetime(2016, 10, 2), datetime(2016, 10, 3)),
508 |                      #],
509 |                      #[
510 |                      #  (datetime(2016, 10, 3), datetime(2016, 10, 4)),
511 |                      #],
512 |                      ##[
513 |                      ##  (datetime(2016, 9, 15), datetime(2016, 9, 16)),
514 |                      ##  (datetime(2016, 10, 7), datetime(2016, 10, 8)),
515 |                      ##],
516 |                      #[
517 |                      #  (datetime(2016, 9, 14), datetime(2016, 9, 19)),
518 |                      #  (datetime(2016, 9, 30), datetime(2016, 10, 10)),                       
519 |                      #],
520 |                      [],
521 |                  ],
522 |               },
523 | #              {
524 | #                 'model': [GradientBoostingRegressor, ],
525 | #                 'is_ignore_skip_date_count': [True,],
526 | #                 'rm_n_head_days': [9, ],
527 | #                 'rm_n_head_days_hours': [
528 | #                     [(0, 6), (12, 14), (20, 24)],
529 | #                 ],
530 | #                 'ft_th': [[
531 | #                            (7, 0.3),
532 | #                            (8, 0.8),
533 | #                            (9, 0.3),
534 | #                            #(42, i/10.0),
535 | #                          ] #for i in range(0, 11, 1)
536 | #                          ],
537 | #                 'n_estimators': [200,],
538 | #                 'is_y_log' : [True,], 
539 | #                 'norm_y': [False,], 
540 | #                 'is_one_hot_encode': [False],
541 | #                 'ft_select': [ [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24),],
542 | #                 'train_days': [9+7, 50 ],
543 | #                 'is_rm_outliers': [True,],
544 | #                 #'rm_outliers_key': [[0, 1, ] + list(one) for one in list(findsubsets2(set([2,3,4,]))) ],
545 | #                 'rm_outliers_key': [ [0, 1, 3, ], ],
546 | #                 #'is_avg_or_median': [1, 2, ],
547 | #                 'is_avg_or_median': [2, ],
548 | #                 'rm_outliers_m': [0.4, ],
549 | #                 #'rm_outliers_m': [i/100.0 for i in range(10, 60, 10)],
550 | #                 'is_sample_weight': range(16),
551 | #                 'skip_date_ranges': [
552 | #                     [
553 | #                       (datetime(2016, 9, 30), datetime(2016, 10, 10)),
554 | #                     ],
555 | #                ],
556 | #
557 | #              },
558 | #              {
559 | #                 'model':[MedianModel,],
560 | #                 'is_y_log': [True,],
561 | #                 'ft_pos':[[0, 1, 2, 4], ],
562 | #                 'ft_select': [
563 | #                      [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17] + range(18, 18+24), 
564 | #                  ],
565 | #                 #'train_days': range(14, 90, 1),
566 | #                 'train_days': [9+7, ],
567 | #                 'is_rm_outliers': [False,],
568 | #                 'rm_outliers_m': [0.5],
569 | #                 #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)],
570 | #                 'rm_outliers_key': [[0, 1, ]],
571 | #                 #'rm_outliers_key': [[0, 1, 2] + list(one) for one in list(findsubsets2(set([2,3,5,6]))) ],
572 | #                 #'is_avg_or_median': [1, 2],
573 | #                 'is_avg_or_median': [1, ],
574 | #              },
575 | #              {
576 | #                 'model': [NonparametricKNN,],
577 | #                 'ft_th': [[
578 | #                            (7, 0.3),
579 | #                            (8, 0.8),
580 | #                            (9, 0.3),
581 | #                            #(42, i/10.0),
582 | #                          ] #for i in range(0, 11, 1)
583 | #                          ],
584 | #                 "n_neighbors": [1,],
585 | #                 'ft_select': [[0, 1, 3, 4, 6, 7, ], ],
586 | #                 #'ft_select': [[0, 1, 3, 4, 5, 6, 8] + list(one) for one in list(findsubsets2(set(range(7, 10)))) ],
587 | #                 "loss": ["SMAPE", ],
588 | #                 'is_y_log' : [False,],
589 | #                 'train_days': [7,],
590 | #                 'is_rm_outliers': [True,],
591 | #                 #'rm_outliers_key': [[0, 1, ] + list(one) for one in list(findsubsets2(set([2,3,4,5]))) ],
592 | #                 'rm_outliers_key': [[0, 1, 2, 4, 5], ],
593 | #                 'rm_outliers_m': [1.5, ],
594 | #                 #'rm_outliers_m': [i/10.0 for i in range(5, 30, 5)],
595 | #                 'is_avg_or_median': [2,],
596 | #              },
597 | #              {
598 | #                 'model': [RandomForestRegressor,ExtraTreesRegressor,],
599 | #                 'train_days': [3,7,14,21,28,35,42,49,56,150],
600 | #                 'is_rm_outliers': [True,],
601 | #                 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,],
602 | #              },
603 | #              {
604 | #                 'model': [LogisticRegression,],
605 | #                 'penalty': ['l1', 'l2'],
606 | #                 'C': [1.0, 10.0],
607 | #                 'train_days': [3,7,14,21,28,35,42,49,56,150],
608 | #                 'is_rm_outliers': [True,],
609 | #                 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,],
610 | #              },
611 | #              {
612 | #                  "model": [BaggingRegressor,],
613 | #                  "base_estimator": [GradientBoostingRegressor(loss="ls", n_estimators=50),
614 | #                                     GradientBoostingRegressor(loss="lad", n_estimators=50),
615 | #                                     GradientBoostingRegressor(loss="huber", n_estimators=50),
616 | #                                     GradientBoostingRegressor(loss="quantile", n_estimators=50),
617 | #                                     ExtraTreesRegressor(n_estimators=50),
618 | #                                     RandomForestRegressor(n_estimators=50)],
619 | #                  'train_days': [3, 7,14,21,28,35,],
620 | #                  'is_rm_outliers': [True,],
621 | #                  'dates_train': [dates_train,],
622 | #                  'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,],
623 | #              },
624 | #              {
625 | #                  "model": [BaggingRegressor,],
626 | #                  "base_estimator": [None, GradientBoostingRegressor(n_estimators=200),
627 | #                                     GradientBoostingRegressor(loss="lad", n_estimators=200),
628 | #                                     ExtraTreesRegressor(n_estimators=200), RandomForestRegressor(n_estimators=200)],
629 | #                  'train_days': [3, 7,14,21,28,35,],
630 | #                  'is_rm_outliers': [False,],
631 | #                  'rm_outliers_m': [0.5, ],
632 | #              },
633 | #
634 | #
635 |             ],
636 |         },
637 | #        {
638 | #            "model": BaggingRegressor,
639 | #            "tuned_parameters":[
640 | #              {
641 | #                 "base_estimator": [None,
642 | #                    GradientBoostingRegressor(n_estimators=200, loss='lad', learning_rate=0.1),
643 | #                    ExtraTreesRegressor() RandomForestRegressor()],
644 | #              },
645 | #            ],
646 | #        },
647 | #        {
648 | #            "model": GradientBoostingRegressor,
649 | #            "tuned_parameters": [
650 | #                {
651 | #                  'n_estimators': [200,],
652 | #                  'loss': ["ls", "lad", "huber", "quantile"],
653 | #                  'learning_rate': [0.1,],
654 | #                }
655 | #            ],
656 | #        },
657 | #        {
658 | #            "model": AdaBoostRegressor,
659 | #            "tuned_parameters": [],
660 | #        },
661 | #        {
662 | #            "model": ExtraTreesRegressor,
663 | #            "tuned_parameters": [
664 | #              {
665 | #                #"criterion": ["mse", "mae"],
666 | #                "n_estimators": [200,]
667 | #              },
668 | #            ]
669 | #        },
670 | #        {
671 | #            "model": RandomForestRegressor,
672 | #            "tuned_parameters": [
673 | #              {"n_estimators": [200,]},
674 | #            ],
675 | #        },
676 | #        {
677 | #            "model": SVR,
678 | #            "tuned_parameters": [
679 | #                {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10,]},
680 | #                #{'kernel': ['linear'], 'C': [1, ]}]
681 | #                {'kernel': ['linear'], 'C': [1, 10, ]}
682 | #            ]
683 | #        }
684 |     ]
685 | 
686 | 
687 |     #skip_cvs = []
688 |     predict_y_final = GridSearchCVDatesWithVal(
689 |         Configurations,
690 |         X_train, y_train, dates_train,
691 |         X_val, y_val, dates_val,
692 |         X_final,
693 |         is_y_log=is_y_log, is_boxcox=is_boxcox, boxcox_lambda=boxcox_lambda,
694 |         is_include_val_loss_for_eval=False, cv=3, days_to_test=7, skip_cvs=skip_cvs,
695 |         estimate_val_w=estimate_val_w, is_include_future_training=is_include_future_training,
696 |         remove_future_training_test=remove_future_training_test,
697 |     )
698 | 
699 |     ##output final output
700 |     path_final_res = os.path.join(
701 |         path_working_dir,
702 |         "volumes_predict.csv")
703 | 
704 |     print("*" * 60)
705 |     print("writing out final results...")
706 |     file_final_res = open(path_final_res, "w+")
707 |     file_final_res.writelines(','.join(['"tollgate_id"',
708 |             '"time_window"', '"direction"', '"volume"']) + '\n')
709 |     for iy, rinfo in enumerate(raw_info):
710 |         tollgate_id, direction, start_datetime = rinfo
711 |         end_datetime = start_datetime + timedelta(minutes=20)
712 |         timewindow = "["+str(start_datetime) + "," + str(end_datetime)+ ")"
713 |         words = [tollgate_id, timewindow, direction, str(predict_y_final[iy])]
714 |         line = '","'.join(words)
715 |         line = '"' + line + '"\n'
716 |         file_final_res.write(line)
717 |     file_final_res.close()
718 | 


--------------------------------------------------------------------------------
/kdd2017/models.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from sklearn.pipeline import FeatureUnion
  3 | from sklearn.ensemble import ExtraTreesRegressor
  4 | from sklearn.ensemble import GradientBoostingRegressor
  5 | from scipy import stats
  6 | import inspect
  7 | import numpy as np
  8 | from sklearn.neighbors import NearestNeighbors
  9 | from datetime import timedelta
 10 | from datetime import datetime
 11 | import numpy as np
 12 | from kdd2017.utils import invboxcox
 13 | from kdd2017.utils import mape_loss
 14 | from kdd2017.utils import remove_outliers
 15 | from kdd2017.utils import remove_outliers2
 16 | from kdd2017.utils import remove_outliers3
 17 | from kdd2017.utils import invboxcox
 18 | from kdd2017.utils import compute_harmonic_mean
 19 | from sklearn.feature_selection import SelectKBest, f_regression
 20 | from sklearn.pipeline import make_pipeline
 21 | from sklearn import preprocessing
 22 | import collections
 23 | import xgboost as xgb
 24 | from scipy.stats import boxcox
 25 | from sklearn.decomposition import PCA
 26 | import copy
 27 | from random import shuffle
 28 | import lightgbm as lgb
 29 | import lightgbm as lgb
 30 | import pandas as pd
 31 | import numpy as np
 32 | from sklearn.metrics import mean_squared_error
 33 | from sklearn import preprocessing
 34 | import os
 35 | import json
 36 | from time import sleep
 37 | import random
 38 | from sklearn.model_selection import train_test_split
 39 | 
 40 | global_variables = {}
 41 | 
 42 | 
 43 | random_state = 1
 44 | 
 45 | 
 46 | def mspe(y, dtrain):
 47 |     yhat = dtrain.get_label()
 48 |     grad = 2.0/yhat * (y * 1.0 / yhat - 1)
 49 |     hess = 2.0/(yhat**2)
 50 |     return grad, hess
 51 | 
 52 | def L2(pred,true):
 53 |     loss = np.square(pred-true)
 54 |     return loss.mean()
 55 | 
 56 | def L1(pred,true):
 57 |     loss = np.abs(pred-true)
 58 |     return loss.mean()
 59 | 
 60 | def SMAPE(pred,true):
 61 |     loss = np.abs((pred-true)/(pred+true))
 62 |     return loss.mean()
 63 | 
 64 | #This function chooses the best point estimate for a numpy array, according to a particular loss.
 65 | #The loss function should take two numpy arrays as arguments, and return a scalar. One example is SMAPE, see above.
 66 | def solver(x,loss):
 67 |     mean = x.mean()
 68 |     best = loss(mean,x)
 69 |     result = mean
 70 |     for i in x:
 71 |         score = loss(i,x)
 72 |         if score < best:
 73 |             best = score
 74 |             result = i
 75 |     return result
 76 | 
 77 | class NonparametricKNN(object):
 78 |     def __init__(self,n_neighbors=5,loss='L2'):
 79 |         if loss in ['L1','L2','SMAPE']:
 80 |             loss = {'L1':L1,'L2':L2,'SMAPE':SMAPE}[loss]
 81 |         self.loss = loss
 82 |         self.n_neighbors = n_neighbors
 83 |         self.model = NearestNeighbors(n_neighbors,algorithm='auto',n_jobs=-1)
 84 |         self.solver = lambda x:solver(x,loss)
 85 |     def __repr__(self, ):
 86 |         return "NonparametricKNN: loss:" + repr(self.loss) + ", n_neighbors=" + repr(self.n_neighbors)
 87 |     def __str__(self,):
 88 |         return repr(self)
 89 | 
 90 |     def fit(self,train,target):#All inputs should be numpy arrays.
 91 |         self.model.fit(train)
 92 |         self.f=np.vectorize(lambda x:target[x])
 93 |         return self
 94 | 
 95 |     def predict(self,test):#Return predictions as a numpy array.
 96 |         neighbors = self.model.kneighbors(test,return_distance=False)
 97 |         neighbors = self.f(neighbors)
 98 |         result = np.apply_along_axis(self.solver,1,neighbors)
 99 |         return result
100 | 
101 | 
102 | def evalerror(preds, dtrain):
103 |     labels = dtrain.get_label()
104 |     # return a pair metric_name, result
105 |     # since preds are margin(before logistic transformation, cutoff at 0)
106 |     return 'error', mape_loss(preds, labels)
107 | 
108 | def xgboostobj(preds, dtrain):
109 |     labels = dtrain.get_label()
110 |     preds = 1.0 / (1.0 + np.exp(-preds))
111 |     grad = preds - labels
112 |     hess = preds * (1.0-preds)
113 |     return grad, hess
114 | 
115 | class Pipeline(object):
116 |     def __init__(self, **kwargs):
117 |         self.models = kwargs["models"]
118 |         kwargs.pop("models")
119 | 
120 |     def fit(self, X, y):
121 |         last_y = np.copy(y)
122 |         for i, model in enumerate(self.models):
123 |             self.models[i].fit(X, last_y)
124 |             last_y = np.copy(self.models[i].predict(X))
125 | 
126 |     def predict(self, X, **kwargs):
127 |         return self.models[-1].predict(X)        
128 | 
129 | class MedianModel(object):
130 |     def __init__(self, **kwargs):
131 |         self.ft_pos = kwargs.get("ft_pos", np.asarray([0,1]))
132 |         if "ft_pos" in kwargs:
133 |             kwargs.pop("ft_pos")
134 |     def __str__(self,):
135 |         return "MedianModel:\n  " + repr(self.ft_pos)
136 |     def __repr__(self,):
137 |         return str(self)
138 | 
139 |     def fit(self, X, y, **kwargs):
140 |         X = X[:,self.ft_pos]
141 |         self.values = {}
142 |         for i, x in enumerate(X):
143 |             key = tuple([j for j in x])
144 |             if key not in self.values:
145 |                 self.values[key] = []
146 |             self.values[key].append(y[i])
147 | 
148 |     def predict(self, X, **kwargs):
149 |         X = X[:,self.ft_pos]
150 |         y = []
151 |         for i, x in enumerate(X):
152 |             key = tuple([j for j in x])
153 |             y.append(np.median(self.values[key]))
154 |         return np.asarray(y)
155 | 
156 | 
157 | class XGBoost(object):
158 |     def __init__(self, **kwargs):
159 |         #self.eval_metric = kwargs.get("eval_metric", "logloss")
160 |         #self.eta = kwargs.get("eta", 0.02)
161 |         #self.max_depth = kwargs.get("max_depth", 3)
162 |         #self.objective = kwargs.get("objective", "reg:gamma")
163 |         #self.booster = kwargs.get("booster", "gbtree")
164 |         self.use_mspe = kwargs.get("use_mspe", False)
165 |         self.num_round = kwargs.get("num_round", 1500)
166 |         self.early_stopping_rounds = kwargs.get("early_stopping_rounds", 10)
167 |         self.verbose_eval = kwargs.get("verbose_eval", 500)
168 |         self.eval_metric = kwargs.get("eval_metric", None)
169 |         self.feval = None
170 |         if self.eval_metric == "mape":
171 |             self.eval_metric = None
172 |             self.feval = evalerror
173 |         if "use_mspe" in kwargs:
174 |             kwargs.pop("use_mspe")
175 |         if "early_stopping_rounds" in kwargs:
176 |             kwargs.pop("early_stopping_rounds")
177 |         if "num_round" in kwargs:
178 |             kwargs.pop("num_round")
179 |         if "verbose_eval" in kwargs:
180 |             kwargs.pop("verbose_eval")
181 |         if "eval_metric" in kwargs:
182 |             kwargs.pop("eval_metric")
183 |         self.param = kwargs
184 | 
185 |     def __str__(self, ):
186 |         members = [attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__")]
187 |         ret_val = ""
188 |         for attr in members:
189 |             value = getattr(self, attr)
190 |             ret_val += "    (%s:%s)\n" % (attr, repr(value))
191 |         return "XGBoost: " + ret_val
192 | 
193 |     def __repr__(self, ):
194 |         return str(self)
195 | 
196 |     def fit(self, X, y, sample_weight=None, **kwargs):
197 |         if sample_weight is not None:
198 |             print("use sample_weight")
199 |             dtrain = xgb.DMatrix(X, label=y, weight=sample_weight, silent=True)
200 |         else:
201 |             dtrain = xgb.DMatrix(X, label=y, silent=True)
202 |         evallist = [(dtrain, 'train')]
203 |         param = {
204 |                  'n_estimators':200,
205 |                  'booster': 'gbtree',
206 |                  'nthread': -1,
207 |                  'max_depth': 3,
208 |                  'eta': 0.02,
209 |                  'silent': 1,
210 |                  'objective': 'reg:gamma',
211 |                  'colsample_bytree': 0.7,
212 |                  'eval_metric': 'logloss',
213 |                  'subsample': 0.5}
214 |         if self.eval_metric is not None:
215 |             param["eval_metric"] = self.eval_metric
216 |         param.update(self.param)
217 |         if not self.use_mspe:
218 |             if self.early_stopping_rounds > 0:
219 |                 self.bst = xgb.train(param, dtrain, self.num_round,
220 |                                      evallist, feval=self.feval, early_stopping_rounds=self.early_stopping_rounds,
221 |                                      verbose_eval=self.verbose_eval)
222 |             else:
223 |                 self.bst = xgb.train(param, dtrain, self.num_round,
224 |                                      evallist, feval=self.feval, verbose_eval=self.verbose_eval)
225 |         else:
226 |             param.pop("objective")
227 |             #param.pop("eval_metric")
228 |             if self.early_stopping_rounds > 0:
229 |                 self.bst = xgb.train(param, dtrain, self.num_round,
230 |                                      evallist,
231 |                                      mspe, feval=self.feval, early_stopping_rounds=self.early_stopping_rounds,
232 |                                      verbose_eval=self.verbose_eval)
233 |             else:
234 |                 self.bst = xgb.train(param, dtrain, self.num_round,
235 |                                      evallist,
236 |                                      mspe, feval=self.feval,
237 |                                      verbose_eval=self.verbose_eval)
238 | 
239 |     def predict(self, X, **kwargs):
240 |         dtest = xgb.DMatrix(X)
241 |         return self.bst.predict(dtest)
242 | 
243 | 
244 | def evalerror_lgbm(preds, dtrain):
245 |     labels = dtrain.get_label()
246 |     # return a pair metric_name, result
247 |     # since preds are margin(before logistic transformation, cutoff at 0)
248 |     return 'error', mape_loss(preds, labels), False
249 | 
250 | 
251 | class LGBM(object):
252 |     def __init__(self, **kwargs):
253 |         self.kwargs = kwargs
254 |         #self.kwargs["random_state"] = random_state
255 |         self.use_mspe = kwargs.get("use_mspe", False)
256 |         if "use_mspe" in self.kwargs:
257 |             self.kwargs.pop("use_mspe")
258 |         self.gbm = lgb.LGBMRegressor(**self.kwargs)
259 | 
260 |     def __repr__(self,):
261 |         return "LGBM:" + repr(self.kwargs)
262 | 
263 |     def __str__(self, ):
264 |         return repr(self)
265 | 
266 |     def fit(self, X, y):
267 |         if self.use_mspe:
268 |             lgb_train = lgb.Dataset(X, y,
269 |                         weight=np.ones(X.shape[0]), 
270 |                         free_raw_data=False)
271 |             lgb_test = lgb.Dataset(X, y, reference=lgb_train,
272 |                         weight=np.ones(X.shape[0]), 
273 |                         free_raw_data=False)
274 |             self.gbm = lgb.train(
275 |                 self.kwargs,
276 |                 lgb_train,
277 |                 num_boost_round=10,
278 |                 fobj=mspe,
279 |                 feval=evalerror_lgbm,
280 |                 valid_sets=lgb_test)
281 |         else:
282 |             X_train, X_test, y_train, y_test = train_test_split(
283 |                 X, y, test_size=0.3)
284 |             #lgb_test = lgb.Dataset(X, y, reference=lgb_train,
285 |             #            weight=np.ones(X.shape[0]), 
286 |             #            free_raw_data=False) 
287 |             self.gbm.fit(X, y, early_stopping_rounds=10, eval_set=[(X, y)], verbose=False)
288 |             #print "gbm best_iteration=", self.gbm.best_iteration
289 | 
290 |     def predict(self, X):
291 |         if self.use_mspe:
292 |             return self.gbm.predict(X)
293 |         else:
294 |             return self.gbm.predict(X, num_iteration=self.gbm.best_iteration)
295 | 
296 | def remove_outliers_by_classifier(X, y, dates, model, m=0.9):
297 |     #xgboost = XGBoost(max_depth=2, num_round=6000)
298 |     if np.isnan(X).any():
299 |         print("X contains NaN")
300 |     if np.isinf(X).any():
301 |         print("X contains inf")
302 |     if np.isnan(np.log(y)).any():
303 |         print("y contains nan")
304 |     if np.isinf(np.log(y)).any():
305 |         print("y contains inf")
306 |     print("X=", X.shape)
307 |     print("y=", y.shape)
308 |     model.fit(X, y)
309 |     y_pred = model.predict(X)
310 |     diff_values = np.abs(y_pred - y)
311 |     abs_diff_vals = np.abs(diff_values)
312 |     sorted_indexes = sorted(range(len(abs_diff_vals)), key = lambda x: abs_diff_vals[x])
313 |     sorted_indexes_lead = sorted_indexes[:int(len(abs_diff_vals)*m)]
314 |     return X[sorted_indexes_lead], y[sorted_indexes_lead], dates[sorted_indexes_lead]
315 | 
316 | 
317 | class BoxcoxModel(object):
318 |     def __init__(self, **kwargs):
319 |         #print("kwargs=", kwargs)
320 |         self.is_boxcox = kwargs.get("is_boxcox", False)
321 |         self.boxcox_lambda = kwargs.get("boxcox_lambda", 0.0)
322 |         self.Model = kwargs.get("model", GradientBoostingRegressor)
323 |         if "is_boxcox" in kwargs:
324 |             kwargs.pop("is_boxcox")
325 |         if "boxcox_lambda" in kwargs:
326 |             kwargs.pop("boxcox_lambda")
327 |         if "model" in kwargs:
328 |             kwargs.pop("model")
329 |         self.clf = self.Model(**kwargs)
330 |     def fit(self, X, y):
331 |         if self.is_boxcox:
332 |             self.clf.fit(X, stats.boxcox(y, self.boxcox_lambda))
333 |         else:
334 |             self.clf.fit(X, y)
335 |     def predict(self, X):
336 |         if self.is_boxcox:
337 |             return invboxcox(self.clf.predict(X), self.boxcox_lambda)
338 |         else:
339 |             return self.clf.predict(X)
340 | 
341 | class CombineModes(object):
342 |     def __init__(self, **kwargs):
343 |         self.models = copy.deepcopy(kwargs.get("models", None))
344 |         self.dates_train = copy.deepcopy(kwargs.get("dates_train", None))
345 |         self.weights = copy.deepcopy(kwargs.get("weights", None))
346 |         if self.weights is not None:
347 |             self.weights = np.asarray(self.weights)
348 |             self.weights = self.weights / np.sum(self.weights)
349 |         self.harmonic_mean = kwargs.get("harmonic_mean", True)
350 |         self.subsample = kwargs.get("subsample", 0.8)
351 |         self.combine_method = kwargs.get("combine_method", 0)
352 |         self.sample_weight = kwargs.get("sample_weight", None)
353 |         self.cache_file = kwargs.get("cache_file", "not useful any more")
354 |         self.model_hash_input_fit_key = []
355 |         self.cache_data = {}
356 |         self.is_save_cache_to_disk = True ### should be true otherwise all the model will be re-initialize each time
357 |         if self.models is not None:
358 |             self.model_hash_input_fit_key = [None] * len(self.models)
359 | 
360 |     def load_cache_data(self, ):
361 |         global global_variables
362 |         if "cache_combine_model" not in global_variables:
363 |             global_variables["cache_combine_model"] = {}
364 |         self.cache_data.update(global_variables["cache_combine_model"])
365 |         #print("self.cache_data=", self.cache_data)
366 | 
367 |     def save_cache_data(self, ):
368 |         global global_variables
369 |         if "cache_combine_model" not in global_variables:
370 |             global_variables["cache_combine_model"] = {}
371 |         global_variables["cache_combine_model"].update(self.cache_data)
372 |         #print("self.cache_data=", self.cache_data)
373 | 
374 |     def fit(self, X, y):
375 |         self.X_train = np.copy(X)
376 |         self.y_train = np.copy(y)
377 | 
378 |     def _fit(self, X, y, model_i):
379 |        mX = np.copy(X)
380 |        my = np.copy(y)
381 |        sub_index = range(len(my))
382 |        shuffle(sub_index)
383 |        if self.subsample < 1.0:
384 |            sub_index = sub_index[:int(len(my)*self.subsample)]
385 |        sub_index = np.asarray(sub_index)
386 |        sub_index.sort()
387 |        mX = mX[sub_index, :]
388 |        my = my[sub_index]
389 |        dates_train = copy.deepcopy(self.dates_train[sub_index])
390 |        if hasattr(self.models[model_i], "dates_train"):
391 |            self.models[model_i].dates_train = copy.deepcopy(dates_train)
392 |        self.models[model_i].fit(mX, my)
393 | 
394 |     def _fit_predict(self, X, model_i):
395 |         if self.cache_file is not None:
396 |             self.X_train.flags.writeable = False
397 |             self.y_train.flags.writeable = False
398 |             self.dates_train.flags.writeable = False
399 |             self.model_hash_input_fit_key[model_i] = str(hash(repr(self.models[model_i]))) + \
400 |                                                      str(hash(self.X_train.data)) + \
401 |                                                      str(hash(self.y_train.data)) + \
402 |                                                      str(hash(self.dates_train.data))
403 |             self.X_train.flags.writeable = True
404 |             self.y_train.flags.writeable = True
405 |             self.dates_train.flags.writeable = True
406 | 
407 |             X.flags.writeable = False
408 |             model_hash_predict_key = str(hash(X.data))
409 |             X.flags.writeable = True
410 |             total_key = self.model_hash_input_fit_key[model_i] + model_hash_predict_key
411 |             if total_key in self.cache_data:
412 |                 #print("using cache ", total_key)
413 |                 return np.asarray(self.cache_data[total_key])
414 |         self._fit(self.X_train, self.y_train, model_i)
415 |         ret_val = self.models[model_i].predict(X)
416 |         if self.cache_file is not None:
417 |             self.cache_data[total_key] = ret_val.tolist()
418 |             self.save_cache_data()
419 |         return ret_val
420 | 
421 |     def predict(self, X):
422 |         self.load_cache_data()
423 |         ret_val = None
424 |         if self.combine_method == 0:
425 |             ys = []
426 |             for i in range(len(self.models)):
427 |                 y_i = self._fit_predict(X, i)
428 |                 ys.append(y_i)
429 |             ret_val = compute_harmonic_mean(ys)
430 |         elif self.combine_method == 1:
431 |             ys = []
432 |             for i in range(len(self.models)):
433 |                 y_i = self._fit_predict(X, i)
434 |                 ys.append(y_i)
435 |             ys = np.asarray(ys)
436 |             ret_val = np.average(ys, axis=0)
437 |         elif self.combine_method == 2:
438 |             if self.weights is not None:
439 |                 sum_w = np.sum(self.weights)
440 |                 self.weights = self.weights / sum_w
441 |                 y = []
442 |                 for i in range(len(self.models)):
443 |                     y_i = self._fit_predict(X, i)
444 |                     y_i = np.asarray(y_i, np.float32)
445 |                     y_i *= np.asarray(self.weights[i], np.float32)
446 |                     y.append(y_i.reshape(-1))
447 |                 y = np.asarray(y)
448 |                 #print(y.shape)
449 |                 y = np.sum(y, axis=0)
450 |                 ret_val = y
451 |         return ret_val
452 |           
453 | 
454 | class DaterangeModel(object):
455 |     def is_in_predict_hour_range(self, item_datetime):
456 |         for i in self.predict_hour_range:
457 |             start_time = datetime(item_datetime.year, item_datetime.month, item_datetime.day, i[0][0], i[0][1])
458 |             end_time = datetime(item_datetime.year, item_datetime.month, item_datetime.day, i[1][0], i[1][1])
459 |             if item_datetime >= start_time and item_datetime < end_time:
460 |                 return True
461 |         return False
462 | 
463 |     def __str__(self, ):
464 |         members = [attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__")]
465 |         ret_val = ""
466 |         for attr in members:
467 |             value = getattr(self, attr)
468 |             ret_val += "    (%s:%s)\n" % (attr, repr(value))
469 |         return "DaterangeModel: " + repr(self.clf) + " " + ret_val
470 | 
471 |     def __repr__(self, ):
472 |         return str(self)
473 | 
474 |     def __init__(self, **kwargs):
475 |         #print("kwargs=", kwargs)
476 |         self.dates_train = copy.deepcopy(kwargs.get("dates_train", None))
477 |         self.skip_date_ranges = copy.deepcopy(kwargs.get("skip_date_ranges", []))
478 |         self.ft_select = copy.deepcopy(kwargs.get("ft_select", None))
479 |         self.train_days = kwargs.get("train_days", None)
480 |         self.is_rm_outliers = kwargs.get("is_rm_outliers", None)
481 |         self.is_y_log = kwargs.get("is_y_log", False)
482 |         self.rm_outliers_m = kwargs.get("rm_outliers_m", 3.0)
483 |         self.Model = kwargs.get("model", GradientBoostingRegressor)
484 |         self.rm_outliers_key = kwargs.get("rm_outliers_key", [0, ])
485 |         self.is_avg_or_median = kwargs.get("is_avg_or_median", True)
486 |         self.is_boxcox = kwargs.get("is_boxcox", False)
487 |         self.boxcox_lambda = kwargs.get("boxcox_lambda", False)
488 |         self.y_log_e = kwargs.get("y_log_e", np.e)
489 |         self.random_state = kwargs.get("random_state", None)
490 |         self.anova_filter = kwargs.get("anova_filter", 0)
491 |         self.norm_y = kwargs.get("norm_y", False)
492 |         self.is_one_hot_encode = kwargs.get("is_one_hot_encode", False)
493 |         self.is_ft_union = kwargs.get("is_ft_union", None)
494 |         self.ft_th = kwargs.get("ft_th", None)
495 |         self.ft_weights = kwargs.get("ft_weights", None)
496 |         self.is_sample_weight = kwargs.get("is_sample_weight", None)
497 |         self.remove_non_predict_hour_range = kwargs.get("remove_non_predict_hour_range", False)
498 |         self.remove_test_date_data = kwargs.get("remove_test_date_data", False)
499 |         self.is_ignore_skip_date_count = kwargs.get("is_ignore_skip_date_count", False)
500 |         self.rm_n_head_days = kwargs.get("rm_n_head_days", 0)
501 |         self.rm_n_head_days_hours = kwargs.get("rm_n_head_days_hours", [(0, 6), (10, 15), (20, 22)])
502 |         self.predict_hour_range = kwargs.get("predict_hour_range",
503 |                                                 [
504 |                                                    [[8, 0], [10, 0]],
505 |                                                    [[17, 0], [19, 0]],
506 |                                                 ]
507 |                                             )
508 |         self.remove_outliers_by_classifier = kwargs.get("remove_outliers_by_classifier", None)
509 | 
510 |         self.ft_norm = kwargs.get("ft_norm", [])
511 |         self.ft_norm_clfs = []
512 |         # ft_norm = [0, 1]
513 |         #self.predict_hour_range = [
514 |         #     [[8, 0], [10, 0]],
515 |         #     [[17, 0], [19, 0]],
516 |         #]
517 | 
518 |         if self.Model is None:
519 |             self.Model = GradientBoostingRegressor
520 |         if "rm_n_head_days_hours" in kwargs:
521 |             kwargs.pop("rm_n_head_days_hours")
522 |         if "remove_outliers_by_classifier" in kwargs:
523 |             kwargs.pop("remove_outliers_by_classifier")
524 |         if "rm_n_head_days" in kwargs:
525 |             kwargs.pop("rm_n_head_days")
526 |         if "ft_norm" in kwargs:
527 |             kwargs.pop("ft_norm")
528 |         if "is_sample_weight" in kwargs:
529 |             kwargs.pop("is_sample_weight")
530 |         if "ft_select" in kwargs:
531 |             kwargs.pop("ft_select")
532 |         if "is_rm_outliers" in kwargs:
533 |             kwargs.pop("is_rm_outliers")
534 |         if "rm_outliers_m" in kwargs:
535 |             kwargs.pop("rm_outliers_m")
536 |         if "dates_train" in kwargs:
537 |             kwargs.pop("dates_train")
538 |         if "model" in kwargs:
539 |             kwargs.pop("model")
540 |         if "train_days" in kwargs:
541 |             kwargs.pop("train_days")
542 |         if "rm_outliers_key" in kwargs:
543 |             kwargs.pop("rm_outliers_key")
544 |         if "is_y_log" in kwargs:
545 |             kwargs.pop("is_y_log")
546 |         if "y_log_e" in kwargs:
547 |             kwargs.pop("y_log_e")
548 |         if "is_avg_or_median" in kwargs:
549 |             kwargs.pop("is_avg_or_median")
550 |         if "is_boxcox" in kwargs:
551 |             kwargs.pop("is_boxcox")
552 |         if "boxcox_lambda" in kwargs:
553 |             kwargs.pop("boxcox_lambda")
554 |         if "anova_filter" in kwargs:
555 |             kwargs.pop("anova_filter")
556 |         if "norm_y" in kwargs:
557 |             kwargs.pop("norm_y")
558 |         if "is_one_hot_encode" in kwargs:
559 |             kwargs.pop("is_one_hot_encode")
560 |         if "is_ft_union" in kwargs:
561 |             kwargs.pop("is_ft_union")
562 |         if "ft_th" in kwargs:
563 |             kwargs.pop("ft_th")
564 |         if "ft_weights" in kwargs:
565 |             kwargs.pop("ft_weights")
566 |         if "remove_non_predict_hour_range" in kwargs:
567 |             kwargs.pop("remove_non_predict_hour_range")
568 |         if "predict_hour_range" in kwargs:
569 |             kwargs.pop("predict_hour_range")
570 |         if "remove_test_date_data" in kwargs:
571 |             kwargs.pop("remove_test_date_data")
572 |         if "is_ignore_skip_date_count" in kwargs:
573 |             kwargs.pop("is_ignore_skip_date_count")
574 |         if "skip_date_ranges" in kwargs:
575 |             kwargs.pop("skip_date_ranges")
576 |         if "random_state" in kwargs:
577 |             kwargs.pop("random_state")
578 |         #print inspect.getargspec(self.Model.__init__)
579 |         arguments = inspect.getargspec(self.Model.__init__)[0]
580 |         if "random_state" in arguments:
581 |             kwargs["random_state"] = self.random_state
582 |         self.clf = self.Model(**kwargs)
583 |         #if hasattr(self.clf, "random_state"):
584 |         #    self.clf.random_state = random_state
585 |         if self.anova_filter > 0:
586 |             anova_filter_clf = SelectKBest(f_regression, k=self.anova_filter)
587 |             self.clf = make_pipeline(anova_filter_clf, self.clf)
588 |         if self.is_one_hot_encode:
589 |             self.enc = preprocessing.OneHotEncoder()
590 | 
591 | 
592 |     def is_need_skip_n_head_days_hours(self, cur_date): 
593 |         #self.rm_n_head_days_hours = kwargs.get("rm_n_head_days_hours", [(0, 8), (10, 17), (19, 22)])
594 |         for hour_range in self.rm_n_head_days_hours:
595 |             if cur_date.hour >= hour_range[0] and cur_date.hour < hour_range[1]:
596 |                 return True
597 |         return False
598 | 
599 |     def is_need_skip(self, cur_date):
600 |         for skip_date_range in self.skip_date_ranges:
601 |             if cur_date >= skip_date_range[0] and cur_date < skip_date_range[1]:
602 |                 return True
603 |         return False
604 | 
605 |     def fit(self, X, y):
606 |         X = np.copy(X)
607 |         y = np.copy(y)
608 |         if self.ft_th is not None:
609 |             for item in self.ft_th:
610 |               ft_pos = item[0]
611 |               th = item[1]
612 |               print("ft_pos=", ft_pos)
613 |               print("X.shape=", X.shape)
614 |               print(X[:5, ft_pos])
615 |               #print(np.sum(X[:, ft_pos]))
616 |               positive_items = X[:, ft_pos] >= th
617 |               negative_items = np.logical_not(positive_items)
618 |               X[positive_items, ft_pos] = 1
619 |               X[negative_items, ft_pos] = 0
620 |               #print(np.sum(X[:, ft_pos]))
621 |         for pos in self.ft_norm:
622 |             min_max_scaler = preprocessing.MinMaxScaler()
623 |             min_max_scaler.fit(X[:, pos].reshape(-1, 1))
624 |             X[:, pos] = min_max_scaler.transform(X[:, pos].reshape(-1, 1)).reshape(-1)
625 |             self.ft_norm_clfs.append(min_max_scaler)
626 |         #tmp_dates_train = copy.deepcopy(self.dates_train)
627 |         #print(X[:5, :])
628 |         if self.ft_select is not None:
629 |             self.ft_select = np.asarray(self.ft_select)
630 |             self.ft_select = self.ft_select.reshape((-1, ))
631 |             X = X[:, self.ft_select]
632 |         if self.is_y_log:
633 |             y = np.log(y) / np.log(self.y_log_e)
634 |         elif self.is_boxcox:
635 |             y = boxcox(y, self.boxcox_lambda)
636 |         if self.norm_y:
637 |             self.norm_y_max_y = np.max(y)
638 |             self.norm_y_min_y = np.min(y)
639 |             y = (y-self.norm_y_min_y)/(self.norm_y_max_y-self.norm_y_min_y)
640 |         if not all(self.dates_train[i] <= self.dates_train[i+1]
641 |                 for i in xrange(len(self.dates_train)-1)):
642 |             raise ValueError("train dates are not sorted...")
643 |         tmp_dates_train = copy.deepcopy(self.dates_train)
644 |         if self.is_rm_outliers:
645 |             if self.dates_train is None:
646 |                 raise ValueError("self.dates_train is None")
647 |             X, y, tmp_dates_train = remove_outliers3(
648 |                 X, y, tmp_dates_train, self.rm_outliers_m,
649 |                 key=self.rm_outliers_key,
650 |                 is_avg_or_median=self.is_avg_or_median)
651 |         if self.ft_weights is not None:
652 |             self.ft_weights = np.asarray(self.ft_weights)
653 |             X = np.multiply(X, np.tile(self.ft_weights, (X.shape[0], 1)))
654 |         if self.remove_outliers_by_classifier is not None:
655 |             X, y, tmp_dates_train = remove_outliers_by_classifier(X, y, tmp_dates_train, **self.remove_outliers_by_classifier)
656 |         if self.is_one_hot_encode:
657 |             self.enc.fit(X)
658 |             X = self.enc.transform(X).toarray()
659 |             #print("X[:5,:]=", X[:5,:])
660 |         if self.is_ft_union is not None:
661 |             print("X[:5,:]=", X[:5,:])
662 |             print("X.shape=", X.shape)
663 |             self.is_ft_union.fit(X,y)
664 |             X_ft_u =self.is_ft_union.transform(X)
665 |             X = np.hstack([X, X_ft_u])
666 | 
667 |         i_train_days = 0
668 |         train_min_date = tmp_dates_train[-1]
669 |         while i_train_days < self.train_days and train_min_date >= tmp_dates_train[0]:
670 |             if not self.is_need_skip(train_min_date) or (self.is_ignore_skip_date_count):
671 |                 i_train_days += 1
672 |             train_min_date -= timedelta(days=1)
673 | 
674 |         train_min_date += timedelta(days=1)
675 |         print("i_train_days=", i_train_days)
676 |         print("real diff days=", (tmp_dates_train[-1] - train_min_date).days)
677 |         train_items = tmp_dates_train >= train_min_date
678 |         X_train = X[train_items, :]
679 |         y_train = y[train_items]
680 |         tmp_dates_train_left = tmp_dates_train[train_items]
681 |         if self.remove_non_predict_hour_range:
682 |             hour_range_items = []
683 |             for item_date in tmp_dates_train_left:
684 |                 if self.is_in_predict_hour_range(item_date):
685 |                     hour_range_items.append(True)
686 |                 else:
687 |                     hour_range_items.append(False)
688 |             hour_range_items = np.asarray(hour_range_items)
689 |             X_train = X_train[hour_range_items]
690 |             y_train = y_train[hour_range_items]
691 |             tmp_dates_train_left = tmp_dates_train_left[hour_range_items]
692 |             #print("tmp_dates_train_left=", tmp_dates_train_left[-60:])
693 | 
694 |         if self.rm_n_head_days > 0:
695 |             max_train_day = tmp_dates_train_left[-1] - timedelta(days=self.rm_n_head_days)
696 | 
697 |             n_head_days_items = []
698 |             for item_date in tmp_dates_train_left:
699 |                 if ( (not self.is_need_skip_n_head_days_hours(item_date)) and item_date >= max_train_day) \
700 |                         or item_date < max_train_day:
701 |                     n_head_days_items.append(True)
702 |                 else:
703 |                     n_head_days_items.append(False)
704 |             n_head_days_items = np.asarray(n_head_days_items)
705 |             X_train = X_train[n_head_days_items]
706 |             y_train = y_train[n_head_days_items]
707 |             tmp_dates_train_left = tmp_dates_train_left[n_head_days_items]
708 | 
709 |         if len(self.skip_date_ranges) > 0:
710 |             left_date_range_items = []
711 |             for item_date in tmp_dates_train_left:
712 |                 if not self.is_need_skip(item_date):
713 |                     left_date_range_items.append(True)
714 |                 else:
715 |                     left_date_range_items.append(False)
716 |             left_date_range_items = np.asarray(left_date_range_items)
717 |             X_train = X_train[left_date_range_items]
718 |             y_train = y_train[left_date_range_items]
719 |             tmp_dates_train_left = tmp_dates_train_left[left_date_range_items]
720 | 
721 |         print("date range min train dates:", tmp_dates_train_left[0])
722 |         print("date range max train dates:", tmp_dates_train_left[-1])
723 |         print("self.clf.name=", self.clf)
724 |         #self.dates_train = self.dates_train[train_items]
725 |         arguments = inspect.getargspec(self.clf.fit)[0]
726 |         if "sample_weight" in arguments and self.is_sample_weight > 0 and self.is_sample_weight is not None and self.is_sample_weight:
727 |             #print("use sample_weight")
728 |             sample_weight = []
729 |             for datei, tdate in enumerate(tmp_dates_train_left):
730 |                 if self.dates_train[-1] >= tdate:
731 |                     div_factor = int((self.dates_train[-1] - tdate).days/self.is_sample_weight) + 1
732 |                 else:
733 |                     div_factor = 1.0
734 |                 #sample_weight.append(1.0/(np.log(div_factor) + 1.0))
735 |                 sample_weight.append(1.0/div_factor)
736 |             sample_weight = np.asarray(sample_weight)
737 |             #sample_weight = (np.max(sample_weight) - sample_weight) / (np.max(sample_weight) - np.min(sample_weight))*100.0
738 |             self.clf.fit(X_train, y_train, sample_weight)
739 |         else:
740 |             self.clf.fit(X_train, y_train)
741 |         if hasattr(self.clf, "feature_importances_"):
742 |             print(self.clf.feature_importances_)
743 | 
744 |     def predict(self, X):
745 |         X = np.copy(X)
746 |         if self.ft_th is not None:
747 |             for item in self.ft_th:
748 |               ft_pos = item[0]
749 |               th = item[1]
750 |               #print(np.sum(X[:, ft_pos]))
751 |               positive_items = X[:, ft_pos] >= th
752 |               negative_items = np.logical_not(positive_items)
753 |               X[positive_items, ft_pos] = 1
754 |               X[negative_items, ft_pos] = 0
755 |               #print(np.sum(X[:, ft_pos]))
756 |         for i, pos in enumerate(self.ft_norm):
757 |             min_max_scaler = self.ft_norm_clfs[i]
758 |             X[:, pos] = min_max_scaler.transform(X[:, pos].reshape(-1, 1)).reshape(-1)
759 |             
760 |         if self.ft_weights is not None:
761 |             self.ft_weights = np.asarray(self.ft_weights)
762 |             X = np.multiply(X, np.tile(self.ft_weights, (X.shape[0], 1)))
763 |         if self.ft_select is not None:
764 |             self.ft_select = np.asarray(self.ft_select)
765 |             X = X[:, self.ft_select]
766 |         if self.is_one_hot_encode:
767 |             X = self.enc.transform(X).toarray()
768 |         if self.is_ft_union is not None:
769 |             X_ft_u = self.is_ft_union.transform(X)
770 |             X = np.hstack([X,X_ft_u])
771 |         pre_y = self.clf.predict(X)
772 |         if self.norm_y:
773 |             pre_y = pre_y * (self.norm_y_max_y - self.norm_y_min_y) + self.norm_y_min_y
774 |         if self.is_y_log:
775 |             pre_y = np.exp(pre_y * np.log(self.y_log_e))
776 |         elif self.is_boxcox:
777 |             pre_y =  invboxcox(pre_y, self.boxcox_lambda)
778 |         else:
779 |             pre_y = pre_y
780 |         return pre_y
781 | 
782 | 
783 | class TestModel(object):
784 |     def __init__(self,):
785 |         pass
786 | 
787 |     def fit(self, X, y):
788 |         self.clf = ExtraTreesRegressor()
789 |         #y = np.log(y)
790 |         self.clf.fit(X, y)
791 | 
792 |     def predict(self, X):
793 |         return self.clf.predict(X)
794 | 
795 | 


--------------------------------------------------------------------------------
/kdd2017/utils.py:
--------------------------------------------------------------------------------
   1 | #! /usr/bin/env python
   2 | # -*- coding: utf-8 -*-
   3 | 
   4 | # import necessary modules
   5 | import math
   6 | from datetime import datetime
   7 | from sklearn import preprocessing
   8 | import numpy as np
   9 | import copy
  10 | from datetime import timedelta
  11 | import collections
  12 | from sklearn.decomposition import IncrementalPCA
  13 | from scipy.stats import boxcox
  14 | import os
  15 | import matplotlib.pyplot as plt
  16 | from sklearn.preprocessing import OneHotEncoder
  17 | import itertools
  18 | import collections
  19 | from multiprocessing import Pool, TimeoutError
  20 | 
  21 | 
  22 | def findsubsets(S,m):
  23 |     return set(itertools.combinations(S, m))
  24 | 
  25 | def findsubsets2(S, add_origin=True):
  26 |     ret = set()
  27 |     for i in range(1, len(S)):
  28 |         ret = ret.union(findsubsets(S, i))
  29 |     print("S=", S)
  30 |     if add_origin:
  31 |         ret.add(tuple(S))
  32 |     return ret
  33 | 
  34 | 
  35 | def generate_final_volumes(volumes):
  36 |     volumes_final = {}
  37 |     tollgate_id_dirs = set()
  38 |     for start_time_window in volumes:
  39 |         for tollgate_id in volumes[start_time_window]:
  40 |             for direction in volumes[start_time_window][tollgate_id]:
  41 |                 tollgate_id_dirs.add((tollgate_id, direction))
  42 | 
  43 |     tollgate_id_dirs = list(tollgate_id_dirs)
  44 |     finnal_predict_times1 = [
  45 |                                 (datetime(2016,10,i,8), datetime(2016,10,i,10))
  46 |                                 for i in range(25, 32)
  47 |                             ]
  48 |     finnal_predict_times2 = [
  49 |                                 (datetime(2016,10,i,17), datetime(2016,10,i,19))
  50 |                                 for i in range(25, 32)
  51 |                             ]
  52 |     finnal_predict_times = finnal_predict_times1 + finnal_predict_times2
  53 |     for tollgate_id, direction in tollgate_id_dirs:
  54 |         predict_datetimes = []
  55 |         for time_range in finnal_predict_times:
  56 |             start_datetime = time_range[0]
  57 |             end_datetime = time_range[1]
  58 |             cur_datetime = start_datetime
  59 |             while cur_datetime < end_datetime:
  60 |                 predict_datetimes.append(cur_datetime)
  61 |                 cur_datetime = cur_datetime + timedelta(minutes=20)
  62 |         for predict_datetime in predict_datetimes:
  63 |             if predict_datetime not in volumes_final:
  64 |                 volumes_final[predict_datetime] = {}
  65 |             if tollgate_id not in volumes_final[predict_datetime]:
  66 |                 volumes_final[predict_datetime][tollgate_id] = {}
  67 |             if direction not in volumes_final[predict_datetime][tollgate_id]:
  68 |                 volumes_final[predict_datetime][tollgate_id][direction] = 1
  69 |     return volumes_final
  70 | 
  71 | 
  72 | def extract_is_work_day(cur_date):
  73 |     if cur_date.year == 2015:
  74 |         # http://news.sina.com.cn/c/2014-12-16/154731291679.shtml
  75 |         if cur_date.month == 1:
  76 |             if cur_date.day >= 1 and cur_date.day <= 3:
  77 |                 return 0
  78 |             if cur_date.day == 4:
  79 |                 return 1
  80 |         if cur_date.month == 2:
  81 |             if cur_date.day == 15 or cur_date.day == 28:
  82 |                 return 1
  83 |             if cur_date.day >= 18 and cur_date.day <= 24:
  84 |                 return 0
  85 |         if cur_date.month == 4:
  86 |             if cur_date.day >= 4 and cur_date.day <= 6:
  87 |                 return 0
  88 |         if cur_date.month == 5:
  89 |             if cur_date.day >= 1 and cur_date.day <= 3:
  90 |                 return 0
  91 |         if cur_date.month == 6:
  92 |             if cur_date.day >= 20 and cur_date.day <= 22:
  93 |                 return 0
  94 |         if cur_date.month == 9:
  95 |             if cur_date.day >= 26 and cur_date.day <= 27:
  96 |                 return 0
  97 |         if cur_date.month == 10:
  98 |             if cur_date.day >= 1 and cur_date.day <= 7:
  99 |                 return 0
 100 |             if cur_date.day == 10:
 101 |                 return 1
 102 |     if cur_date.year == 2016:
 103 |         # http://news.qq.com/cross/20151211/xK0R05S8.html
 104 |         if cur_date.month == 1:
 105 |             if cur_date.day >= 1 and cur_date.day <= 3:
 106 |                 return 0
 107 |         if cur_date.month == 2:
 108 |             if cur_date.day >= 7 and cur_date.day <= 13:
 109 |                 return 0
 110 |             if cur_date.day == 6 or cur_date.day == 4:
 111 |                 return 1
 112 |         if cur_date.month == 4:
 113 |             if cur_date.day >= 2 and cur_date.day <= 4:
 114 |                 return 0
 115 |         if cur_date.month == 5:
 116 |             if cur_date.day >= 1 and cur_date.day <= 2:
 117 |                 return 0
 118 |         if cur_date.month == 6:
 119 |             if cur_date.day >= 9 and cur_date.day <= 11:
 120 |                 return 0
 121 |             if cur_date.day == 12:
 122 |                 return 1
 123 |         if cur_date.month == 9:
 124 |             if cur_date.day >= 15 and cur_date.day <= 17:
 125 |                 return 0
 126 |             if cur_date.day == 18:
 127 |                 return 1
 128 |         if cur_date.month == 10:
 129 |             if cur_date.day >= 1 and cur_date.day <= 7:
 130 |                 return 0
 131 |             if cur_date.day == 8 or cur_date.day == 9:
 132 |                 return 1
 133 |     if cur_date.weekday() == 6:
 134 |         return 0
 135 |     if cur_date.weekday() == 5:
 136 |         return 0
 137 |     return 1
 138 |     #return cur_date.weekday != 6 and cur_date.weekday != 5
 139 | 
 140 | 
 141 | def load_links(path_links):
 142 |     lines = open(path_links, "r").readlines()
 143 |     link_data = {}
 144 |     for line in lines[1:]:
 145 |         #line = line.replace('"', '')
 146 |         line = line.strip()
 147 |         words = line.split('","')
 148 |         link_id = int(words[0].replace('"', ''))
 149 |         length = int(words[1])
 150 |         width = int(words[2])
 151 |         lanes = int(words[3])
 152 |         in_top = words[4].split(",")
 153 |         out_top = words[5].split(",")
 154 |         lane_width = int(words[6].replace('"', ''))
 155 |         link_data[link_id] = {}
 156 |         link_data[link_id]["length"] = length
 157 |         link_data[link_id]["width"] = width
 158 |         link_data[link_id]["lanes"] = lanes
 159 |         link_data[link_id]["in_top"] = in_top
 160 |         link_data[link_id]["out_top"] = out_top
 161 |         link_data[link_id]["lane_width"] = lane_width
 162 |     return link_data
 163 | 
 164 | 
 165 | def load_routes(path_routes):
 166 |     lines = open(path_routes, "r").readlines()
 167 |     routes_data = {}
 168 |     for line in lines[1:]:
 169 |         line = line.strip()
 170 |         line = line.replace('"', '')
 171 |         words = line.split(",")
 172 |         intersection_id = words[0]
 173 |         tollgate_id = int(words[1])
 174 |         link_seq = [int(link_id) for link_id in words[2:]]
 175 |         routes_data[(intersection_id, tollgate_id)] = link_seq
 176 |     return routes_data
 177 | 
 178 | def load_weather_info(path_weather_infos):
 179 |     datetime_weather = {}
 180 |     for path_weather_info in path_weather_infos:
 181 |         is_first_line = True
 182 |         for line in open(path_weather_info, "r"):
 183 |             if is_first_line:
 184 |                 is_first_line = False
 185 |                 continue
 186 |             line = line.replace('"', '')
 187 |             words = line.split(",")
 188 |             trace_start_time = words[0]
 189 |             trace_start_time = datetime.strptime(trace_start_time, "%Y-%m-%d")
 190 |             hour = int(words[1])
 191 |             trace_start_time = datetime(trace_start_time.year,
 192 |                                         trace_start_time.month,
 193 |                                         trace_start_time.day,
 194 |                                         hour)
 195 |             datetime_weather[trace_start_time] = {}
 196 |             datetime_weather[trace_start_time]["pressure"] = float(words[2])
 197 |             datetime_weather[trace_start_time]["sea_pressure"] = float(words[3])
 198 |             datetime_weather[trace_start_time]["wind_direction"] = float(words[4])
 199 |             datetime_weather[trace_start_time]["wind_speed"] = float(words[5])
 200 |             datetime_weather[trace_start_time]["temperature"] = float(words[6])
 201 |             datetime_weather[trace_start_time]["rel_humidity"] = float(words[7])
 202 |             datetime_weather[trace_start_time]["precipitation"] = float(words[8])
 203 |     return datetime_weather
 204 | 
 205 | 
 206 | def load_volumes_info(in_file_names):
 207 | 
 208 |     if not isinstance(in_file_names, list):
 209 |         in_file_names = [in_file_names]
 210 | 
 211 |     volumes = {}
 212 |     for in_file_name in in_file_names:
 213 |         # Step 1: Load volume data
 214 |         fr = open(in_file_name, 'r')
 215 |         fr.readline()  # skip the header
 216 |         vol_data = fr.readlines()
 217 |         fr.close()
 218 | 
 219 |         # Step 2: Create a dictionary to caculate and store volume per time window
 220 |         # volumes = {}  # key: time window value: dictionary
 221 |         for i in range(len(vol_data)):
 222 |             each_pass = vol_data[i].replace('"', '').split(',')
 223 |             tollgate_id = each_pass[1]
 224 |             direction = each_pass[2]
 225 | 
 226 |             pass_time = each_pass[0]
 227 |             pass_time = datetime.strptime(pass_time, "%Y-%m-%d %H:%M:%S")
 228 |             time_window_minute = int(math.floor(pass_time.minute / 20) * 20)
 229 |             #print pass_time
 230 |             start_time_window = datetime(pass_time.year, pass_time.month,
 231 |                                          pass_time.day,
 232 |                                          pass_time.hour, time_window_minute, 0)
 233 | 
 234 |             if start_time_window not in volumes:
 235 |                 volumes[start_time_window] = {}
 236 |             if tollgate_id not in volumes[start_time_window]:
 237 |                 volumes[start_time_window][tollgate_id] = {}
 238 |             if direction not in volumes[start_time_window][tollgate_id]:
 239 |                 volumes[start_time_window][tollgate_id][direction] = 1
 240 |             else:
 241 |                 volumes[start_time_window][tollgate_id][direction] += 1
 242 |     return volumes
 243 | 
 244 | 
 245 | 
 246 | 
 247 | def load_travel_times_from_trajectories(path_trajectorieses, skip_date_ranges,
 248 |                                         load_frequent_info=False,
 249 |                                         frequent_threshold=1):
 250 | 
 251 |     if isinstance(path_trajectorieses, basestring):
 252 |         path_trajectorieses = [path_trajectorieses]
 253 |     elif isinstance(path_trajectorieses, list):
 254 |         path_trajectorieses = path_trajectorieses
 255 |     else:
 256 |         raise ValueError("unknown format...")
 257 | 
 258 |     travel_times = {}
 259 |     for path_trajectories in path_trajectorieses:
 260 |         # Step 1: Load trajectories
 261 |         fr = open(path_trajectories, 'r')
 262 |         fr.readline()  # skip the header
 263 |         traj_data = fr.readlines()
 264 |         fr.close()
 265 |         # print(traj_data[0])
 266 | 
 267 |         vehicle_id_f = collections.defaultdict(lambda : 0)
 268 |         all_route_ids = set()
 269 |         ## compute the vehicule
 270 |         for i in range(len(traj_data)):
 271 |             each_traj = traj_data[i].replace('"', '').split(',')
 272 |             intersection_id = each_traj[0]
 273 |             tollgate_id = each_traj[1]
 274 |             vehicle_id = each_traj[2]
 275 |             route_id = intersection_id + '-' + tollgate_id
 276 |             all_route_ids.add(route_id)
 277 |             vehicle_id_f[(route_id, vehicle_id)] += 1
 278 | 
 279 |         frequent_route_ids = set()
 280 |         for key in vehicle_id_f:
 281 |             if vehicle_id_f[key] > 1:
 282 |                 route_id = key[0]
 283 |                 frequent_route_ids.add(route_id)
 284 |                 #print vehicle_id_f[key]
 285 |         if all_route_ids == frequent_route_ids:
 286 |             print("enough paramters")
 287 |         else:
 288 |             raise ValueError("not info...")
 289 | 
 290 |         # Step 2: Create a dictionary to store travel time for each route per time window
 291 |         # travel_times = {}  # key: route_id. Value is also a dictionary of which key is the start time for the time window and value is a list of travel times
 292 |         # travel_times_avg = collections.defaultdict(list)
 293 |         for i in range(len(traj_data)):
 294 |             each_traj = traj_data[i].replace('"', '').split(',')
 295 |             intersection_id = each_traj[0]
 296 |             tollgate_id = each_traj[1]
 297 | #            vehicle_id = each_traj[2]
 298 | 
 299 |             route_id = intersection_id + '-' + tollgate_id
 300 | 
 301 |             if load_frequent_info:
 302 |                 key = (route_id, vehicle_id)
 303 |                 if key not in vehicle_id_f or vehicle_id_f[key] <= frequent_threshold:
 304 |                     continue
 305 | 
 306 |             if route_id not in travel_times.keys():
 307 |                 travel_times[route_id] = {}
 308 | 
 309 |             trace_start_time = each_traj[3]
 310 |             trace_start_time = datetime.strptime(trace_start_time, "%Y-%m-%d %H:%M:%S")
 311 |             time_window_minute = math.floor(trace_start_time.minute / 20) * 20
 312 |             start_time_window = datetime(trace_start_time.year,
 313 |                                          trace_start_time.month,
 314 |                                          trace_start_time.day,
 315 |                                          trace_start_time.hour,
 316 |                                          int(time_window_minute),
 317 |                                          0)
 318 |             is_need_skip = False
 319 |             for skip_date_range in skip_date_ranges:
 320 |                 if start_time_window >= skip_date_range[0] and \
 321 |                         start_time_window < skip_date_range[1]:
 322 |                     is_need_skip = True
 323 |                     break
 324 |             if is_need_skip:
 325 |                 continue
 326 |             tt = float(each_traj[-1]) # travel time
 327 |             if start_time_window not in travel_times[route_id].keys():
 328 |                 travel_times[route_id][start_time_window] = [tt]
 329 |             else:
 330 |                 travel_times[route_id][start_time_window].append(tt)
 331 |             if trace_start_time.hour >= 8 and trace_start_time.hour <= 10 \
 332 |                  or trace_start_time.hour >= 17 and trace_start_time.hour <= 19:
 333 |                 key = (route_id, trace_start_time.hour, int(time_window_minute))
 334 |                 #travel_times_avg[key].append(tt)
 335 | 
 336 | #    for key in travel_times_avg:
 337 | #        print(key)
 338 | #        print(np.median(travel_times_avg[key]))
 339 | #
 340 | #    exit(0)
 341 | 
 342 |     return travel_times
 343 | 
 344 | def search_closest_date_weather_info(date, datetime_weather):
 345 |     closest_date = None
 346 |     for cur_date in datetime_weather:
 347 |         if closest_date is None or (cur_date <= date and closest_date < cur_date):
 348 |             closest_date = cur_date
 349 |     #print("closest_date=", closest_date)
 350 |     return datetime_weather[closest_date]
 351 | 
 352 | 
 353 | 
 354 | def convert_date_to_x_volumes(date, datetime_weather, tollgate_id_x):
 355 |     weather_info = search_closest_date_weather_info(
 356 |         date, datetime_weather)
 357 |     x = []
 358 |     #x.append(date.year)
 359 |     x.append(date.month) ## 2
 360 |     #x.append(date.day)
 361 |     x.append(date.hour) ## 3
 362 |     x.append(date.minute) ## 4
 363 |     x.append(date.weekday()) ## 5
 364 |     x.append(not extract_is_work_day(date)) # 6
 365 |     x.append(float(weather_info["wind_speed"])) ## 7
 366 |     x.append(float(weather_info["temperature"])) ## 8
 367 |     x.append(float(weather_info["rel_humidity"])) ## 9
 368 |     x.append(float(weather_info["precipitation"])) ## 10
 369 |     wod = [0] * 7
 370 |     wod[date.weekday()] = 1
 371 |     x += wod ## 11 - 17
 372 |     x += list(tollgate_id_x) # 18 - 41 (include)
 373 |     x += [date.weekday()==5 or date.weekday()==6] ## weekend 42
 374 |     return x
 375 | 
 376 | 
 377 | def transform_data(x_i, le, with_fit=True):
 378 |     if isinstance(le, preprocessing.MinMaxScaler) or isinstance(le, preprocessing.Binarizer):
 379 |         x_i = x_i.astype(np.float)
 380 |         if with_fit:
 381 |             le.fit(x_i.reshape((-1, 1)))
 382 |         x_i = le.transform(x_i.reshape((-1, 1)))
 383 |     elif isinstance(le, preprocessing.LabelEncoder):
 384 |         if with_fit:
 385 |             le.fit(x_i)
 386 |         x_i = le.transform(x_i.reshape((-1, 1)))
 387 |     else:
 388 |         raise ValueError("unknow transform")
 389 |     return x_i.reshape((-1)), le
 390 | 
 391 | def compute_tollgate_id_to_link_ids_x(routes_data,):
 392 |     link_ids = []
 393 | 
 394 |     tollgate_id_to_link_ids = collections.defaultdict(list)
 395 |     tollgate_id = set()
 396 |     for it in routes_data:
 397 |         tollgate_id_to_link_ids[it[1]] += routes_data[it]
 398 |         tollgate_id_to_link_ids[it[1]] = list(set(tollgate_id_to_link_ids[it[1]]))
 399 |         link_ids += routes_data[it]
 400 | 
 401 |     all_link_ids = list(set(link_ids))
 402 |     tollgate_id_to_x = {}
 403 |     for tollgate_id in tollgate_id_to_link_ids:
 404 |         x = [0] * len(all_link_ids)
 405 |         x = np.asarray(x)
 406 |         link_ids = tollgate_id_to_link_ids[tollgate_id]
 407 |         for link_id in link_ids:
 408 |             #print("all_link_ids=", all_link_ids)
 409 |             #print("link_id=", link_id)
 410 |             pos = all_link_ids.index(link_id)
 411 |             #print("pos=", pos)
 412 |             #print("x=", x)
 413 |             x[pos] = 1
 414 |         tollgate_id_to_x[tollgate_id] = x
 415 |     return tollgate_id_to_x
 416 | 
 417 | def convert_volumes_into_X_y(volumes, datetime_weather, link_data, routes_data,
 418 |                             les_train=None,
 419 |                             is_ret_raw_info=False, verbose=False):
 420 | 
 421 |     tollgate_id_to_x = compute_tollgate_id_to_link_ids_x(routes_data)
 422 |     tollgate_id_to_x_len = len(tollgate_id_to_x[tollgate_id_to_x.keys()[0]])
 423 |     print("tollgate_id_to_x_len=", tollgate_id_to_x_len)
 424 |     X = []
 425 |     y = []
 426 |     les = [
 427 |            preprocessing.LabelEncoder(), ## tollgate_id
 428 |            preprocessing.LabelEncoder(), ## direction
 429 |            preprocessing.LabelEncoder(), ## month
 430 |            preprocessing.LabelEncoder(), ## hour
 431 |            preprocessing.LabelEncoder(), ## minute
 432 |            preprocessing.LabelEncoder(), ## weekday
 433 |            preprocessing.LabelEncoder(), ## work day
 434 |            preprocessing.MinMaxScaler(), ## wind
 435 |            preprocessing.MinMaxScaler(), ## temperature
 436 |            preprocessing.MinMaxScaler(), ## rel_humidity
 437 |            preprocessing.MinMaxScaler(), ## precipitation
 438 |           ]
 439 |     wod_les = [preprocessing.LabelEncoder()] * 7
 440 |     les += wod_les
 441 |     les += [preprocessing.LabelEncoder()] * tollgate_id_to_x_len
 442 |     les += [preprocessing.LabelEncoder(),]
 443 |     les += [preprocessing.MinMaxScaler(), ] ## future hour feature
 444 |     dates = []
 445 |     raw_info = []
 446 |     for start_time_window in volumes:
 447 |         for tollgate_id in volumes[start_time_window]:
 448 |             #tollgate_id = int(tollgate_id)
 449 |             for direction in volumes[start_time_window][tollgate_id]:
 450 |                 dates.append(start_time_window)
 451 |                 x = []
 452 |                 x.append(tollgate_id)
 453 |                 x.append(direction)
 454 |                 #print("tollgate_id=", tollgate_id)
 455 |                 x += convert_date_to_x_volumes(start_time_window, datetime_weather, tollgate_id_to_x[int(tollgate_id)])
 456 |                 X.append(x)
 457 |                 y.append(volumes[start_time_window][tollgate_id][direction])
 458 |                 raw_info.append((tollgate_id, direction, start_time_window))
 459 |     X = np.asarray(X)
 460 |     y = np.asarray(y)
 461 | 
 462 |     X = add_prev_two_hour_ft(X, y, dates)
 463 |     print("X[:5,:]=", X[:5,:])
 464 |     print("y[:5]=", y[:5])
 465 |     for i in range(X.shape[1]):
 466 |         if les_train is None:
 467 |             le = les[i]
 468 |             x_i = X[:, i]
 469 |             x_i, le = transform_data(x_i, le, True)
 470 |             X[:, i] = x_i
 471 |         else:
 472 |             le = les_train[i]
 473 |             x_i = X[:, i]
 474 |             x_i, le = transform_data(x_i, le, False)
 475 |             X[:, i] = x_i
 476 |     X = np.asarray(X, np.float)
 477 | 
 478 | 
 479 |     dates = np.asarray(dates)
 480 |     sorted_indexes = sorted(range(len(dates)), key = lambda x: dates[x])
 481 |     X = X[sorted_indexes, :]
 482 |     y = y[sorted_indexes]
 483 |     dates = dates[sorted_indexes]
 484 |     if not is_ret_raw_info:
 485 |         if les_train is None:
 486 |             return X, y, dates, les
 487 |         else:
 488 |             return X, y, dates, les_train
 489 |     else:
 490 |         new_raw_info = []
 491 |         for i in sorted_indexes:
 492 |             new_raw_info.append(raw_info[i])
 493 |         raw_info = new_raw_info
 494 |         if les_train is None:
 495 |             return X, y, dates, les, raw_info
 496 |         else:
 497 |             return X, y, dates, les_train, raw_info
 498 | 
 499 | 
 500 | def convert_date_to_x(date, datetime_weather, link_ids_x):
 501 | 
 502 |     #print("len(link_ids_x)=", len(link_ids_x))
 503 |     weather_info = search_closest_date_weather_info(
 504 |         date, datetime_weather)
 505 | 
 506 |     x = []
 507 |     #x.append(date.year)
 508 |     x.append(date.month) ## 2
 509 |     x.append(float(date.day)) ## 3
 510 |     x.append(date.hour) ## 4
 511 |     x.append(date.minute) ## 5
 512 |     x.append(date.weekday()) ## 6
 513 |     x.append(not extract_is_work_day(date)) ## 7
 514 |     x.append(float(weather_info["wind_speed"])) ## 8
 515 |     x.append(float(weather_info["temperature"])) ## 9
 516 |     x.append(float(weather_info["rel_humidity"])) ## 10
 517 |     x.append(float(weather_info["precipitation"])) ## 11
 518 |     wod = [0] * 7
 519 |     wod[date.weekday()] = 1
 520 |     x += wod # 12 - 18
 521 |     x += list(link_ids_x) # 19 - 42 (include)
 522 |     return x
 523 | 
 524 | 
 525 | def compute_route_to_link_ids_x(routes_data,):
 526 |     link_ids = []
 527 |     tollgate_id = set()
 528 |     print("routes_data=", routes_data)
 529 |     for it in routes_data:
 530 |         link_ids += routes_data[it]
 531 |     all_link_ids = list(set(link_ids))
 532 | 
 533 |     route_to_x = {}
 534 |     for route_id in routes_data:
 535 |         x = [0] * len(all_link_ids)
 536 |         x = np.asarray(x)
 537 |         link_ids = routes_data[route_id]
 538 |         for link_id in link_ids:
 539 |             #print("all_link_ids=", all_link_ids)
 540 |             #print("link_id=", link_id)
 541 |             pos = all_link_ids.index(link_id)
 542 |             #print("pos=", pos)
 543 |             #print("x=", x)
 544 |             x[pos] = 1
 545 |         route_to_x[route_id] = x
 546 |     return route_to_x
 547 | 
 548 | 
 549 | def search_prev_two_hour_y(cur_date, y, dates):
 550 |     prev_cur_date = cur_date - timedelta(hours=2)
 551 |     for i, date in enumerate(dates):
 552 |         if date == prev_cur_date:
 553 |             return y[i]
 554 |     return 0
 555 | 
 556 | def add_prev_two_hour_ft(X, y, dates):
 557 |     addition_x = []
 558 |     for i, date in enumerate(dates):
 559 |         addition_x.append(search_prev_two_hour_y(date, y, dates))
 560 |     addition_x = np.asarray(addition_x).reshape((-1, 1))
 561 |     print("X.shape=", X.shape)
 562 |     print("addition_x.shape=", len(addition_x))
 563 |     X = np.hstack([X, np.asarray(addition_x)])
 564 |     return X
 565 | 
 566 | def convert_into_X_y(travel_times, datetime_weather, link_data, routes_data,
 567 |                      les_train=None, is_ret_raw_info=False, is_skip_not_trainning_hours=False, verbose=False):
 568 | 
 569 |     route_to_link_ids_x = compute_route_to_link_ids_x(routes_data)
 570 |     route_to_link_ids_x_len = len(route_to_link_ids_x[route_to_link_ids_x.keys()[0]])
 571 |     X = []
 572 |     y = []
 573 |     les = [
 574 |            preprocessing.LabelEncoder(),
 575 |            preprocessing.LabelEncoder(),
 576 |            preprocessing.LabelEncoder(),
 577 |            preprocessing.LabelEncoder(),
 578 |            preprocessing.LabelEncoder(),
 579 |            preprocessing.LabelEncoder(),
 580 |            preprocessing.LabelEncoder(),
 581 |            preprocessing.LabelEncoder(),
 582 |            preprocessing.MinMaxScaler(), ## wind speed
 583 |            preprocessing.MinMaxScaler(), ## temperature
 584 |            preprocessing.MinMaxScaler(), ## rel_humidity
 585 |            preprocessing.MinMaxScaler(), ## precipitation
 586 |           ]
 587 |     wod_les = [preprocessing.LabelEncoder()] * 7
 588 |     les += wod_les
 589 |     les += [preprocessing.MinMaxScaler(), ] * route_to_link_ids_x_len
 590 |     les += [preprocessing.MinMaxScaler(), ]
 591 |     dates = []
 592 |     raw_info = []
 593 |     for route_id in travel_times:
 594 |         if verbose:
 595 |             print("route_id=", route_id)
 596 |         for start_time_window in travel_times[route_id]:
 597 |             is_trainning_hour = False
 598 |             start_year = start_time_window.year
 599 |             start_month = start_time_window.month
 600 |             start_day = start_time_window.day
 601 |             if start_time_window >= datetime(start_year, start_month, start_day, 8) and \
 602 |                     start_time_window < datetime(start_year, start_month, start_day, 10):
 603 |                 is_trainning_hour = True
 604 |             if start_time_window >= datetime(start_year, start_month, start_day, 17) and \
 605 |                     start_time_window < datetime(start_year, start_month, start_day, 19):
 606 |                 is_trainning_hour = True
 607 |             if (not is_trainning_hour) and is_skip_not_trainning_hours:
 608 |                 continue
 609 |             if verbose:
 610 |                 print("start_time_window=", start_time_window)
 611 |             
 612 |             raw_info.append((route_id, start_time_window))
 613 |             A, B = route_id.split('-')
 614 |             x = []
 615 |             #x.append(route_id)
 616 |             x.append(A)
 617 |             x.append(B)
 618 |             route_id_for_link_id = (A, int(B))
 619 |             #print(route_to_link_ids_x[route_id_for_link_id])
 620 |             x += convert_date_to_x(start_time_window, datetime_weather, route_to_link_ids_x[route_id_for_link_id])
 621 |             #print("x=", x)
 622 |             if len(travel_times[route_id][start_time_window]) >= 1:
 623 |                 X.append(x)
 624 |                 y.append(np.average(travel_times[route_id][start_time_window]))
 625 |                 dates.append(start_time_window)
 626 |     X = np.asarray(X)
 627 |     y = np.asarray(y)
 628 |     X = add_prev_two_hour_ft(X, y, dates)
 629 |     print("X=", X)
 630 |     print("X[:5,:]=", X[:5,:])
 631 |     print("y[:5]=", y[:5])
 632 |     for i in range(X.shape[1]):
 633 |         if les_train is None:
 634 |             le = les[i]
 635 |             x_i = X[:, i]
 636 |             x_i, le = transform_data(x_i, le, True)
 637 |             X[:, i] = x_i
 638 |         else:
 639 |             le = les_train[i]
 640 |             x_i = X[:, i]
 641 |             x_i, le = transform_data(x_i, le, False)
 642 |             X[:, i] = x_i
 643 |     X = np.asarray(X, np.float)
 644 |     dates = np.asarray(dates)
 645 |     sorted_indexes = sorted(range(len(dates)), key = lambda x: dates[x])
 646 |     X = X[sorted_indexes, :]
 647 |     y = y[sorted_indexes]
 648 |     dates = dates[sorted_indexes]
 649 | 
 650 | #    if les_train is None:
 651 | #        print("LabelEncoder........info")
 652 | #        for le in les:
 653 | #            print(list(le.classes_))
 654 | 
 655 |     if not is_ret_raw_info:
 656 |         if les_train is None:
 657 |             return X, y, dates, les
 658 |         else:
 659 |             return X, y, dates, les_train
 660 |     else:
 661 |         new_raw_info = []
 662 |         for i in sorted_indexes:
 663 |             new_raw_info.append(raw_info[i])
 664 |         raw_info = new_raw_info
 665 |         if les_train is None:
 666 |             return X, y, dates, les, raw_info
 667 |         else:
 668 |             return X, y, dates, les_train, raw_info
 669 | 
 670 | 
 671 | #Function
 672 | def invboxcox(y,ld):
 673 |    if ld == 0:
 674 |       return(np.exp(y))
 675 |    else:
 676 |       return(np.exp(np.log(ld*y+1)/ld))
 677 | 
 678 | 
 679 | def mape_loss(y, y_predict):
 680 |     loss = np.sum(np.abs((y_predict - y) / y)) / float(len(y))
 681 |     if np.isnan(loss):
 682 |         return 100.0
 683 |     else:
 684 |         return loss
 685 | 
 686 | def inv_mape_loss(estimator, y, y_predict):
 687 |     return 1.0 - mape_loss(y, y_predict)
 688 | 
 689 | def remove_outliers(X, y, dates, m=3.0):
 690 |     ret_keep_indexes = np.asarray([], dtype=int)
 691 |     route_ids = set(X[:, 0])
 692 |     keep_indexes = np.asarray(range(len(y)))
 693 |     for route_id in route_ids:
 694 |         route_items = X[:, 0] == route_id
 695 |         y_route = y[route_items]
 696 |         keep_indexes_route = keep_indexes[route_items]
 697 |         avg_y_route = np.average(y_route)
 698 |         m_diff_y = y_route - avg_y_route
 699 |         stable_indexes = keep_indexes_route[
 700 |             abs(m_diff_y) < m * np.std(m_diff_y)]
 701 |         ret_keep_indexes = np.append(ret_keep_indexes, stable_indexes)
 702 |     ret_keep_indexes.sort()
 703 |     return X[ret_keep_indexes], y[ret_keep_indexes], dates[ret_keep_indexes]
 704 | 
 705 | 
 706 | def remove_outliers2(X, y, dates, m=3.0):
 707 |     ret_keep_indexes = np.asarray([], dtype=int)
 708 |     route_ids = set(X[:, 0])
 709 |     keep_indexes = np.asarray(range(len(y)))
 710 |     time_keys = set()
 711 |     for start_time_window in dates:
 712 |         key_str = "%02d%02d" % (
 713 |             start_time_window.hour,
 714 |             start_time_window.minute)
 715 |         time_keys.add(key_str)
 716 |     for time_key in time_keys:
 717 |         for route_id in route_ids:
 718 |             route_items = X[:, 0] == route_id
 719 |             X_route = X[route_items,:]
 720 |             y_route = y[route_items]
 721 |             dates_route = dates[route_items]
 722 |             keep_indexes_route = keep_indexes[route_items]
 723 |             inner_time_items = np.asarray([False] * len(dates_route))
 724 |             for i, date in enumerate(dates_route):
 725 |                 key_str = "%02d%02d" % (
 726 |                     start_time_window.hour,
 727 |                     start_time_window.minute)
 728 |                 if key_str == time_key:
 729 |                     inner_time_items[i] = True
 730 |             X_route_time = X_route[inner_time_items]
 731 |             y_route_time = y_route[inner_time_items]
 732 |             keep_indexes_route_time = keep_indexes_route[inner_time_items]
 733 |             avg_y_route_time = np.average(y_route_time)
 734 |             m_diff_y = y_route_time - avg_y_route_time
 735 |             stable_indexes = keep_indexes_route_time[
 736 |                 abs(m_diff_y) < m * np.std(m_diff_y)]
 737 |             ret_keep_indexes = np.append(ret_keep_indexes, stable_indexes)
 738 |     ret_keep_indexes.sort()
 739 | 
 740 | 
 741 | def remove_outliers3(X, y, dates, m=3.0, key=[0,1], is_avg_or_median=True):
 742 |     if len(key) == 0:
 743 |         return X, y, dates
 744 | 
 745 |     key = np.asarray(key)
 746 |     key = key[key < X.shape[1]]
 747 | 
 748 |     ret_keep_indexes = np.asarray([], dtype=int)
 749 |     entities = set()
 750 |     for entity in X[:, key]:
 751 |         entities.add(tuple(list(entity)))
 752 |     keep_indexes = np.asarray(range(len(y)))
 753 |     for entity in entities:
 754 |         entity_items = np.all(X[:, key] == entity, axis=1)
 755 |         y_entities = y[entity_items]
 756 |         keep_indexes_route = keep_indexes[entity_items]
 757 |         if is_avg_or_median == 1:
 758 |             avg_y_entity = np.average(y_entities)
 759 |             m_diff_y = y_entities - avg_y_entity
 760 |             stable_indexes = keep_indexes_route[
 761 |                 abs(m_diff_y) < m * np.std(m_diff_y)]
 762 |         elif is_avg_or_median == 0:
 763 |             d = np.abs(y_entities - np.median(y_entities))
 764 |             mdev = np.median(d)
 765 |             if mdev != 0:
 766 |                 s = d/mdev
 767 |             else:
 768 |                 s = [0.0,] * len(d)
 769 |             s = np.asarray(s)
 770 |             stable_indexes = keep_indexes_route[s<m]
 771 |         elif is_avg_or_median == 2:
 772 |             len_y_entities = len(y_entities)
 773 |             sorted_indexes = sorted(range(len_y_entities), key = lambda x: y_entities[x])
 774 |             edge = int(len_y_entities*m/10.0/2.0)
 775 |             #print("len_y_entities=", len_y_entities)
 776 |             #print("edge=", edge)
 777 |             if edge < len_y_entities / 2 and edge > 0:
 778 |                 sorted_indexes = sorted_indexes[edge:-edge]
 779 |                 stable_indexes = keep_indexes_route[sorted_indexes]
 780 |             elif edge == 0:
 781 |                 stable_indexes = keep_indexes_route[sorted_indexes]
 782 |             else:
 783 |                 stable_indexes = np.asarray([])
 784 |         elif is_avg_or_median == 3:
 785 |             avg_y_entity = np.average(y_entities)
 786 |             m_diff_y = y_entities - avg_y_entity
 787 |             stable_indexes = keep_indexes_route[
 788 |                 m_diff_y < m * np.std(m_diff_y)]
 789 |         else:
 790 |             raise ValueError("is_avg_or_median unknown....")
 791 |         ret_keep_indexes = np.append(ret_keep_indexes, stable_indexes)
 792 |     ret_keep_indexes.sort()
 793 |     return X[ret_keep_indexes], y[ret_keep_indexes], dates[ret_keep_indexes]
 794 | 
 795 | 
 796 | #def remove_outliers_by_classifier(X, y, dates, ):
 797 |     
 798 | def compute_loss(input_compute_loss):
 799 | 
 800 |     Model = input_compute_loss["Model"]
 801 |     config = input_compute_loss["config"]
 802 |     X_train = input_compute_loss["X_train"]
 803 |     y_train = input_compute_loss["y_train"]
 804 |     dates_train = input_compute_loss["dates_train"]
 805 |     X_test = input_compute_loss["X_test"]
 806 |     y_test = input_compute_loss["y_test"]
 807 |     is_y_log = input_compute_loss["is_y_log"]
 808 |     is_boxcox = input_compute_loss["is_boxcox"]
 809 |     loss_func = input_compute_loss["loss_func"]
 810 | 
 811 |     model = Model(**config)
 812 |     if hasattr(model ,"dates_train"):
 813 |         model.dates_train = dates_train
 814 |     if is_y_log:
 815 |         model.fit(X_train, np.log(y_train))
 816 |         predict_y_test = np.exp(model.predict(X_test))
 817 |     elif is_boxcox:
 818 |         model.fit(X_train, boxcox(y_train, boxcox_lambda))
 819 |         predict_y_test = invboxcox(model.predict(X_test), boxcox_lambda)
 820 |     else:
 821 |         model.fit(X_train, y_train)
 822 |         predict_y_test = model.predict(X_test)
 823 |     if loss_func is None:
 824 |         loss = mape_loss(y_test, predict_y_test)
 825 |     else:
 826 |         loss = loss_func(y_test, predict_y_test)
 827 |     return (repr(config), config, loss)
 828 | 
 829 | def compute_losses(input_compute_losses):
 830 |     result = []
 831 |     for input_compute_loss in input_compute_losses:
 832 |         result.append(compute_loss(input_compute_loss))
 833 |     return result
 834 |  
 835 | def GridSearchCVDates(Model, tuned_parameters, X, y, dates,
 836 |                       days_to_test=7, cv=5, loss_func=None,
 837 |                       is_y_log=False, is_boxcox=False, boxcox_lambda=1.0,
 838 |                       X_val=None, y_val=None, dates_val=None, skip_cvs=[],
 839 |                       is_estimate_val=True, estimate_val_w=2.0,
 840 |                       is_include_future_training=False,
 841 |                       test_date_ranges=None, n_cores=2):
 842 |     global_configs = []
 843 |     for tuned_parameter in tuned_parameters:
 844 |         configs = []
 845 |         init_config = {}
 846 |         for key in tuned_parameter:
 847 |             init_config[key] = tuned_parameter[key][0]
 848 |         configs.append(copy.copy(init_config))
 849 |         for key in tuned_parameter:
 850 |             tmp_configs = []
 851 |             for i in range(1, len(tuned_parameter[key])):
 852 |                 for iconfig in configs:
 853 |                     tmp_config = copy.copy(iconfig)
 854 |                     tmp_config[key] = tuned_parameter[key][i]
 855 |                     tmp_configs.append(tmp_config)
 856 |             configs += tmp_configs
 857 |         global_configs += configs
 858 |     if len(global_configs) == 0:
 859 |         global_configs.append({})
 860 |     max_date = dates[-1] + timedelta(days=1)
 861 |     report = collections.defaultdict(list)
 862 |     mem_config = {}
 863 |     nrepeat = 1
 864 | 
 865 |     if test_date_ranges is None:
 866 |         test_date_ranges = []
 867 |         for p, i in enumerate(range(cv) * nrepeat):
 868 |             if i in skip_cvs:
 869 |                 continue
 870 |             max_train_date = max_date - timedelta(days=(i + 1) * days_to_test)
 871 |             max_test_date = max_date - timedelta(days=(i) * days_to_test)
 872 |             test_date_ranges.append((max_train_date, max_test_date))
 873 |             print("range=", max_train_date, max_test_date)
 874 |     else:
 875 |         for max_train_date, max_test_date in test_date_ranges:
 876 |             print("range=", max_train_date, max_test_date)
 877 | 
 878 |     start_time = datetime.now()
 879 |     for p, date_time_range_item in enumerate(test_date_ranges*nrepeat):
 880 |         max_train_date, max_test_date = date_time_range_item
 881 |         train_items = dates < max_train_date
 882 |         test_items = np.asarray([False] * len(dates))
 883 |         inc_one_step = timedelta(days=1)
 884 |         cur_datetime = max_train_date
 885 |         while cur_datetime < max_test_date:
 886 |             ## training
 887 |             if is_include_future_training:
 888 |                 tmp_range_items = np.logical_and(
 889 |                     dates >= datetime(
 890 |                         cur_datetime.year, cur_datetime.month, cur_datetime.day,
 891 |                         hour=6),
 892 |                     dates < datetime(
 893 |                         cur_datetime.year, cur_datetime.month, cur_datetime.day,
 894 |                         hour=8))
 895 |                 train_items = np.logical_or(
 896 |                     train_items,
 897 |                     tmp_range_items
 898 |                     )
 899 |                 tmp_range_items = np.logical_and(
 900 |                     dates >= datetime(
 901 |                         cur_datetime.year, cur_datetime.month, cur_datetime.day,
 902 |                         hour=15),
 903 |                     dates < datetime(
 904 |                         cur_datetime.year, cur_datetime.month, cur_datetime.day,
 905 |                         hour=17))
 906 |                 train_items = np.logical_or(
 907 |                     train_items,
 908 |                     tmp_range_items
 909 |                     )
 910 |             ## testing
 911 |             tmp_range_items = np.logical_and(
 912 |                 dates >= datetime(
 913 |                     cur_datetime.year, cur_datetime.month, cur_datetime.day,
 914 |                     hour=8),
 915 |                 dates < datetime(
 916 |                     cur_datetime.year, cur_datetime.month, cur_datetime.day,
 917 |                     hour=10))
 918 |             test_items = np.logical_or(
 919 |                 test_items,
 920 |                 tmp_range_items
 921 |                 )
 922 |             tmp_range_items = np.logical_and(
 923 |                 dates >= datetime(
 924 |                     cur_datetime.year, cur_datetime.month, cur_datetime.day,
 925 |                     hour=17),
 926 |                 dates < datetime(
 927 |                     cur_datetime.year, cur_datetime.month, cur_datetime.day,
 928 |                     hour=19))
 929 |             test_items = np.logical_or(
 930 |                 test_items,
 931 |                 tmp_range_items
 932 |                 )
 933 |             cur_datetime += inc_one_step
 934 | 
 935 |         X_train = X[train_items]
 936 |         y_train = y[train_items]
 937 |         dates_train = dates[train_items]
 938 | 
 939 |         X_test = X[test_items]
 940 |         y_test = y[test_items]
 941 |         dates_test = dates[test_items]
 942 | 
 943 |         if len(y_test) == 0:
 944 |             continue
 945 | 
 946 |         input_compute_losses = []
 947 |         for ic, config in enumerate(global_configs):
 948 |             input_compute_loss = {}
 949 |             input_compute_loss["Model"] = Model
 950 |             input_compute_loss["config"] = config
 951 |             input_compute_loss["X_train"] = X_train
 952 |             input_compute_loss["y_train"] = y_train
 953 |             input_compute_loss["dates_train"] = dates_train
 954 |             input_compute_loss["X_test"] = X_test
 955 |             input_compute_loss["y_test"] = y_test
 956 |             input_compute_loss["is_y_log"] = is_y_log
 957 |             input_compute_loss["is_boxcox"] = is_boxcox
 958 |             input_compute_loss["loss_func"] = loss_func
 959 |             input_compute_losses.append(input_compute_loss)
 960 |         print("len(input_compute_losses)=", len(input_compute_losses))
 961 |         print("n_cores=", n_cores)
 962 |         ic = 0
 963 |         need_print_last_time = datetime.now()
 964 |         #pool = Pool(processes=n_cores)
 965 |         #for ret_val in pool.imap_unordered(compute_loss, input_compute_losses):
 966 |         for input_compute_loss in input_compute_losses:
 967 |             is_need_print = False
 968 |             diff_time_print = datetime.now() - need_print_last_time
 969 |             if diff_time_print.seconds > 30:
 970 |                 is_need_print = True
 971 |                 need_print_last_time = datetime.now()
 972 |             ret_val = compute_loss(input_compute_loss)
 973 |             if is_need_print:
 974 |                 print("p=", p)
 975 |                 print("ic %d/%d" % (ic, len(global_configs)))
 976 |             diff_time = datetime.now() - start_time
 977 |             cur_count = ic + p * len(global_configs) + 1
 978 |             total_count = len(global_configs) * nrepeat * len(test_date_ranges)
 979 | 
 980 |             config_key = ret_val[0]
 981 |             config = ret_val[1]
 982 |             loss = ret_val[2]
 983 | 
 984 |             report[repr(config)].append(loss)
 985 |             mem_config[repr(config)] = config
 986 |             if is_need_print:
 987 |                 print("progress %d/%d" % (cur_count, total_count))
 988 |                 remaining_seconds = diff_time.seconds / float(cur_count) * float(total_count - cur_count)
 989 |                 print("remaining minutes = %d" % int(remaining_seconds/60))
 990 |                 print("loss=", loss)
 991 |             ic += 1
 992 | 
 993 | 
 994 |     if is_estimate_val and not is_include_future_training:
 995 |         start_time = datetime.now()
 996 |         for nr in range(nrepeat):
 997 |             for ic, config in enumerate(global_configs):
 998 |                 print("val part")
 999 |                 print("ic %d/%d" % (ic, len(global_configs)))
1000 |                 print("nr = ", nr)
1001 |                 print("config=", config)
1002 |                 diff_time = datetime.now() - start_time
1003 |                 total_count = ic + nr * len(global_configs) + 1
1004 |                 remaining_seconds = diff_time.seconds / float(total_count) * float(len(global_configs) * nrepeat - total_count)
1005 |                 print("remaining minutes = %d" % int(remaining_seconds/60))
1006 |                 model = Model(**config)
1007 |                 if hasattr(model ,"dates_train"):
1008 |                     model.dates_train = copy.deepcopy(dates)
1009 |                 if is_y_log:
1010 |                     model.fit(X, np.log(y))
1011 |                     predict_y_val = np.exp(model.predict(X_val))
1012 |                 elif is_boxcox:
1013 |                     model.fit(X, boxcox(y, boxcox_lambda))
1014 |                     predict_y_val = invboxcox(model.predict(X_val), boxcox_lambda)
1015 |                 else:
1016 |                     model.fit(X, y)
1017 |                     predict_y_val = model.predict(X_val)
1018 |                 if loss_func is None:
1019 |                     loss = mape_loss(y_val, predict_y_val)
1020 |                 else:
1021 |                     loss = loss_func(y_val, predict_y_val)
1022 |                 for i in range(int(estimate_val_w)):
1023 |                     report[repr(config)].append(loss)
1024 |                 print("loss=", loss)
1025 | 
1026 |     all_loss_configs = []
1027 |     for config in report:
1028 |         end_pos = len(report[config]) / 6 * 5
1029 |         all_loss_configs.append((
1030 |                                   np.average(report[config][:end_pos]), config, report[config], np.average(report[config]), np.std(report[config])
1031 |                                ))
1032 |     all_loss_configs = sorted(all_loss_configs, key=lambda x: x[0])
1033 |     print("summary:")
1034 |     for loss_config in all_loss_configs[::-1]:
1035 |         #print("loss_config=", loss_config)
1036 |         loss_val = loss_config[0]
1037 |         config_str = str(loss_config[1])
1038 |         avg_val = float(loss_config[3])
1039 |         std_val = float(loss_config[4])
1040 |         print("key loss=", loss_val , "avg loss=", avg_val, " std_val=", std_val, " all losses=", loss_config[2])
1041 |         print(config_str)
1042 |     print("end summary")
1043 |     best_config = all_loss_configs[0][1]
1044 |     best_loss = all_loss_configs[0][0]
1045 |     return mem_config[best_config], best_loss
1046 | 
1047 | 
1048 | 
1049 | def GridSearchCVDatesWithVal(Configurations,
1050 |                              X_train, y_train, dates_train,
1051 |                              X_val, y_val, dates_val,
1052 |                              X_final,
1053 |                              is_y_log=False, is_boxcox=False, boxcox_lambda=1.0,
1054 |                              is_include_val_loss_for_eval=True, cv=5, skip_cvs=[], days_to_test=7,
1055 |                              X_frequent_train=None, y_frequent_train=None, is_estimate_val=True,
1056 |                              estimate_val_w=2.0,
1057 |                              is_include_future_training=False, remove_future_training_test=True,
1058 |                              test_date_ranges=None, n_cores=2):
1059 |     ## search for best meta parameters
1060 |     report = []
1061 |     for config in Configurations:
1062 |         print("="*60)
1063 |         Model = config["model"]
1064 |         print(type(Model()).__name__)
1065 |         tuned_parameters = config["tuned_parameters"]
1066 |         best_config, best_loss = GridSearchCVDates(
1067 |             Model, tuned_parameters,
1068 |             X_train, y_train, copy.deepcopy(dates_train), loss_func=None,
1069 |             is_boxcox=is_boxcox, boxcox_lambda=boxcox_lambda,
1070 |             cv=cv, skip_cvs=skip_cvs,
1071 |             X_val=X_val, y_val=y_val, days_to_test=days_to_test,
1072 |             is_estimate_val=is_estimate_val, estimate_val_w=estimate_val_w,
1073 |             is_include_future_training=is_include_future_training, test_date_ranges=test_date_ranges, n_cores=n_cores)
1074 |         if is_include_val_loss_for_eval and (not is_include_future_training):
1075 |             cur_Model = Model
1076 |             cur_config = copy.deepcopy(best_config)
1077 |             cur_model = cur_Model(**cur_config)
1078 |             if hasattr(cur_model, "dates_train"):
1079 |                 cur_model.dates_train = copy.deepcopy(dates_train)
1080 |             if is_y_log:
1081 |                 cur_model.fit(X_train, np.log(y_train))
1082 |                 predict_y_val = np.exp(cur_model.predict(X_val))
1083 |             elif is_boxcox:
1084 |                 cur_model.fit(X_train, boxcox(y_train, boxcox_lambda))
1085 |                 predict_y_val = invboxcox(cur_model.predict(X_val), boxcox_lambda)
1086 |             else:
1087 |                 cur_model.fit(X_train, y_train)
1088 |                 predict_y_val = cur_model.predict(X_val)
1089 |             val_loss = mape_loss(y_val, predict_y_val)
1090 |             best_loss = (best_loss + val_loss)/2.0
1091 |         report.append((best_loss, Model, best_config))
1092 | 
1093 |     report = sorted(report, key = lambda x: x[0])
1094 |     for item in report[::-1]:
1095 |         print("model=", type(item[1]()).__name__,
1096 |               " loss=", item[0],
1097 |               " best_config=", item[2])
1098 | 
1099 |     best_Model = report[0][1]
1100 |     best_config = report[0][2]
1101 |     best_model = best_Model(**best_config)
1102 | 
1103 |     if (not is_include_future_training) or remove_future_training_test:
1104 |         X_train_all = X_train
1105 |         y_train_all = y_train
1106 |         if hasattr(best_model, "dates_train"):
1107 |             #print("debug best_model has dates_train")
1108 |             best_model.dates_train = copy.deepcopy(dates_train)
1109 |     else:
1110 |         print("include future value for fitting")
1111 |         X_train_all = np.vstack([X_train, X_val])
1112 |         y_train_all = np.hstack([y_train, y_val])
1113 |         if hasattr(best_model, "dates_train"):
1114 |             dates_train_all = np.hstack([dates_train, dates_val])
1115 |             best_model.dates_train = copy.deepcopy(dates_train_all)
1116 |             if not all(dates_train_all[i] <= dates_train_all[i+1]
1117 |                     for i in xrange(len(dates_train_all)-1)):
1118 |                 raise ValueError("train dates are not sorted...")
1119 |         #print("X_train_all=", X_train_all[-10:, :])
1120 |         #print("best_model.dates_train=", best_model.dates_train[-60:])
1121 |         #print("X_train_all.shape=", X_train_all.shape)
1122 |         #print("y_train_all.shape=", y_train_all.shape)
1123 |         #print("best_model.dates_train.shape=", best_model.dates_train.shape)
1124 | 
1125 |     if is_y_log:
1126 |         best_model.fit(X_train_all, np.log(y_train_all))
1127 |         predict_y_val = np.exp(best_model.predict(X_val))
1128 |         predict_y_final = np.exp(best_model.predict(X_final))
1129 |     elif is_boxcox:
1130 |         best_model.fit(X_train_all, boxcox(y_train_all, boxcox_lambda))
1131 |         predict_y_val = invboxcox(best_model.predict(X_val), boxcox_lambda)
1132 |         predict_y_final = invboxcox(best_model.predict(X_final), boxcox_lambda)
1133 |     else:
1134 |         best_model.fit(X_train_all, y_train_all)
1135 |         predict_y_val = best_model.predict(X_val)
1136 |         predict_y_final = best_model.predict(X_final)
1137 | 
1138 |     loss = mape_loss(y_val, predict_y_val)
1139 |     print("X_train_all[0]=", X_train_all[0])
1140 |     print("X_train_all[-1]=", X_train_all[-1])
1141 |     print("best_model=", type(best_model).__name__)
1142 |     print("best_config=", best_config)
1143 |     print("best model loss on val=", loss)
1144 |     return predict_y_final
1145 | 
1146 | #    for config in global_configs:
1147 | #        print config
1148 | 
1149 | def plot_travel_times_fix_hour(working_dir, traval_times):
1150 |     out_dir = "travel_times_fix_hour"
1151 |     abs_out_dir = os.path.join(working_dir, out_dir)
1152 |     if not os.path.isdir(abs_out_dir):
1153 |         os.makedirs(abs_out_dir)
1154 |     time_keys = set()
1155 |     for route_id in traval_times:
1156 |         for start_time_window in traval_times[route_id]:
1157 |             key_str = "%02d%02d" % (start_time_window.hour,
1158 |                                 start_time_window.minute)
1159 |             time_keys.add(key_str)
1160 | 
1161 |     for time_key in time_keys:
1162 |         for route_id in traval_times:
1163 |             x = []
1164 |             y = []
1165 |             for start_time_window in traval_times[route_id]:
1166 |                 key_str = "%02d%02d" % (start_time_window.hour,
1167 |                                     start_time_window.minute)
1168 |                 if time_key == key_str:
1169 |                     x.append(start_time_window)
1170 |                     y.append(np.average(traval_times[route_id][start_time_window]))
1171 | 
1172 |             path_figure = os.path.join(abs_out_dir, "%s_%s.svg" % (route_id, time_key))
1173 |             plt.clf()
1174 |             plt.scatter(x, y, color="black", marker=".")
1175 |             plt.gcf().autofmt_xdate()
1176 |             plt.draw()
1177 |             plt.savefig(path_figure)
1178 | 
1179 |             path_figure = os.path.join(abs_out_dir, "%s_%s_log.svg" % (route_id, time_key))
1180 |             plt.clf()
1181 |             plt.scatter(x, np.log(y), color="black", marker="o")
1182 |             plt.gcf().autofmt_xdate()
1183 |             plt.draw()
1184 |             plt.savefig(path_figure)
1185 | 
1186 | def compute_harmonic_mean(predict_vals):
1187 |     predict_vals = np.vstack(predict_vals)
1188 |     predict_vals = 1.0/predict_vals
1189 |     print("predict_vals.shape=", predict_vals.shape)
1190 |     n_res = predict_vals.shape[0]
1191 |     print("n_res=", n_res)
1192 |     predict_vals = np.sum(predict_vals, axis=0)
1193 |     predict_vals = n_res / predict_vals
1194 |     return predict_vals.flatten()
1195 | 
1196 | def plot_travel_times_fix_date(working_dir, traval_times):
1197 |     out_dir = "travel_times_fix_date"
1198 |     abs_out_dir = os.path.join(working_dir, out_dir)
1199 |     if not os.path.isdir(abs_out_dir):
1200 |         os.makedirs(abs_out_dir)
1201 |     time_keys = set()
1202 |     for route_id in traval_times:
1203 |         for start_time_window in traval_times[route_id]:
1204 |             key_str = "%04d%02d%02d" % (
1205 |                 int(start_time_window.year),
1206 |                 int(start_time_window.month),
1207 |                 int(start_time_window.day), )
1208 |             time_keys.add(key_str)
1209 | 
1210 |     for time_key in time_keys:
1211 |         for route_id in traval_times:
1212 |             x = []
1213 |             y = []
1214 |             for start_time_window in traval_times[route_id]:
1215 |                 key_str = "%04d%02d%02d" % (
1216 |                     start_time_window.year,
1217 |                     start_time_window.month,
1218 |                     start_time_window.day, )
1219 |                 if time_key == key_str:
1220 |                     x.append(int("%02d%02d" % (start_time_window.hour, start_time_window.minute)))
1221 |                     y.append(np.average(traval_times[route_id][start_time_window]))
1222 |             if len(y) == 0:
1223 |                 continue
1224 |             sorted_indexes = sorted(range(len(y)), key=lambda i: x[i])
1225 |             x = np.asarray(x)
1226 |             y = np.asarray(y)
1227 |             x = x[sorted_indexes]
1228 |             y = y[sorted_indexes]
1229 |             #x = range(len(y))
1230 | 
1231 |             path_figure = os.path.join(abs_out_dir, "%s_%s.svg" % (route_id, time_key))
1232 |             plt.clf()
1233 |             plt.scatter(x, y, color="black", marker=".")
1234 |             #plt.gcf().autofmt_xdate()
1235 |             plt.draw()
1236 |             plt.savefig(path_figure)
1237 | 
1238 |             path_figure = os.path.join(abs_out_dir, "%s_%s_log.svg" % (route_id, time_key))
1239 |             plt.clf()
1240 |             plt.scatter(x, np.log(y), color="black", marker="o")
1241 |             #plt.gcf().autofmt_xdate()
1242 |             plt.draw()
1243 |             plt.savefig(path_figure)
1244 | 
1245 | if __name__ == "__main__":
1246 |     from kdd2017.utils import load_weather_info
1247 |     from kdd2017.utils import search_closest_date_weather_info
1248 |     from datetime import datetime
1249 |     weather_infos = ["/ssd/jinpeng/dataSets/dataSets/training/weather (table 7)_training_update.csv", "/ssd/jinpeng/dataSets/dataSets/testing_phase1/weather (table 7)_test1.csv"]
1250 | 
1251 |     datetime_weather = load_weather_info(weather_infos)
1252 | 
1253 |     cur_date = datetime(2016,10,18,12)
1254 |     weather = search_closest_date_weather_info(cur_date, datetime_weather)
1255 | 
1256 | 


--------------------------------------------------------------------------------
/bin/compute_kdd2017:
--------------------------------------------------------------------------------
   1 | #! /usr/bin/env python
   2 | # -*- coding: utf-8 -*-
   3 | 
   4 | from sklearn import linear_model
   5 | import numpy as np
   6 | 
   7 | from argparse import RawTextHelpFormatter
   8 | import argparse
   9 | import json
  10 | from datetime import timedelta
  11 | from datetime import datetime
  12 | import dateutil
  13 | import copy
  14 | import os
  15 | import matplotlib.pyplot as plt
  16 | import matplotlib
  17 | import sys, traceback
  18 | 
  19 | from kdd2017.utils import load_travel_times_from_trajectories
  20 | from kdd2017.utils import convert_into_X_y
  21 | from kdd2017.utils import mape_loss
  22 | from kdd2017.utils import inv_mape_loss
  23 | from kdd2017.utils import GridSearchCVDates
  24 | from kdd2017.utils import invboxcox
  25 | from kdd2017.utils import plot_travel_times_fix_date
  26 | from kdd2017.utils import plot_travel_times_fix_hour
  27 | from sklearn import linear_model
  28 | from kdd2017.utils import GridSearchCVDatesWithVal
  29 | 
  30 | from kdd2017.utils import remove_outliers
  31 | from kdd2017.utils import load_weather_info
  32 | 
  33 | from scipy.stats import boxcox
  34 | from sklearn.ensemble import GradientBoostingRegressor
  35 | from sklearn.ensemble import BaggingRegressor
  36 | from sklearn.ensemble import AdaBoostRegressor
  37 | from sklearn.ensemble import ExtraTreesRegressor
  38 | from sklearn.ensemble import RandomForestRegressor
  39 | 
  40 | from sklearn.neighbors import KNeighborsRegressor
  41 | from sklearn.neighbors import RadiusNeighborsRegressor
  42 | 
  43 | from sklearn.linear_model import Lasso
  44 | 
  45 | from sklearn.linear_model import LinearRegression
  46 | from sklearn.linear_model import ARDRegression
  47 | from sklearn.linear_model import HuberRegressor
  48 | from sklearn.linear_model import LinearRegression
  49 | from sklearn.linear_model import LogisticRegression
  50 | from sklearn.linear_model import LogisticRegressionCV
  51 | from sklearn.linear_model import PassiveAggressiveRegressor
  52 | from sklearn.linear_model import RandomizedLogisticRegression
  53 | from sklearn.linear_model import RANSACRegressor
  54 | from sklearn.linear_model import SGDRegressor
  55 | from sklearn.linear_model import TheilSenRegressor
  56 | from sklearn.linear_model import logistic_regression_path
  57 | from sklearn.model_selection import train_test_split
  58 | 
  59 | 
  60 | from sklearn.neural_network import MLPRegressor
  61 | from sklearn.cross_decomposition import PLSRegression
  62 | from sklearn.svm import SVR
  63 | from sklearn.svm import LinearSVR
  64 | from sklearn.svm import NuSVR
  65 | 
  66 | from sklearn.tree import DecisionTreeRegressor
  67 | from sklearn.tree import ExtraTreeRegressor
  68 | 
  69 | 
  70 | from sklearn.pipeline import FeatureUnion
  71 | from sklearn.decomposition import PCA
  72 | from sklearn.feature_selection import SelectKBest
  73 | 
  74 | from sklearn.model_selection import GridSearchCV
  75 | from sklearn.model_selection import train_test_split
  76 | from sklearn.svm import SVR
  77 | from sklearn.metrics import r2_score
  78 | from sklearn.metrics import make_scorer
  79 | from sklearn.metrics import mean_absolute_error
  80 | from kdd2017.models import *
  81 | import itertools
  82 | 
  83 | import matplotlib.pyplot as plt
  84 | import matplotlib
  85 | import collections
  86 | 
  87 | from sklearn.neighbors import KNeighborsClassifier
  88 | from sklearn.gaussian_process import GaussianProcessRegressor
  89 | from sklearn.isotonic import IsotonicRegression
  90 | 
  91 | from kdd2017.utils import findsubsets
  92 | from kdd2017.utils import findsubsets2
  93 | 
  94 | from kdd2017.utils import load_routes
  95 | from kdd2017.utils import load_links
  96 | #from kdd2017.remove_outliers import remove_outliers_by_classifier
  97 | #global_variables = {}
  98 | 
  99 | if __name__ == "__main__":
 100 |     description='''
 101 |         compute_kdd2017 /media/jl237561/usb_ext/workspace/kdd2017/kdd2017/config/config.json
 102 |     '''
 103 |     parser = argparse.ArgumentParser(
 104 |         description=description,
 105 |         formatter_class=RawTextHelpFormatter)
 106 | 
 107 |     parser.add_argument('config', type=unicode, nargs=1,
 108 |                         help='...')
 109 | 
 110 |     options = parser.parse_args()
 111 | 
 112 |     if not options.config:
 113 |         raise ValueError("Please set all the parameters.")
 114 | 
 115 |     config_path = options.config[0]
 116 |     config = json.load(open(config_path, "r"))
 117 |     path_trajectories_train = config["path_trajectories_train"]
 118 |     path_trajectories_val = config["path_trajectories_val"]
 119 |     path_working_dir = config["working_dir"]
 120 |     path_weather_infos = config["weather_infos"]
 121 |     path_links = config["path_links"]
 122 |     path_routes = config["path_routes"]
 123 | 
 124 |     #path_combine_model_cache = os.path.join(path_working_dir, "cache_combine_model_%d.json" % random.randint(0, 1000))
 125 |     #if os.path.isfile(path_combine_model_cache):
 126 |     #    raise ValueError("cache exist.. please remove %s" % path_combine_model_cache)
 127 |     #global global_variables
 128 |     #global_variables["cache_combine_model"] = {}
 129 | 
 130 |     datetime_weather = load_weather_info(path_weather_infos)
 131 |     routes_data = load_routes(path_routes)
 132 |     link_data = load_links(path_links)
 133 | 
 134 |     if not os.path.isdir(path_working_dir):
 135 |         os.makedirs(path_working_dir)
 136 | 
 137 |     test_date_ranges = None
 138 |     #test_date_ranges.append((datetime(2016, 10, 11, 23, 40), datetime(2016, 10, 18, 23, 40)))
 139 |     #test_date_ranges.append((datetime(2016, 9, 20, 23, 40), datetime(2016, 9, 27, 23, 40)))
 140 |     #test_date_ranges.append((datetime(2016, 9, 1, 23, 40), datetime(2016, 9, 8, 23, 40)))
 141 | 
 142 |     estimate_val_w=1.0
 143 |     n_cores=3
 144 |     is_estimate_val = False
 145 |     skip_cvs = []
 146 |     is_y_log = False
 147 |     is_boxcox = False
 148 |     is_include_val_loss_for_eval = is_estimate_val
 149 |     is_include_future_training = False
 150 |     remove_future_training_test = not is_include_future_training
 151 |     #remove_future_training_test = True
 152 |     skip_date_ranges = [
 153 |         #(datetime(2016, 9, 14), datetime(2016, 9, 16)),
 154 |         #(datetime(2016, 9, 14), datetime(2016, 9, 19)),
 155 |         #(datetime(2016, 10, 13), datetime(2016, 10, 14)),
 156 |         #(datetime(2016, 9, 30), datetime(2016, 10, 1)),
 157 |     ]
 158 |     boxcox_lambda = -1.0
 159 |     travel_times_train = load_travel_times_from_trajectories(
 160 |         path_trajectories_train, skip_date_ranges=[], load_frequent_info=False)
 161 | 
 162 |     #plot_travel_times_fix_hour(path_working_dir, travel_times_train)
 163 |     #plot_travel_times_fix_date(path_working_dir, travel_times_train)
 164 |     travel_times_val = load_travel_times_from_trajectories(
 165 |         path_trajectories_val, skip_date_ranges=[], load_frequent_info=False)
 166 | 
 167 |     X_train, y_train, dates_train, les_train = convert_into_X_y(
 168 |         travel_times_train, datetime_weather, link_data, routes_data, )
 169 | 
 170 |     X_val, y_val, dates_val, _ = convert_into_X_y(
 171 |         travel_times_val, datetime_weather, link_data, routes_data, les_train, is_skip_not_trainning_hours=False)
 172 | 
 173 |     route_ids = set()
 174 |     for route_id in travel_times_train:
 175 |         route_ids.add(route_id)
 176 |     route_ids = list(route_ids)
 177 |     travel_times_predict = collections.OrderedDict()
 178 |     finnal_predict_times1 = [
 179 |                                 (datetime(2016,10,i,8), datetime(2016,10,i,10))
 180 |                                 for i in range(25, 32)
 181 |                             ]
 182 |     finnal_predict_times2 = [
 183 |                                 (datetime(2016,10,i,17), datetime(2016,10,i,19))
 184 |                                 for i in range(25, 32)
 185 |                             ]
 186 |     finnal_predict_times = finnal_predict_times1 + finnal_predict_times2
 187 |     for route_id in route_ids:
 188 |         travel_times_predict[route_id] = collections.OrderedDict()
 189 |         predict_datetimes = []
 190 |         for time_range in finnal_predict_times:
 191 |             start_datetime = time_range[0]
 192 |             end_datetime = time_range[1]
 193 |             cur_datetime = start_datetime
 194 |             while cur_datetime < end_datetime:
 195 |                 predict_datetimes.append(cur_datetime)
 196 |                 cur_datetime = cur_datetime + timedelta(minutes=20)
 197 |         for predict_datetime in predict_datetimes:
 198 |             travel_times_predict[route_id][predict_datetime] = [0, ]
 199 |     print("*" * 60)
 200 |     X_final, y_final, dates_final, _, raw_info = convert_into_X_y(
 201 |         travel_times_predict, datetime_weather, link_data, routes_data, les_train, True, verbose=False)
 202 | 
 203 |     Configurations = [
 204 |         #{
 205 |         #    "model": BoxcoxModel,
 206 |         #    "tuned_parameters": [
 207 |         #        {
 208 |         #           "model": [GradientBoostingRegressor],
 209 |         #           "loss": ["lad"],
 210 |         #           "learning_rate": [0.1,],
 211 |         #           "n_estimators": [200,],
 212 |         #           "is_boxcox": [True, ],
 213 |         #           "boxcox_lambda": [-1.0, -0.6, -0.3, 0, 0.1, 0.5, 1.0, 2.0],
 214 |         #        },
 215 |         #        {
 216 |         #           "model": [GradientBoostingRegressor,],
 217 |         #           "loss": ["lad"],
 218 |         #           "learning_rate": [0.1,],
 219 |         #           "n_estimators": [200,],
 220 |         #           "is_boxcox": [False, ],
 221 |         #           "boxcox_lambda": [1,],
 222 |         #        },
 223 |         #   ],
 224 |         #},
 225 |         #{
 226 |         #    "model": XGBoost,
 227 |         #    "tuned_parameters": [],
 228 |         #},
 229 | #        {
 230 | #            "model": GradientBoostingRegressor,
 231 | #            "tuned_parameters":[
 232 | #                {
 233 | #                    "n_estimators": [50, ],
 234 | #                    "criterion": ["mae", "mse", "friedman_mse",],
 235 | #                    "verbose": [3, ],
 236 | #                    'loss': ['ls', 'lad', 'huber', 'quantile', ],
 237 | #                },
 238 | #            ],
 239 | #        },
 240 | 
 241 | ### CombineModels.... final models..
 242 |         {
 243 |             "model": CombineModes,
 244 |             "tuned_parameters": [
 245 |                 {
 246 |                     'models': [
 247 |                         [
 248 |                             ## global
 249 |                             ## 1
 250 |                             DaterangeModel(
 251 |                                 model=MedianModel,
 252 |                                 train_days=19,
 253 |                                 ft_pos=[0, 1, 3, 5],
 254 |                                 is_y_log=True,
 255 |                                 y_log_e=np.e,
 256 |                                 #ft_pos=[0, 1, 3, 5],
 257 |                                 ft_select=[0, 1, 2, 4, 5, 7],
 258 |                                 is_rm_outliers=True,
 259 |                                 rm_outliers_m=4.0,
 260 |                                 rm_outliers_key=[0, 1, 2, ],
 261 |                                 is_avg_or_median=1, 
 262 |                             ),
 263 |                             ## 2
 264 |                             DaterangeModel(
 265 |                                 model=LGBM,
 266 |                                 rm_n_head_days=0,
 267 |                                 # subsample=0.6, not necessary
 268 |                                 colsample_bytree=0.8,
 269 |                                 learning_rate=0.1,
 270 |                                 num_leaves=8,
 271 |                                 ft_select=[0, 1, 2, 4, 5, 7, 9, 10],
 272 |                                 n_estimators=1000,
 273 |                                 ft_th=[(9, 0.25), (10, 0.55)],
 274 |                                 objective="regression",
 275 | 
 276 |                                 train_days=50,
 277 |                                 is_y_log=True,
 278 |                                 y_log_e=np.e,
 279 |                                 is_rm_outliers=True,
 280 |                                 rm_outliers_key=[0,1,2,],
 281 |                                 rm_outliers_m=4.0,
 282 |                                 is_avg_or_median=1,
 283 |                             ),
 284 |                             ## 3
 285 |                             DaterangeModel(
 286 |                                 model=LGBM,
 287 |                                 rm_n_head_days=0,
 288 |                                 # subsample=0.6, not necessary
 289 |                                 colsample_bytree=0.8,
 290 |                                 learning_rate=0.1,
 291 |                                 num_leaves=8,
 292 |                                 ft_select=[0, 1, 2, 3],
 293 |                                 n_estimators=200,
 294 |                                 ft_th=[(9, 0.25), (10, 0.55)],
 295 |                                 objective="regression",
 296 | 
 297 |                                 train_days=21,
 298 |                                 is_y_log=True,
 299 |                                 y_log_e=np.e,
 300 |                                 is_rm_outliers=True,
 301 |                                 rm_outliers_key=[0,1,2,],
 302 |                                 rm_outliers_m=4.0,
 303 |                                 is_avg_or_median=1,
 304 |                             ),
 305 |                             ### 4
 306 |                             DaterangeModel(
 307 |                                 model=NonparametricKNN,
 308 |                                 train_days=21,
 309 |                                 n_neighbors=3,
 310 |                                 loss="SMAPE",
 311 |                                 ft_th=[
 312 |                                          (9, 0.25),
 313 |                                          (10, 0.55),
 314 |                                          #(11, i/100.0),
 315 |                                       ],
 316 |                                 random_state=0,
 317 |                                 is_y_log=True,
 318 |                                 y_log_e=np.e,
 319 |                                 #is_one_hot_encode=is_one_hot_encode,
 320 |                                 #ft_pos=[0, 1, 3, 5],
 321 |                                 ft_select=[0, 1, 3, 4],
 322 |                                 is_rm_outliers=True,
 323 |                                 rm_outliers_m=4.0,
 324 |                                 rm_outliers_key=[0, 1, 2, ],
 325 |                                 is_avg_or_median=1,
 326 |                             ), 
 327 |                             ### 5
 328 |                             DaterangeModel(
 329 |                                 model=NonparametricKNN,
 330 |                                 train_days=50,
 331 |                                 n_neighbors=5,
 332 |                                 loss="SMAPE",
 333 |                                 ft_th=[
 334 |                                          (9, 0.25),
 335 |                                          (10, 0.55),
 336 |                                          #(11, i/100.0),
 337 |                                       ],
 338 |                                 random_state=0,
 339 |                                 is_y_log=True,
 340 |                                 y_log_e=np.e,
 341 |                                 #is_one_hot_encode=is_one_hot_encode,
 342 |                                 #ft_pos=[0, 1, 3, 5],
 343 |                                 ft_select=[0, 1, 2, 4, 5, 7, 12, 13, 14, 15, 16, 17, 18],
 344 |                                 is_rm_outliers=True,
 345 |                                 rm_outliers_m=4.0,
 346 |                                 rm_outliers_key=[0, 1, 2, ],
 347 |                                 is_avg_or_median=1,
 348 |                             ), 
 349 |                             ### 6
 350 |                             DaterangeModel(
 351 |                                 model=NonparametricKNN,
 352 |                                 train_days=90,
 353 |                                 n_neighbors=7,
 354 |                                 loss="L2",
 355 |                                 ft_th=[
 356 |                                          (9, 0.25),
 357 |                                          (10, 0.55),
 358 |                                          #(11, i/100.0),
 359 |                                       ],
 360 |                                 random_state=0,
 361 |                                 is_y_log=True,
 362 |                                 y_log_e=np.e,
 363 |                                 #is_one_hot_encode=is_one_hot_encode,
 364 |                                 #ft_pos=[0, 1, 3, 5],
 365 |                                 ft_select=[0, 1, 3, 4],
 366 |                                 is_rm_outliers=True,
 367 |                                 rm_outliers_m=4.0,
 368 |                                 rm_outliers_key=[0, 1, 2, ],
 369 |                                 is_avg_or_median=1,
 370 |                             ),
 371 |                             #### 7
 372 |                             DaterangeModel(
 373 |                                 model=NonparametricKNN,
 374 |                                 train_days=50,
 375 |                                 n_neighbors=5,
 376 |                                 loss="L1",
 377 |                                 ft_th=[
 378 |                                          (9, 0.25),
 379 |                                          (10, 0.55),
 380 |                                          #(11, i/100.0),
 381 |                                       ],
 382 |                                 random_state=0,
 383 |                                 is_y_log=True,
 384 |                                 y_log_e=np.e,
 385 |                                 #is_one_hot_encode=is_one_hot_encode,
 386 |                                 #ft_pos=[0, 1, 3, 5],
 387 |                                 ft_select=[0, 1, 3, 4],
 388 |                                 is_rm_outliers=True,
 389 |                                 rm_outliers_m=4.0,
 390 |                                 rm_outliers_key=[0, 1, 2, ],
 391 |                                 is_avg_or_median=1,
 392 |                             ), 
 393 |                             #### 8
 394 |                             DaterangeModel(
 395 |                                 model=MLPRegressor,
 396 |                                 train_days=21,
 397 |                                 early_stopping=True,
 398 |                                 ft_th=[
 399 |                                          (9, 0.25),
 400 |                                          (10, 0.55),
 401 |                                          #(11, i/100.0),
 402 |                                       ],
 403 |                                 random_state=0,
 404 |                                 is_y_log=True,
 405 |                                 y_log_e=np.e,
 406 |                                 #is_one_hot_encode=is_one_hot_encode,
 407 |                                 #ft_pos=[0, 1, 3, 5],
 408 |                                 ft_select=[0, 1, 2, 4, 5, 7, 9],
 409 |                                 is_rm_outliers=True,
 410 |                                 rm_outliers_m=4.0,
 411 |                                 rm_outliers_key=[0, 1, 2, ],
 412 |                                 is_avg_or_median=1,
 413 |                             ), 
 414 |                             #### 9
 415 |                             DaterangeModel(
 416 |                                 model=KNeighborsRegressor,
 417 |                                 train_days=21,
 418 |                                 n_neighbors=5,
 419 |                                 ft_th=[
 420 |                                          (9, 0.25),
 421 |                                          (10, 0.55),
 422 |                                          #(11, i/100.0),
 423 |                                       ],
 424 |                                 random_state=0,
 425 |                                 is_y_log=True,
 426 |                                 y_log_e=np.e,
 427 |                                 #is_one_hot_encode=is_one_hot_encode,
 428 |                                 #ft_pos=[0, 1, 3, 5],
 429 |                                 ft_select=[0, 1, 2, 4, 5, 7, 9],
 430 |                                 is_rm_outliers=True,
 431 |                                 rm_outliers_m=4.0,
 432 |                                 rm_outliers_key=[0, 1, 2, ],
 433 |                                 is_avg_or_median=1,
 434 |                             ), 
 435 |                             ### 10
 436 |                             DaterangeModel(
 437 |                                 model=GradientBoostingRegressor,
 438 |                                 ft_select=[0, 1, 2, 4, 5, 7, 9],
 439 |                                 rm_n_head_days=0,
 440 |                                 rm_n_head_days_hours=[(0, 6), (12, 15), (21, 22)],
 441 |                                 n_estimators=200,
 442 |                                 loss='ls',
 443 |                                 is_y_log=True,
 444 |                                 y_log_e=3.0,
 445 |                                 learning_rate=0.15,
 446 |                                 train_days=90,
 447 |                                 is_rm_outliers=True,
 448 |                                 rm_outliers_key=[0,1,2,],
 449 |                                 rm_outliers_m=4.0,
 450 |                                 max_depth=3,
 451 |                                 is_sample_weight=29,
 452 |                             ), 
 453 |                             #DaterangeModel(
 454 |                             #    model=GradientBoostingRegressor,
 455 |                             #    ft_select=ft_select,
 456 |                             #    rm_n_head_days=0,
 457 |                             #    rm_n_head_days_hours=[(0, 6), (12, 15), (21, 22)],
 458 |                             #    n_estimators=200,
 459 |                             #    loss='ls',
 460 |                             #    is_y_log=True,
 461 |                             #    y_log_e=3.0,
 462 |                             #    learning_rate=0.15,
 463 |                             #    train_days=model_day,
 464 |                             #    is_rm_outliers=True,
 465 |                             #    rm_outliers_key=[0,1,2,],
 466 |                             #    rm_outliers_m=4.0,
 467 |                             #    max_depth=3,
 468 |                             #    is_sample_weight=29,
 469 |                             #),
 470 |                             #### 11
 471 |                             #DaterangeModel(
 472 |                             #    model=XGBoost,
 473 |                             #    ft_select=ft_select,
 474 |                             #    num_round=6000,
 475 |                             #    early_stopping_rounds=10,
 476 |                             #    eta=0.02,
 477 |                             #    colsample_bytree=0.9,
 478 |                             #    subsample=0.6,
 479 |                             #    max_depth=3,
 480 |                             #    eval_metric='rmse',
 481 |                             #    objective='reg:gamma',
 482 |                             #    rm_n_head_days=0,
 483 |                             #    n_estimators=200,
 484 |                             #    booster='gbtree',
 485 | 
 486 |                             #    is_y_log=True,
 487 |                             #    y_log_e=3.0,
 488 |                             #    train_days=model_day,
 489 |                             #    is_rm_outliers=True,
 490 |                             #    rm_outliers_key=[0,1,2,],
 491 |                             #    rm_outliers_m=4.0,
 492 |                             #), 
 493 |                             ### 12
 494 |                            #DaterangeModel(
 495 |                             #    model=LGBM,
 496 |                             #    rm_n_head_days=0,
 497 |                             #    # subsample=0.6, not necessary
 498 |                             #    colsample_bytree=0.8,
 499 |                             #    learning_rate=0.1,
 500 |                             #    num_leaves=8,
 501 |                             #    ft_select=ft_select,
 502 |                             #    n_estimators=200,
 503 |                             #    ft_th=[(9, 0.25), (10, 0.55)],
 504 |                             #    objective="regression",
 505 | 
 506 |                             #    train_days=model_day,
 507 |                             #    is_y_log=True,
 508 |                             #    y_log_e=np.e,
 509 |                             #    is_rm_outliers=True,
 510 |                             #    rm_outliers_key=[0,1,2,],
 511 |                             #    rm_outliers_m=4.0,
 512 |                             #    is_avg_or_median=1,
 513 |                             #),
 514 |                          ]
 515 |                          #for n_neighbors in [3, 5, 7]
 516 |                          #for num_leaves in [8, 16, 32]
 517 | #                        ## LGBM
 518 | #                        #for loss in ["SMAPE", "L1", "L2"]
 519 | #                        #for loss in ["lad"]
 520 | #                        for n_neighbors in range(3, 5, 7)
 521 | #                        for model in [ LGBM,
 522 | #                                       #MedianModel
 523 | #                                       #GradientBoostingRegressor, 
 524 | #                                       #NonparametricKNN,
 525 | #                                       #KNeighborsRegressor,
 526 | #                                       #MLPRegressor,               
 527 | #                                       #HuberRegressor, PassiveAggressiveRegressor, RANSACRegressor, SGDRegressor, 
 528 | #                                       #TheilSenRegressor,
 529 | #                                       #RadiusNeighborsRegressor, PLSRegression, LinearSVR, #NuSVR,
 530 | #                                     ]
 531 |                          #for model_day in range(7, 90, 3)
 532 | #                        #for is_one_hot_encode in [True, False]
 533 |                          #for model_day in [21, 50, 90]
 534 | #                        for objective in ["regression", "regression_l2", "regression_l1", "huber", "fair", "poisson"]
 535 | #                        #for ft_select in [range(10)]
 536 | #                        #for ft_pos in [[0,1] + list(one) for one in findsubsets2(range(4, 8))]
 537 |                          #for ft_select in list( [[0, 1, ] + list(one) for one in findsubsets2(range(2, 10))])
 538 |                         #for ft_select in [[0, 1, 2, 4, 5, 7, 9, 10],]
 539 |                          #for ft_select in [[0, 1, 2, 4, 5, 7, 9], [0, 1, 2, 4, 5, 7] + range(12, 19), [0, 1, 2, 3], [0, 1, 2, 6], [0, 1, 3, 4], [0, 1, 2, 4, 5, 7],]
 540 | #                        #for ft_select in [[0, 1, 2, 4, 5, 7] + range(19, 43), ]
 541 |                      ],
 542 |                      #'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
 543 |                      'subsample': [1.0, ],
 544 |                      'combine_method': [2,],
 545 |                      'weights': [
 546 |                          [0.4, 1.4, 1.1, 1.0, x1/100.0, x2/100.0, 0.4, 0.1, 0.5, 1.2]  
 547 |                          for x1 in range(0, 201, 10)
 548 |                          for x2 in range(0, 201, 10)
 549 |                      ],
 550 |                 },
 551 |             ],
 552 |         },
 553 | #### CombineModels.... final models..
 554 | #        {
 555 | #            "model": CombineModes,
 556 | #            "tuned_parameters": [
 557 | #                {
 558 | #                    'models': [
 559 | #                        [
 560 | #                            ## global
 561 | #                            ## 1
 562 | #                            DaterangeModel(
 563 | #                                model=GradientBoostingRegressor,
 564 | #                                ft_select=[0, 1, 2, 4, 5, 7,],
 565 | #                                rm_n_head_days=10,
 566 | #                                rm_n_head_days_hours=[(0, 6), (12, 15), (21, 22)],
 567 | #                                n_estimators=200,
 568 | #                                loss='ls',
 569 | #                                is_y_log=True,
 570 | #                                y_log_e=3.0,
 571 | #                                learning_rate=0.15,
 572 | #                                train_days=50,
 573 | #                                is_rm_outliers=True,
 574 | #                                rm_outliers_key=[0,1,2,],
 575 | #                                rm_outliers_m=4.0,
 576 | #                                max_depth=3,
 577 | #                                is_sample_weight=29,
 578 | #                            ),
 579 | #                            # 2
 580 | #                            DaterangeModel(
 581 | #                                model=MedianModel,
 582 | #                                train_days=19,
 583 | #                                ft_pos=[0, 1, 3, 5],
 584 | #                                is_y_log=True,
 585 | #                                y_log_e=np.e,
 586 | #                                #ft_pos=[0, 1, 3, 5],
 587 | #                                ft_select=[0, 1, 2, 4, 5, 7],
 588 | #                                is_rm_outliers=True,
 589 | #                                rm_outliers_m=4.0,
 590 | #                                rm_outliers_key=[0, 1, 2, ],
 591 | #                                is_avg_or_median=1, 
 592 | #                            ),
 593 | #                            # 3
 594 | #                            DaterangeModel(
 595 | #                 
 596 | #                                train_days=23,
 597 | #                                random_state=0,
 598 | #                                is_y_log=True,
 599 | #                                y_log_e=np.e,
 600 | #                                #ft_pos=[0, 1, 3, 5],
 601 | #                                ft_select=[0, 1, 2, 4, 5, 7],
 602 | #                                is_rm_outliers=True,
 603 | #                                rm_outliers_m=4.0,
 604 | #                                rm_outliers_key=[0, 1, 2, ],
 605 | #                                is_avg_or_median=1, 
 606 | #                            ),
 607 | #                            # 4
 608 | #                            DaterangeModel(
 609 | #                                model=KNeighborsRegressor,
 610 | #                                train_days=10,
 611 | #                                is_y_log=True,
 612 | #                                y_log_e=np.e,
 613 | #                                #ft_pos=[0, 1, 3, 5],
 614 | #                                ft_select=[0, 1, 2, 4, 5, 7],
 615 | #                                is_rm_outliers=True,
 616 | #                                rm_outliers_m=4.0,
 617 | #                                rm_outliers_key=[0, 1, 2, ],
 618 | #                                is_avg_or_median=1, 
 619 | #                            ), 
 620 | #                            # 5
 621 | #                            DaterangeModel(
 622 | #                                model=MLPRegressor,
 623 | #                                train_days=35,
 624 | #                                random_state=0,
 625 | #                                is_y_log=True,
 626 | #                                y_log_e=np.e,
 627 | #                                #ft_pos=[0, 1, 3, 5],
 628 | #                                ft_select=[0, 1, 2, 4, 5, 7],
 629 | #                                is_rm_outliers=True,
 630 | #                                rm_outliers_m=4.0,
 631 | #                                rm_outliers_key=[0, 1, 2, ],
 632 | #                                is_avg_or_median=1, 
 633 | #                            ),
 634 | #                            # 6
 635 | #                            DaterangeModel(
 636 | #                                model=TheilSenRegressor,
 637 | #                                train_days=30,
 638 | #                                is_y_log=True,
 639 | #                                y_log_e=np.e,
 640 | #                                #ft_pos=[0, 1, 3, 5],
 641 | #                                ft_select=[0, 1, 2, 4, 5, 7],
 642 | #                                is_rm_outliers=True,
 643 | #                                rm_outliers_m=4.0,
 644 | #                                rm_outliers_key=[0, 1, 2, ],
 645 | #                                is_avg_or_median=1, 
 646 | #                            ),
 647 | #                            # 7 
 648 | #                            DaterangeModel(
 649 | #                                model=KNeighborsRegressor,
 650 | #                                train_days=60,
 651 | #                                is_y_log=True,
 652 | #                                y_log_e=np.e,
 653 | #                                #ft_pos=[0, 1, 3, 5],
 654 | #                                ft_select=[0, 1, 2, 4, 5, 7],
 655 | #                                is_rm_outliers=True,
 656 | #                                rm_outliers_m=4.0,
 657 | #                                rm_outliers_key=[0, 1, 2, ],
 658 | #                                is_avg_or_median=1, 
 659 | #                            ), 
 660 | #                            # 8  ## start to mix different features
 661 | #                            DaterangeModel(
 662 | #                                model=KNeighborsRegressor,
 663 | #                                train_days=14,
 664 | #                                is_y_log=True,
 665 | #                                y_log_e=np.e,
 666 | #                                #ft_pos=[0, 1, 3, 5],
 667 | #                                ft_select=[0, 1, 3, 4],
 668 | #                                is_rm_outliers=True,
 669 | #                                rm_outliers_m=4.0,
 670 | #                                rm_outliers_key=[0, 1, 2, ],
 671 | #                                is_avg_or_median=1, 
 672 | #                            ), 
 673 | #                            # 9
 674 | #                            DaterangeModel(
 675 | #                                model=MLPRegressor,
 676 | #                                train_days=56,
 677 | #                                random_state=0,
 678 | #                                is_y_log=True,
 679 | #                                y_log_e=np.e,
 680 | #                                ft_select=[0, 1, 3, 4, 7],
 681 | #                                is_rm_outliers=True,
 682 | #                                rm_outliers_m=4.0,
 683 | #                                rm_outliers_key=[0, 1, 2, ],
 684 | #                                is_avg_or_median=1,
 685 | #                            ), 
 686 | #                            ## 10
 687 | #                            DaterangeModel(
 688 | #                                model=KNeighborsRegressor,
 689 | #                                train_days=21,
 690 | #                                is_y_log=True,
 691 | #                                y_log_e=np.e,
 692 | #                                #ft_pos=[0, 1, 3, 5],
 693 | #                                ft_select=[0, 1, 3, 4],
 694 | #                                is_rm_outliers=True,
 695 | #                                rm_outliers_m=4.0,
 696 | #                                rm_outliers_key=[0, 1, 2, ],
 697 | #                                is_avg_or_median=1,
 698 | #                            ),
 699 | #                            ## 11
 700 | #                            DaterangeModel(
 701 | #                                model=MLPRegressor,
 702 | #                                train_days=7,
 703 | #                                random_state=0,
 704 | #                                is_y_log=True,
 705 | #                                y_log_e=np.e,
 706 | #                                #ft_pos=[0, 1, 3, 5],
 707 | #                                ft_select=[0, 1, 2, 6],
 708 | #                                is_rm_outliers=True,
 709 | #                                rm_outliers_m=4.0,
 710 | #                                rm_outliers_key=[0, 1, 2, ],
 711 | #                                is_avg_or_median=1,
 712 | #                            ), 
 713 | #                            ## 12
 714 | #                            DaterangeModel(
 715 | #                                model=KNeighborsRegressor,
 716 | #                                n_neighbors=6,
 717 | #                                train_days=84,
 718 | #                                #random_state=0,
 719 | #                                is_y_log=True,
 720 | #                                y_log_e=np.e,
 721 | #                                #ft_pos=[0, 1, 3, 5],
 722 | #                                ft_select=[0, 1, 2, 3],
 723 | #                                is_rm_outliers=True,
 724 | #                                rm_outliers_m=4.0,
 725 | #                                rm_outliers_key=[0, 1, 2, ],
 726 | #                                is_avg_or_median=1,
 727 | #                            ),
 728 | #                            ## 13
 729 | #                            DaterangeModel(
 730 | #                                model=KNeighborsRegressor,
 731 | #                                n_neighbors=4,
 732 | #                                train_days=63,
 733 | #                                ft_th=[
 734 | #                                         (9, 0.25),
 735 | #                                         (10, 0.55),
 736 | #                                         #(11, i/100.0),
 737 | #                                      ],
 738 | #                                #random_state=0,
 739 | #                                is_y_log=True,
 740 | #                                y_log_e=np.e,
 741 | #                                #ft_pos=[0, 1, 3, 5],
 742 | #                                ft_select=[0, 1, 2, 4, 5, 7, 9],
 743 | #                                is_rm_outliers=True,
 744 | #                                rm_outliers_m=4.0,
 745 | #                                rm_outliers_key=[0, 1, 2, ],
 746 | #                                is_avg_or_median=1,
 747 | #                            ), 
 748 | #                            ## 14
 749 | #                            DaterangeModel(
 750 | #                                model=KNeighborsRegressor,
 751 | #                                train_days=49,
 752 | #                                n_neighbors=3,
 753 | #                                ft_th=[
 754 | #                                         (9, 0.25),
 755 | #                                         (10, 0.55),
 756 | #                                         #(11, i/100.0),
 757 | #                                      ],
 758 | #                                is_y_log=True,
 759 | #                                y_log_e=np.e,
 760 | #                                #ft_pos=[0, 1, 3, 5],
 761 | #                                ft_select=[0, 1, 2, 4, 5, 7] + range(12, 19),
 762 | #                                is_rm_outliers=True,
 763 | #                                rm_outliers_m=4.0,
 764 | #                                rm_outliers_key=[0, 1, 2, ],
 765 | #                                is_avg_or_median=1,
 766 | #                            ), 
 767 | #                            ## 15
 768 | #                            DaterangeModel(
 769 | #                                model=KNeighborsRegressor,
 770 | #                                train_days=70,
 771 | #                                ft_th=[
 772 | #                                         (9, 0.25),
 773 | #                                         (10, 0.55),
 774 | #                                         #(11, i/100.0),
 775 | #                                      ],
 776 | #                                random_state=0,
 777 | #                                is_y_log=True,
 778 | #                                y_log_e=np.e,
 779 | #                                #ft_pos=[0, 1, 3, 5],
 780 | #                                ft_select=[0, 1, 3, 4],
 781 | #                                is_rm_outliers=True,
 782 | #                                rm_outliers_m=4.0,
 783 | #                                rm_outliers_key=[0, 1, 2, ],
 784 | #                                is_avg_or_median=1,
 785 | #                            ),
 786 | #                            ## 16
 787 | #                            DaterangeModel(
 788 | #                                model=NonparametricKNN,
 789 | #                                train_days=28,
 790 | #                                ft_th=[
 791 | #                                         (9, 0.25),
 792 | #                                         (10, 0.55),
 793 | #                                         #(11, i/100.0),
 794 | #                                      ],
 795 | #                                random_state=0,
 796 | #                                is_y_log=True,
 797 | #                                y_log_e=np.e,
 798 | #                                #is_one_hot_encode=is_one_hot_encode,
 799 | #                                #ft_pos=[0, 1, 3, 5],
 800 | #                                ft_select=[0, 1, 3, 4],
 801 | #                                is_rm_outliers=True,
 802 | #                                rm_outliers_m=4.0,
 803 | #                                rm_outliers_key=[0, 1, 2, ],
 804 | #                                is_avg_or_median=1,
 805 | #                            ), 
 806 | #                            ## 17
 807 | #                            DaterangeModel(
 808 | #                                model=NonparametricKNN,
 809 | #                                train_days=14,
 810 | #                                n_neighbors=3,
 811 | #                                loss="SMAPE",
 812 | #                                ft_th=[
 813 | #                                         (9, 0.25),
 814 | #                                         (10, 0.55),
 815 | #                                         #(11, i/100.0),
 816 | #                                      ],
 817 | #                                random_state=0,
 818 | #                                is_y_log=True,
 819 | #                                y_log_e=np.e,
 820 | #                                #is_one_hot_encode=is_one_hot_encode,
 821 | #                                #ft_pos=[0, 1, 3, 5],
 822 | #                                ft_select=[0, 1, 3, 4],
 823 | #                                is_rm_outliers=True,
 824 | #                                rm_outliers_m=4.0,
 825 | #                                rm_outliers_key=[0, 1, 2, ],
 826 | #                                is_avg_or_median=1,
 827 | #                            ), 
 828 | #                            ## 18
 829 | #                            DaterangeModel(
 830 | #                                model=NonparametricKNN,
 831 | #                                train_days=56,
 832 | #                                n_neighbors=3,
 833 | #                                loss="L1",
 834 | #                                ft_th=[
 835 | #                                         (9, 0.25),
 836 | #                                         (10, 0.55),
 837 | #                                         #(11, i/100.0),
 838 | #                                      ],
 839 | #                                random_state=0,
 840 | #                                is_y_log=True,
 841 | #                                y_log_e=np.e,
 842 | #                                #is_one_hot_encode=is_one_hot_encode,
 843 | #                                #ft_pos=[0, 1, 3, 5],
 844 | #                                ft_select=[0, 1, 2, 4, 5, 7, 9],
 845 | #                                is_rm_outliers=True,
 846 | #                                rm_outliers_m=4.0,
 847 | #                                rm_outliers_key=[0, 1, 2, ],
 848 | #                                is_avg_or_median=1,
 849 | #                            ), 
 850 | #                            ## 19
 851 | #                            DaterangeModel(
 852 | #                                model=NonparametricKNN,
 853 | #                                train_days=50,
 854 | #                                #loss=loss,
 855 | #                                ft_th=[
 856 | #                                         (9, 0.25),
 857 | #                                         (10, 0.55),
 858 | #                                         #(11, i/100.0),
 859 | #                                      ],
 860 | #                                random_state=0,
 861 | #                                is_y_log=True,
 862 | #                                y_log_e=np.e,
 863 | #                                #is_one_hot_encode=is_one_hot_encode,
 864 | #                                #ft_pos=[0, 1, 3, 5],
 865 | #                                ft_select=[0, 1, 3, 4],
 866 | #                                is_rm_outliers=True,
 867 | #                                rm_outliers_m=4.0,
 868 | #                                rm_outliers_key=[0, 1, 2, ],
 869 | #                                is_avg_or_median=1,
 870 | #                            ), 
 871 | #                            ## 20
 872 | #                            DaterangeModel(
 873 | #                                model=LGBM,
 874 | #                                train_days=21,
 875 | #                                #loss=loss,
 876 | #                                ft_th=[
 877 | #                                         (9, 0.25),
 878 | #                                         (10, 0.55),
 879 | #                                         #(11, i/100.0),
 880 | #                                      ],
 881 | #                                random_state=0,
 882 | #                                is_y_log=True,
 883 | #                                y_log_e=np.e,
 884 | #                                #is_one_hot_encode=is_one_hot_encode,
 885 | #                                #ft_pos=[0, 1, 3, 5],
 886 | #                                ft_select=[0, 1, 3, 4, 5, 6],
 887 | #                                is_rm_outliers=True,
 888 | #                                rm_outliers_m=4.0,
 889 | #                                rm_outliers_key=[0, 1, 2, ],
 890 | #                                is_avg_or_median=1,
 891 | #                            ), 
 892 | #                            DaterangeModel(
 893 | #                                model=model,
 894 | #                                train_days=model_day,
 895 | #                                #eval_metric="rmse",
 896 | #                                #early_stopping_rounds=10,
 897 | #                                #num_round=6000,
 898 | #                                #booster='gbtree',
 899 | #                                #colsample_bytree=0.9,
 900 | #                                #objective="reg:gamma",
 901 | #                                #subsample=0.6,
 902 | #                                #eta=0.02,
 903 | #                                #max_depth=3,
 904 | #                                learning_rate=0.1,
 905 | #                                n_estimators=1200,
 906 | #                                num_leaves=8,
 907 | #                                colsample_bytree=0.8,
 908 | #                                subsample=0.9,
 909 | #                                objective=objective,
 910 | #                                #loss=loss,
 911 | #                                ft_th=[
 912 | #                                         (9, 0.25),
 913 | #                                         (10, 0.55),
 914 | #                                         #(11, i/100.0),
 915 | #                                      ],
 916 | #                                random_state=0,
 917 | #                                is_y_log=True,
 918 | #                                y_log_e=np.e,
 919 | #                                #is_one_hot_encode=is_one_hot_encode,
 920 | #                                #ft_pos=[0, 1, 3, 5],
 921 | #                                ft_select=ft_select,
 922 | #                                is_rm_outliers=True,
 923 | #                                rm_outliers_m=4.0,
 924 | #                                rm_outliers_key=[0, 1, 2, ],
 925 | #                                is_avg_or_median=1,
 926 | #                            ), 
 927 | #                        ]
 928 | #                        ## LGBM
 929 | #                        #for loss in ["SMAPE", "L1", "L2"]
 930 | #                        #for loss in ["lad"]
 931 | #                        ##for n_neighbors in range(3, 5, 7)
 932 | #                        for model in [ LGBM,
 933 | #                                       #MedianModel
 934 | #                                       #GradientBoostingRegressor, 
 935 | #                                       #NonparametricKNN,
 936 | #                                       #KNeighborsRegressor,
 937 | #                                       #MLPRegressor,               
 938 | #                                       #HuberRegressor, PassiveAggressiveRegressor, RANSACRegressor, SGDRegressor, 
 939 | #                                       #TheilSenRegressor,
 940 | #                                       #RadiusNeighborsRegressor, PLSRegression, LinearSVR, #NuSVR,
 941 | #                                     ]
 942 | #                        #for model_day in range(7, 90, 3)
 943 | #                        #for is_one_hot_encode in [True, False]
 944 | #                        for model_day in [21, 50]
 945 | #                        for objective in ["regression", "regression_l2", "regression_l1", "huber", "fair", "poisson"]
 946 | #                        #for ft_select in [range(10)]
 947 | #                        #for ft_pos in [[0,1] + list(one) for one in findsubsets2(range(4, 8))]
 948 | #                        for ft_select in list( [[0, 1, ] + list(one) for one in findsubsets2(range(2, 8))])
 949 | #                        #for ft_select in [[0, 1, 2, 4, 5, 7, 9, 10],]
 950 | #                        #for ft_select in [[0, 1, 2, 4, 5, 7, 9], [0, 1, 2, 4, 5, 7] + range(12, 19), [0, 1, 2, 3], [0, 1, 2, 6], [0, 1, 3, 4], [0, 1, 2, 4, 5, 7],]
 951 | #                        #for ft_select in [[0, 1, 2, 4, 5, 7] + range(19, 43), ]
 952 | #                     ],
 953 | #                     #'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
 954 | #                     'subsample': [1.0, ],
 955 | #                     'combine_method': [2,],
 956 | #                     'weights': [
 957 | #                         #[1.0, 0.25, 0.05, 0.5, 0.1, 0.05, 0.1, 0.3, 0.8, 0.25, 0.2, 0.35, 0.25, 0.1, ]
 958 | #                         [1.0, 0.25, 0.05, 0.5, 0.1, 
 959 | #                          0.05, 0.1, 0.3, 0.8, 0.25, 
 960 | #                          0.2, 0.6, 0.25, 0.2, 0.2, 
 961 | #                          0.25, 1.0, 0.3, 0.25, 0.9,
 962 | #                         x1/100.0]  #for x1 in range(0, 201, 10)
 963 | #                         #[1.0, x1/100.0, x2/100.0, x3/100.0, x4/100.0, x5/100.0, x6/100.0, x7/100.0, x8/100.0, x9/100.0, x10/100.0, x11/100.0, x12/100.0, x13/100.0, ] #x1/100.0,  ] # x1/100.0]
 964 | #                         for x1 in range(0, 101,  5) 
 965 | #                         #for x2 in range(0, 101,  10) 
 966 | #                         #for x3 in range(0, 101,  5)
 967 | #                         #for x4 in range(0, 101,  50) 
 968 | #                         #for x5 in range(0, 101,  50) 
 969 | #                         #for x6 in range(0, 101,  50)
 970 | #                         #for x7 in range(0, 101,  50)
 971 | #                         #for x8 in range(0, 101,  50)
 972 | #                         #for x9 in range(0, 101,  50)
 973 | #                         #for x10 in range(0, 101, 50) 
 974 | #                         #for x11 in range(0, 101, 50) 
 975 | #                         #for x12 in range(0, 101, 50)
 976 | #                         #for x13 in range(0, 101, 50)
 977 | #                         #[1.0, 0.31, 0.2, 0.35, 0.05, 0.05],
 978 | #                         #[1.0, 0.25, 0.2, 0.4, 0.05, 0.05] ## 0.17551018223761861,
 979 | #                         #[1.0, 0.25, 0.25, 0.5, 0.0, 0.0] ## 0.17573313467167548,
 980 | #                         #[1.0, 0.25, 0.2, 0.4, 0.0, 0.0] ## 0.17566198457914264,
 981 | #                         #[1.0, 0.15, 0.1, 0.1, 0.0]
 982 | #                     ],
 983 | #                },
 984 | #            ],
 985 | #        },
 986 |         {
 987 |             "model": DaterangeModel,
 988 |              
 989 |             "tuned_parameters": [
 990 | #              { ## baseline
 991 | #                 'model':[GradientBoostingRegressor,],
 992 | #                 'n_estimators': [200,],
 993 | #                 'is_y_log': [True,],
 994 | #                 'ft_select': [[0, 1, 2, 4, 5, 7, ], ],
 995 | #                 'y_log_e': [3.0, ],
 996 | #                 'norm_y': [False, ],
 997 | #                 'loss': ["ls", ],
 998 | #                 'learning_rate': [0.15, ],
 999 | #                 'train_days': [43, ],
1000 | #                 'is_rm_outliers': [True,],
1001 | #                 'max_depth': [3, ],
1002 | #                 'criterion': ['friedman_mse', ],
1003 | #                 'rm_outliers_m': [4.0, ],
1004 | #                 'rm_outliers_key': [[0, 1, 2, ], ],
1005 | #                 'is_avg_or_median': [1, ],
1006 | #                 'anova_filter': [0],
1007 | #                 'is_one_hot_encode': [False,],
1008 | #              },
1009 | 
1010 | #              { ## tune val part
1011 | #                 'model':[GradientBoostingRegressor,],
1012 | #                 'n_estimators': [200,],
1013 | #                 'is_y_log': [True,],
1014 | #                 'ft_select': [[0, 1, 2, 4, 5, 7], ],
1015 | #                 'y_log_e': [3.0, ],
1016 | #                 'norm_y': [False, ],
1017 | #                 'loss': ["ls", ],
1018 | #                 'learning_rate': [0.15, ],
1019 | #                 'train_days': [43, ],
1020 | #                 'is_rm_outliers': [True,],
1021 | #                 'max_depth': [3, ],
1022 | #                 'criterion': ['friedman_mse', ],
1023 | #                 'rm_outliers_m': [4.0, ],
1024 | #                 'rm_outliers_key': [[0, 1, 2, ], ],
1025 | #                 'is_avg_or_median': [1, ],
1026 | #                 'anova_filter': [0],
1027 | #                 'is_one_hot_encode': [False,],
1028 | #              },
1029 | #              {
1030 | #                 'model':[Pipeline,],
1031 | #                 'models':[[
1032 | #                             GradientBoostingRegressor(n_estimators=200,  loss='ls', learning_rate=0.1, subsample=0.9, max_depth=2),
1033 | #                             GradientBoostingRegressor(n_estimators=200,  loss='ls', learning_rate=0.1, max_depth=4),
1034 | #                          ]],
1035 | #                 'is_y_log': [True, ],
1036 | #                 'ft_select': [[0, 1, 2, 4, 5, 7,],],
1037 | #                 'train_days': range(20, 90, 1),
1038 | #              },
1039 | #              {
1040 | #                 'model':[GradientBoostingRegressor,],
1041 | #                 'is_ignore_skip_date_count': [True,],
1042 | #                 'n_estimators': [200,],
1043 | #                 #'rm_n_head_days': range(31),
1044 | #                 #'rm_n_head_days': [10, ],
1045 | #                 'rm_n_head_days': [4, ],
1046 | #                 'rm_n_head_days_hours': [
1047 | #                     #[],
1048 | #                     #[(0, 6), (21, 22)],
1049 | #                     #[(0, 5), (12, 15), (22, 24)],
1050 | #                     #[(0, 6), (12, 15), (21, 22)], ### 0.1750 public leader board
1051 | #                     #[(0, 7), (11, 16), (20, 24)],
1052 | #                     [(0, 7), (12, 15), (22, 24)],
1053 | #                     #[one, two , three]
1054 | #                     #[(0, 7), (11, 16), (20, 22)], ### 
1055 | #                     #[(0, 8), (10, 17), (19, 22)],
1056 | #                     #for one in [(0, 6), (0, 5), (0, 7), (0, 8),]
1057 | #                     #for two in [(10, 17), (11, 16), (12, 14), ]
1058 | #                     #for three in [(19, 24), (20, 24), (21, 24), (22, 24)]
1059 | #                 ],
1060 | #                 #'is_boxcox': [True,],
1061 | #                 #'boxcox_lambda': [-5.0, -4.0, -3.0, -2.0, -1.0, -0.5, ],
1062 | #                 'is_y_log': [True, ],
1063 | #                 #'ft_select': list([[0, 1, ] + list(one) for one in findsubsets2(set(range(2, 8)))]),
1064 | #                 'ft_th': [[
1065 | #                             #(9, 0.5),
1066 | #                             (10, 0.3),
1067 | #                             #(11, i/10.0),
1068 | #                             #(42, i/10.0),
1069 | #                           ]  #for i in range(0, 11, 1)
1070 | #                          ],
1071 | #                 'ft_select': [
1072 | #                     #[0, 1, 2, 4, 5, 7, ],
1073 | #                    [0, 1, 2, 4, 5, 7, ],
1074 | #                     #[0, 1, 2, 4, 5, 7, 42],
1075 | #                     #[0, 1, 2, 4, 5, 7, 9, 10, ] + range(19, 43),
1076 | #                 ],
1077 | #                 #'remove_outliers_by_classifier': [
1078 | #                 #    {
1079 | #                 #        "model": GradientBoostingRegressor(
1080 | #                 #           max_depth=x,
1081 | #                 #        ),
1082 | #                 #        "m": v/100.,
1083 | #                 #    }
1084 | #                 #    for v in range(80, 101, 1) for x in range(2,5)
1085 | #                 #],
1086 | #                 #'ft_select': [range(8)],
1087 | #                 #'ft_select': [[0, 1] + list(one) for one in findsubsets2(set(range(2, 8)))],
1088 | #                 #'ft_select': [range(8), [0, 1, 2, 4, 5, 7,] + range(12, 19)],
1089 | #                 #'ft_weights': [[1, 1, 1, i/100.0, 1, 1, 1, 1, 1,] for i in range(0,101, 10) ],
1090 | #                 #'ft_select': [[0, 1, 4, 6], [0, 1, 2, 3, 7], [0, 1, 2, 4, 5, 6, 7], [0, 1, 2, 4, 5, 7], [0, 1, 2, 3, 4, 5, 6, 7], ],
1091 | #                 #'y_log_e': [2.0, np.e, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ],
1092 | #                 'y_log_e': [3.0, ],
1093 | #                 #'max_features': ['sqrt', 'auto', 'log2', ],
1094 | #                 #'max_leaf_nodes': [5, ],
1095 | #                 #'max_leaf_nodes': [ 2, 3, 5, 10, 100, 1000, 10000, 100000, None,],
1096 | #                 #'min_impurity_split': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, ],
1097 | #                 'norm_y': [False, ],
1098 | #                 #'loss': ["ls", "huber", "quantile"],
1099 | #                 'loss': ["ls", ],
1100 | #                 #'learning_rate': [v/100.0 for v in range(1, 101, 1)],
1101 | #                 'learning_rate': [0.15, ],
1102 | #                 #'train_days': range(10, 90, 1),
1103 | #                 #'train_days': range(35, 70),
1104 | #                 'train_days': [50, ],
1105 | #                 'is_rm_outliers': [True, ],
1106 | #                 #'max_depth': range(2, 10),
1107 | #                 'max_depth': [3, ],
1108 | #                 'criterion': ['friedman_mse', ],
1109 | #                 #'rm_outliers_m': [i/10.0 for i in range(5, 61, 5)],
1110 | #                 'rm_outliers_m': [4.0, ],
1111 | #                 #'rm_outliers_m': [2.5, ],
1112 | #                 #'rm_outliers_key': [[0, 1, ] + list(one) for one in findsubsets2(set([2, 4, 6, 7,]), add_origin=False)],
1113 | #                 #'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4,5]))),
1114 | #                 'rm_outliers_key': [[0, 1, 2, ], ],
1115 | #                 #'rm_outliers_key': [[0, 1, 2], ],
1116 | #                 #'is_avg_or_median': [1, 2],
1117 | #                 'is_avg_or_median': [1,],
1118 | #                 'anova_filter': [0],
1119 | #                 'is_one_hot_encode': [False,],
1120 | #                 #'remove_non_predict_hour_range': [True, ],
1121 | #                 #'predict_hour_range':[
1122 | #                 #     [
1123 | #                 #          [[8, 0], [10, 0]],
1124 | #                 #          [[17, 0], [19, 0]],
1125 | #                 #     ],
1126 | #                 #     #[
1127 | #                 #     #     [[7, 0], [11, 0]],
1128 | #                 #     #     [[16, 0], [20, 0]],    
1129 | #                 #     #],
1130 | #                 #], 
1131 | #                 #'is_sample_weight': range(40),
1132 | #                 'is_sample_weight': [29, ],
1133 | #                 ## avoid overfitting....
1134 | #                 #'max_features' : ["log2",],
1135 | #                 #'subsample': [0.8, ],
1136 | #                 #'max_features': [i/10.0 for i in range(1, 9)],
1137 | #                 #'subsample': [0.9, ],
1138 | #                 #'is_ft_union': [ FeatureUnion(
1139 | #                 #                   [
1140 | #                 #                       #("pca", PCA(n_components=1)),
1141 | #                 #                       ("skb", SelectKBest(k=3)),
1142 | #                 #                   ]
1143 | #                 #                 ),
1144 | #                 #               ],
1145 | #                 'skip_date_ranges': [
1146 | #                     #[
1147 | #                     #  (datetime(2016, 10, i, 6), datetime(2016, 10, i, 8)) for i in range (11, 18)
1148 | #                     #], 
1149 | #                     #[
1150 | #                     #  (datetime(2016, 10, i, 6), datetime(2016, 10, i, 8)) for i in range (4, 11)
1151 | #                     #],  
1152 | #                     #[
1153 | #                     #  (datetime(2016, 9, 15) + timedelta(days=i), datetime(2016, 9, 15) + timedelta(days=i+1))
1154 | #                     #] for i in range(30)
1155 | #                     #[
1156 | #                     #  (datetime(2016, 9, 14), datetime(2016, 9, 17)),
1157 | #                     #  (datetime(2016, 9, 30), datetime(2016, 10, 3)),
1158 | #                     #],
1159 | #                     #[
1160 | #                     #  (datetime(2016, 9, 30), datetime(2016, 10, 8)),
1161 | #                     #],
1162 | #                     #[
1163 | #                     #  (datetime(2016, 9, 17), datetime(2016, 9, 18)),
1164 | #                     #], 
1165 | #                     #[
1166 | #                     #  (datetime(2016, 10, 7), datetime(2016, 10, 8)),
1167 | #                     #],
1168 | #                     ##[
1169 | #                     ##  (datetime(2016, 9, 15), datetime(2016, 9, 16)),
1170 | #                     ##  (datetime(2016, 10, 7), datetime(2016, 10, 8)),
1171 | #                     ##],
1172 | #                     #[
1173 | #                     #  (datetime(2016, 9, 14), datetime(2016, 9, 19)),
1174 | #                     #  (datetime(2016, 9, 30), datetime(2016, 10, 10)),                       
1175 | #                     #],
1176 | #                     [],
1177 | #                 ],
1178 | #              },
1179 | #              {
1180 | #                 'model':[BaggingRegressor,],
1181 | #                 'base_estimator': [
1182 | #                                      GradientBoostingRegressor(
1183 | #		                          n_estimators=50,
1184 | #                                          learning_rate=0.15,
1185 | # 					  loss='ls',
1186 | #                                      ),
1187 | #                                   ],
1188 | #                 'n_estimators': [50,],
1189 | #                 'is_y_log': [True,],
1190 | #                 'y_log_e': [3.0, ],
1191 | #                 'norm_y': [False, ],
1192 | #                 #'train_days': [43, ],
1193 | #                 'train_days': range(7, 60, 1),
1194 | #                 'is_rm_outliers': [True,],
1195 | #                 'rm_outliers_m': [4.0, ],
1196 | #                 'rm_outliers_key': [[0, 1, 2,], ],
1197 | #                 'is_avg_or_median': [1, ],
1198 | #                 'anova_filter': [0],
1199 | #                 'is_one_hot_encode': [False,],
1200 | #              },
1201 | #
1202 | #              {
1203 | #                 'model':[GradientBoostingRegressor,],
1204 | #                 'n_estimators': [200,],
1205 | #                 'loss': ["lad", ],
1206 | #                 'learning_rate': [0.5,],
1207 | #                 'train_days': [35, ],
1208 | #                 'is_rm_outliers': [True,],
1209 | #                 #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)],
1210 | #                 'rm_outliers_m': [5.0],
1211 | #                 'max_depth': [3, ],
1212 | #                 # 'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4,5]))),
1213 | #                 'rm_outliers_key': [[2,3,4,],],
1214 | #                 'is_avg_or_median': [True, ],
1215 | #                 'is_ignore_skip_date_count': [True,],
1216 | #              },
1217 | #               {
1218 | #                  'model':[LGBM,],
1219 | #                  #'objective':["regression", "regression_l2", "regression_l1", "huber", "fair", "poisson"],
1220 | #                  'objective':["fair", ],
1221 | #                  'subsample': [i/10.0 for i in range(6, 11, 1)],
1222 | #                  'use_mspe': [False],
1223 | #                  #'use_mspe':[True, False],
1224 | #                  #'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8, 1.0],
1225 | #                  'learning_rate': [0.1,],
1226 | #                  #'drop_rate': [0.3, ],
1227 | #                  #'max_bin': range(100, 400, 5),
1228 | #                  #'boosting_type': ['gbdt', 'dart', ],
1229 | #                  #'xgboost_dart_mode': [True, False,],
1230 | #                  #'boosting_type': ['gbdt', ],
1231 | #                  'xgboost_dart_mode':[False, ],
1232 | #                  #'uniform_drop': [True, False],
1233 | #                  'ft_th': [[
1234 | #                              (9, 0.25),
1235 | #                              (10, 0.55),
1236 | #                              #(11, i/100.0),
1237 | #                           ] ],
1238 | #                  #'ft_th': [[
1239 | #                  #            (9, 0.25),
1240 | #                  #            (10, 0.5),
1241 | #                  #            (11, 0.5),
1242 | #                  #          ]
1243 | #                  #         ],
1244 | #                  #'max_bin': [100,],
1245 | #                  'num_leaves': [8, ],
1246 | #                  #'num_leaves': range(3, 60),
1247 | #                  #'subsample': [i/10.0 for i in range(1, 11)],
1248 | #                  #'colsample_bytree': [i/10.0 for i in range(5, 11)],
1249 | #                  'colsample_bytree': [0.8, ],
1250 | #                  #'lambda_l1': [],
1251 | #                  'subsample': [0.9,],
1252 | #                  'n_estimators': [200, ],
1253 | #                  'is_y_log': [True, ],
1254 | #                  'y_log_e': [np.e, ],
1255 | #                  'norm_y': [False, ],
1256 | #                  #'ft_select': [[0, 1, 2, 4, 5, 7], ],
1257 | #                  'ft_select': [[0, 1, 2, 4, 5, 7, 9, 10], ],
1258 | #                  #'ft_select': list([[0, 1, 2, 4, 5, 7, ] + list(one) for one in findsubsets2(set(range(12,19)))]),
1259 | #                  'train_days': [49, ],
1260 | #                  #'train_days': range(7, 90, 7),
1261 | #                  'is_rm_outliers': [True, False],
1262 | #                  #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)],
1263 | #                  'rm_outliers_m': [4.0, ],
1264 | #                  'rm_outliers_key': [[0, 1, 2, ], ],
1265 | #                  #'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4]))),
1266 | #                  'is_avg_or_median': [1, ],
1267 | #               },
1268 | #               {
1269 | #                 'model':[XGBoost,],
1270 | #                 'rm_n_head_days': [0, ],
1271 | #                 'rm_n_head_days_hours': [
1272 | #                     #[],
1273 | #                     [(0, 6), (12, 15), (21, 22)],
1274 | #                     #[(0, 7), (11, 16), (20, 22)],
1275 | #                     #[(0, 8), (10, 17), (19, 22)],
1276 | #                 ],
1277 | #                 'is_y_log': [True, ],
1278 | #                 'use_mspe': [False],
1279 | #                 #'y_log_e': [np.e, ] + [pow(10, i) for i in range(1, 10)],
1280 | #                 #'y_log_e': [2.0, np.e, 3.0, 4.0, 5.0, 6.0, 7.0,],
1281 | #                 'y_log_e': [np.e, ],
1282 | #                 'norm_y': [False, ],
1283 | #                 #'early_stopping_rounds' : [5, 8, 10, 12, 15],
1284 | #                 'early_stopping_rounds' : [10, ],
1285 | #                 #'gamma': [0.0, 0.05, 0.1, 0.25, 0.5, 0.75, 1.0,],
1286 | #                 #'eta': [v/10.0 for v in range(2, 6, 1)] + [v/100.0 for v in range(1, 11, 1)],
1287 | #                 'eta': [0.02, ],
1288 | #                 #'max_depth': range(3, 10),
1289 | #                 #'min_child_weight': [1,3,5,7,],
1290 | #                 #'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
1291 | #                 'colsample_bytree': [0.9, ],
1292 | #                 'max_depth': [3, ],
1293 | #                 'eval_metric': ['rmse'],
1294 | #                 #'eval_metric': ['mape'],
1295 | #                 #'objective': ["reg:gamma", "reg:linear"],
1296 | #                 'objective': ["reg:gamma", ],
1297 | #                 #'subsample': [v/10.0 for v in range(5, 11)],
1298 | #                 'subsample': [0.6, ],
1299 | #                 'is_ignore_skip_date_count': [True, ],
1300 | #                 'booster': ['gbtree', ],
1301 | #                 #'ft_th': [[
1302 | #                 #            #(9, 0.25),
1303 | #                 #            #(10, 0.55),
1304 | #                 #            #(11, 0.5),
1305 | #                 #          ] #for i in range(0, 101, 5)
1306 | #                 #         ],
1307 | #                 'ft_select': [[0, 1, 2, 4, 5, 7, ]],
1308 | #                 #'ft_norm': [range(3), [], ],
1309 | #                 #'ft_select': [[0, 1, 2, 4, 5, 7], ],
1310 | #                 #'ft_select': [[0, 1, 2, 4, 5, 7, i] for i in range(12, 19) ] + [[0, 1, 2, 4, 5, 7, ],],
1311 | #                 #'ft_select': [[0, 1, 2, 3, 4, 5, 7], [0, 1, 2, 4, 5, 7], [0, 1, 2, 4, 5, 6, 7], ],
1312 | #                 #'ft_select': list([[0, 1, 2, 4, 5, 7] + list(one) for one in findsubsets2(set(range(12, 19)))]),
1313 | #                 #'ft_select': list([[0, 1, ] + list(one) for one in findsubsets2(set(range(2, 8)))]),
1314 | #                 #'train_days': range(7, 91, 7),
1315 | #                 'train_days': [50, ],
1316 | #                 'is_rm_outliers': [True, ],
1317 | #                 #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)],
1318 | #                 'rm_outliers_m': [4.0, ],
1319 | #                 #'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4,5]))),
1320 | #                 'rm_outliers_key': [[0, 1, 2, ], ],
1321 | #                 'is_avg_or_median': [1, ],
1322 | #                 #'is_sample_weight': range(5, 41, 1),
1323 | #                 'is_sample_weight': [29, 0],
1324 | #                 #'num_round': range(1000, 6001, 1000),
1325 | #                 'num_round': [6000, ],
1326 | #                 'skip_date_ranges': [
1327 | #                     #[
1328 | #                     #  (datetime(2016, 10, i, 6), datetime(2016, 10, i, 8)) for i in range (11, 18)
1329 | #                     #], 
1330 | #                     #[
1331 | #                     #  (datetime(2016, 10, i, 6), datetime(2016, 10, i, 8)) for i in range (4, 11)
1332 | #                     #],  
1333 | #                     #[
1334 | #                     #  (datetime(2016, 9, 15) + timedelta(days=i), datetime(2016, 9, 15) + timedelta(days=i+1))
1335 | #                     #] for i in range(30)
1336 | #                     #[
1337 | #                     #  (datetime(2016, 9, 15), datetime(2016, 9, 16)),
1338 | #                     #],
1339 | #                     #[
1340 | #                     #  (datetime(2016, 9, 14), datetime(2016, 9, 16)),
1341 | #                     #],
1342 | #                     #[
1343 | #                     #  (datetime(2016, 9, 17), datetime(2016, 9, 18)),
1344 | #                     #], 
1345 | #                     #[
1346 | #                     #  (datetime(2016, 10, 7), datetime(2016, 10, 8)),
1347 | #                     #],
1348 | #                     ##[
1349 | #                     ##  (datetime(2016, 9, 15), datetime(2016, 9, 16)),
1350 | #                     ##  (datetime(2016, 10, 7), datetime(2016, 10, 8)),
1351 | #                     ##],
1352 | #                     #[
1353 | #                     #  (datetime(2016, 9, 14), datetime(2016, 9, 19)),
1354 | #                     #  (datetime(2016, 9, 30), datetime(2016, 10, 10)),                       
1355 | #                     #],
1356 | #                     [],
1357 | #                 ],
1358 | #                 'remove_non_predict_hour_range': [False],
1359 | #                 'predict_hour_range':[
1360 | #                      [
1361 | #                           [[7, 0], [11, 0]],
1362 | #                           [[16, 0], [21, 0]],
1363 | #                      ],
1364 | #                      #[
1365 | #                      #     [[7, 0], [11, 0]],
1366 | #                      #     [[16, 0], [20, 0]],    
1367 | #                      #],
1368 | #                 ], 
1369 | #              },
1370 |               {
1371 |                  'model':[MedianModel,],
1372 |                  'is_y_log': [True,],
1373 |                  'ft_pos':[[0, 1, 3, 5], ],
1374 |                  'ft_select': [[0, 1, 2, 4, 5, 7], ],
1375 |                  #'train_days': range(14, 90, 1),
1376 |                  'train_days': [45, ],
1377 |                  'is_rm_outliers': [True,],
1378 |                  #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)],
1379 |                  'rm_outliers_m': [4.0,],
1380 |                  #'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4]))),
1381 |                  'rm_outliers_key': [[0,1,2], ],
1382 |                  'is_avg_or_median': [1,],
1383 |               },
1384 | #              {
1385 | #                 'model':[
1386 | #                            #LinearRegression,
1387 | #                            #ARDRegression,
1388 | #                            HuberRegressor,
1389 | #                            #LogisticRegression,
1390 | #                            #LogisticRegressionCV,
1391 | #                            PassiveAggressiveRegressor,
1392 | #                            #RandomizedLogisticRegression,
1393 | #                            RANSACRegressor,
1394 | #                            SGDRegressor,
1395 | #                            TheilSenRegressor,
1396 | #                            KNeighborsRegressor,
1397 | #                            RadiusNeighborsRegressor,
1398 | #                            MLPRegressor,
1399 | #                            PLSRegression,
1400 | #                            SVR,
1401 | #                            LinearSVR,
1402 | #                            NuSVR,
1403 | #                            Lasso,
1404 | #                          ],
1405 | #                 'is_y_log': [False, True],
1406 | #                 'train_days': [14,],
1407 | #              },
1408 | #              {
1409 | #                 'model':[
1410 | #                            ExtraTreeRegressor,
1411 | #                            AdaBoostRegressor,
1412 | #                            BaggingRegressor,
1413 | #                            RandomForestRegressor,
1414 | #                          ],
1415 | #                 'is_y_log': [False, True],
1416 | #                 'train_days': range(7, 130, 7),
1417 | #                 #'train_days': [56, ],
1418 | #              },
1419 | #              {
1420 | #                 'model':[
1421 | #                            AdaBoostRegressor,
1422 | #                         ],
1423 | #                 'loss': ['linear', ],
1424 | #                 'is_y_log': [True,],
1425 | #                 'y_log_e': [3.0,],
1426 | #                 #'train_days': range(14, 130, 7),
1427 | #                 'train_days': [35, ],
1428 | #                 #'train_days': [43, ],
1429 | #                 'is_rm_outliers': [True,],
1430 | #                  #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)],
1431 | #                  'rm_outliers_m': [2.0, ],
1432 | #              },
1433 | #              {
1434 | #                'model': [RandomForestRegressor,BaggingRegressor,
1435 | #                          AdaBoostRegressor,ExtraTreesRegressor,],
1436 | #                'n_estimators': [50,],
1437 | #                 'is_y_log': [False, True],
1438 | #                 'train_days': [7,14,21],
1439 | #                 'is_rm_outliers': [True,],
1440 | #                 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,],
1441 | #              },
1442 | #              {
1443 | #                'model': [
1444 | #                    CombineModes
1445 | #                ],
1446 | #                'models':[
1447 | #                    [
1448 | #                        DaterangeModel(model=GradientBoostingRegressor,
1449 | #                                       n_estimators=200,
1450 | #                                       is_y_log=True,
1451 | #                                       y_log_e=3.0,
1452 | #                                       loss='ls',
1453 | #                                       learning_rate=0.15,
1454 | #                                       train_days=43,
1455 | #                                       is_rm_outliers=True,
1456 | #                                       rm_outliers_m=4.0,
1457 | #                                       max_depth=3,
1458 | #                                       rm_outliers_key=[0,1,2,],
1459 | #                                       is_avg_or_median=True,
1460 | #                                       dates_train=copy.deepcopy(dates_train),
1461 | #                                       ),
1462 | #                        DaterangeModel(model=GradientBoostingRegressor,
1463 | #                                       n_estimators=200,
1464 | #                                       loss='lad',
1465 | #                                       learning_rate=0.1,
1466 | #                                       train_days=35,
1467 | #                                       is_rm_outliers=True,
1468 | #                                       rm_outliers_m=4.0,
1469 | #                                       max_depth=3,
1470 | #                                       rm_outliers_key=[0,1,2,],
1471 | #                                       is_avg_or_median=True,
1472 | #                                       dates_train=copy.deepcopy(dates_train),
1473 | #                                       ),
1474 | #                    ],
1475 | #                ],
1476 | #                'weights':[
1477 | #                    [
1478 | #                        0.8,
1479 | #                        0.2,
1480 | #                    ],
1481 | #                    [
1482 | #                        0.9,
1483 | #                        0.1,
1484 | #                    ],
1485 | #                    [
1486 | #                        0.7,
1487 | #                        0.3,
1488 | #                    ],
1489 | #                    [
1490 | #                        0.6,
1491 | #                        0.4,
1492 | #                    ],
1493 | #
1494 | #                ],
1495 | #                'train_days': [200,],
1496 | #                'dates_train': [copy.deepcopy(dates_train),],
1497 | #              },
1498 | #              {
1499 | #                  "model": [SVR,],
1500 | #                  "ft_select": [[0, 1, 2, 4, 5, 7,],],
1501 | #                  "kernel": ['rbf', ],
1502 | #                  'C': [1000,  ],
1503 | #                  'gamma': [0.001, ],
1504 | #                  'is_y_log': [True,],
1505 | #                  #'train_days': range(3, 60, 2),
1506 | #                  'train_days': [41,],
1507 | #                  'is_one_hot_encode': [True,],
1508 | #                  'is_rm_outliers': [True,],
1509 | #                  'rm_outliers_m': [ 4.0,],
1510 | #                  'rm_outliers_key': [[0, 1, 2], ],
1511 | #
1512 | #              },
1513 | #              {
1514 | #                  "model": [Lasso,],
1515 | #                  "ft_select": [[0, 1, 2, 4, 5, 7,],],
1516 | #                  'is_y_log': [True,],
1517 | #                  #'train_days': range(3, 60, 2),
1518 | #                  'train_days': [1,2,3,7, 14, 28,],
1519 | #                  'is_one_hot_encode': [True,],
1520 | #                  'is_rm_outliers': [True,],
1521 | #                  'rm_outliers_m': [ 4.0,],
1522 | #                  'rm_outliers_key': [[0, 1, 2], ],
1523 | #              },
1524 | #              {
1525 | #                 'model':[SVR,],
1526 | #                 'train_days': [10],
1527 | #                 'ft_select': [[0, 1, 2, 4, 5, 7], ],
1528 | #                 'is_rm_outliers': [False,],
1529 | #                 #'rm_outliers_m': [i/10.0 for i in range(5, 60, 5)],
1530 | #                 'rm_outliers_m': [4.0],
1531 | #                 #'rm_outliers_key': list(findsubsets2(set([0,1,2,3,4,5]))),
1532 | #                 'rm_outliers_key': [[2,3,4,],],
1533 | #                 'is_avg_or_median': [True, ],
1534 | #                 ##'rm_n_head_days': range(31),
1535 | #                 #'rm_n_head_days': [10, ],
1536 | #                 #'rm_n_head_days_hours': [
1537 | #                 #    #[],
1538 | #                 #    #[(0, 6), (21, 22)],
1539 | #                 #    #[(0, 5), (12, 15), (22, 24)],
1540 | #                 #    [(0, 6), (12, 15), (21, 22)], ### 0.1750 public leader board
1541 | #                 #    #[(0, 7), (11, 16), (20, 24)],
1542 | #                 #    #[(0, 7), (12, 15), (22, 24)],
1543 | #                 #    #[one, two , three]
1544 | #                 #    #[(0, 7), (11, 16), (20, 22)], ### 
1545 | #                 #    #[(0, 8), (10, 17), (19, 22)],
1546 | #                 #    #for one in [(0, 6), (0, 5), (0, 7), (0, 8),]
1547 | #                 #    #for two in [(10, 17), (11, 16), (12, 14), ]
1548 | #                 #    #for three in [(19, 24), (20, 24), (21, 24), (22, 24)]
1549 | #                 #],
1550 | #              },
1551 | 
1552 | #              {
1553 | #                  "model": [
1554 | #                            #LinearRegression,
1555 | #                            #ARDRegression,
1556 | #                            HuberRegressor,
1557 | #                            #LogisticRegression,
1558 | #                            #LogisticRegressionCV,
1559 | #                            PassiveAggressiveRegressor,
1560 | #                            #RandomizedLogisticRegression,
1561 | #                            RANSACRegressor,
1562 | #                            SGDRegressor,
1563 | #                            TheilSenRegressor,
1564 | #                            KNeighborsRegressor,
1565 | #                            RadiusNeighborsRegressor,
1566 | #                            MLPRegressor,
1567 | #                            PLSRegression,
1568 | #                            SVR,
1569 | #                            LinearSVR,
1570 | #                            NuSVR,
1571 | #                            ],
1572 | #                  'is_y_log': [True,],
1573 | #                  'train_days': [45,],
1574 | #                  'is_one_hot_encode': [False,],
1575 | #                  'ft_select': [[0, 1, 2, 4, 5, 7,],],
1576 | #                  #'is_rm_outliers': [True,],
1577 | #                  #'rm_outliers_m': [ 4.0,],
1578 | #              },
1579 | #              {
1580 | #                  "model": [BaggingRegressor, AdaBoostRegressor,],
1581 | #                  "base_estimator": [
1582 | #                      GradientBoostingRegressor(n_estimators=50,
1583 | #                                                loss="lad",
1584 | #                                                learning_rate=0.1,),
1585 | #                      RandomForestRegressor(),
1586 | #                                     ],
1587 | #                  'train_days': [7,14,21,28,35,42,49,56,150],
1588 | #                  #'train_days': [7,],
1589 | #                  'is_rm_outliers': [True,],
1590 | #                  'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,],
1591 | #                  #'rm_outliers_m': [3.0,],
1592 | #              },
1593 | #              {
1594 | #                 'model': [ExtraTreesRegressor,],
1595 | #                 'n_estimators': [200,],
1596 | #                 'train_days': [7,14,21,28,35,42,49,56,150],
1597 | #                 'is_rm_outliers': [True,],
1598 | #                 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,],
1599 | #                 'max_features': ['auto', 'sqrt', 'log2', None,]
1600 | #              },
1601 | #              {
1602 | #                'model': [RandomForestRegressor,],
1603 | #                'n_estimators': [50,],
1604 | #                 'train_days': [7,14,21,28,35,42,49,56,150],
1605 | #                 'is_rm_outliers': [True,],
1606 | #                 'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,],
1607 | #                 "dates_train": [dates_train,],
1608 | #              },
1609 | #              {
1610 | #                 'model': [NonparametricKNN, ],
1611 | #                 "n_neighbors": [1, 3, 5, 10, 15, 20,],
1612 | #                 #"n_neighbors": [20,],
1613 | #                 "loss": ["SMAPE", "L1"],
1614 | #                 #"loss": ["SMAPE", ],
1615 | #                  'train_days': [7, 14, 28,],
1616 | #                  'is_rm_outliers': [True,],
1617 | #                  'rm_outliers_m': [ 4.0,],
1618 | #              },
1619 | #              {
1620 | #                  'model': [Lasso, ],
1621 | #                  "alpha": [0.1,],
1622 | #                  'train_days': [1,2,3,7, 14, 28,],
1623 | #                  'is_rm_outliers': [True,],
1624 | #                  'rm_outliers_m': [ 4.0,],
1625 | #              },
1626 | #              {
1627 | #                  "model": [BaggingRegressor,],
1628 | #                  "base_estimator": [
1629 | #                      GradientBoostingRegressor(n_estimators=200,
1630 | #                                                loss="lad",
1631 | #                                                learning_rate=0.1,),
1632 | #                                     ],
1633 | #                  'train_days': [7,14,21,28,35,42,49,56,150],
1634 | #                  'is_rm_outliers': [True,],
1635 | #                  'rm_outliers_m': [0.5, 1.0, 2.0, 3.0, 4.0,],
1636 | #              },
1637 |             ],
1638 |         },
1639 | #        {
1640 | #            "model": BaggingRegressor,
1641 | #            "tuned_parameters":[
1642 | #              {
1643 | #                 "base_estimator": [None,
1644 | #                    GradientBoostingRegressor(n_estimators=200, loss='lad', learning_rate=0.1),
1645 | #                    ExtraTreesRegressor(), RandomForestRegressor()],
1646 | #              },
1647 | #            ],
1648 | #        },
1649 | #        {
1650 | #            "model": AdaBoostRegressor,
1651 | #            "tuned_parameters": [],
1652 | #        },
1653 | #        {
1654 | #            "model": ExtraTreesRegressor,
1655 | #            "tuned_parameters": [
1656 | #              {
1657 | #                #"criterion": ["mse", "mae"],
1658 | #                "n_estimators": [200,]
1659 | #              },
1660 | #            ]
1661 | #        },
1662 | #        {
1663 | #            "model": RandomForestRegressor,
1664 | #            "tuned_parameters": [
1665 | #              {"n_estimators": [200,]},
1666 | #            ],
1667 | #        },
1668 |         #{
1669 |         #    "model": SVR,
1670 |         #    "tuned_parameters": [
1671 |         #        #{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10,]},
1672 |         #        {y'kernel': ['linear'], 'C': [1, ]}]
1673 |         #        #{'kernel': ['linear'], 'C': [1, 10, ]}]
1674 |         #}
1675 |     ]
1676 |     print("GridSearchCVDatesWithVal X_train.shape=", X_train.shape)
1677 |     predict_y_final = GridSearchCVDatesWithVal(
1678 |         Configurations,
1679 |         X_train, y_train, dates_train,
1680 |         X_val, y_val, dates_val,
1681 |         X_final,
1682 |         is_y_log=is_y_log, is_boxcox=is_boxcox, boxcox_lambda=boxcox_lambda,
1683 |         is_include_val_loss_for_eval=False, cv=6, days_to_test=7, skip_cvs=skip_cvs,
1684 |         is_estimate_val=is_estimate_val, estimate_val_w=estimate_val_w,
1685 |         is_include_future_training=is_include_future_training, remove_future_training_test=remove_future_training_test,
1686 |         test_date_ranges=test_date_ranges, n_cores=n_cores)
1687 | 
1688 |     ##output final output
1689 |     path_final_res = os.path.join(
1690 |         path_working_dir,
1691 |         "travel_time_from_intersection_to_tollgates.csv")
1692 | 
1693 |     print("*" * 60)
1694 |     print("writing out final results...")
1695 |     file_final_res = open(path_final_res, "w+")
1696 |     file_final_res.writelines(','.join(['"intersection_id"',
1697 |             '"tollgate_id"', '"time_window"', '"avg_travel_time"']) + '\n')
1698 |     for iy, rinfo in enumerate(raw_info):
1699 |         route_id = rinfo[0]
1700 |         start_datetime = rinfo[1]
1701 |         intersection_id, tollgate_id = route_id.split("-")
1702 |         end_datetime = start_datetime + timedelta(minutes=20)
1703 |         timewindow = "["+str(start_datetime) + "," + str(end_datetime)+ ")"
1704 |         words = [intersection_id, tollgate_id, timewindow, str(predict_y_final[iy])]
1705 |         line = '","'.join(words)
1706 |         line = '"' + line + '"\n'
1707 |         file_final_res.write(line)
1708 |     file_final_res.close()
1709 | 


--------------------------------------------------------------------------------