├── CONSTANT.py
├── LICENSE
├── README.md
├── automl.py
├── hyperopt_class.py
├── merge.py
├── metadata
├── model.py
├── preprocess.py
└── util.py


/CONSTANT.py:
--------------------------------------------------------------------------------
 1 | 
 2 | NUMERICAL_TYPE = "num"
 3 | NUMERICAL_PREFIX = "n_"
 4 | 
 5 | CATEGORY_TYPE = "cat"
 6 | CATEGORY_PREFIX = "c_"
 7 | 
 8 | TIME_TYPE = "time"
 9 | TIME_PREFIX = "t_"
10 | 
11 | MULTI_CAT_TYPE = "multi-cat"
12 | MULTI_CAT_PREFIX = "m_"
13 | MULTI_CAT_DELIMITER = ","
14 | 
15 | POS_MULTI_CAT_PREFIX = "cm_"
16 | 
17 | 
18 | MAIN_TABLE_NAME = "main"
19 | MAIN_TABLE_TEST_NAME = "main_test"
20 | TABLE_PREFIX = "table_"
21 | 
22 | LABEL = "label"
23 | 
24 | HASH_MAX = 200
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 shuyao95
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # kddcup2019-automl
2 | Code for KDD CUP 2019 Auto-ML track
3 | 


--------------------------------------------------------------------------------
/automl.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List
  2 | 
  3 | import hyperopt
  4 | import lightgbm as lgb
  5 | import numpy as np
  6 | import pandas as pd
  7 | from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe
  8 | from sklearn.metrics import roc_auc_score
  9 | from sklearn.model_selection import train_test_split
 10 | from Our_model.hyperopt_class import train_hyperopt
 11 | from util import Config, log, timeit
 12 | 
 13 | 
 14 | @timeit
 15 | def train(X: pd.DataFrame, y: pd.Series, config: Config):
 16 |     train_lightgbm(X, y, config)
 17 | 
 18 | 
 19 | @timeit
 20 | def predict(X: pd.DataFrame, config: Config) -> List:
 21 |     preds = predict_lightgbm(X, config)
 22 |     return preds
 23 | 
 24 | 
 25 | @timeit
 26 | def validate(preds, y_path) -> np.float64:
 27 |     score = roc_auc_score(pd.read_csv(y_path)['label'].values, preds)
 28 |     log("Score: {:0.4f}".format(score))
 29 |     return score
 30 | 
 31 | 
 32 | @timeit
 33 | def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config):
 34 |     params = {
 35 |         "boosting": "gbdt",
 36 |         "objective": "binary",
 37 |         "metric": "auc",
 38 |         "verbosity": -1,
 39 |         "seed": 1,
 40 |         "num_threads": 4,
 41 |         # "scale_pos_weight": 5
 42 |     }
 43 | 
 44 |     n_samples = int(0.1*len(X))
 45 |     print('number of sample for hyperopt', n_samples)
 46 |     X_sample, y_sample = data_sample(X, y, n_samples)
 47 |     hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config)
 48 | 
 49 |     X_train, X_val, y_train, y_val = data_split(X, y, 0.1)
 50 |     train_data = lgb.Dataset(X_train, label=y_train)
 51 |     valid_data = lgb.Dataset(X_val, label=y_val)
 52 | 
 53 |     config["model"] = lgb.train({**params, **hyperparams},
 54 |                                 train_data,
 55 |                                 1200,
 56 |                                 [train_data, valid_data],
 57 |                                 early_stopping_rounds=45,
 58 |                                 verbose_eval=100)
 59 | 
 60 | 
 61 | @timeit
 62 | def predict_lightgbm(X: pd.DataFrame, config: Config) -> List:
 63 |     return config["model"].predict(X)
 64 | 
 65 | 
 66 | @timeit
 67 | def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config):
 68 |     X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5)
 69 |     train_data = lgb.Dataset(X_train, label=y_train)
 70 |     valid_data = lgb.Dataset(X_val, label=y_val)
 71 | 
 72 |     space = {
 73 |         "max_depth": hp.choice("max_depth", np.arange(2, 10, 1, dtype=int)),
 74 |         # smaller than 2^(max_depth)
 75 |         "num_leaves": hp.choice("num_leaves", np.arange(4, 200, 4, dtype=int)),
 76 |         "feature_fraction": hp.quniform("feature_fraction", 0.2, 0.8, 0.1),
 77 |         # "bagging_fraction": hp.quniform("bagging_fraction", 0.2, 0.8, 0.1),
 78 |         # "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 10, 2, dtype=int)),
 79 |         # "scale_pos_weight":hp.uniform('scale_pos_weight',1.0, 10.0),
 80 |         # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0),
 81 |         "min_child_weight": hp.quniform('min_child_weight', 2, 50, 2),
 82 |         "reg_alpha": hp.uniform("reg_alpha", 2.0, 8.0),
 83 |         "reg_lambda": hp.uniform("reg_lambda", 2.0, 8.0),
 84 |         "learning_rate": hp.quniform("learning_rate", 0.05, 0.4, 0.01),
 85 |         # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)),
 86 |         #
 87 |         "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(200, 2000, 100, dtype=int)),
 88 |         #"is_unbalance": hp.choice("is_unbalance", [True])
 89 |     }
 90 | 
 91 |     def objective(hyperparams):
 92 |         model = lgb.train({**params, **hyperparams}, train_data, 300,
 93 |                           valid_data, early_stopping_rounds=45, verbose_eval=0)
 94 | 
 95 |         score = model.best_score["valid_0"][params["metric"]]
 96 | 
 97 |         # in classification, less is better
 98 |         return {'loss': -score, 'status': STATUS_OK}
 99 | 
100 |     trials = Trials()
101 |     best = hyperopt.fmin(fn=objective, space=space, trials=trials,
102 |                          algo=tpe.suggest, max_evals=150, verbose=1,
103 |                          rstate=np.random.RandomState(1))
104 | 
105 |     hyperparams = space_eval(space, best)
106 |     log(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}")
107 |     return hyperparams
108 | 
109 | 
110 | def data_split(X: pd.DataFrame, y: pd.Series, test_size: float=0.2):
111 |     #  -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series):
112 |     return train_test_split(X, y, test_size=test_size, random_state=1)
113 | 
114 | 
115 | def data_sample(X: pd.DataFrame, y: pd.Series, nrows: int=5000):
116 |     # -> (pd.DataFrame, pd.Series):
117 |     if len(X) > nrows:
118 |         X_sample = X.sample(nrows, random_state=1)
119 |         y_sample = y[X_sample.index]
120 |     else:
121 |         X_sample = X
122 |         y_sample = y
123 | 
124 |     return X_sample, y_sample
125 | 
126 | 
127 | ######################################with time limitation
128 | 
129 | @timeit
130 | def timetrain(X: pd.DataFrame, y: pd.Series, config: Config,Time_info):
131 | 
132 |     time_limitation_for_hp= Time_info['time_ramain_so_far']-Time_info['For_safe']
133 | 
134 |     new_y = train_hyperopt(Time_info)
135 | 
136 |     new_y.train_lightgbm(X,y,config,time_limitation_for_hp)
137 | 
138 | 
139 | @timeit
140 | def timepredict(X: pd.DataFrame, config: Config) -> List:
141 | 
142 |     preds = predict_configmodel(X, config)
143 | 
144 |     return preds
145 | 
146 | @timeit
147 | def predict_configmodel(X: pd.DataFrame, config: Config) -> List:
148 |     return config["model"].predict(X)
149 | 


--------------------------------------------------------------------------------
/hyperopt_class.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from sklearn.model_selection import  train_test_split,cross_val_score,KFold
  3 | import lightgbm as lgb
  4 | 
  5 | from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe
  6 | 
  7 | from util import Config,log,timeit
  8 | from typing import Dict,List
  9 | from sklearn.metrics import roc_auc_score,auc
 10 | 
 11 | import time
 12 | 
 13 | import hyperopt
 14 | import numpy as np
 15 | 
 16 | 
 17 | def data_split(X: pd.DataFrame, y: pd.Series, test_size: float=0.2):
 18 |     #  -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series):
 19 |     return train_test_split(X, y, test_size=test_size, random_state=1)
 20 | 
 21 | 
 22 | def data_sample(X: pd.DataFrame, y: pd.Series, nrows: int=5000):
 23 |     # -> (pd.DataFrame, pd.Series):
 24 |     if len(X) > nrows:
 25 |         X_sample = X.sample(nrows, random_state=1)
 26 |         y_sample = y[X_sample.index]
 27 |     else:
 28 |         X_sample = X
 29 |         y_sample = y
 30 | 
 31 |     return X_sample, y_sample
 32 | 
 33 | 
 34 | class train_hyperopt:
 35 | 
 36 |     def __init__(self,Time_info):
 37 |         self.Time_info = Time_info
 38 | 
 39 |     @timeit
 40 |     def train_lightgbm(self,X: pd.DataFrame, y: pd.Series, config: Config,time_limitation):
 41 | 
 42 |         params = {
 43 |             'boosting_type':'gbdt',
 44 |             #'boosting_type': 'dart',
 45 |             "objective": "binary",
 46 |             "metric": "auc",
 47 |             "verbosity": -1,
 48 |             "seed": 1,
 49 |             "num_threads": 4,
 50 |             'feature_fraction':0.9,
 51 |         }
 52 | 
 53 |         n_samples = int(0.1 * len(X))
 54 |         print('number of sample for hyperopt', n_samples)
 55 |         X_sample, y_sample = data_sample(X, y, n_samples)
 56 | 
 57 | 
 58 |         time_for_hp = (1 * time_limitation / 2)
 59 | 
 60 |         HYPEROPT_START=time.time()
 61 |         hyperparams = self.hyperopt_lightgbm(X_sample, y_sample, params, config,time_for_hp)
 62 |         HYPEROPT_end=time.time()
 63 | 
 64 |         print("time hyperopt:",HYPEROPT_end-HYPEROPT_START)
 65 | 
 66 | 
 67 |         time_for_train = time_limitation - (HYPEROPT_end - HYPEROPT_START)
 68 | 
 69 |         X_train, X_val, y_train, y_val = data_split(X, y, 0.1)
 70 | 
 71 |         train_data = lgb.Dataset(X_train, label=y_train)
 72 |         valid_data = lgb.Dataset(X_val, label=y_val)
 73 |         # set it to a big value
 74 | 
 75 |         train_time_start = time.time()
 76 |         clf1= lgb.train({**params, **hyperparams},
 77 |                                     train_data,
 78 |                                     30,
 79 |                                     valid_data,
 80 |                                     early_stopping_rounds=30,
 81 |                                     verbose_eval=100)
 82 |         train_time_end = time.time()
 83 |         del clf1
 84 | 
 85 |         _30_boost_rounds_for_train_time = train_time_end - train_time_start
 86 | 
 87 |         leave_num_boost_rounds = int(0.9*(30*((time_for_train-_30_boost_rounds_for_train_time)/_30_boost_rounds_for_train_time)))
 88 | 
 89 |         print("leave_num_boost_rounds",leave_num_boost_rounds)
 90 | 
 91 |         #for test
 92 |         clf = lgb.train({**params, **hyperparams},
 93 |                                     train_set=train_data,
 94 |                                     num_boost_round=leave_num_boost_rounds,
 95 |                                     valid_sets=[train_data, valid_data],
 96 |                                     early_stopping_rounds=200,
 97 |                                     verbose_eval=100)
 98 | 
 99 | 
100 | 
101 |         config["model"]=clf
102 | 
103 |         #importance = config["model"].feature_importance(importance_type='split')
104 |         #feature_name = config["model"].feature_name()
105 |         #feature_importance = pd.DataFrame({'feature_name': feature_name, 'importance': importance})
106 |         #feature_importance.to_csv('feature_importance.csv', index=False)
107 | 
108 |     @timeit
109 |     def predict_lightgbm(self,X: pd.DataFrame, config: Config) -> List:
110 | 
111 |         return config["model"].predict(X)
112 | 
113 |     @timeit
114 |     def hyperopt_lightgbm(self,X: pd.DataFrame, y: pd.Series, params: Dict, config: Config,time_limitation):
115 | 
116 |         time_start_dfl = time.time()
117 | 
118 |         X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5)
119 | 
120 |         train_data = lgb.Dataset(X_train, label=y_train)
121 |         valid_data = lgb.Dataset(X_val, label=y_val)
122 | 
123 |         if params['boosting_type']=='dart':
124 |             space = {
125 |                 "max_depth": hp.choice("max_depth", np.arange(2, 8, 1, dtype=int)),
126 |                 # smaller than 2^(max_depth)
127 |                 "num_leaves": hp.choice("num_leaves", np.arange(4, 400, 4, dtype=int)),
128 |                 "feature_fraction": hp.quniform("feature_fraction", 0.5, 0.9, 0.1),
129 |                 "bagging_fraction": hp.quniform("bagging_fraction", 0.5, 0.9, 0.1),
130 |                 "bagging_freq": hp.choice("bagging_freq", np.linspace(1,10,2, dtype=int)),
131 |                 # "scale_pos_weight":hp.uniform('scale_pos_weight',1.0,10.0),
132 |                 # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0),
133 |                 "min_child_weight": hp.quniform('min_child_weight', 2, 50, 2),
134 |                 "reg_alpha": hp.uniform("reg_alpha", 0.5, 5.0),
135 |                 "reg_lambda": hp.uniform("reg_lambda", 0.5, 5.0),
136 |                 "learning_rate": hp.quniform("learning_rate", 0.05, 0.2, 0.02),
137 |                 # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)),
138 |                 #
139 |                 "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(200, 2000, 200, dtype=int)),
140 |                 "is_unbalance": hp.choice("is_unbalance", [True])
141 |             }
142 |         else:
143 |             space = {
144 |                 "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
145 |                 # smaller than 2^(max_depth)
146 |                 "num_leaves": hp.choice("num_leaves", np.arange(4, 160, 4, dtype=int)),
147 |                 # "feature_fraction": hp.quniform("feature_fraction", 0.8, 0.9, 0.05),
148 |                 # "bagging_fraction": hp.quniform("bagging_fraction", 0.2, 0.8, 0.1),
149 |                 # "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 10, 2, dtype=int)),
150 |                 # "scale_pos_weight": hp.uniform('scale_pos_weight',1.0, 10.0),
151 |                 "scale_pos_weight": hp.choice("scale_pos_weight", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
152 |                 # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0),
153 | 
154 |                 "min_child_weight": hp.uniform('min_child_weight', 1, 10),
155 |                 # "min_child_weight": hp.quniform('min_child_weight', 0.0002, 0.005, 0.0005),
156 | 
157 |                 "reg_alpha": hp.uniform("reg_alpha", 0.0, 10.0),
158 |                 "reg_lambda": hp.uniform("reg_lambda", 0.0, 10.0),
159 |                 "learning_rate": hp.quniform("learning_rate", 0.01, 0.1, 0.01),
160 |                 # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)),
161 |                 #
162 |                 "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(10, 2000, 10, dtype=int)),
163 |                 # "is_unbalance": hp.choice("is_unbalance", [True])
164 |             }
165 | 
166 |             """
167 |             space = {
168 |                 "max_depth": hp.choice("max_depth", np.arange(2, 11, 1, dtype=int)),
169 |                 # smaller than 2^(max_depth)
170 |                 #160
171 |                 "num_leaves": hp.choice("num_leaves", np.arange(8, 200, 8, dtype=int)),
172 |                 "feature_fraction": hp.quniform("feature_fraction", 0.6, 0.9, 0.05),
173 |                 # "bagging_fraction": hp.quniform("bagging_fraction", 0.2, 0.8, 0.1),
174 |                 # "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 10, 2, dtype=int)),
175 |                 # "scale_pos_weight":hp.uniform('scale_pos_weight',1.0, 10.0),
176 |                 # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0),
177 | 
178 |                 "min_child_weight": hp.quniform('min_child_weight', 0.5, 80, 0.5),
179 |                 #"min_child_weight": hp.quniform('min_child_weight', 0.0002, 0.005, 0.0005),
180 | 
181 |                 "reg_alpha": hp.uniform("reg_alpha", 3.0, 11.0),
182 |                 "reg_lambda": hp.uniform("reg_lambda", 3.0, 11.0),
183 |                 "learning_rate": hp.quniform("learning_rate", 0.02, 0.4, 0.01),
184 |                 # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)),
185 |                 #
186 |                 "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(200, 2000, 80, dtype=int)),
187 |                 # "is_unbalance": hp.choice("is_unbalance", [True])
188 |             }
189 |             """
190 | 
191 |         def objective(hyperparams):
192 |             model = lgb.train({**params, **hyperparams}, train_data, 300,
193 |                               valid_data, early_stopping_rounds=45, verbose_eval=0)
194 | 
195 |             score = model.best_score["valid_0"][params["metric"]]
196 |             # in classification, less is better
197 |             return {'loss': -score, 'status': STATUS_OK}
198 | 
199 | 
200 |         trials = Trials()
201 |         time_10evals_start = time.time()
202 | 
203 |         best = hyperopt.fmin(fn=objective, space=space, trials=trials,
204 |                              algo=tpe.suggest, max_evals=10, verbose=1,
205 |                              rstate=np.random.RandomState(1))
206 | 
207 |         time_10evals_end = time.time()
208 | 
209 |         time_10_eval = (time_10evals_end - time_10evals_start)
210 |         time_end_dfl = time.time()
211 | 
212 | 
213 |         #400 = dim*50
214 |         #avoid error
215 |         #0.8*20=16
216 | 
217 |         #for test
218 |         #400
219 |         evals_num = min(int(8 * ((time_limitation - (time_end_dfl - time_start_dfl)) / time_10_eval)),1000)
220 | 
221 |         best = hyperopt.fmin(fn=objective, space=space, trials=trials,
222 |                              algo=tpe.suggest, max_evals=evals_num, verbose=1,
223 |                              rstate=np.random.RandomState(1))
224 |         hyperparams = space_eval(space, best)
225 | 
226 |         log(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}")
227 | 
228 |         del trials
229 | 
230 |         return hyperparams
231 | 


--------------------------------------------------------------------------------
/merge.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from collections import defaultdict, deque
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | import CONSTANT
  9 | from util import Config, Timer, log, timeit
 10 | 
 11 | NUM_OP = [np.std, np.mean]
 12 | 
 13 | def join_name(a):
 14 |     if 'm_cat' in a[1]:
 15 |         prefix = CONSTANT.MULTI_CAT_PREFIX
 16 |     elif 'cat' in a[1]:
 17 |         prefix = CONSTANT.CATEGORY_PREFIX
 18 |     else:
 19 |         prefix = CONSTANT.NUMERICAL_PREFIX
 20 |     return f"{prefix}JOIN_{a[1].upper()}({a[0]})"
 21 | 
 22 | 
 23 | def bfs(root_name, graph, tconfig):
 24 |     tconfig[CONSTANT.MAIN_TABLE_NAME]['depth'] = 0
 25 |     queue = deque([root_name])
 26 |     while queue:
 27 |         u_name = queue.popleft()
 28 |         for edge in graph[u_name]:
 29 |             v_name = edge['to']
 30 |             if 'depth' not in tconfig[v_name]:
 31 |                 tconfig[v_name]['depth'] = tconfig[u_name]['depth'] + 1
 32 |                 queue.append(v_name)
 33 | 
 34 | 
 35 | @timeit
 36 | def join(u, v, v_name, key, type_):
 37 |     if type_.split("_")[2] == 'many':
 38 |         agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key
 39 |                      and not col.startswith(CONSTANT.TIME_PREFIX)
 40 |                      and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)}
 41 |         v = v.groupby(key).agg(agg_funcs)
 42 |         v.columns = v.columns.map(join_name)
 43 |     else:
 44 |         v = v.set_index(key)
 45 |     v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}")
 46 |     return u.join(v, on=key)
 47 | 
 48 | # @timeit
 49 | # def join(u, v, v_name, key, type_):
 50 | # #    if isinstance(key, list):
 51 | # #        assert len(key) == 1
 52 | # #        key = key[0]
 53 | #    if type_.split("_")[2] == 'many':
 54 | #        agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key
 55 | #                     and not col.startswith(CONSTANT.TIME_PREFIX)
 56 | #                     and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)}
 57 | #        rehash_key = f'rehash_{key}'
 58 | #        v[rehash_key] = v[key].apply(lambda x: hash(x) % 200000)
 59 | #        v = v.groupby(rehash_key).agg(agg_funcs)
 60 | #        v.columns = v.columns.map(join_name)
 61 | #    else:
 62 | #        v = v.set_index(key)
 63 | #    v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}")
 64 | #    return u.join(v, on=key)
 65 | 
 66 | 
 67 | @timeit
 68 | def temporal_join(u, v, v_name, key, time_col):
 69 |     timer = Timer()
 70 | 
 71 |     if isinstance(key, list):
 72 |         assert len(key) == 1
 73 |         key = key[0]
 74 | 
 75 |     tmp_u = u[[time_col, key]]
 76 |     timer.check("select")
 77 | 
 78 |     tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
 79 |     timer.check("concat")
 80 | 
 81 |     rehash_key = f'rehash_{key}'
 82 |     tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX)
 83 |     timer.check("rehash_key")
 84 | 
 85 |     tmp_u.sort_values(time_col, inplace=True)
 86 |     timer.check("sort")
 87 | 
 88 |     agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key
 89 |                  and not col.startswith(CONSTANT.TIME_PREFIX)
 90 |                  and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)}
 91 | 
 92 |     tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs)
 93 |     timer.check("group & rolling & agg")
 94 | 
 95 |     tmp_u.reset_index(0, drop=True, inplace=True)  # drop rehash index
 96 |     timer.check("reset_index")
 97 | 
 98 |     tmp_u.columns = tmp_u.columns.map(lambda a:
 99 |         f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")
100 | 
101 |     if tmp_u.empty:
102 |         log("empty tmp_u, return u")
103 |         return u
104 | 
105 |     ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False)
106 |     timer.check("final concat")
107 | 
108 |     del tmp_u
109 | 
110 |     return ret
111 | 
112 | def dfs(u_name, config, tables, graph):
113 |     u = tables[u_name]
114 |     log(f"enter {u_name}")
115 |     for edge in graph[u_name]:
116 |         v_name = edge['to']
117 |         if config['tables'][v_name]['depth'] <= config['tables'][u_name]['depth']:
118 |             continue
119 | 
120 |         v = dfs(v_name, config, tables, graph)
121 |         key = edge['key']
122 |         type_ = edge['type']
123 | 
124 |         if config['time_col'] not in u and config['time_col'] in v:
125 |             continue
126 | 
127 |         if config['time_col'] in u and config['time_col'] in v:
128 |             log(f"join {u_name} <--{type_}--t {v_name}")
129 |             u = temporal_join(u, v, v_name, key, config['time_col'])
130 |         else:
131 |             log(f"join {u_name} <--{type_}--nt {v_name}")
132 |             u = join(u, v, v_name, key, type_)
133 | 
134 |         del v
135 | 
136 |     log(f"leave {u_name}")
137 |     return u
138 | 
139 | 
140 | @timeit
141 | def merge_table(tables, config):
142 |     graph = defaultdict(list)
143 |     for rel in config['relations']:
144 |         ta = rel['table_A']
145 |         tb = rel['table_B']
146 |         graph[ta].append({
147 |             "to": tb,
148 |             "key": rel['key'],
149 |             "type": rel['type']
150 |         })
151 |         graph[tb].append({
152 |             "to": ta,
153 |             "key": rel['key'],
154 |             "type": '_'.join(rel['type'].split('_')[::-1])
155 |         })
156 |     bfs(CONSTANT.MAIN_TABLE_NAME, graph, config['tables'])
157 |     return dfs(CONSTANT.MAIN_TABLE_NAME, config, tables, graph)
158 | 


--------------------------------------------------------------------------------
/metadata:
--------------------------------------------------------------------------------
1 | description: Provides prediction model to be executed by the ingestion program


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | os.system("pip3 install hyperopt")
  4 | os.system("pip3 install lightgbm")
  5 | os.system("pip3 install pandas==0.24.2")
  6 | 
  7 | import copy
  8 | import numpy as np
  9 | import pandas as pd
 10 | from sklearn.metrics import roc_auc_score
 11 | 
 12 | 
 13 | from automl import predict, train, validate,timetrain,timepredict
 14 | from CONSTANT import MAIN_TABLE_NAME
 15 | from merge import merge_table
 16 | from preprocess import clean_df, clean_tables, feature_engineer
 17 | from util import Config, log, show_dataframe, timeit
 18 | import random
 19 | import time
 20 | 
 21 | class Model:
 22 |     def __init__(self, info):
 23 |         self.config = Config(info)
 24 |         self.tables = None
 25 |         self.targets = None
 26 | 
 27 |         self.Time_data_info={
 28 |             #time
 29 |             'total_time':0,
 30 |             'time_ramain_so_far':0,
 31 |             'time_for_feature_engineering':0,
 32 |             'time_for_hyperparams_searching':0,
 33 |             'time_for_model_train':0,
 34 |             'time_for_model_prediction':0,
 35 | 
 36 |             #size
 37 |             'feature_engineering_input_size':0,
 38 |             'data_rows_for_hp':0,
 39 |             'data_cols_for_hp':0,
 40 |             'test_data_rows':0,
 41 |             'test_data_columns':0,
 42 |             'For_safe':50,
 43 |         }
 44 |         self.randomintvalue = random.randint(1, 100)
 45 | 
 46 |     @timeit
 47 |     def fit(self, Xs, y, time_ramain):
 48 |         self.Time_data_info['total_time'] = time_ramain
 49 |         self.Time_data_info['For_safe'] = (self.Time_data_info['total_time'] / 12)
 50 | 
 51 |         self.tables = Xs
 52 |         self.targets = y
 53 | 
 54 |     @timeit
 55 |     def predict(self, X_test, time_remain):
 56 |         self.Time_data_info['time_ramain_so_far'] = time_remain
 57 | 
 58 |         start_feature = time.time()
 59 | 
 60 |         Xs = self.tables
 61 |         main_table = Xs[MAIN_TABLE_NAME]
 62 | 
 63 |         log(f"Merge train and test tables...")
 64 |         main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
 65 |         main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
 66 |         Xs[MAIN_TABLE_NAME] = main_table
 67 | 
 68 |         log(f"Feature engineering...")
 69 |         clean_tables(Xs)
 70 |         X = merge_table(Xs, self.config)
 71 |         X = clean_df(X)
 72 |         X = feature_engineer(X, self.config)
 73 | 
 74 | 
 75 |         X_train = X[X.index.str.startswith("train")]
 76 |         X_train.index = X_train.index.map(lambda x: int(x.split('_')[1]))
 77 |         X_train.sort_index(inplace=True)
 78 |         y_train = self.targets
 79 | 
 80 |         end_feature = time.time()
 81 | 
 82 |         self.Time_data_info['time_for_feature_engineering'] = (end_feature - start_feature)
 83 | 
 84 |         self.Time_data_info['time_ramain_so_far'] = self.Time_data_info['time_ramain_so_far'] - self.Time_data_info[
 85 |             'time_for_feature_engineering']
 86 | 
 87 |         print(f"TIME info:", self.Time_data_info)
 88 | 
 89 |         # train model
 90 |         log(f"Training...")
 91 |         train_start = time.time()
 92 | 
 93 |         timetrain(X_train, y_train, self.config,self.Time_data_info)
 94 | 
 95 |         train_end = time.time()
 96 | 
 97 |         self.Time_data_info['time_ramain_so_far'] = self.Time_data_info['time_ramain_so_far']-(train_end-train_start)
 98 |         self.Time_data_info['time_for_model_train'] = (train_end-train_start)
 99 | 
100 |         print("TIME info:", self.Time_data_info)
101 | 
102 |         # predict
103 |         log(f"Predicting...")
104 |         X_test = X[X.index.str.startswith("test")]
105 |         X_test.index = X_test.index.map(lambda x: int(x.split('_')[1]))
106 |         X_test.sort_index(inplace=True)
107 |         result = predict(X_test, self.config)
108 | 
109 |         return pd.Series(result)
110 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import pandas as pd
  3 | import numpy as np
  4 | from multiprocessing import Pool
  5 | 
  6 | import CONSTANT
  7 | from util import log, timeit
  8 | 
  9 | 
 10 | uni_ops = {
 11 |     CONSTANT.TIME_PREFIX: {
 12 |         'week': lambda df: df.dt.week,
 13 |         'year': lambda df: df.dt.year,
 14 |         'month': lambda df: df.dt.month,
 15 |         'day': lambda df: df.dt.day,
 16 |         'hour': lambda df: df.dt.hour,
 17 |         # 'minute': lambda df: df.dt.minute,
 18 |         'dayofweek': lambda df: df.dt.dayofweek,
 19 |         'dayofyear': lambda df: df.dt.dayofyear,
 20 |     },
 21 | }
 22 | 
 23 | @timeit
 24 | def compress_df(df, num=True, cat=True):
 25 |     if num:
 26 |         num_cols = [c for c in df if c.startswith(CONSTANT.NUMERICAL_PREFIX)]
 27 |         if len(num_cols) > 0:
 28 |             df[num_cols] = df[num_cols].astype('float32')
 29 |     if cat:
 30 |         cat_cols = [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)]
 31 |         if len(cat_cols) > 0:
 32 |             df[cat_cols] = df[cat_cols].astype('category')
 33 | 
 34 | 
 35 | @timeit
 36 | def parallelize_apply(func, df, cols):
 37 |    num_threads=4
 38 |    pool = Pool(processes=num_threads)
 39 |    col_num = int(np.ceil(len(cols) / num_threads))
 40 |    res1 = pool.apply_async(func, args=(df,cols[:col_num]))
 41 |    res2 = pool.apply_async(func, args=(df,cols[col_num:2 * col_num]))
 42 |    res3 = pool.apply_async(func, args=(df,cols[2 * col_num:3 * col_num]))
 43 |    res4 = pool.apply_async(func, args=(df,cols[3 * col_num:]))
 44 |    pool.close()
 45 |    pool.join()
 46 |    df = pd.concat([df,res1.get(),res2.get(),res3.get(),res4.get()],axis=1)
 47 |    return df
 48 | 
 49 | 
 50 | @timeit
 51 | def normal_apply(func, df, cols):
 52 |     return pd.concat([df, func(df, cols)], axis=1)
 53 | 
 54 | 
 55 | @timeit
 56 | def clean_tables(tables):
 57 |     for tname in tables:
 58 |         log(f"cleaning table {tname}")
 59 |         df = tables[tname]
 60 |         fillna(df)
 61 |         num_cols = [c for c in df if c.startswith(CONSTANT.NUMERICAL_PREFIX)]
 62 |         cat_cols = [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)]
 63 |         m_cat_cols = [c for c in df if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]
 64 |         time_cols = [c for c in df if c.startswith(CONSTANT.TIME_PREFIX)]
 65 | 
 66 |         if len(cat_cols) > 0:
 67 |             df = normal_apply(count_cat, df, cat_cols)
 68 |         if len(m_cat_cols) > 3:
 69 |             df = parallelize_apply(count_m_cat, df, m_cat_cols)
 70 |         elif len(m_cat_cols) > 0:
 71 |             df = normal_apply(count_m_cat, df, m_cat_cols)
 72 |         if len(time_cols) > 0:
 73 |             df = normal_apply(transform_datetime, df, time_cols)
 74 |         # drop columns
 75 |         df.drop(m_cat_cols+time_cols, axis=1, inplace=True)
 76 | 
 77 |         compress_df(df)
 78 |         tables[tname] = df
 79 | 
 80 | @timeit
 81 | def clean_df(df):
 82 |     compress_df(df, num=False)
 83 |     df_fillna_with_mean(df)
 84 |     hash_cat(df)
 85 |     return df
 86 | 
 87 | 
 88 | @timeit
 89 | def fillna(df):
 90 |     for c in [c for c in df if c.startswith(CONSTANT.NUMERICAL_PREFIX)]:
 91 |         df[c].fillna(-1, inplace=True)
 92 |     for c in [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)]:
 93 |         df[c].fillna("0", inplace=True)
 94 |     for c in [c for c in df if c.startswith(CONSTANT.TIME_PREFIX)]:
 95 |         df[c].fillna(datetime.datetime(1970, 1, 1), inplace=True)
 96 |     for c in [c for c in df if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
 97 |         df[c].fillna("0", inplace=True)
 98 | 
 99 | 
100 | @timeit
101 | def df_fillna_with_mean(df):
102 |     for c in [c for c in df if c.startswith(CONSTANT.NUMERICAL_PREFIX)]:
103 |         df[c].fillna(df[c].mean(), inplace=True)
104 |     for c in [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)]:
105 |         if "0" not in df[c].cat.categories:
106 |             df[c] = df[c].cat.add_categories(["0"])
107 |         df[c].fillna("0")
108 |     for c in [c for c in df if c.startswith(CONSTANT.TIME_PREFIX)]:
109 |         mean = pd.to_timedelta(df[c]).mean() + pd.Timestamp(0)
110 |         df[c].fillna(mean, inplace=True)
111 |     for c in [c for c in df if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
112 |         df[c].fillna("0", inplace=True)
113 | 
114 | 
115 | @timeit
116 | def feature_engineer(df, config):
117 |    return df
118 | 
119 | 
120 | def count_cat(df, cat_cols):
121 |     prefix_n = CONSTANT.NUMERICAL_PREFIX
122 |     prefix_c = CONSTANT.CATEGORY_PREFIX
123 |     op = "frequency"
124 |     new_df=pd.DataFrame()
125 |     for c in cat_cols:
126 |         dic = df[c].value_counts().to_dict()
127 |         new_df[f"{prefix_n}{op.upper()}({c})"] = df[c].apply(lambda x: dic[x])
128 |     return new_df
129 | 
130 | def hash_cat(df):
131 |     for c in [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)]:
132 |         df[c] = df[c].apply(lambda x: int(x))
133 | 
134 | def frequent_cat(x):
135 |     data = x.split(',')
136 |     item, freq = np.unique(data, return_counts=True)
137 |     return item[np.argmax(freq)]
138 | 
139 | def weighted_cat(dic):
140 |     def freq(x):
141 |         data = x.split(',')
142 |         item, freq = np.unique(data, return_counts=True)
143 |         global_freq = np.array([dic[i] for i in item])
144 |         return item[np.argmax(global_freq*freq)]
145 |     return freq
146 | 
147 | def count_m_cat(df,m_cat_cols):
148 |     prefix_n = CONSTANT.NUMERICAL_PREFIX
149 |     prefix_c = CONSTANT.CATEGORY_PREFIX
150 |     op_l = 'length'
151 |     op_f = 'frequent_cat'
152 |     op_fw = 'frequent_weighted_cat'
153 |     new_df=pd.DataFrame()
154 |     for c in m_cat_cols:
155 |         new_df[f"{prefix_c}{op_f.upper()}RANK(1)({c})"] = df[c].apply(frequent_cat)
156 |         new_df[f"{prefix_n}{op_l.upper()}({c})"] = df[c].apply(lambda x: len(x.split(',')))
157 |         all_item = ','.join(df[c].values).split(',')
158 |         item, freq = np.unique(all_item, return_counts=True)
159 |         dic = dict(zip(item, freq))
160 |         new_df[f"{prefix_c}{op_fw.upper()}RANK(1)({c})"] = df[c].apply(weighted_cat(dic))
161 |     return new_df
162 | 
163 | 
164 | def transform_datetime(df, time_cols):
165 |     prefix_n = CONSTANT.NUMERICAL_PREFIX
166 |     ops = uni_ops[CONSTANT.TIME_PREFIX]
167 |     new_dfs = []
168 |     for c in time_cols:
169 |         new_df = df[c].agg(ops.values())
170 |         new_df.columns = [f"{prefix_n}{op.upper()}({c})" for op in ops]
171 |         new_dfs += [new_df]
172 |     return pd.concat(new_dfs, axis=1)
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | import CONSTANT
  8 | 
  9 | nesting_level = 0
 10 | is_start = None
 11 | 
 12 | class Timer:
 13 |     def __init__(self):
 14 |         self.start = time.time()
 15 |         self.history = [self.start]
 16 | 
 17 |     def check(self, info):
 18 |         current = time.time()
 19 |         log(f"[{info}] spend {current - self.history[-1]:0.2f} sec")
 20 |         self.history.append(current)
 21 | 
 22 | def timeit(method, start_log=None):
 23 |     def timed(*args, **kw):
 24 |         global is_start
 25 |         global nesting_level
 26 | 
 27 |         if not is_start:
 28 |             print()
 29 | 
 30 |         is_start = True
 31 |         log(f"Start [{method.__name__}]:" + (start_log if start_log else ""))
 32 |         nesting_level += 1
 33 | 
 34 |         start_time = time.time()
 35 |         result = method(*args, **kw)
 36 |         end_time = time.time()
 37 | 
 38 |         nesting_level -= 1
 39 |         log(f"End   [{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.")
 40 |         is_start = False
 41 | 
 42 |         return result
 43 | 
 44 |     return timed
 45 | 
 46 | 
 47 | def log(entry: Any):
 48 |     global nesting_level
 49 |     space = "-" * (4 * nesting_level)
 50 |     print(f"{space}{entry}")
 51 | 
 52 | def show_dataframe(df):
 53 |     if len(df) <= 30:
 54 |         print(f"content=\n"
 55 |               f"{df}")
 56 |     else:
 57 |         print(f"dataframe is too large to show the content, over {len(df)} rows")
 58 | 
 59 |     if len(df.dtypes) <= 100:
 60 |         print(f"types=\n"
 61 |               f"{df.dtypes}\n")
 62 |     else:
 63 |         print(f"dataframe is too wide to show the dtypes, over {len(df.dtypes)} columns")
 64 | 
 65 | 
 66 | class Config:
 67 |     def __init__(self, info):
 68 |         self.data = {
 69 |             "start_time": time.time(),
 70 |             **info
 71 |         }
 72 |         self.data["tables"] = {}
 73 |         for tname, ttype in info['tables'].items():
 74 |             self.data['tables'][tname] = {}
 75 |             self.data['tables'][tname]['type'] = ttype
 76 | 
 77 |     @staticmethod
 78 |     def aggregate_op(col):
 79 | 
 80 |         def freq_cat(x):
 81 |             item, freq = np.unique(x, return_counts=True)
 82 |             return item[np.argmax(freq)]
 83 |         
 84 |         def freq_m_cat(x):
 85 |             data = ','.join(x.values).split(',')
 86 |             item, freq = np.unique(data, return_counts=True)
 87 |             return item[np.argmax(freq)]
 88 | 
 89 |         def m_cat_concat(x):
 90 |             return ','.join(x.values)
 91 | 
 92 |         ops = {
 93 |             CONSTANT.NUMERICAL_TYPE: ["sum", "mean"],
 94 |             CONSTANT.CATEGORY_TYPE: ["count"],
 95 |             #  TIME_TYPE: ["max"],
 96 |             # CONSTANT.MULTI_CAT_TYPE: [m_cat_concat],
 97 |         }
 98 |         if col.startswith(CONSTANT.NUMERICAL_PREFIX):
 99 |             return ops[CONSTANT.NUMERICAL_TYPE]
100 |         if col.startswith(CONSTANT.CATEGORY_PREFIX):
101 |             return ops[CONSTANT.CATEGORY_TYPE]
102 |         if col.startswith(CONSTANT.MULTI_CAT_PREFIX):
103 |             assert False, f"MultiCategory type feature's aggregate op are not supported."
104 |             return ops[CONSTANT.MULTI_CAT_TYPE]
105 |         if col.startswith(CONSTANT.TIME_PREFIX):
106 |             assert False, f"Time type feature's aggregate op are not implemented."
107 |         assert False, f"Unknown col type {col}"
108 | 
109 |     def time_left(self):
110 |         return self["time_budget"] - (time.time() - self["start_time"])
111 | 
112 |     def __getitem__(self, key):
113 |         return self.data[key]
114 | 
115 |     def __setitem__(self, key, value):
116 |         self.data[key] = value
117 | 
118 |     def __delitem__(self, key):
119 |         del self.data[key]
120 | 
121 |     def __contains__(self, key):
122 |         return key in self.data
123 | 
124 |     def __len__(self):
125 |         return len(self.data)
126 | 
127 |     def __repr__(self):
128 |         return repr(self.data)
129 | 


--------------------------------------------------------------------------------