├── CONSTANT.py ├── LICENSE ├── README.md ├── automl.py ├── hyperopt_class.py ├── merge.py ├── metadata ├── model.py ├── preprocess.py └── util.py /CONSTANT.py: -------------------------------------------------------------------------------- 1 | 2 | NUMERICAL_TYPE = "num" 3 | NUMERICAL_PREFIX = "n_" 4 | 5 | CATEGORY_TYPE = "cat" 6 | CATEGORY_PREFIX = "c_" 7 | 8 | TIME_TYPE = "time" 9 | TIME_PREFIX = "t_" 10 | 11 | MULTI_CAT_TYPE = "multi-cat" 12 | MULTI_CAT_PREFIX = "m_" 13 | MULTI_CAT_DELIMITER = "," 14 | 15 | POS_MULTI_CAT_PREFIX = "cm_" 16 | 17 | 18 | MAIN_TABLE_NAME = "main" 19 | MAIN_TABLE_TEST_NAME = "main_test" 20 | TABLE_PREFIX = "table_" 21 | 22 | LABEL = "label" 23 | 24 | HASH_MAX = 200 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 shuyao95 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kddcup2019-automl 2 | Code for KDD CUP 2019 Auto-ML track 3 | -------------------------------------------------------------------------------- /automl.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import hyperopt 4 | import lightgbm as lgb 5 | import numpy as np 6 | import pandas as pd 7 | from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe 8 | from sklearn.metrics import roc_auc_score 9 | from sklearn.model_selection import train_test_split 10 | from Our_model.hyperopt_class import train_hyperopt 11 | from util import Config, log, timeit 12 | 13 | 14 | @timeit 15 | def train(X: pd.DataFrame, y: pd.Series, config: Config): 16 | train_lightgbm(X, y, config) 17 | 18 | 19 | @timeit 20 | def predict(X: pd.DataFrame, config: Config) -> List: 21 | preds = predict_lightgbm(X, config) 22 | return preds 23 | 24 | 25 | @timeit 26 | def validate(preds, y_path) -> np.float64: 27 | score = roc_auc_score(pd.read_csv(y_path)['label'].values, preds) 28 | log("Score: {:0.4f}".format(score)) 29 | return score 30 | 31 | 32 | @timeit 33 | def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config): 34 | params = { 35 | "boosting": "gbdt", 36 | "objective": "binary", 37 | "metric": "auc", 38 | "verbosity": -1, 39 | "seed": 1, 40 | "num_threads": 4, 41 | # "scale_pos_weight": 5 42 | } 43 | 44 | n_samples = int(0.1*len(X)) 45 | print('number of sample for hyperopt', n_samples) 46 | X_sample, y_sample = data_sample(X, y, n_samples) 47 | hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config) 48 | 49 | X_train, X_val, y_train, y_val = data_split(X, y, 0.1) 50 | train_data = lgb.Dataset(X_train, label=y_train) 51 | valid_data = lgb.Dataset(X_val, label=y_val) 52 | 53 | config["model"] = lgb.train({**params, **hyperparams}, 54 | train_data, 55 | 1200, 56 | [train_data, valid_data], 57 | early_stopping_rounds=45, 58 | verbose_eval=100) 59 | 60 | 61 | @timeit 62 | def predict_lightgbm(X: pd.DataFrame, config: Config) -> List: 63 | return config["model"].predict(X) 64 | 65 | 66 | @timeit 67 | def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config): 68 | X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5) 69 | train_data = lgb.Dataset(X_train, label=y_train) 70 | valid_data = lgb.Dataset(X_val, label=y_val) 71 | 72 | space = { 73 | "max_depth": hp.choice("max_depth", np.arange(2, 10, 1, dtype=int)), 74 | # smaller than 2^(max_depth) 75 | "num_leaves": hp.choice("num_leaves", np.arange(4, 200, 4, dtype=int)), 76 | "feature_fraction": hp.quniform("feature_fraction", 0.2, 0.8, 0.1), 77 | # "bagging_fraction": hp.quniform("bagging_fraction", 0.2, 0.8, 0.1), 78 | # "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 10, 2, dtype=int)), 79 | # "scale_pos_weight":hp.uniform('scale_pos_weight',1.0, 10.0), 80 | # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0), 81 | "min_child_weight": hp.quniform('min_child_weight', 2, 50, 2), 82 | "reg_alpha": hp.uniform("reg_alpha", 2.0, 8.0), 83 | "reg_lambda": hp.uniform("reg_lambda", 2.0, 8.0), 84 | "learning_rate": hp.quniform("learning_rate", 0.05, 0.4, 0.01), 85 | # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)), 86 | # 87 | "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(200, 2000, 100, dtype=int)), 88 | #"is_unbalance": hp.choice("is_unbalance", [True]) 89 | } 90 | 91 | def objective(hyperparams): 92 | model = lgb.train({**params, **hyperparams}, train_data, 300, 93 | valid_data, early_stopping_rounds=45, verbose_eval=0) 94 | 95 | score = model.best_score["valid_0"][params["metric"]] 96 | 97 | # in classification, less is better 98 | return {'loss': -score, 'status': STATUS_OK} 99 | 100 | trials = Trials() 101 | best = hyperopt.fmin(fn=objective, space=space, trials=trials, 102 | algo=tpe.suggest, max_evals=150, verbose=1, 103 | rstate=np.random.RandomState(1)) 104 | 105 | hyperparams = space_eval(space, best) 106 | log(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}") 107 | return hyperparams 108 | 109 | 110 | def data_split(X: pd.DataFrame, y: pd.Series, test_size: float=0.2): 111 | # -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series): 112 | return train_test_split(X, y, test_size=test_size, random_state=1) 113 | 114 | 115 | def data_sample(X: pd.DataFrame, y: pd.Series, nrows: int=5000): 116 | # -> (pd.DataFrame, pd.Series): 117 | if len(X) > nrows: 118 | X_sample = X.sample(nrows, random_state=1) 119 | y_sample = y[X_sample.index] 120 | else: 121 | X_sample = X 122 | y_sample = y 123 | 124 | return X_sample, y_sample 125 | 126 | 127 | ######################################with time limitation 128 | 129 | @timeit 130 | def timetrain(X: pd.DataFrame, y: pd.Series, config: Config,Time_info): 131 | 132 | time_limitation_for_hp= Time_info['time_ramain_so_far']-Time_info['For_safe'] 133 | 134 | new_y = train_hyperopt(Time_info) 135 | 136 | new_y.train_lightgbm(X,y,config,time_limitation_for_hp) 137 | 138 | 139 | @timeit 140 | def timepredict(X: pd.DataFrame, config: Config) -> List: 141 | 142 | preds = predict_configmodel(X, config) 143 | 144 | return preds 145 | 146 | @timeit 147 | def predict_configmodel(X: pd.DataFrame, config: Config) -> List: 148 | return config["model"].predict(X) 149 | -------------------------------------------------------------------------------- /hyperopt_class.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.model_selection import train_test_split,cross_val_score,KFold 3 | import lightgbm as lgb 4 | 5 | from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe 6 | 7 | from util import Config,log,timeit 8 | from typing import Dict,List 9 | from sklearn.metrics import roc_auc_score,auc 10 | 11 | import time 12 | 13 | import hyperopt 14 | import numpy as np 15 | 16 | 17 | def data_split(X: pd.DataFrame, y: pd.Series, test_size: float=0.2): 18 | # -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series): 19 | return train_test_split(X, y, test_size=test_size, random_state=1) 20 | 21 | 22 | def data_sample(X: pd.DataFrame, y: pd.Series, nrows: int=5000): 23 | # -> (pd.DataFrame, pd.Series): 24 | if len(X) > nrows: 25 | X_sample = X.sample(nrows, random_state=1) 26 | y_sample = y[X_sample.index] 27 | else: 28 | X_sample = X 29 | y_sample = y 30 | 31 | return X_sample, y_sample 32 | 33 | 34 | class train_hyperopt: 35 | 36 | def __init__(self,Time_info): 37 | self.Time_info = Time_info 38 | 39 | @timeit 40 | def train_lightgbm(self,X: pd.DataFrame, y: pd.Series, config: Config,time_limitation): 41 | 42 | params = { 43 | 'boosting_type':'gbdt', 44 | #'boosting_type': 'dart', 45 | "objective": "binary", 46 | "metric": "auc", 47 | "verbosity": -1, 48 | "seed": 1, 49 | "num_threads": 4, 50 | 'feature_fraction':0.9, 51 | } 52 | 53 | n_samples = int(0.1 * len(X)) 54 | print('number of sample for hyperopt', n_samples) 55 | X_sample, y_sample = data_sample(X, y, n_samples) 56 | 57 | 58 | time_for_hp = (1 * time_limitation / 2) 59 | 60 | HYPEROPT_START=time.time() 61 | hyperparams = self.hyperopt_lightgbm(X_sample, y_sample, params, config,time_for_hp) 62 | HYPEROPT_end=time.time() 63 | 64 | print("time hyperopt:",HYPEROPT_end-HYPEROPT_START) 65 | 66 | 67 | time_for_train = time_limitation - (HYPEROPT_end - HYPEROPT_START) 68 | 69 | X_train, X_val, y_train, y_val = data_split(X, y, 0.1) 70 | 71 | train_data = lgb.Dataset(X_train, label=y_train) 72 | valid_data = lgb.Dataset(X_val, label=y_val) 73 | # set it to a big value 74 | 75 | train_time_start = time.time() 76 | clf1= lgb.train({**params, **hyperparams}, 77 | train_data, 78 | 30, 79 | valid_data, 80 | early_stopping_rounds=30, 81 | verbose_eval=100) 82 | train_time_end = time.time() 83 | del clf1 84 | 85 | _30_boost_rounds_for_train_time = train_time_end - train_time_start 86 | 87 | leave_num_boost_rounds = int(0.9*(30*((time_for_train-_30_boost_rounds_for_train_time)/_30_boost_rounds_for_train_time))) 88 | 89 | print("leave_num_boost_rounds",leave_num_boost_rounds) 90 | 91 | #for test 92 | clf = lgb.train({**params, **hyperparams}, 93 | train_set=train_data, 94 | num_boost_round=leave_num_boost_rounds, 95 | valid_sets=[train_data, valid_data], 96 | early_stopping_rounds=200, 97 | verbose_eval=100) 98 | 99 | 100 | 101 | config["model"]=clf 102 | 103 | #importance = config["model"].feature_importance(importance_type='split') 104 | #feature_name = config["model"].feature_name() 105 | #feature_importance = pd.DataFrame({'feature_name': feature_name, 'importance': importance}) 106 | #feature_importance.to_csv('feature_importance.csv', index=False) 107 | 108 | @timeit 109 | def predict_lightgbm(self,X: pd.DataFrame, config: Config) -> List: 110 | 111 | return config["model"].predict(X) 112 | 113 | @timeit 114 | def hyperopt_lightgbm(self,X: pd.DataFrame, y: pd.Series, params: Dict, config: Config,time_limitation): 115 | 116 | time_start_dfl = time.time() 117 | 118 | X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5) 119 | 120 | train_data = lgb.Dataset(X_train, label=y_train) 121 | valid_data = lgb.Dataset(X_val, label=y_val) 122 | 123 | if params['boosting_type']=='dart': 124 | space = { 125 | "max_depth": hp.choice("max_depth", np.arange(2, 8, 1, dtype=int)), 126 | # smaller than 2^(max_depth) 127 | "num_leaves": hp.choice("num_leaves", np.arange(4, 400, 4, dtype=int)), 128 | "feature_fraction": hp.quniform("feature_fraction", 0.5, 0.9, 0.1), 129 | "bagging_fraction": hp.quniform("bagging_fraction", 0.5, 0.9, 0.1), 130 | "bagging_freq": hp.choice("bagging_freq", np.linspace(1,10,2, dtype=int)), 131 | # "scale_pos_weight":hp.uniform('scale_pos_weight',1.0,10.0), 132 | # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0), 133 | "min_child_weight": hp.quniform('min_child_weight', 2, 50, 2), 134 | "reg_alpha": hp.uniform("reg_alpha", 0.5, 5.0), 135 | "reg_lambda": hp.uniform("reg_lambda", 0.5, 5.0), 136 | "learning_rate": hp.quniform("learning_rate", 0.05, 0.2, 0.02), 137 | # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)), 138 | # 139 | "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(200, 2000, 200, dtype=int)), 140 | "is_unbalance": hp.choice("is_unbalance", [True]) 141 | } 142 | else: 143 | space = { 144 | "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), 145 | # smaller than 2^(max_depth) 146 | "num_leaves": hp.choice("num_leaves", np.arange(4, 160, 4, dtype=int)), 147 | # "feature_fraction": hp.quniform("feature_fraction", 0.8, 0.9, 0.05), 148 | # "bagging_fraction": hp.quniform("bagging_fraction", 0.2, 0.8, 0.1), 149 | # "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 10, 2, dtype=int)), 150 | # "scale_pos_weight": hp.uniform('scale_pos_weight',1.0, 10.0), 151 | "scale_pos_weight": hp.choice("scale_pos_weight", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), 152 | # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0), 153 | 154 | "min_child_weight": hp.uniform('min_child_weight', 1, 10), 155 | # "min_child_weight": hp.quniform('min_child_weight', 0.0002, 0.005, 0.0005), 156 | 157 | "reg_alpha": hp.uniform("reg_alpha", 0.0, 10.0), 158 | "reg_lambda": hp.uniform("reg_lambda", 0.0, 10.0), 159 | "learning_rate": hp.quniform("learning_rate", 0.01, 0.1, 0.01), 160 | # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)), 161 | # 162 | "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(10, 2000, 10, dtype=int)), 163 | # "is_unbalance": hp.choice("is_unbalance", [True]) 164 | } 165 | 166 | """ 167 | space = { 168 | "max_depth": hp.choice("max_depth", np.arange(2, 11, 1, dtype=int)), 169 | # smaller than 2^(max_depth) 170 | #160 171 | "num_leaves": hp.choice("num_leaves", np.arange(8, 200, 8, dtype=int)), 172 | "feature_fraction": hp.quniform("feature_fraction", 0.6, 0.9, 0.05), 173 | # "bagging_fraction": hp.quniform("bagging_fraction", 0.2, 0.8, 0.1), 174 | # "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 10, 2, dtype=int)), 175 | # "scale_pos_weight":hp.uniform('scale_pos_weight',1.0, 10.0), 176 | # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0), 177 | 178 | "min_child_weight": hp.quniform('min_child_weight', 0.5, 80, 0.5), 179 | #"min_child_weight": hp.quniform('min_child_weight', 0.0002, 0.005, 0.0005), 180 | 181 | "reg_alpha": hp.uniform("reg_alpha", 3.0, 11.0), 182 | "reg_lambda": hp.uniform("reg_lambda", 3.0, 11.0), 183 | "learning_rate": hp.quniform("learning_rate", 0.02, 0.4, 0.01), 184 | # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)), 185 | # 186 | "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(200, 2000, 80, dtype=int)), 187 | # "is_unbalance": hp.choice("is_unbalance", [True]) 188 | } 189 | """ 190 | 191 | def objective(hyperparams): 192 | model = lgb.train({**params, **hyperparams}, train_data, 300, 193 | valid_data, early_stopping_rounds=45, verbose_eval=0) 194 | 195 | score = model.best_score["valid_0"][params["metric"]] 196 | # in classification, less is better 197 | return {'loss': -score, 'status': STATUS_OK} 198 | 199 | 200 | trials = Trials() 201 | time_10evals_start = time.time() 202 | 203 | best = hyperopt.fmin(fn=objective, space=space, trials=trials, 204 | algo=tpe.suggest, max_evals=10, verbose=1, 205 | rstate=np.random.RandomState(1)) 206 | 207 | time_10evals_end = time.time() 208 | 209 | time_10_eval = (time_10evals_end - time_10evals_start) 210 | time_end_dfl = time.time() 211 | 212 | 213 | #400 = dim*50 214 | #avoid error 215 | #0.8*20=16 216 | 217 | #for test 218 | #400 219 | evals_num = min(int(8 * ((time_limitation - (time_end_dfl - time_start_dfl)) / time_10_eval)),1000) 220 | 221 | best = hyperopt.fmin(fn=objective, space=space, trials=trials, 222 | algo=tpe.suggest, max_evals=evals_num, verbose=1, 223 | rstate=np.random.RandomState(1)) 224 | hyperparams = space_eval(space, best) 225 | 226 | log(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}") 227 | 228 | del trials 229 | 230 | return hyperparams 231 | -------------------------------------------------------------------------------- /merge.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from collections import defaultdict, deque 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | import CONSTANT 9 | from util import Config, Timer, log, timeit 10 | 11 | NUM_OP = [np.std, np.mean] 12 | 13 | def join_name(a): 14 | if 'm_cat' in a[1]: 15 | prefix = CONSTANT.MULTI_CAT_PREFIX 16 | elif 'cat' in a[1]: 17 | prefix = CONSTANT.CATEGORY_PREFIX 18 | else: 19 | prefix = CONSTANT.NUMERICAL_PREFIX 20 | return f"{prefix}JOIN_{a[1].upper()}({a[0]})" 21 | 22 | 23 | def bfs(root_name, graph, tconfig): 24 | tconfig[CONSTANT.MAIN_TABLE_NAME]['depth'] = 0 25 | queue = deque([root_name]) 26 | while queue: 27 | u_name = queue.popleft() 28 | for edge in graph[u_name]: 29 | v_name = edge['to'] 30 | if 'depth' not in tconfig[v_name]: 31 | tconfig[v_name]['depth'] = tconfig[u_name]['depth'] + 1 32 | queue.append(v_name) 33 | 34 | 35 | @timeit 36 | def join(u, v, v_name, key, type_): 37 | if type_.split("_")[2] == 'many': 38 | agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key 39 | and not col.startswith(CONSTANT.TIME_PREFIX) 40 | and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)} 41 | v = v.groupby(key).agg(agg_funcs) 42 | v.columns = v.columns.map(join_name) 43 | else: 44 | v = v.set_index(key) 45 | v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}") 46 | return u.join(v, on=key) 47 | 48 | # @timeit 49 | # def join(u, v, v_name, key, type_): 50 | # # if isinstance(key, list): 51 | # # assert len(key) == 1 52 | # # key = key[0] 53 | # if type_.split("_")[2] == 'many': 54 | # agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key 55 | # and not col.startswith(CONSTANT.TIME_PREFIX) 56 | # and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)} 57 | # rehash_key = f'rehash_{key}' 58 | # v[rehash_key] = v[key].apply(lambda x: hash(x) % 200000) 59 | # v = v.groupby(rehash_key).agg(agg_funcs) 60 | # v.columns = v.columns.map(join_name) 61 | # else: 62 | # v = v.set_index(key) 63 | # v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}") 64 | # return u.join(v, on=key) 65 | 66 | 67 | @timeit 68 | def temporal_join(u, v, v_name, key, time_col): 69 | timer = Timer() 70 | 71 | if isinstance(key, list): 72 | assert len(key) == 1 73 | key = key[0] 74 | 75 | tmp_u = u[[time_col, key]] 76 | timer.check("select") 77 | 78 | tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False) 79 | timer.check("concat") 80 | 81 | rehash_key = f'rehash_{key}' 82 | tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX) 83 | timer.check("rehash_key") 84 | 85 | tmp_u.sort_values(time_col, inplace=True) 86 | timer.check("sort") 87 | 88 | agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key 89 | and not col.startswith(CONSTANT.TIME_PREFIX) 90 | and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)} 91 | 92 | tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs) 93 | timer.check("group & rolling & agg") 94 | 95 | tmp_u.reset_index(0, drop=True, inplace=True) # drop rehash index 96 | timer.check("reset_index") 97 | 98 | tmp_u.columns = tmp_u.columns.map(lambda a: 99 | f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})") 100 | 101 | if tmp_u.empty: 102 | log("empty tmp_u, return u") 103 | return u 104 | 105 | ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False) 106 | timer.check("final concat") 107 | 108 | del tmp_u 109 | 110 | return ret 111 | 112 | def dfs(u_name, config, tables, graph): 113 | u = tables[u_name] 114 | log(f"enter {u_name}") 115 | for edge in graph[u_name]: 116 | v_name = edge['to'] 117 | if config['tables'][v_name]['depth'] <= config['tables'][u_name]['depth']: 118 | continue 119 | 120 | v = dfs(v_name, config, tables, graph) 121 | key = edge['key'] 122 | type_ = edge['type'] 123 | 124 | if config['time_col'] not in u and config['time_col'] in v: 125 | continue 126 | 127 | if config['time_col'] in u and config['time_col'] in v: 128 | log(f"join {u_name} <--{type_}--t {v_name}") 129 | u = temporal_join(u, v, v_name, key, config['time_col']) 130 | else: 131 | log(f"join {u_name} <--{type_}--nt {v_name}") 132 | u = join(u, v, v_name, key, type_) 133 | 134 | del v 135 | 136 | log(f"leave {u_name}") 137 | return u 138 | 139 | 140 | @timeit 141 | def merge_table(tables, config): 142 | graph = defaultdict(list) 143 | for rel in config['relations']: 144 | ta = rel['table_A'] 145 | tb = rel['table_B'] 146 | graph[ta].append({ 147 | "to": tb, 148 | "key": rel['key'], 149 | "type": rel['type'] 150 | }) 151 | graph[tb].append({ 152 | "to": ta, 153 | "key": rel['key'], 154 | "type": '_'.join(rel['type'].split('_')[::-1]) 155 | }) 156 | bfs(CONSTANT.MAIN_TABLE_NAME, graph, config['tables']) 157 | return dfs(CONSTANT.MAIN_TABLE_NAME, config, tables, graph) 158 | -------------------------------------------------------------------------------- /metadata: -------------------------------------------------------------------------------- 1 | description: Provides prediction model to be executed by the ingestion program -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.system("pip3 install hyperopt") 4 | os.system("pip3 install lightgbm") 5 | os.system("pip3 install pandas==0.24.2") 6 | 7 | import copy 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn.metrics import roc_auc_score 11 | 12 | 13 | from automl import predict, train, validate,timetrain,timepredict 14 | from CONSTANT import MAIN_TABLE_NAME 15 | from merge import merge_table 16 | from preprocess import clean_df, clean_tables, feature_engineer 17 | from util import Config, log, show_dataframe, timeit 18 | import random 19 | import time 20 | 21 | class Model: 22 | def __init__(self, info): 23 | self.config = Config(info) 24 | self.tables = None 25 | self.targets = None 26 | 27 | self.Time_data_info={ 28 | #time 29 | 'total_time':0, 30 | 'time_ramain_so_far':0, 31 | 'time_for_feature_engineering':0, 32 | 'time_for_hyperparams_searching':0, 33 | 'time_for_model_train':0, 34 | 'time_for_model_prediction':0, 35 | 36 | #size 37 | 'feature_engineering_input_size':0, 38 | 'data_rows_for_hp':0, 39 | 'data_cols_for_hp':0, 40 | 'test_data_rows':0, 41 | 'test_data_columns':0, 42 | 'For_safe':50, 43 | } 44 | self.randomintvalue = random.randint(1, 100) 45 | 46 | @timeit 47 | def fit(self, Xs, y, time_ramain): 48 | self.Time_data_info['total_time'] = time_ramain 49 | self.Time_data_info['For_safe'] = (self.Time_data_info['total_time'] / 12) 50 | 51 | self.tables = Xs 52 | self.targets = y 53 | 54 | @timeit 55 | def predict(self, X_test, time_remain): 56 | self.Time_data_info['time_ramain_so_far'] = time_remain 57 | 58 | start_feature = time.time() 59 | 60 | Xs = self.tables 61 | main_table = Xs[MAIN_TABLE_NAME] 62 | 63 | log(f"Merge train and test tables...") 64 | main_table = pd.concat([main_table, X_test], keys=['train', 'test']) 65 | main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") 66 | Xs[MAIN_TABLE_NAME] = main_table 67 | 68 | log(f"Feature engineering...") 69 | clean_tables(Xs) 70 | X = merge_table(Xs, self.config) 71 | X = clean_df(X) 72 | X = feature_engineer(X, self.config) 73 | 74 | 75 | X_train = X[X.index.str.startswith("train")] 76 | X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) 77 | X_train.sort_index(inplace=True) 78 | y_train = self.targets 79 | 80 | end_feature = time.time() 81 | 82 | self.Time_data_info['time_for_feature_engineering'] = (end_feature - start_feature) 83 | 84 | self.Time_data_info['time_ramain_so_far'] = self.Time_data_info['time_ramain_so_far'] - self.Time_data_info[ 85 | 'time_for_feature_engineering'] 86 | 87 | print(f"TIME info:", self.Time_data_info) 88 | 89 | # train model 90 | log(f"Training...") 91 | train_start = time.time() 92 | 93 | timetrain(X_train, y_train, self.config,self.Time_data_info) 94 | 95 | train_end = time.time() 96 | 97 | self.Time_data_info['time_ramain_so_far'] = self.Time_data_info['time_ramain_so_far']-(train_end-train_start) 98 | self.Time_data_info['time_for_model_train'] = (train_end-train_start) 99 | 100 | print("TIME info:", self.Time_data_info) 101 | 102 | # predict 103 | log(f"Predicting...") 104 | X_test = X[X.index.str.startswith("test")] 105 | X_test.index = X_test.index.map(lambda x: int(x.split('_')[1])) 106 | X_test.sort_index(inplace=True) 107 | result = predict(X_test, self.config) 108 | 109 | return pd.Series(result) 110 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import pandas as pd 3 | import numpy as np 4 | from multiprocessing import Pool 5 | 6 | import CONSTANT 7 | from util import log, timeit 8 | 9 | 10 | uni_ops = { 11 | CONSTANT.TIME_PREFIX: { 12 | 'week': lambda df: df.dt.week, 13 | 'year': lambda df: df.dt.year, 14 | 'month': lambda df: df.dt.month, 15 | 'day': lambda df: df.dt.day, 16 | 'hour': lambda df: df.dt.hour, 17 | # 'minute': lambda df: df.dt.minute, 18 | 'dayofweek': lambda df: df.dt.dayofweek, 19 | 'dayofyear': lambda df: df.dt.dayofyear, 20 | }, 21 | } 22 | 23 | @timeit 24 | def compress_df(df, num=True, cat=True): 25 | if num: 26 | num_cols = [c for c in df if c.startswith(CONSTANT.NUMERICAL_PREFIX)] 27 | if len(num_cols) > 0: 28 | df[num_cols] = df[num_cols].astype('float32') 29 | if cat: 30 | cat_cols = [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)] 31 | if len(cat_cols) > 0: 32 | df[cat_cols] = df[cat_cols].astype('category') 33 | 34 | 35 | @timeit 36 | def parallelize_apply(func, df, cols): 37 | num_threads=4 38 | pool = Pool(processes=num_threads) 39 | col_num = int(np.ceil(len(cols) / num_threads)) 40 | res1 = pool.apply_async(func, args=(df,cols[:col_num])) 41 | res2 = pool.apply_async(func, args=(df,cols[col_num:2 * col_num])) 42 | res3 = pool.apply_async(func, args=(df,cols[2 * col_num:3 * col_num])) 43 | res4 = pool.apply_async(func, args=(df,cols[3 * col_num:])) 44 | pool.close() 45 | pool.join() 46 | df = pd.concat([df,res1.get(),res2.get(),res3.get(),res4.get()],axis=1) 47 | return df 48 | 49 | 50 | @timeit 51 | def normal_apply(func, df, cols): 52 | return pd.concat([df, func(df, cols)], axis=1) 53 | 54 | 55 | @timeit 56 | def clean_tables(tables): 57 | for tname in tables: 58 | log(f"cleaning table {tname}") 59 | df = tables[tname] 60 | fillna(df) 61 | num_cols = [c for c in df if c.startswith(CONSTANT.NUMERICAL_PREFIX)] 62 | cat_cols = [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)] 63 | m_cat_cols = [c for c in df if c.startswith(CONSTANT.MULTI_CAT_PREFIX)] 64 | time_cols = [c for c in df if c.startswith(CONSTANT.TIME_PREFIX)] 65 | 66 | if len(cat_cols) > 0: 67 | df = normal_apply(count_cat, df, cat_cols) 68 | if len(m_cat_cols) > 3: 69 | df = parallelize_apply(count_m_cat, df, m_cat_cols) 70 | elif len(m_cat_cols) > 0: 71 | df = normal_apply(count_m_cat, df, m_cat_cols) 72 | if len(time_cols) > 0: 73 | df = normal_apply(transform_datetime, df, time_cols) 74 | # drop columns 75 | df.drop(m_cat_cols+time_cols, axis=1, inplace=True) 76 | 77 | compress_df(df) 78 | tables[tname] = df 79 | 80 | @timeit 81 | def clean_df(df): 82 | compress_df(df, num=False) 83 | df_fillna_with_mean(df) 84 | hash_cat(df) 85 | return df 86 | 87 | 88 | @timeit 89 | def fillna(df): 90 | for c in [c for c in df if c.startswith(CONSTANT.NUMERICAL_PREFIX)]: 91 | df[c].fillna(-1, inplace=True) 92 | for c in [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)]: 93 | df[c].fillna("0", inplace=True) 94 | for c in [c for c in df if c.startswith(CONSTANT.TIME_PREFIX)]: 95 | df[c].fillna(datetime.datetime(1970, 1, 1), inplace=True) 96 | for c in [c for c in df if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]: 97 | df[c].fillna("0", inplace=True) 98 | 99 | 100 | @timeit 101 | def df_fillna_with_mean(df): 102 | for c in [c for c in df if c.startswith(CONSTANT.NUMERICAL_PREFIX)]: 103 | df[c].fillna(df[c].mean(), inplace=True) 104 | for c in [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)]: 105 | if "0" not in df[c].cat.categories: 106 | df[c] = df[c].cat.add_categories(["0"]) 107 | df[c].fillna("0") 108 | for c in [c for c in df if c.startswith(CONSTANT.TIME_PREFIX)]: 109 | mean = pd.to_timedelta(df[c]).mean() + pd.Timestamp(0) 110 | df[c].fillna(mean, inplace=True) 111 | for c in [c for c in df if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]: 112 | df[c].fillna("0", inplace=True) 113 | 114 | 115 | @timeit 116 | def feature_engineer(df, config): 117 | return df 118 | 119 | 120 | def count_cat(df, cat_cols): 121 | prefix_n = CONSTANT.NUMERICAL_PREFIX 122 | prefix_c = CONSTANT.CATEGORY_PREFIX 123 | op = "frequency" 124 | new_df=pd.DataFrame() 125 | for c in cat_cols: 126 | dic = df[c].value_counts().to_dict() 127 | new_df[f"{prefix_n}{op.upper()}({c})"] = df[c].apply(lambda x: dic[x]) 128 | return new_df 129 | 130 | def hash_cat(df): 131 | for c in [c for c in df if c.startswith(CONSTANT.CATEGORY_PREFIX)]: 132 | df[c] = df[c].apply(lambda x: int(x)) 133 | 134 | def frequent_cat(x): 135 | data = x.split(',') 136 | item, freq = np.unique(data, return_counts=True) 137 | return item[np.argmax(freq)] 138 | 139 | def weighted_cat(dic): 140 | def freq(x): 141 | data = x.split(',') 142 | item, freq = np.unique(data, return_counts=True) 143 | global_freq = np.array([dic[i] for i in item]) 144 | return item[np.argmax(global_freq*freq)] 145 | return freq 146 | 147 | def count_m_cat(df,m_cat_cols): 148 | prefix_n = CONSTANT.NUMERICAL_PREFIX 149 | prefix_c = CONSTANT.CATEGORY_PREFIX 150 | op_l = 'length' 151 | op_f = 'frequent_cat' 152 | op_fw = 'frequent_weighted_cat' 153 | new_df=pd.DataFrame() 154 | for c in m_cat_cols: 155 | new_df[f"{prefix_c}{op_f.upper()}RANK(1)({c})"] = df[c].apply(frequent_cat) 156 | new_df[f"{prefix_n}{op_l.upper()}({c})"] = df[c].apply(lambda x: len(x.split(','))) 157 | all_item = ','.join(df[c].values).split(',') 158 | item, freq = np.unique(all_item, return_counts=True) 159 | dic = dict(zip(item, freq)) 160 | new_df[f"{prefix_c}{op_fw.upper()}RANK(1)({c})"] = df[c].apply(weighted_cat(dic)) 161 | return new_df 162 | 163 | 164 | def transform_datetime(df, time_cols): 165 | prefix_n = CONSTANT.NUMERICAL_PREFIX 166 | ops = uni_ops[CONSTANT.TIME_PREFIX] 167 | new_dfs = [] 168 | for c in time_cols: 169 | new_df = df[c].agg(ops.values()) 170 | new_df.columns = [f"{prefix_n}{op.upper()}({c})" for op in ops] 171 | new_dfs += [new_df] 172 | return pd.concat(new_dfs, axis=1) 173 | 174 | 175 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any 4 | import pandas as pd 5 | import numpy as np 6 | 7 | import CONSTANT 8 | 9 | nesting_level = 0 10 | is_start = None 11 | 12 | class Timer: 13 | def __init__(self): 14 | self.start = time.time() 15 | self.history = [self.start] 16 | 17 | def check(self, info): 18 | current = time.time() 19 | log(f"[{info}] spend {current - self.history[-1]:0.2f} sec") 20 | self.history.append(current) 21 | 22 | def timeit(method, start_log=None): 23 | def timed(*args, **kw): 24 | global is_start 25 | global nesting_level 26 | 27 | if not is_start: 28 | print() 29 | 30 | is_start = True 31 | log(f"Start [{method.__name__}]:" + (start_log if start_log else "")) 32 | nesting_level += 1 33 | 34 | start_time = time.time() 35 | result = method(*args, **kw) 36 | end_time = time.time() 37 | 38 | nesting_level -= 1 39 | log(f"End [{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.") 40 | is_start = False 41 | 42 | return result 43 | 44 | return timed 45 | 46 | 47 | def log(entry: Any): 48 | global nesting_level 49 | space = "-" * (4 * nesting_level) 50 | print(f"{space}{entry}") 51 | 52 | def show_dataframe(df): 53 | if len(df) <= 30: 54 | print(f"content=\n" 55 | f"{df}") 56 | else: 57 | print(f"dataframe is too large to show the content, over {len(df)} rows") 58 | 59 | if len(df.dtypes) <= 100: 60 | print(f"types=\n" 61 | f"{df.dtypes}\n") 62 | else: 63 | print(f"dataframe is too wide to show the dtypes, over {len(df.dtypes)} columns") 64 | 65 | 66 | class Config: 67 | def __init__(self, info): 68 | self.data = { 69 | "start_time": time.time(), 70 | **info 71 | } 72 | self.data["tables"] = {} 73 | for tname, ttype in info['tables'].items(): 74 | self.data['tables'][tname] = {} 75 | self.data['tables'][tname]['type'] = ttype 76 | 77 | @staticmethod 78 | def aggregate_op(col): 79 | 80 | def freq_cat(x): 81 | item, freq = np.unique(x, return_counts=True) 82 | return item[np.argmax(freq)] 83 | 84 | def freq_m_cat(x): 85 | data = ','.join(x.values).split(',') 86 | item, freq = np.unique(data, return_counts=True) 87 | return item[np.argmax(freq)] 88 | 89 | def m_cat_concat(x): 90 | return ','.join(x.values) 91 | 92 | ops = { 93 | CONSTANT.NUMERICAL_TYPE: ["sum", "mean"], 94 | CONSTANT.CATEGORY_TYPE: ["count"], 95 | # TIME_TYPE: ["max"], 96 | # CONSTANT.MULTI_CAT_TYPE: [m_cat_concat], 97 | } 98 | if col.startswith(CONSTANT.NUMERICAL_PREFIX): 99 | return ops[CONSTANT.NUMERICAL_TYPE] 100 | if col.startswith(CONSTANT.CATEGORY_PREFIX): 101 | return ops[CONSTANT.CATEGORY_TYPE] 102 | if col.startswith(CONSTANT.MULTI_CAT_PREFIX): 103 | assert False, f"MultiCategory type feature's aggregate op are not supported." 104 | return ops[CONSTANT.MULTI_CAT_TYPE] 105 | if col.startswith(CONSTANT.TIME_PREFIX): 106 | assert False, f"Time type feature's aggregate op are not implemented." 107 | assert False, f"Unknown col type {col}" 108 | 109 | def time_left(self): 110 | return self["time_budget"] - (time.time() - self["start_time"]) 111 | 112 | def __getitem__(self, key): 113 | return self.data[key] 114 | 115 | def __setitem__(self, key, value): 116 | self.data[key] = value 117 | 118 | def __delitem__(self, key): 119 | del self.data[key] 120 | 121 | def __contains__(self, key): 122 | return key in self.data 123 | 124 | def __len__(self): 125 | return len(self.data) 126 | 127 | def __repr__(self): 128 | return repr(self.data) 129 | --------------------------------------------------------------------------------