├── metadata.json ├── preprocess.py ├── feature_selection.py ├── predict.py ├── validate.py ├── README.md ├── utils.py └── train.py /metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "sberbank/python", 3 | "entry_points": { 4 | "train_classification": "python train.py --mode classification --train-csv {train_csv} --model-dir {model_dir}", 5 | "train_regression": "python train.py --mode regression --train-csv {train_csv} --model-dir {model_dir}", 6 | "predict": "python predict.py --test-csv {test_csv} --prediction-csv {prediction_csv} --model-dir {model_dir}" 7 | } 8 | } -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils import transform_datetime_features 3 | from utils import drop_const_cols, filter_columns, std_scaler 4 | from utils import count_encoding 5 | 6 | 7 | def preprocess(df, model_config, type='train'): 8 | """preprocessing and feature engineering for input data""" 9 | 10 | print('preprocess data..') 11 | 12 | # extract datetime features 13 | df = transform_datetime_features(df) 14 | print('datetime features extracted') 15 | 16 | # categorical count encoding 17 | if type == 'train': 18 | df, categorical_values = count_encoding(df) 19 | model_config['categorical_values'] = categorical_values 20 | elif type=='test': 21 | df = count_encoding(df, model_config['categorical_values']) 22 | print('count encoding of categorical features added') 23 | 24 | # drop constant features 25 | if type == 'train': 26 | df = drop_const_cols(df) 27 | 28 | # scaling 29 | # if mtype == 'train': 30 | # df, scaler_mean, scaler_std = std_scaler(df) 31 | # model_config['scaler_mean'] = scaler_mean 32 | # model_config['scaler_std'] = scaler_std 33 | # elif type=='test': 34 | # df = model_config['scaler'].transform(df) 35 | 36 | # filter columns 37 | if type == 'train': 38 | df, used_columns = filter_columns(df, groups=['number', 'count']) 39 | model_config['used_columns'] = used_columns 40 | elif type=='test': 41 | df_pred = df[['line_id']] 42 | df = df[model_config['used_columns']] 43 | 44 | # missing values 45 | df.fillna(-1, inplace=True) 46 | 47 | # convert if dataframe is too big 48 | # if model_config['is_big']: 49 | # df = df.astype(np.float16) 50 | 51 | if type == 'train': 52 | return df, model_config 53 | else: 54 | return df, df_pred 55 | -------------------------------------------------------------------------------- /feature_selection.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import lightgbm as lgb 4 | 5 | 6 | def lgb_model(params, mode): 7 | 8 | if mode == 'regression': 9 | model = lgb.LGBMRegressor(**params) 10 | else: 11 | model = lgb.LGBMClassifier(**params) 12 | return model 13 | 14 | 15 | def lgb_importance_fs(df, y, mode, BIG_DATASET_SIZE): 16 | """choose best features based on lightgbm feature importance""" 17 | 18 | print('lightgbm feature selection..') 19 | 20 | # coefficient for taking fraction of data (to be sure that there won't be memory error) 21 | coef = 0.5 22 | 23 | # dataframe size 24 | df_size = df.memory_usage(deep=True).sum() 25 | 26 | # get subset of data if df is too big 27 | subset_size = min(df.shape[0], int(coef * df.shape[0] / (df_size / BIG_DATASET_SIZE))) 28 | print('subset_size {}'.format(subset_size)) 29 | idx = np.random.choice(df.index, size=subset_size, replace=False) 30 | 31 | # define model 32 | params = {'n_estimators': 100, 'learning_rate': 0.05, 'num_leaves': 200, 33 | 'subsample': 1, 'colsample_bytree': 1, 'random_state': 42, 'n_jobs': -1} 34 | model = lgb_model(params, mode) 35 | 36 | # train model 37 | model.fit(df.loc[idx], y.loc[idx]) 38 | 39 | # feature importance 40 | feature_importance = pd.Series(model.booster_.feature_importance('gain'), 41 | index=df.columns).fillna(0).sort_values(ascending=False) 42 | # print(feature_importance.head(50)) 43 | # print(feature_importance.tail(10)) 44 | 45 | # remove totally unimportant features 46 | best_features = feature_importance[feature_importance>0] 47 | 48 | # leave most relevant features for big dataset 49 | if df_size > BIG_DATASET_SIZE: 50 | new_feature_count = min(df.shape[1], int(coef * df.shape[1] / (df_size / BIG_DATASET_SIZE))) 51 | best_features = best_features.head(new_feature_count) 52 | 53 | # select features 54 | used_columns = best_features.index.tolist() 55 | df = df[used_columns] 56 | 57 | print('feature selection done') 58 | print('number of selected features {}'.format(len(used_columns))) 59 | 60 | return df, used_columns 61 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import time 5 | import pandas as pd 6 | import numpy as np 7 | 8 | import warnings 9 | warnings.filterwarnings("ignore") 10 | 11 | from preprocess import preprocess 12 | 13 | import h2o 14 | h2o.init() 15 | 16 | # use this to stop the algorithm before time limit exceeds 17 | TIME_LIMIT = int(os.environ.get('TIME_LIMIT', 5*60)) 18 | 19 | if __name__ == '__main__': 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--test-csv', type=argparse.FileType('r'), required=True) 23 | parser.add_argument('--prediction-csv', type=argparse.FileType('w'), required=True) 24 | parser.add_argument('--model-dir', required=True) 25 | args = parser.parse_args() 26 | 27 | start_time = time.time() 28 | 29 | # load config 30 | model_config_filename = os.path.join(args.model_dir, 'model_config.pkl') 31 | with open(model_config_filename, 'rb') as fin: 32 | model_config = pickle.load(fin) 33 | 34 | # read data 35 | # df = pd.read_csv(args.test_csv) 36 | df = pd.read_csv(args.test_csv, dtype=model_config['dtypes'], 37 | parse_dates=model_config['datetime_cols']) 38 | print('Dataset read, shape {}'.format(df.shape)) 39 | print('time elapsed: {}'.format(time.time()-start_time)) 40 | 41 | # preprocessing 42 | df, df_pred = preprocess(df, model_config, type='test') 43 | print('time elapsed: {}'.format(time.time()-start_time)) 44 | 45 | # final data shape 46 | print('final df shape {}'.format(df.shape)) 47 | 48 | # convert data to h2o format 49 | print('convert data to h2o format..') 50 | test = h2o.H2OFrame(df) 51 | print('time elapsed: {}'.format(time.time()-start_time)) 52 | 53 | # make prediction 54 | aml = h2o.load_model(model_config['model_path']) 55 | if model_config['mode'] == 'regression': 56 | df_pred['prediction'] = aml.predict(test).as_data_frame().squeeze() 57 | if model_config['mode'] == 'classification': 58 | df_pred['prediction'] = aml.predict(test)['p1'].as_data_frame().squeeze() 59 | 60 | df_pred[['line_id', 'prediction']].to_csv(args.prediction_csv, index=False) 61 | 62 | print('Prediction time: {}'.format(time.time() - start_time)) 63 | -------------------------------------------------------------------------------- /validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.metrics import mean_squared_error, roc_auc_score 5 | import timeit 6 | import mlflow 7 | 8 | datasets = ['check_1_r', 'check_2_r', 'check_3_r', 'check_4_c', 'check_5_c', 'check_6_c', 'check_7_c', 'check_8_c'] 9 | # datasets = ['check_8_c'] 10 | result_dir = '../../res' 11 | data_dir = '../../data' 12 | 13 | mlflow.set_tracking_uri('../../mlruns') 14 | mlflow.set_experiment('h2o') 15 | 16 | with mlflow.start_run(): 17 | 18 | for i, dataset in enumerate(datasets): 19 | 20 | if not os.path.exists(result_dir): 21 | os.mkdir(result_dir) 22 | 23 | if not os.path.exists('{}/{}'.format(result_dir, dataset)): 24 | os.mkdir('{}/{}'.format(result_dir, dataset)) 25 | 26 | print('\n### Check dataset', dataset, '\n') 27 | 28 | train_time = timeit.default_timer() 29 | os.system('python train.py --mode {} --train-csv {} --model-dir {}'.format( 30 | 'regression' if dataset[-1] == 'r' else 'classification', 31 | '{}/{}/train.csv'.format(data_dir, dataset), 32 | '{}/{}/'.format(result_dir, dataset) 33 | )) 34 | train_time = timeit.default_timer() - train_time 35 | 36 | pred_time = timeit.default_timer() 37 | os.system('python predict.py --prediction-csv {} --test-csv {} --model-dir {}'.format( 38 | '{}/{}/pred.csv'.format(result_dir, dataset), 39 | '{}/{}/test.csv'.format(data_dir, dataset), 40 | '{}/{}/'.format(result_dir, dataset) 41 | )) 42 | pred_time = timeit.default_timer() - pred_time 43 | 44 | df = pd.read_csv('{}/{}/test-target.csv'.format(data_dir, dataset)) 45 | df_pred = pd.read_csv('{}/{}/pred.csv'.format(result_dir, dataset)) 46 | df = pd.merge(df, df_pred, on='line_id', left_index=True) 47 | 48 | score = roc_auc_score(df.target.values, df.prediction.values) if dataset[-1] == 'c' else \ 49 | np.sqrt(mean_squared_error(df.target.values, df.prediction.values)) 50 | print('Score {:0.5f}'.format(score)) 51 | 52 | n = dataset.split('_')[1] 53 | mlflow.log_metric('score_{}'.format(n), score) 54 | mlflow.log_metric('train_time_{}'.format(n), train_time) 55 | mlflow.log_metric('test_time_{}'.format(n), pred_time) 56 | 57 | mlflow.log_artifacts('./') 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sberbank Data Science Journey 2018: H2O AutoML Baseline 2 | 3 | Бейзлайн к соревнованию [SDSJ 2018 AutoML](http://sdsj.sberbank.ai/) c использованием H2O AutoML. 4 | 5 | Документация и примеры по H2O AutoML: 6 | - http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html 7 | - http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/modeling.html#h2oautoml 8 | - https://github.com/h2oai/h2o-tutorials/blob/master/h2o-world-2017/automl/Python/automl_regression_powerplant_output.ipynb 9 | 10 | H2O AutoML в рамках отведенного лимита времени строит на данных ряд моделей из списка 11 | - GLM - generalized linear models 12 | - GBM - градиентный бустинг 13 | - DRF(Distributed Random Forest) - Random Forest и Extremely Randomized Trees 14 | - Deep learning 15 | 16 | Для предсказания выбирается лучшая модель. 17 | Имеются два режима обучения - с кросс-валидацией и без. С кросс-валидацией на основе базовых моделей строится ансамбль - Stacked Ensemble, который, как правило, и будет лучшей моделью. 18 | 19 | Бонус - в файле validate.py тестирование на локальных датасетах на основе бейзлайна https://github.com/vlarine/sdsj2018_lightgbm_baseline, но с добавлением логирования экспериментов с помощью [mlflow](https://mlflow.org/). 20 | mlflow позволяет сохранять параметры, результаты и исходный код экспериментов, смотреть и сравнивать результаты в веб-интерфейсе. Подробнее 21 | https://mlflow.org/docs/latest/tutorial.html 22 | 23 | 24 | --- 25 | 26 | #### Возможные пути улучшения 27 | 28 | - Выбирать режим в зависимости от размера датасета - для небольших датасетов выполнять кросс-валидацию со стэкингом, для больших датасетов обучаться без кросс-валидации. 29 | - Есть параметр exclude_algos. Он позволяет настраивать список алгоритмов, которые используются в обучении. Можно, например, ограничить используемые алгоритмы только бустингом, чтобы не тратить время на другие алгоритмы (как правило, бустинг все равно оказывается лучшим вариантом). Либо наоборот, для совсем маленьких датасетов попробовать использовать менее склонные к переобучению алгоритмы - GLM, DRF. 30 | - По умолчанию H2O делит train на train/validation/test(leaderboard) в пропорции 80%/10%/10%. Возможно, 10% на тест в каких-то задачах - слишком мало и приводит к подгонке к нему, можно руками задавать validation_frame и leaderboard_frame (тест, на котором скорятся модели и выбирается лучшая). 31 | 32 | 33 | --- 34 | 35 | ### Update 36 | 37 | Текущая версия работает с восьмым (самым большим) датасетом. 38 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def transform_datetime_features(df): 5 | """extract datetime features""" 6 | 7 | datetime_columns = [ 8 | col_name 9 | for col_name in df.columns 10 | if col_name.startswith('datetime') 11 | ] 12 | 13 | for col_name in datetime_columns: 14 | if len(datetime_columns) < 10: 15 | # df[col_name] = pd.to_datetime(df[col_name]) 16 | df['number_weekday_{}'.format(col_name)] = df[col_name].dt.weekday 17 | df['number_month_{}'.format(col_name)] = df[col_name].dt.month 18 | df['number_day_{}'.format(col_name)] = df[col_name].dt.day 19 | df['number_hour_{}'.format(col_name)] = df[col_name].dt.hour 20 | df['number_hour_of_week_{}'.format(col_name)] = df[col_name].dt.hour + df[col_name].dt.weekday * 24 21 | df['number_minute_of_day_{}'.format(col_name)] = df[col_name].dt.minute + df[col_name].dt.hour * 60 22 | else: 23 | # df[col_name] = pd.to_datetime(df[col_name]) 24 | df['number_weekday_{}'.format(col_name)] = df[col_name].dt.weekday 25 | df['number_month_{}'.format(col_name)] = df[col_name].dt.month 26 | df['number_day_{}'.format(col_name)] = df[col_name].dt.day 27 | df['number_hour_{}'.format(col_name)] = df[col_name].dt.hour 28 | 29 | return df 30 | 31 | 32 | def drop_const_cols(df): 33 | """drop constant columns""" 34 | 35 | constant_columns = [ 36 | col_name 37 | for col_name in df.columns 38 | if df[col_name].nunique() == 1 39 | ] 40 | df.drop(constant_columns, axis=1, inplace=True) 41 | 42 | return df 43 | 44 | 45 | def count_encoding(df, categorical_values=None): 46 | """count encoding of categorical features""" 47 | 48 | # train stage 49 | if categorical_values is None: 50 | categorical_values = {} 51 | for col_name in list(df.columns): 52 | if col_name.startswith('id') or col_name.startswith('string'): 53 | categorical_values[col_name] = df[col_name].value_counts().to_dict() 54 | df['count_{}'.format(col_name)] = df[col_name] \ 55 | .map(lambda x: categorical_values[col_name].get(x, 0)) 56 | return df, categorical_values 57 | 58 | # test stage 59 | else: 60 | for col_name in list(df.columns): 61 | if col_name in categorical_values: 62 | df['count_{}'.format(col_name)] = df[col_name] \ 63 | .map(lambda x: categorical_values[col_name].get(x, 0)) 64 | return df 65 | 66 | 67 | def filter_columns(df, groups=['number']): 68 | """filter columns to use in model""" 69 | 70 | used_columns = [] 71 | for gr in groups: 72 | used_columns += [col_name for col_name in df.columns 73 | if col_name.startswith(gr)] 74 | cols_to_drop = df.columns[~df.columns.isin(used_columns)] 75 | df.drop(cols_to_drop, axis=1, inplace=True) 76 | 77 | return df, used_columns 78 | 79 | 80 | def std_scaler(df, scaler_mean=None, scaler_std=None): 81 | """standard scaler""" 82 | 83 | # train stage 84 | if scaler_mean is None: 85 | 86 | scaler_mean = {} 87 | scaler_std = {} 88 | for col in df.columns: 89 | mean = df[col].mean() 90 | std = df[col].mean() 91 | df[col] = (df[col]-mean)/std 92 | scaler_mean[col] = mean 93 | scaler_std[col] = std 94 | 95 | return df, scaler_mean, scaler_std 96 | 97 | # test stage 98 | else: 99 | 100 | for col in df.columns: 101 | df[col] = (df[col]-scaler_mean[col])/scaler_std[col] 102 | 103 | return df 104 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import time 5 | import pandas as pd 6 | import gc 7 | 8 | import warnings 9 | warnings.filterwarnings("ignore") 10 | 11 | from preprocess import preprocess 12 | from feature_selection import lgb_importance_fs 13 | 14 | import h2o 15 | from h2o.automl import H2OAutoML 16 | h2o.init() 17 | 18 | 19 | # use this to stop the algorithm before time limit exceeds 20 | TIME_LIMIT = int(os.environ.get('TIME_LIMIT', 5*60)) 21 | BIG_DATASET_SIZE = 300 * 1024 * 1024 22 | 23 | 24 | if __name__ == '__main__': 25 | 26 | 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--train-csv', required=True) 29 | parser.add_argument('--model-dir', required=True) 30 | parser.add_argument('--mode', choices=['classification', 'regression'], required=True) 31 | args = parser.parse_args() 32 | 33 | start_time = time.time() 34 | 35 | 36 | # read small amount of data to parse dtypes and find datetime columns 37 | df0 = pd.read_csv(args.train_csv, nrows=5000) 38 | dtypes = df0.dtypes.map(lambda x: 'float32' if x=='float64' else x).to_dict() 39 | datetime_cols = df0.columns[df0.columns.str.contains('datetime')].tolist() 40 | # read full data with float32 instead of float64 and parsing datetime columns 41 | df = pd.read_csv(args.train_csv, dtype=dtypes, parse_dates=datetime_cols) 42 | # df = pd.read_csv(args.train_csv) 43 | 44 | y = df.target 45 | df.drop('target', axis=1, inplace=True) 46 | is_big = df.memory_usage(deep=True).sum() > BIG_DATASET_SIZE 47 | 48 | print('Dataset read, shape {}'.format(df.shape)) 49 | print('time elapsed: {}'.format(time.time()-start_time)) 50 | 51 | # dict with data necessary to make predictions 52 | model_config = {} 53 | model_config['is_big'] = is_big 54 | model_config['mode'] = args.mode 55 | model_config['dtypes'] = dtypes 56 | model_config['datetime_cols'] = datetime_cols 57 | 58 | # preprocessing 59 | df, model_config = preprocess(df, model_config, type='train') 60 | print('number of features {}'.format(len(model_config['used_columns']))) 61 | print('time elapsed: {}'.format(time.time()-start_time)) 62 | 63 | gc.collect() 64 | 65 | # feature selection 66 | if is_big or len(model_config['used_columns']) > 500: 67 | df, used_columns = lgb_importance_fs(df, y, args.mode, BIG_DATASET_SIZE) 68 | model_config['used_columns'] = used_columns 69 | print('time elapsed: {}'.format(time.time()-start_time)) 70 | 71 | # final data shape 72 | print('final df shape {}'.format(df.shape)) 73 | 74 | gc.collect() 75 | 76 | # convert data to h2o format 77 | print('convert data to h2o format..') 78 | df['target'] = y 79 | train = h2o.H2OFrame(df) 80 | if args.mode == 'classification': 81 | train['target'] = train['target'].asfactor() 82 | print('time elapsed: {}'.format(time.time()-start_time)) 83 | 84 | del df 85 | gc.collect() 86 | 87 | # training 88 | elapsed = time.time()-start_time 89 | # main parameters of H2OAutoML: 90 | # max_runtime_secs - limit of time for run 91 | # max_models - maximum number of models to build 92 | # nfolds - number of folds for cross-validation 93 | # if n_folds=0 then no cross-validation, check performance on validation set 94 | # but then no stacked ensemble as well 95 | # cross-validation with stacked ensemble is better, but too long 96 | # exclude_algos - list of algorithms to skip during model building, options: 97 | # “GLM”, “GBM”, “DRF” (Random Forest and ExtraTrees), “DeepLearning” and “StackedEnsemble” 98 | aml = H2OAutoML(max_runtime_secs=int((TIME_LIMIT-elapsed)*0.9), 99 | max_models=50, nfolds=0, 100 | exclude_algos=None, 101 | seed=42) 102 | aml.train(y = 'target', training_frame = train, validation_frame=None) 103 | print(aml.leaderboard) 104 | 105 | # save model to file 106 | model_path = h2o.save_model(model=aml.leader, 107 | path=os.path.join(args.model_dir, 'aml'), force=True) 108 | model_config['model_path'] = model_path 109 | 110 | # save config to file 111 | model_config_filename = os.path.join(args.model_dir, 'model_config.pkl') 112 | with open(model_config_filename, 'wb') as fout: 113 | pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL) 114 | 115 | print('Train time: {}'.format(time.time() - start_time)) 116 | --------------------------------------------------------------------------------