├── metadata.json
├── preprocess.py
├── feature_selection.py
├── predict.py
├── validate.py
├── README.md
├── utils.py
└── train.py


/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "image": "sberbank/python",
3 |     "entry_points": {
4 |         "train_classification": "python train.py --mode classification --train-csv {train_csv} --model-dir {model_dir}",
5 |         "train_regression": "python train.py --mode regression --train-csv {train_csv} --model-dir {model_dir}",
6 |         "predict": "python predict.py --test-csv {test_csv} --prediction-csv {prediction_csv} --model-dir {model_dir}"
7 |     }
8 | }


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from utils import transform_datetime_features
 3 | from utils import drop_const_cols, filter_columns, std_scaler
 4 | from utils import count_encoding
 5 | 
 6 | 
 7 | def preprocess(df, model_config, type='train'):
 8 |     """preprocessing and feature engineering for input data"""
 9 | 
10 |     print('preprocess data..')
11 | 
12 |     # extract datetime features
13 |     df = transform_datetime_features(df)
14 |     print('datetime features extracted')
15 | 
16 |     # categorical count encoding
17 |     if type == 'train':
18 |         df, categorical_values = count_encoding(df)
19 |         model_config['categorical_values'] = categorical_values
20 |     elif type=='test':
21 |         df = count_encoding(df, model_config['categorical_values'])
22 |     print('count encoding of categorical features added')
23 | 
24 |     # drop constant features
25 |     if type == 'train':
26 |         df = drop_const_cols(df)
27 | 
28 |     # scaling
29 |     # if mtype == 'train':
30 |     #     df, scaler_mean, scaler_std = std_scaler(df)
31 |     #     model_config['scaler_mean'] = scaler_mean
32 |     #     model_config['scaler_std'] = scaler_std
33 |     # elif type=='test':
34 |     #     df = model_config['scaler'].transform(df)
35 | 
36 |     # filter columns
37 |     if type == 'train':
38 |         df, used_columns = filter_columns(df, groups=['number', 'count'])
39 |         model_config['used_columns'] = used_columns
40 |     elif type=='test':
41 |         df_pred = df[['line_id']]
42 |         df = df[model_config['used_columns']]
43 | 
44 |     # missing values
45 |     df.fillna(-1, inplace=True)
46 | 
47 |     # convert if dataframe is too big
48 |     # if model_config['is_big']:
49 |     #     df = df.astype(np.float16)
50 | 
51 |     if type == 'train':
52 |         return df, model_config
53 |     else:
54 |         return df, df_pred
55 | 


--------------------------------------------------------------------------------
/feature_selection.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import lightgbm as lgb
 4 | 
 5 | 
 6 | def lgb_model(params, mode):
 7 | 
 8 |     if mode == 'regression':
 9 |         model = lgb.LGBMRegressor(**params)
10 |     else:
11 |         model = lgb.LGBMClassifier(**params)
12 |     return model
13 | 
14 | 
15 | def lgb_importance_fs(df, y, mode, BIG_DATASET_SIZE):
16 |     """choose best features based  on lightgbm feature importance"""
17 | 
18 |     print('lightgbm feature selection..')
19 | 
20 |     # coefficient for taking fraction of data (to be sure that there won't be memory error)
21 |     coef = 0.5
22 | 
23 |     # dataframe size
24 |     df_size = df.memory_usage(deep=True).sum()
25 | 
26 |     # get subset of data if df is too big
27 |     subset_size = min(df.shape[0], int(coef * df.shape[0] / (df_size / BIG_DATASET_SIZE)))
28 |     print('subset_size {}'.format(subset_size))
29 |     idx = np.random.choice(df.index, size=subset_size, replace=False)
30 | 
31 |     # define model
32 |     params = {'n_estimators': 100, 'learning_rate': 0.05, 'num_leaves': 200,
33 |               'subsample': 1, 'colsample_bytree': 1, 'random_state': 42, 'n_jobs': -1}
34 |     model = lgb_model(params, mode)
35 | 
36 |     # train model
37 |     model.fit(df.loc[idx], y.loc[idx])
38 | 
39 |     # feature importance
40 |     feature_importance = pd.Series(model.booster_.feature_importance('gain'),
41 |         index=df.columns).fillna(0).sort_values(ascending=False)
42 |     # print(feature_importance.head(50))
43 |     # print(feature_importance.tail(10))
44 | 
45 |     # remove totally unimportant features
46 |     best_features = feature_importance[feature_importance>0]
47 | 
48 |     # leave most relevant features for big dataset
49 |     if df_size > BIG_DATASET_SIZE:
50 |         new_feature_count = min(df.shape[1], int(coef * df.shape[1] / (df_size / BIG_DATASET_SIZE)))
51 |         best_features = best_features.head(new_feature_count)
52 | 
53 |     # select features
54 |     used_columns = best_features.index.tolist()
55 |     df = df[used_columns]
56 | 
57 |     print('feature selection done')
58 |     print('number of selected features {}'.format(len(used_columns)))
59 | 
60 |     return df, used_columns
61 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle
 4 | import time
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | import warnings
 9 | warnings.filterwarnings("ignore")
10 | 
11 | from preprocess import preprocess
12 | 
13 | import h2o
14 | h2o.init()
15 | 
16 | # use this to stop the algorithm before time limit exceeds
17 | TIME_LIMIT = int(os.environ.get('TIME_LIMIT', 5*60))
18 | 
19 | if __name__ == '__main__':
20 | 
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('--test-csv', type=argparse.FileType('r'), required=True)
23 |     parser.add_argument('--prediction-csv', type=argparse.FileType('w'), required=True)
24 |     parser.add_argument('--model-dir', required=True)
25 |     args = parser.parse_args()
26 | 
27 |     start_time = time.time()
28 | 
29 |     # load config
30 |     model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
31 |     with open(model_config_filename, 'rb') as fin:
32 |         model_config = pickle.load(fin)
33 | 
34 |     # read data
35 |     # df = pd.read_csv(args.test_csv)
36 |     df = pd.read_csv(args.test_csv, dtype=model_config['dtypes'],
37 |                      parse_dates=model_config['datetime_cols'])
38 |     print('Dataset read, shape {}'.format(df.shape))
39 |     print('time elapsed: {}'.format(time.time()-start_time))
40 | 
41 |     # preprocessing
42 |     df, df_pred = preprocess(df, model_config, type='test')
43 |     print('time elapsed: {}'.format(time.time()-start_time))
44 | 
45 |     # final data shape
46 |     print('final df shape {}'.format(df.shape))
47 | 
48 |     # convert data to h2o format
49 |     print('convert data to h2o format..')
50 |     test = h2o.H2OFrame(df)
51 |     print('time elapsed: {}'.format(time.time()-start_time))
52 | 
53 |     # make prediction
54 |     aml = h2o.load_model(model_config['model_path'])
55 |     if model_config['mode'] == 'regression':
56 |         df_pred['prediction'] = aml.predict(test).as_data_frame().squeeze()
57 |     if model_config['mode'] == 'classification':
58 |         df_pred['prediction'] = aml.predict(test)['p1'].as_data_frame().squeeze()
59 | 
60 |     df_pred[['line_id', 'prediction']].to_csv(args.prediction_csv, index=False)
61 | 
62 |     print('Prediction time: {}'.format(time.time() - start_time))
63 | 


--------------------------------------------------------------------------------
/validate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn.metrics import mean_squared_error, roc_auc_score
 5 | import timeit
 6 | import mlflow
 7 | 
 8 | datasets = ['check_1_r', 'check_2_r', 'check_3_r', 'check_4_c', 'check_5_c', 'check_6_c', 'check_7_c', 'check_8_c']
 9 | # datasets = ['check_8_c']
10 | result_dir = '../../res'
11 | data_dir = '../../data'
12 | 
13 | mlflow.set_tracking_uri('../../mlruns')
14 | mlflow.set_experiment('h2o')
15 | 
16 | with mlflow.start_run():
17 | 
18 |     for i, dataset in enumerate(datasets):
19 | 
20 |         if not os.path.exists(result_dir):
21 |             os.mkdir(result_dir)
22 | 
23 |         if not os.path.exists('{}/{}'.format(result_dir, dataset)):
24 |             os.mkdir('{}/{}'.format(result_dir, dataset))
25 | 
26 |         print('\n### Check dataset', dataset, '\n')
27 | 
28 |         train_time = timeit.default_timer()
29 |         os.system('python train.py --mode {} --train-csv {} --model-dir {}'.format(
30 |             'regression' if dataset[-1] == 'r' else 'classification',
31 |             '{}/{}/train.csv'.format(data_dir, dataset),
32 |             '{}/{}/'.format(result_dir, dataset)
33 |         ))
34 |         train_time = timeit.default_timer() - train_time
35 | 
36 |         pred_time = timeit.default_timer()
37 |         os.system('python predict.py --prediction-csv {} --test-csv {} --model-dir {}'.format(
38 |             '{}/{}/pred.csv'.format(result_dir, dataset),
39 |             '{}/{}/test.csv'.format(data_dir, dataset),
40 |             '{}/{}/'.format(result_dir, dataset)
41 |         ))
42 |         pred_time = timeit.default_timer() - pred_time
43 | 
44 |         df = pd.read_csv('{}/{}/test-target.csv'.format(data_dir, dataset))
45 |         df_pred = pd.read_csv('{}/{}/pred.csv'.format(result_dir, dataset))
46 |         df = pd.merge(df, df_pred, on='line_id', left_index=True)
47 | 
48 |         score = roc_auc_score(df.target.values, df.prediction.values) if dataset[-1] == 'c' else \
49 |                 np.sqrt(mean_squared_error(df.target.values, df.prediction.values))
50 |         print('Score {:0.5f}'.format(score))
51 | 
52 |         n = dataset.split('_')[1]
53 |         mlflow.log_metric('score_{}'.format(n), score)
54 |         mlflow.log_metric('train_time_{}'.format(n), train_time)
55 |         mlflow.log_metric('test_time_{}'.format(n), pred_time)
56 | 
57 |     mlflow.log_artifacts('./')
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sberbank Data Science Journey 2018: H2O AutoML Baseline
 2 | 
 3 | Бейзлайн к соревнованию [SDSJ 2018 AutoML](http://sdsj.sberbank.ai/) c использованием H2O AutoML.
 4 | 
 5 | Документация и примеры по H2O AutoML:
 6 | - http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
 7 | - http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/modeling.html#h2oautoml
 8 | - https://github.com/h2oai/h2o-tutorials/blob/master/h2o-world-2017/automl/Python/automl_regression_powerplant_output.ipynb
 9 | 
10 | H2O AutoML в рамках отведенного лимита времени строит на данных ряд моделей из списка
11 | - GLM - generalized linear models
12 | - GBM - градиентный бустинг
13 | - DRF(Distributed Random Forest) - Random Forest и Extremely Randomized Trees
14 | - Deep learning
15 | 
16 | Для предсказания выбирается лучшая модель.
17 | Имеются два режима обучения - с кросс-валидацией и без. С кросс-валидацией на основе базовых моделей строится ансамбль - Stacked Ensemble, который, как правило, и будет лучшей моделью.
18 | 
19 | Бонус - в файле validate.py тестирование на локальных датасетах на основе бейзлайна https://github.com/vlarine/sdsj2018_lightgbm_baseline, но с добавлением логирования экспериментов с помощью [mlflow](https://mlflow.org/).
20 | mlflow позволяет сохранять параметры, результаты и исходный код экспериментов, смотреть и сравнивать результаты в веб-интерфейсе. Подробнее
21 | https://mlflow.org/docs/latest/tutorial.html
22 | 
23 | 
24 | ---
25 | 
26 | #### Возможные пути улучшения
27 | 
28 | - Выбирать режим в зависимости от размера датасета - для небольших датасетов выполнять  кросс-валидацию со стэкингом, для больших датасетов обучаться без кросс-валидации.
29 | - Есть параметр exclude_algos. Он позволяет настраивать список алгоритмов, которые используются в обучении. Можно, например, ограничить используемые алгоритмы только бустингом, чтобы не тратить время на другие алгоритмы (как правило, бустинг все равно оказывается лучшим вариантом). Либо наоборот, для совсем маленьких датасетов попробовать использовать менее склонные к переобучению алгоритмы - GLM, DRF.
30 | - По умолчанию H2O делит train на train/validation/test(leaderboard) в пропорции 80%/10%/10%. Возможно, 10% на тест в каких-то задачах - слишком мало и приводит к подгонке к нему, можно руками задавать validation_frame и leaderboard_frame (тест, на котором скорятся модели и выбирается лучшая). 
31 | 
32 | 
33 | ---
34 | 
35 | ### Update
36 | 
37 | Текущая версия работает с восьмым (самым большим) датасетом.
38 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | 
  4 | def transform_datetime_features(df):
  5 |     """extract datetime features"""
  6 | 
  7 |     datetime_columns = [
  8 |         col_name
  9 |         for col_name in df.columns
 10 |         if col_name.startswith('datetime')
 11 |     ]
 12 | 
 13 |     for col_name in datetime_columns:
 14 |         if len(datetime_columns) < 10:
 15 |             # df[col_name] = pd.to_datetime(df[col_name])
 16 |             df['number_weekday_{}'.format(col_name)] = df[col_name].dt.weekday
 17 |             df['number_month_{}'.format(col_name)] = df[col_name].dt.month
 18 |             df['number_day_{}'.format(col_name)] = df[col_name].dt.day
 19 |             df['number_hour_{}'.format(col_name)] = df[col_name].dt.hour
 20 |             df['number_hour_of_week_{}'.format(col_name)] = df[col_name].dt.hour + df[col_name].dt.weekday * 24
 21 |             df['number_minute_of_day_{}'.format(col_name)] = df[col_name].dt.minute + df[col_name].dt.hour * 60
 22 |         else:
 23 |             # df[col_name] = pd.to_datetime(df[col_name])
 24 |             df['number_weekday_{}'.format(col_name)] = df[col_name].dt.weekday
 25 |             df['number_month_{}'.format(col_name)] = df[col_name].dt.month
 26 |             df['number_day_{}'.format(col_name)] = df[col_name].dt.day
 27 |             df['number_hour_{}'.format(col_name)] = df[col_name].dt.hour
 28 | 
 29 |     return df
 30 | 
 31 | 
 32 | def drop_const_cols(df):
 33 |     """drop constant columns"""
 34 | 
 35 |     constant_columns = [
 36 |         col_name
 37 |         for col_name in df.columns
 38 |         if df[col_name].nunique() == 1
 39 |         ]
 40 |     df.drop(constant_columns, axis=1, inplace=True)
 41 | 
 42 |     return df
 43 | 
 44 | 
 45 | def count_encoding(df, categorical_values=None):
 46 |     """count encoding of categorical features"""
 47 | 
 48 |     # train stage
 49 |     if categorical_values is None:
 50 |         categorical_values = {}
 51 |         for col_name in list(df.columns):
 52 |                 if col_name.startswith('id') or col_name.startswith('string'):
 53 |                     categorical_values[col_name] = df[col_name].value_counts().to_dict()
 54 |                     df['count_{}'.format(col_name)] = df[col_name] \
 55 |                         .map(lambda x: categorical_values[col_name].get(x, 0))
 56 |         return df, categorical_values
 57 | 
 58 |     # test stage
 59 |     else:
 60 |         for col_name in list(df.columns):
 61 |             if col_name in categorical_values:
 62 |                 df['count_{}'.format(col_name)] = df[col_name] \
 63 |                     .map(lambda x: categorical_values[col_name].get(x, 0))
 64 |         return df
 65 | 
 66 | 
 67 | def filter_columns(df, groups=['number']):
 68 |     """filter columns to use in model"""
 69 | 
 70 |     used_columns = []
 71 |     for gr in groups:
 72 |         used_columns += [col_name for col_name in df.columns
 73 |                         if col_name.startswith(gr)]
 74 |     cols_to_drop = df.columns[~df.columns.isin(used_columns)]
 75 |     df.drop(cols_to_drop, axis=1, inplace=True)
 76 | 
 77 |     return df, used_columns
 78 | 
 79 | 
 80 | def std_scaler(df, scaler_mean=None, scaler_std=None):
 81 |     """standard scaler"""
 82 | 
 83 |     # train stage
 84 |     if scaler_mean is None:
 85 | 
 86 |         scaler_mean = {}
 87 |         scaler_std = {}
 88 |         for col in df.columns:
 89 |             mean = df[col].mean()
 90 |             std = df[col].mean()
 91 |             df[col] = (df[col]-mean)/std
 92 |             scaler_mean[col] = mean
 93 |             scaler_std[col] = std
 94 | 
 95 |         return df, scaler_mean, scaler_std
 96 | 
 97 |     # test stage
 98 |     else:
 99 | 
100 |         for col in df.columns:
101 |             df[col] = (df[col]-scaler_mean[col])/scaler_std[col]
102 | 
103 |         return df
104 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import pickle
  4 | import time
  5 | import pandas as pd
  6 | import gc
  7 | 
  8 | import warnings
  9 | warnings.filterwarnings("ignore")
 10 | 
 11 | from preprocess import preprocess
 12 | from feature_selection import lgb_importance_fs
 13 | 
 14 | import h2o
 15 | from h2o.automl import H2OAutoML
 16 | h2o.init()
 17 | 
 18 | 
 19 | # use this to stop the algorithm before time limit exceeds
 20 | TIME_LIMIT = int(os.environ.get('TIME_LIMIT', 5*60))
 21 | BIG_DATASET_SIZE = 300 * 1024 * 1024
 22 | 
 23 | 
 24 | if __name__ == '__main__':
 25 | 
 26 | 
 27 |     parser = argparse.ArgumentParser()
 28 |     parser.add_argument('--train-csv', required=True)
 29 |     parser.add_argument('--model-dir', required=True)
 30 |     parser.add_argument('--mode', choices=['classification', 'regression'], required=True)
 31 |     args = parser.parse_args()
 32 | 
 33 |     start_time = time.time()
 34 | 
 35 | 
 36 |     # read small amount of data to parse dtypes and find datetime columns
 37 |     df0 = pd.read_csv(args.train_csv, nrows=5000)
 38 |     dtypes = df0.dtypes.map(lambda x: 'float32' if x=='float64' else x).to_dict()
 39 |     datetime_cols = df0.columns[df0.columns.str.contains('datetime')].tolist()
 40 |     # read full data with float32 instead of float64 and parsing datetime columns
 41 |     df = pd.read_csv(args.train_csv, dtype=dtypes, parse_dates=datetime_cols)
 42 |     # df = pd.read_csv(args.train_csv)
 43 | 
 44 |     y = df.target
 45 |     df.drop('target', axis=1, inplace=True)
 46 |     is_big = df.memory_usage(deep=True).sum() > BIG_DATASET_SIZE
 47 | 
 48 |     print('Dataset read, shape {}'.format(df.shape))
 49 |     print('time elapsed: {}'.format(time.time()-start_time))
 50 | 
 51 |     # dict with data necessary to make predictions
 52 |     model_config = {}
 53 |     model_config['is_big'] = is_big
 54 |     model_config['mode'] = args.mode
 55 |     model_config['dtypes'] = dtypes
 56 |     model_config['datetime_cols'] = datetime_cols
 57 | 
 58 |     # preprocessing
 59 |     df, model_config = preprocess(df, model_config, type='train')
 60 |     print('number of features {}'.format(len(model_config['used_columns'])))
 61 |     print('time elapsed: {}'.format(time.time()-start_time))
 62 | 
 63 |     gc.collect()
 64 | 
 65 |     # feature selection
 66 |     if is_big or len(model_config['used_columns']) > 500:
 67 |         df, used_columns = lgb_importance_fs(df, y, args.mode, BIG_DATASET_SIZE)
 68 |         model_config['used_columns'] = used_columns
 69 |         print('time elapsed: {}'.format(time.time()-start_time))
 70 | 
 71 |     # final data shape
 72 |     print('final df shape {}'.format(df.shape))
 73 | 
 74 |     gc.collect()
 75 | 
 76 |     # convert data to h2o format
 77 |     print('convert data to h2o format..')
 78 |     df['target'] = y
 79 |     train = h2o.H2OFrame(df)
 80 |     if args.mode == 'classification':
 81 |         train['target'] = train['target'].asfactor()
 82 |     print('time elapsed: {}'.format(time.time()-start_time))
 83 | 
 84 |     del df
 85 |     gc.collect()
 86 | 
 87 |     # training
 88 |     elapsed = time.time()-start_time
 89 |     # main parameters of H2OAutoML:
 90 |     # max_runtime_secs - limit of time for run
 91 |     # max_models - maximum number of models to build
 92 |     # nfolds - number of folds for cross-validation
 93 |     #   if n_folds=0 then no cross-validation, check performance on validation set
 94 |     #   but then no stacked ensemble as well
 95 |     #   cross-validation with stacked ensemble is better, but too long
 96 |     # exclude_algos - list of algorithms to skip during model building, options:
 97 |     #   “GLM”, “GBM”, “DRF” (Random Forest and ExtraTrees), “DeepLearning” and “StackedEnsemble”
 98 |     aml = H2OAutoML(max_runtime_secs=int((TIME_LIMIT-elapsed)*0.9),
 99 |                     max_models=50, nfolds=0,
100 |                     exclude_algos=None,
101 |                     seed=42)
102 |     aml.train(y = 'target', training_frame = train, validation_frame=None)
103 |     print(aml.leaderboard)
104 | 
105 |     # save model to file
106 |     model_path = h2o.save_model(model=aml.leader,
107 |                     path=os.path.join(args.model_dir, 'aml'), force=True)
108 |     model_config['model_path'] = model_path
109 | 
110 |     # save config to file
111 |     model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
112 |     with open(model_config_filename, 'wb') as fout:
113 |         pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL)
114 | 
115 |     print('Train time: {}'.format(time.time() - start_time))
116 | 


--------------------------------------------------------------------------------