├── data
    └── .gitkeep
├── models
    └── .gitkeep
├── predictions
    └── .gitkeep
├── submissions
    └── .gitkeep
├── .dockerignore
├── .gitignore
├── scores
    └── scores.csv
├── metadata.json
├── main.py
├── lib
    ├── features.py
    ├── read.py
    ├── automl.py
    ├── model_other.py
    ├── util.py
    ├── model.py
    └── preprocess.py
├── score.py
├── Dockerfile
├── Makefile
└── README.md


/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/predictions/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/submissions/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | data
2 | models
3 | predictions
4 | submissions
5 | scores
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | .DS_Store
 3 | .ipynb_checkpoints
 4 | __pycache__
 5 | data
 6 | models
 7 | predictions
 8 | submissions
 9 | scores
10 | 


--------------------------------------------------------------------------------
/scores/scores.csv:
--------------------------------------------------------------------------------
 1 | ,dataset,score,time
 2 | 0,1_r,11.699119328400574,7.91574501991272
 3 | 1,2_r,1.2025001668490136,51.469507694244385
 4 | 2,3_r,13258.658787686656,74.08733296394348
 5 | 3,4_c,0.9998686888554368,150.34715294837952
 6 | 4,5_c,0.7892876687105762,292.5374937057495
 7 | 5,6_c,0.6603385075889981,141.35956001281738
 8 | 6,7_c,0.8206793915381698,519.9199469089508
 9 | 7,8_c,0.8826053456934709,969.1855833530426
10 | 


--------------------------------------------------------------------------------
/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "image": "{image}",
3 |     "entry_points": {
4 |         "train_classification": "python3 main.py --mode classification --train-csv {train_csv} --model-dir {model_dir}",
5 |         "train_regression": "python3 main.py --mode regression --train-csv {train_csv} --model-dir {model_dir}",
6 |         "predict": "python3 main.py --test-csv {test_csv} --prediction-csv {prediction_csv} --model-dir {model_dir}"
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from lib.util import Log
 3 | from lib.automl import AutoML
 4 | 
 5 | 
 6 | @Log.timeit
 7 | def main():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('--mode', choices=['classification', 'regression'])
10 |     parser.add_argument('--model-dir')
11 |     parser.add_argument('--train-csv')
12 |     parser.add_argument('--test-csv')
13 |     parser.add_argument('--prediction-csv')
14 |     args = parser.parse_args()
15 | 
16 |     automl = AutoML(args.model_dir)
17 | 
18 |     if args.train_csv is not None:
19 |         automl.train(args.train_csv, args.mode)
20 |         automl.save()
21 |     elif args.test_csv is not None:
22 |         automl.load()
23 |         automl.predict(args.test_csv, args.prediction_csv)
24 |     else:
25 |         exit(1)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/lib/features.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import lightgbm as lgb
 3 | from boruta import BorutaPy
 4 | from typing import List, Optional
 5 | 
 6 | 
 7 | class LGBMFeatureEstimator():
 8 |     def __init__(self, params, n_estimators: int=50):
 9 |         self.params = params
10 |         self.n_estimators = n_estimators
11 | 
12 |     def get_params(self):
13 |         return self.params
14 | 
15 |     def set_params(self, n_estimators:Optional[int]=None, random_state:Optional[int]=None):
16 |         if n_estimators is not None:
17 |             self.n_estimators = n_estimators
18 | 
19 |     def fit(self, X: pd.DataFrame, y: pd.Series):
20 |         train_data = lgb.Dataset(X, label=y)
21 |         model = lgb.train(self.params, train_data, self.n_estimators)
22 |         self.feature_importances_ = model.feature_importance(importance_type="gain")
23 | 
24 | 
25 | def select_features(X: pd.DataFrame, y: pd.Series, mode: str, n_estimators: int=50, max_iter: int=50, perc: int=75) -> List[str]:
26 |     feat_estimator = LGBMFeatureEstimator({
27 |         "objective": "regression" if mode == "regression" else "binary",
28 |         "metric": "rmse" if mode == "regression" else "auc",
29 |         "learning_rate": 0.01,
30 |         "verbosity": -1,
31 |         "seed": 1,
32 |         "max_depth": 7,
33 |         "min_data_in_leaf": 3,
34 |     }, n_estimators)
35 | 
36 |     feat_selector = BorutaPy(feat_estimator, n_estimators=n_estimators, max_iter=max_iter, verbose=2, random_state=1, perc=perc)
37 | 
38 |     try:
39 |         feat_selector.fit(X.values, y.values.ravel())
40 |     except:
41 |         pass
42 | 
43 |     return X.columns[feat_selector.support_].tolist()
44 | 


--------------------------------------------------------------------------------
/score.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import time
 4 | from lib.automl import AutoML
 5 | from lib.util import Log
 6 | 
 7 | DATASETS = [
 8 |     ("1", "regression", 300),
 9 |     ("2", "regression", 300),
10 |     ("3", "regression", 300),
11 |     ("4", "classification", 300),
12 |     ("5", "classification", 300),
13 |     ("6", "classification", 600),
14 |     ("7", "classification", 1800),
15 |     ("8", "classification", 1800),
16 | ]
17 | 
18 | 
19 | @Log.timeit
20 | def validate_dataset(alias: str, mode: str, train_limit: int) -> np.float64:
21 |     Log.print(alias)
22 | 
23 |     automl = AutoML("models/check_{}".format(alias))
24 | 
25 |     automl.config["time_limit"] = train_limit
26 |     automl.train("data/check_{}/train.csv".format(alias), mode)
27 | 
28 |     automl.config["time_limit"] = 300
29 |     automl.config["start_time"] = time.time()
30 |     _, score = automl.predict("data/check_{}/test.csv".format(alias), "predictions/check_{}.csv".format(alias))
31 | 
32 |     return score
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     scores = {
37 |         "dataset": [],
38 |         "score": [],
39 |         "time": [],
40 |     }
41 | 
42 |     for i, mode, train_limit in DATASETS:
43 |         alias = "{}_{}".format(i, mode[0])
44 | 
45 |         start_time = time.time()
46 |         score = validate_dataset(alias, mode, train_limit)
47 |         end_time = time.time()
48 | 
49 |         scores["dataset"].append(alias)
50 |         scores["score"].append(score)
51 |         scores["time"].append(end_time - start_time)
52 | 
53 |     scores = pd.DataFrame(scores)
54 |     scores.to_csv("scores/{}.csv".format(int(time.time())))
55 |     Log.print(scores, nesting=False)
56 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | ENV LANG=C.UTF-8
 4 | 
 5 | # Common packages
 6 | RUN apt-get update && \
 7 |     apt-get install -y --no-install-recommends \
 8 |         software-properties-common \
 9 |         build-essential \
10 |         vim \
11 |         wget \
12 |         curl \
13 |         git \
14 |         zip \
15 |         unzip && \
16 |     rm -rf /var/lib/apt/lists/*
17 | 
18 | # Python 3.6
19 | RUN add-apt-repository -y ppa:deadsnakes/ppa && \
20 |     apt-get update && \
21 |     apt-get install -y --no-install-recommends \
22 |         python3-pip \
23 |         python3-setuptools \
24 |         python3.6 \
25 |         python3.6-dev \
26 |         python3.6-venv && \
27 |     pip3 install --no-cache-dir --upgrade pip && \
28 |     rm -rf /var/lib/apt/lists/*
29 | 
30 | # Vowpal Wabbit
31 | ENV PATH="/opt/vowpal_wabbit/utl:${PATH}" \
32 |     CPLUS_INCLUDE_PATH=/usr/lib/jvm/java-8-openjdk-amd64/include/linux:/usr/lib/jvm/java-1.8.0-openjdk-amd64/include
33 | RUN apt-get update && \
34 |     apt-get install -y --no-install-recommends \
35 |         libboost-python-dev\
36 |         libboost-program-options-dev \
37 |         zlib1g-dev \
38 |         openjdk-8-jdk && \
39 |     rm -rf /var/lib/apt/lists/* &&\
40 |     cd opt && git clone git://github.com/JohnLangford/vowpal_wabbit.git && cd vowpal_wabbit && make && make install && \
41 |     cd python && python3 setup.py install
42 | 
43 | # H2O AutoML
44 | RUN mkdir /tmp/h2o && cd /tmp/h2o && \
45 |     wget http://h2o-release.s3.amazonaws.com/h2o/rel-wright/9/h2o-3.20.0.9.zip && \
46 |     unzip -j h2o-3.20.0.9.zip && \
47 |     pip3 install h2o-3.20.0.9-py2.py3-none-any.whl && \
48 |     rm -rf /tmp/h2o
49 | 
50 | # Python packages
51 | RUN pip3 install --no-cache-dir --upgrade \
52 |     pandas \
53 |     jupyter \
54 |     lightgbm \
55 |     catboost \
56 |     xgboost \
57 |     hyperopt \
58 |     Boruta \
59 |     category_encoders \
60 |     memory_profiler
61 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | IMAGE=tyz910/sdsj2018
 2 | DOWNLOAD_URL=https://s3.eu-central-1.amazonaws.com/sdsj2018-automl/public/sdsj2018_automl_check_datasets.zip
 3 | 
 4 | ifeq ($(OS), Windows_NT)
 5 | 	DOCKER_BUILD=docker build -t ${IMAGE} .
 6 | else
 7 | 	DOCKER_BUILD=docker build -t ${IMAGE} . && (docker ps -q -f status=exited | xargs docker rm) && (docker images -qf dangling=true | xargs docker rmi) && docker images
 8 | endif
 9 | 
10 | ifeq ($(DATASET),)
11 | 	DATASET=1
12 | endif
13 | 
14 | DATASET_MODE=_c
15 | ifeq ($(DATASET), $(filter $(DATASET), 1 2 3))
16 | 	DATASET_MODE=_r
17 | endif
18 | 
19 | DATASET_NAME=${DATASET}${DATASET_MODE}
20 | 
21 | ifeq ($(DATASET_MODE), _r)
22 | 	TRAIN_MODE=regression
23 | else
24 | 	TRAIN_MODE=classification
25 | endif
26 | 
27 | TRAIN_CSV=data/check_${DATASET_NAME}/train.csv
28 | TEST_CSV=data/check_${DATASET_NAME}/test.csv
29 | PREDICTIONS_CSV=predictions/check_${DATASET_NAME}.csv
30 | MODEL_DIR=models/check_${DATASET_NAME}
31 | 
32 | DOCKER_RUN=docker run --rm -it -v ${CURDIR}:/app -w /app ${IMAGE}
33 | 
34 | download:
35 | 	${DOCKER_RUN} /bin/bash -c "test -f ${TRAIN_CSV} || (cd data && curl ${DOWNLOAD_URL} > data.zip && unzip data.zip && rm data.zip)"
36 | 
37 | train:
38 | 	${DOCKER_RUN} python3 main.py --mode ${TRAIN_MODE} --train-csv ${TRAIN_CSV} --model-dir ${MODEL_DIR}
39 | 
40 | predict:
41 | 	${DOCKER_RUN} python3 main.py --test-csv ${TEST_CSV} --prediction-csv ${PREDICTIONS_CSV} --model-dir ${MODEL_DIR}
42 | 
43 | score:
44 | 	${DOCKER_RUN} python3 score.py
45 | 
46 | docker-build:
47 | 	${DOCKER_BUILD}
48 | 
49 | docker-push:
50 | 	docker push ${IMAGE}
51 | 
52 | run-bash:
53 | 	${DOCKER_RUN} /bin/bash
54 | 
55 | run-jupyter:
56 | 	docker run --rm -it -v ${CURDIR}:/app -w /app -p 8888:8888 ${IMAGE} jupyter notebook --ip=0.0.0.0 --no-browser --allow-root  --NotebookApp.token='' --NotebookApp.password=''
57 | 
58 | submission:
59 | 	${DOCKER_RUN} /bin/bash -c "sed -i.bak 's~{image}~${IMAGE}~g' metadata.json && zip -9 -r submissions/submission_`date '+%Y%m%d_%H%M%S'`.zip main.py lib/*.py metadata.json && mv metadata.json.bak metadata.json"
60 | 


--------------------------------------------------------------------------------
/lib/read.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from lib.util import Log, Config
 3 | 
 4 | 
 5 | @Log.timeit
 6 | def read_df(csv_path: str, config: Config) -> pd.DataFrame:
 7 |     if "dtype" not in config:
 8 |         preview_df(csv_path, config)
 9 | 
10 |     df = pandas_read_csv(csv_path, config)
11 | 
12 |     if "sort_values" in config:
13 |         df.sort_values(config["sort_values"], inplace=True)
14 | 
15 |     return df
16 | 
17 | 
18 | @Log.timeit
19 | def pandas_read_csv(csv_path: str, config: Config) -> pd.DataFrame:
20 |     return pd.read_csv(csv_path, encoding="utf-8", low_memory=False, dtype=config["dtype"], parse_dates=config["parse_dates"])
21 | 
22 | 
23 | @Log.timeit
24 | def preview_df(train_csv: str, config: Config, nrows: int=3000):
25 |     num_rows = sum(1 for line in open(train_csv)) - 1
26 |     Log.print("Rows in train: {}".format(num_rows))
27 | 
28 |     df = pd.read_csv(train_csv, encoding="utf-8", low_memory=False, nrows=nrows)
29 |     mem_per_row = df.memory_usage(deep=True).sum() / nrows
30 |     Log.print("Memory per row: {:0.2f} Kb".format(mem_per_row / 1024))
31 | 
32 |     df_size = (num_rows * mem_per_row) / 1024 / 1024
33 |     Log.print("Approximate dataset size: {:0.2f} Mb".format(df_size))
34 | 
35 |     config["parse_dates"] = []
36 |     config["dtype"] = {
37 |         "line_id": int,
38 |     }
39 | 
40 |     counters = {
41 |         "id": 0,
42 |         "number": 0,
43 |         "string": 0,
44 |         "datetime": 0,
45 |     }
46 | 
47 |     for c in df:
48 |         if c.startswith("number_"):
49 |             counters["number"] += 1
50 |         elif c.startswith("string_"):
51 |             counters["string"] += 1
52 |             config["dtype"][c] = str
53 |         elif c.startswith("datetime_"):
54 |             counters["datetime"] += 1
55 |             config["dtype"][c] = str
56 |             config["parse_dates"].append(c)
57 |         elif c.startswith("id_"):
58 |             counters["id"] += 1
59 | 
60 |     Log.print("Number columns: {}".format(counters["number"]))
61 |     Log.print("String columns: {}".format(counters["string"]))
62 |     Log.print("Datetime columns: {}".format(counters["datetime"]))
63 | 
64 |     config["counters"] = counters
65 | 


--------------------------------------------------------------------------------
/lib/automl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import numpy as np
 4 | from lib.util import Config
 5 | from lib.read import read_df
 6 | from lib.preprocess import preprocess
 7 | from lib.model import train, predict, validate
 8 | from typing import Optional
 9 | 
10 | 
11 | class AutoML:
12 |     def __init__(self, model_dir: str):
13 |         os.makedirs(model_dir, exist_ok=True)
14 |         self.config = Config(model_dir)
15 | 
16 |     def train(self, train_csv: str, mode: str):
17 |         self.config["task"] = "train"
18 |         self.config["mode"] = mode
19 |         self.config.tmp_dir = self.config.model_dir + "/tmp"
20 |         os.makedirs(self.config.tmp_dir, exist_ok=True)
21 | 
22 |         df = read_df(train_csv, self.config)
23 |         preprocess(df, self.config)
24 | 
25 |         y = df["target"]
26 |         X = df.drop("target", axis=1)
27 |         train(X, y, self.config)
28 | 
29 |     def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
30 |         self.config["task"] = "predict"
31 |         self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
32 |         os.makedirs(self.config.tmp_dir, exist_ok=True)
33 | 
34 |         df = read_df(test_csv, self.config)
35 |         result = {
36 |             "line_id": list(df["line_id"]),
37 |             "prediction": [],
38 |         }
39 | 
40 |         def chunker(seq, size):
41 |             return (seq[pos:pos+size] for pos in range(0, len(seq), size))
42 | 
43 |         for chunk in chunker(df, 100000):
44 |             X = chunk.copy()
45 |             preprocess(X, self.config)
46 |             result["prediction"] += list(predict(X, self.config))
47 | 
48 |         result = pd.DataFrame(result)
49 |         result.sort_values("line_id", inplace=True)
50 |         result.to_csv(prediction_csv, index=False)
51 | 
52 |         target_csv = test_csv.replace("test", "test-target")
53 |         if os.path.exists(target_csv):
54 |             score = validate(result, target_csv, self.config["mode"])
55 |         else:
56 |             score = None
57 | 
58 |         return result, score
59 | 
60 |     def save(self):
61 |         self.config.save()
62 | 
63 |     def load(self):
64 |         self.config.load()
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sberbank Data Science Journey 2018: Docker-friendly baseline
 2 | 
 3 | Для работы необходим `make` и `docker`. Перед началом работы нужно скачать датасет в папку `data`.
 4 | 
 5 | ## Особенности решения
 6 | 
 7 | * LightGBM с подбором гиперпараметров через hyperopt.
 8 | * Mean target Encoding для категориальных фич.
 9 | * Для 8-го датасета отбор фич через [BorutaPy](https://github.com/scikit-learn-contrib/boruta_py).
10 | * Лик от [bagxi](https://github.com/bagxi/sdsj2018_lightgbm_baseline).
11 | 
12 | Так же есть, но не используются: Vowpal Wabbit, H2O AutoML.
13 | Скор на ЛБ: `5,30072`.
14 | 
15 | ## Make-команды для работы с Docker :whale:
16 | 
17 | `make download` - cкачать датасет в папку data.  
18 | `make train DATASET=1` - обучение модели на датасете с указанным номером [1-8].  
19 | `make predict DATASET=1` - валидация модели на датасете с указанным номером [1-8].  
20 | `make score` - валидация модели на всех датасетах и сохранение результата в папку scores.  
21 | `make docker-build` - сборка Docker-образа.  
22 | `make docker-push` - залить Docker-образ на Docker Hub.  
23 | `make run-bash` - запустить терминал в Docker-контейнере.  
24 | `make run-jupyter` - запустить Jupyter в Docker-контейнере по адресу http://localhost:8888.  
25 | `make submission` - создать сабмит-файл в директории submissions.  
26 | 
27 | ## Сборка своего Docker-образа
28 | 
29 | 1. Зарегистрироваться на [Docker Hub](https://hub.docker.com/).
30 | 2. Отредактировать [Makefile](https://github.com/tyz910/sdsj2018/blob/master/Makefile) и указать название образа на первой строчке `IMAGE=username/image`.  
31 | 3. Отредактировать [Dockerfile](https://github.com/tyz910/sdsj2018/blob/master/Dockerfile) и добавить установку нужных пакетов.
32 | 4. Запустить сборку образа `make docker-build`.
33 | 5. Залить Docker-образ на Docker Hub `make docker-push`.
34 | 6. Убедиться, что созданный репозиторий публичный (Public), а не приватный (Private). Приватность настраивается по ссылке `https://hub.docker.com/r/username/image/~/settings/`.
35 | 
36 | ## Запуск на Windows
37 | 
38 | 1. Установить [Docker](https://download.docker.com/win/stable/Docker%20for%20Windows%20Installer.exe).
39 | 2. Установить [Make](http://gnuwin32.sourceforge.net/downlinks/make.php). И добавить его в [PATH](https://ru.stackoverflow.com/questions/153628/Как-добавить-путь-в-переменную-окружения-path-на-windows).
40 | 3. Запустить Docker. В настройках выделить докеру нужное количество оперативной памяти.
41 | 4. Запустить PowerShell от имени администратора. При выполнении make-команд дать разрешение на монтирование директории для докера.
42 | 


--------------------------------------------------------------------------------
/lib/model_other.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import pandas as pd
  3 | import numpy as np
  4 | import h2o
  5 | from h2o.automl import H2OAutoML
  6 | from vowpalwabbit.sklearn_vw import tovw
  7 | from sklearn.linear_model import LogisticRegression, Ridge
  8 | from lib.util import timeit, Config
  9 | from typing import List
 10 | 
 11 | 
 12 | @timeit
 13 | def train_vw(X: pd.DataFrame, y: pd.Series, config: Config):
 14 |     cache_file = config.tmp_dir + "/.vw_cache"
 15 |     data_file = config.tmp_dir + "/vw_data_train.csv"
 16 | 
 17 |     cmd = " ".join([
 18 |         "rm -f {cache} && vw",
 19 |         "-f {f}",
 20 |         "--cache_file {cache}",
 21 |         "--passes {passes}",
 22 |         "-l {l}",
 23 |         "--early_terminate {early_terminate}",
 24 |         "{df}"
 25 |     ]).format(
 26 |         cache=cache_file,
 27 |         df=data_file,
 28 |         f=config.model_dir + "/vw.model",
 29 |         passes=max(20, int(1000000/len(X))),
 30 |         l=25,
 31 |         early_terminate=1,
 32 |     )
 33 | 
 34 |     if config["mode"] == "classification":
 35 |         cmd += " --loss_function logistic --link logistic"
 36 |         y = y.replace({0: -1})
 37 | 
 38 |     save_to_vw(data_file, X, y)
 39 |     subprocess.Popen(cmd, shell=True).communicate()
 40 | 
 41 | 
 42 | @timeit
 43 | def predict_vw(X: pd.DataFrame, config: Config) -> List:
 44 |     preds_file = config.tmp_dir + "/.vw_preds"
 45 |     data_file = config.tmp_dir + "/vw_data_test.csv"
 46 |     save_to_vw(data_file, X)
 47 | 
 48 |     subprocess.Popen("vw -i {i} -p {p} {df}".format(
 49 |         df=data_file,
 50 |         i=config.model_dir + "/vw.model",
 51 |         p=preds_file
 52 |     ), shell=True).communicate()
 53 | 
 54 |     return [np.float64(line) for line in open(preds_file, "r")]
 55 | 
 56 | 
 57 | @timeit
 58 | def save_to_vw(filepath: str, X: pd.DataFrame, y: pd.Series=None, chunk_size=1000):
 59 |     with open(filepath, "w+") as f:
 60 |         for pos in range(0, len(X), chunk_size):
 61 |             chunk_X = X.iloc[pos:pos + chunk_size, :]
 62 |             chunk_y = y.iloc[pos:pos + chunk_size] if y is not None else None
 63 |             for row in tovw(chunk_X, chunk_y):
 64 |                 f.write(row + "\n")
 65 | 
 66 | 
 67 | @timeit
 68 | def train_lm(X: pd.DataFrame, y: pd.Series, config: Config):
 69 |     if config["mode"] == "regression":
 70 |         model = Ridge()
 71 |     else:
 72 |         model = LogisticRegression(solver="liblinear")
 73 | 
 74 |     config["model_lm"] = model.fit(X, y)
 75 | 
 76 | 
 77 | @timeit
 78 | def predict_lm(X: pd.DataFrame, config: Config) -> List:
 79 |     if config["mode"] == "regression":
 80 |         return config["model_lm"].predict(X)
 81 |     else:
 82 |         return config["model_lm"].predict_proba(X)[:, 1]
 83 | 
 84 | 
 85 | @timeit
 86 | def train_h2o(X: pd.DataFrame, y: pd.Series, config: Config):
 87 |     h2o.init()
 88 | 
 89 |     X["target"] = y
 90 |     train = h2o.H2OFrame(X)
 91 |     train_x = train.columns
 92 |     train_y = "target"
 93 |     train_x.remove(train_y)
 94 | 
 95 |     if config["mode"] == "classification":
 96 |         train[train_y] = train[train_y].asfactor()
 97 | 
 98 |     aml = H2OAutoML(max_runtime_secs=60)
 99 |     aml.train(x=train_x, y=train_y, training_frame=train)
100 | 
101 |     config["model_h2o"] = h2o.save_model(model=aml.leader, path=config.model_dir + "/h2o.model", force=True)
102 |     print(aml.leaderboard)
103 | 
104 |     X.drop("target", axis=1, inplace=True)
105 | 
106 | 
107 | @timeit
108 | def predict_h2o(X: pd.DataFrame, config: Config) -> List:
109 |     h2o.init()
110 |     model = h2o.load_model(config["model_h2o"])
111 | 
112 |     return model.predict(h2o.H2OFrame(X)).as_data_frame()["predict"].tolist()
113 | 


--------------------------------------------------------------------------------
/lib/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import pickle
  4 | import signal
  5 | from contextlib import contextmanager
  6 | from typing import Any
  7 | 
  8 | 
  9 | class Log:
 10 |     nesting_level = 0
 11 |     is_silent = False
 12 |     is_method_start = None
 13 | 
 14 |     @staticmethod
 15 |     def silent(silent: bool):
 16 |         Log.is_silent = silent
 17 | 
 18 |     @staticmethod
 19 |     def print(entry: Any="", nesting: bool=True):
 20 |         if Log.is_silent:
 21 |             return
 22 | 
 23 |         space = "." * (4 * Log.nesting_level) if nesting else ""
 24 |         print("{}{}".format(space, entry))
 25 | 
 26 |     @staticmethod
 27 |     def nest(n: int):
 28 |         Log.nesting_level += n
 29 | 
 30 |     @staticmethod
 31 |     def timeit(method):
 32 |         def timed(*args, **kw):
 33 |             if not Log.is_method_start:
 34 |                 Log.print(nesting=False)
 35 | 
 36 |             Log.is_method_start = True
 37 |             Log.print("Start {}.".format(method.__name__))
 38 |             Log.nest(1)
 39 | 
 40 |             start_time = time.time()
 41 |             result = method(*args, **kw)
 42 |             end_time = time.time()
 43 | 
 44 |             Log.nest(-1)
 45 |             Log.print("End {}. Time: {:0.2f} sec.".format(method.__name__, end_time - start_time))
 46 |             Log.is_method_start = False
 47 | 
 48 |             return result
 49 | 
 50 |         return timed
 51 | 
 52 | 
 53 | class Config:
 54 |     def __init__(self, model_dir: str):
 55 |         self.model_dir = model_dir
 56 |         self.tmp_dir = model_dir
 57 |         self.current_time_limit = 0
 58 |         self.current_time_limit_start = 0
 59 |         self.data = {
 60 |             "start_time": time.time(),
 61 |             "time_limit": int(os.environ.get("TIME_LIMIT", 5 * 60)),
 62 |         }
 63 | 
 64 |     def is_train(self) -> bool:
 65 |         return self["task"] == "train"
 66 | 
 67 |     def is_predict(self) -> bool:
 68 |         return self["task"] == "predict"
 69 | 
 70 |     def is_regression(self) -> bool:
 71 |         return self["mode"] == "regression"
 72 | 
 73 |     def is_classification(self) -> bool:
 74 |         return self["mode"] == "classification"
 75 | 
 76 |     def limit_time_fraction(self, fraction: float=0.1):
 77 |         self.current_time_limit = int(self["time_limit"] * fraction)
 78 |         self.current_time_limit_start = self.time_left()
 79 | 
 80 |     def is_time_fraction_limit(self) -> bool:
 81 |         return self.current_time_limit_start - self.time_left() >= self.current_time_limit
 82 | 
 83 |     def time_left(self) -> float:
 84 |         return self["time_limit"] - (time.time() - self["start_time"])
 85 | 
 86 |     def save(self):
 87 |         with open(os.path.join(self.model_dir, "config.pkl"), "wb") as f:
 88 |             pickle.dump(self.data, f, protocol=pickle.HIGHEST_PROTOCOL)
 89 | 
 90 |     def load(self):
 91 |         with open(os.path.join(self.model_dir, "config.pkl"), "rb") as f:
 92 |             data = pickle.load(f)
 93 | 
 94 |         self.data = {**data, **self.data}
 95 | 
 96 |     def __getitem__(self, key):
 97 |         return self.data[key]
 98 | 
 99 |     def __setitem__(self, key, value):
100 |         self.data[key] = value
101 | 
102 |     def __delitem__(self, key):
103 |         del self.data[key]
104 | 
105 |     def __contains__(self, key):
106 |         return key in self.data
107 | 
108 |     def __len__(self):
109 |         return len(self.data)
110 | 
111 |     def __repr__(self):
112 |         return repr(self.data)
113 | 
114 | 
115 | class TimeoutException(Exception): pass
116 | 
117 | 
118 | @contextmanager
119 | def time_limit(seconds):
120 |     seconds = int(seconds)
121 |     def signal_handler(signum, frame):
122 |         raise TimeoutException("Timed out!")
123 |     signal.signal(signal.SIGALRM, signal_handler)
124 |     signal.alarm(seconds)
125 |     try:
126 |         yield
127 |     finally:
128 |         signal.alarm(0)
129 | 


--------------------------------------------------------------------------------
/lib/model.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import lightgbm as lgb
  4 | import hyperopt
  5 | from hyperopt import hp, tpe, STATUS_OK, space_eval, Trials
  6 | from sklearn.model_selection import train_test_split
  7 | from sklearn.metrics import mean_squared_error, roc_auc_score
  8 | from lib.util import Log, Config, time_limit, TimeoutException
  9 | from typing import List, Dict
 10 | 
 11 | 
 12 | @Log.timeit
 13 | def train(X: pd.DataFrame, y: pd.Series, config: Config):
 14 |     train_lightgbm(X, y, config)
 15 | 
 16 | 
 17 | @Log.timeit
 18 | def predict(X: pd.DataFrame, config: Config) -> List:
 19 |     preds = predict_lightgbm(X, config)
 20 | 
 21 |     if config["non_negative_target"]:
 22 |         preds = [max(0, p) for p in preds]
 23 | 
 24 |     return preds
 25 | 
 26 | @Log.timeit
 27 | def validate(preds: pd.DataFrame, target_csv: str, mode: str) -> np.float64:
 28 |     df = pd.merge(preds, pd.read_csv(target_csv), on="line_id", left_index=True)
 29 |     score = roc_auc_score(df.target.values, df.prediction.values) if mode == "classification" else \
 30 |         np.sqrt(mean_squared_error(df.target.values, df.prediction.values))
 31 |     Log.print("Score: {:0.4f}".format(score))
 32 |     return score
 33 | 
 34 | 
 35 | @Log.timeit
 36 | def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config):
 37 |     params = {
 38 |         "objective": "regression" if config.is_regression() else "binary",
 39 |         "metric": "rmse" if config.is_regression() else "auc",
 40 |         "verbosity": -1,
 41 |         "seed": 1,
 42 |     }
 43 | 
 44 |     X_sample, y_sample = data_sample(X, y, config, nrows=20000)
 45 |     hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config)
 46 | 
 47 |     X_train, X_val, y_train, y_val = data_split(X, y, config)
 48 | 
 49 |     config["model"] = lgb.train(
 50 |         {**params, **hyperparams},
 51 |         lgb.Dataset(X_train, label=y_train),
 52 |         5000,
 53 |         lgb.Dataset(X_val, label=y_val),
 54 |         early_stopping_rounds=100,
 55 |         verbose_eval=100,
 56 |     )
 57 |     config.save()
 58 | 
 59 |     try:
 60 |         with time_limit(config.time_left() - 10):
 61 |             config["model"] = lgb.train(
 62 |                 {**params, **hyperparams},
 63 |                 lgb.Dataset(X, label=y),
 64 |                 int(1.2 * config["model"].best_iteration),
 65 |             )
 66 |     except TimeoutException:
 67 |         Log.print("Timed out!")
 68 | 
 69 | 
 70 | @Log.timeit
 71 | def predict_lightgbm(X: pd.DataFrame, config: Config) -> List:
 72 |     return config["model"].predict(X)
 73 | 
 74 | 
 75 | @Log.timeit
 76 | def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config):
 77 |     X_train, X_val, y_train, y_val = data_split(X, y, config, test_size=0.5)
 78 |     train_data = lgb.Dataset(X_train, label=y_train)
 79 |     valid_data = lgb.Dataset(X_val, label=y_val)
 80 | 
 81 |     space = {
 82 |         "learning_rate": hp.choice("learning_rate", np.arange(0.01, 0.05, 0.01)),
 83 |         "boost_from_average": hp.choice("boost_from_average", [True, False]),
 84 |         "is_unbalance": hp.choice("is_unbalance", [True, False]),
 85 |         "zero_as_missing": hp.choice("zero_as_missing", [True, False]),
 86 |         "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 7]),
 87 |         "num_leaves": hp.choice("num_leaves", [11, 31, 51, 101, 151, 201]),
 88 |         "feature_fraction": hp.choice("feature_fraction", np.arange(0.5, 1.0, 0.1)),
 89 |         "bagging_fraction": hp.choice("bagging_fraction", np.arange(0.5, 1.0, 0.1)),
 90 |         "bagging_freq": hp.choice("bagging_freq", [1, 3, 5, 10, 20, 50]),
 91 |         "reg_alpha": hp.uniform("reg_alpha", 0, 10),
 92 |         "reg_lambda": hp.uniform("reg_lambda", 0, 10),
 93 |         "min_child_weight": hp.uniform("min_child_weight", 0, 10),
 94 |     }
 95 | 
 96 |     config.limit_time_fraction(0.15)
 97 | 
 98 |     def objective(hyperparams):
 99 |         if config.is_time_fraction_limit():
100 |             score = np.inf if config.is_regression() else 0
101 |             return {'loss': score, 'status': STATUS_OK}
102 | 
103 |         model = lgb.train({**params, **hyperparams}, train_data, 300, valid_data,
104 |                           early_stopping_rounds=100, verbose_eval=False)
105 | 
106 |         score = model.best_score["valid_0"][params["metric"]]
107 |         Log.print(score)
108 |         if config.is_classification():
109 |             score = -score
110 | 
111 |         return {'loss': score, 'status': STATUS_OK}
112 | 
113 |     trials = Trials()
114 |     best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=100, verbose=1,
115 |                          rstate= np.random.RandomState(1))
116 | 
117 |     hyperparams = space_eval(space, best)
118 |     Log.print("{:0.4f} {}".format(trials.best_trial['result']['loss'], hyperparams))
119 |     return hyperparams
120 | 
121 | 
122 | def ts_split(X: pd.DataFrame, y: pd.Series, test_size: float) -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series):
123 |     test_len = int(len(X) * test_size)
124 |     return X[:-test_len], X[-test_len:], y[:-test_len], y[-test_len:]
125 | 
126 | 
127 | def data_split(X: pd.DataFrame, y: pd.Series, config: Config, test_size: float=0.2) -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series):
128 |     if "sort_values" in config:
129 |         return ts_split(X, y, test_size=test_size)
130 |     else:
131 |         return train_test_split(X, y, test_size=test_size, random_state=1)
132 | 
133 | 
134 | def data_sample(X: pd.DataFrame, y: pd.Series, config: Config, nrows: int=10000) -> (pd.DataFrame, pd.Series):
135 |     if len(X) > nrows:
136 |         if "sort_values" in config:
137 |             X_sample = X.iloc[:nrows]
138 |             y_sample = y.iloc[:nrows]
139 |         else:
140 |             X_sample = X.sample(nrows, random_state=1)
141 |             y_sample = y[X_sample.index]
142 |     else:
143 |         X_sample = X
144 |         y_sample = y
145 | 
146 |     return X_sample, y_sample
147 | 


--------------------------------------------------------------------------------
/lib/preprocess.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import datetime
  3 | import warnings
  4 | import numpy as np
  5 | import pandas as pd
  6 | import lightgbm as lgb
  7 | from sklearn.exceptions import DataConversionWarning
  8 | from sklearn.preprocessing import StandardScaler
  9 | from sklearn.model_selection import train_test_split
 10 | from lib.features import select_features
 11 | from lib.util import Log, Config
 12 | from typing import Optional, List
 13 | 
 14 | 
 15 | @Log.timeit
 16 | def preprocess(df: pd.DataFrame, config: Config):
 17 |     non_negative_target_detect(df, config)
 18 |     drop_columns(df, config)
 19 |     fillna(df, config)
 20 |     to_int8(df, config)
 21 | 
 22 |     time_series_detect(df, config)
 23 |     feature_selection(df, config)
 24 | 
 25 |     subsample(df, config, max_size_mb=2 * 1024)
 26 |     transform(df, config)
 27 |     subsample(df, config, max_size_mb=2 * 1024)
 28 | 
 29 | 
 30 | @Log.timeit
 31 | def transform(df: pd.DataFrame, config: Config):
 32 |     transform_datetime(df, config)
 33 |     transform_categorical(df, config)
 34 |     scale(df, config)
 35 | 
 36 | 
 37 | @Log.timeit
 38 | def drop_columns(df: pd.DataFrame, config: Config):
 39 |     df.drop([c for c in ["is_test", "line_id"] if c in df], axis=1, inplace=True)
 40 |     drop_constant_columns(df, config)
 41 | 
 42 | 
 43 | @Log.timeit
 44 | def fillna(df: pd.DataFrame, config: Config):
 45 |     for c in [c for c in df if c.startswith("number_")]:
 46 |         df[c].fillna(-1, inplace=True)
 47 | 
 48 |     for c in [c for c in df if c.startswith("string_")]:
 49 |         df[c].fillna("", inplace=True)
 50 | 
 51 |     for c in [c for c in df if c.startswith("datetime_")]:
 52 |         df[c].fillna(datetime.datetime(1970, 1, 1), inplace=True)
 53 | 
 54 | 
 55 | @Log.timeit
 56 | def drop_constant_columns(df: pd.DataFrame, config: Config):
 57 |     if "constant_columns" not in config:
 58 |         config["constant_columns"] = [c for c in df if c.startswith("number_") and not (df[c] != df[c].iloc[0]).any()]
 59 |         Log.print("Constant columns: {}".format(config["constant_columns"]))
 60 | 
 61 |     if len(config["constant_columns"]) > 0:
 62 |         df.drop(config["constant_columns"], axis=1, inplace=True)
 63 | 
 64 | 
 65 | @Log.timeit
 66 | def transform_datetime(df: pd.DataFrame, config: Config):
 67 |     date_parts = ["year", "weekday", "month", "day", "hour"]
 68 | 
 69 |     if "date_columns" not in config:
 70 |         config["date_columns"] = {}
 71 | 
 72 |         for c in [c for c in df if c.startswith("datetime_")]:
 73 |             config["date_columns"][c] = []
 74 |             for part in date_parts:
 75 |                 part_col = c + "_" + part
 76 |                 df[part_col] = getattr(df[c].dt, part).astype(np.uint16 if part == "year" else np.uint8).values
 77 | 
 78 |                 if not (df[part_col] != df[part_col].iloc[0]).any():
 79 |                     Log.print(part_col + " is constant")
 80 |                     df.drop(part_col, axis=1, inplace=True)
 81 |                 else:
 82 |                     config["date_columns"][c].append(part)
 83 | 
 84 |             df.drop(c, axis=1, inplace=True)
 85 |     else:
 86 |         for c, parts in config["date_columns"].items():
 87 |             for part in parts:
 88 |                 part_col = c + "_" + part
 89 |                 df[part_col] = getattr(df[c].dt, part)
 90 |             df.drop(c, axis=1, inplace=True)
 91 | 
 92 | 
 93 | @Log.timeit
 94 | def transform_categorical(df: pd.DataFrame, config: Config):
 95 |     if "categorical_columns" not in config:
 96 |         config["categorical_columns"] = []
 97 | 
 98 |         # https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
 99 |         prior = config["categorical_prior"] = df["target"].mean()
100 |         min_samples_leaf = 10
101 |         smoothing = 5
102 | 
103 |         config["categorical_columns_string"] = {}
104 |         for c in [c for c in df if c.startswith("string_")]:
105 |             Log.print(c)
106 |             config["categorical_columns"].append(c)
107 | 
108 |             averages = df[[c, "target"]].groupby(c)["target"].agg(["mean", "count"])
109 |             smooth = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
110 |             averages["target"] = prior * (1 - smooth) + averages["mean"] * smooth
111 |             config["categorical_columns_string"][c] = averages["target"].to_dict()
112 | 
113 |         config["categorical_columns_id"] = {}
114 |         for c in [c for c in df if c.startswith("id_")]:
115 |             Log.print(c)
116 |             config["categorical_columns"].append(c)
117 | 
118 |             if df[c].dtype == str or df[c].dtype == object:
119 |                 config["categorical_columns_id"][c] = {v: i for i, v in enumerate(df[c].unique())}
120 | 
121 |     for c, values in config["categorical_columns_string"].items():
122 |         df.loc[:, c] = df[c].apply(lambda x: values[x] if x in values else config["categorical_prior"])
123 | 
124 |     for c, values in config["categorical_columns_id"].items():
125 |         df.loc[:, c] = df[c].apply(lambda x: values[x] if x in values else -1)
126 | 
127 | 
128 | @Log.timeit
129 | def scale(df: pd.DataFrame, config: Config):
130 |     warnings.filterwarnings(action='ignore', category=DataConversionWarning)
131 |     scale_columns = [c for c in df if c.startswith("number_") and df[c].dtype != np.int8 and c not in config["categorical_columns"]]
132 | 
133 |     if len(scale_columns) > 0:
134 |         if "scaler" not in config:
135 |             config["scaler"] = StandardScaler(copy=False)
136 |             config["scaler"].fit(df[scale_columns])
137 | 
138 |         df[scale_columns] = config["scaler"].transform(df[scale_columns])
139 | 
140 | 
141 | @Log.timeit
142 | def to_int8(df: pd.DataFrame, config: Config):
143 |     if "int8_columns" not in config:
144 |         config["int8_columns"] = []
145 |         vals = [-1, 0, 1]
146 | 
147 |         for c in [c for c in df if c.startswith("number_")]:
148 |             if (~df[c].isin(vals)).any():
149 |                 continue
150 |             config["int8_columns"].append(c)
151 | 
152 |         Log.print("Num columns: {}".format(len(config["int8_columns"])))
153 | 
154 |     if len(config["int8_columns"]) > 0:
155 |         df.loc[:, config["int8_columns"]] = df.loc[:, config["int8_columns"]].astype(np.int8)
156 | 
157 | 
158 | @Log.timeit
159 | def subsample(df: pd.DataFrame, config: Config, max_size_mb: float=2.0):
160 |     if config.is_train():
161 |         df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
162 |         if df_size_mb > max_size_mb:
163 |             mem_per_row = df_size_mb / len(df)
164 |             sample_rows = int(max_size_mb / mem_per_row)
165 | 
166 |             Log.print("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows.".format(df_size_mb, len(df), sample_rows))
167 |             _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1)
168 |             df.drop(df_drop.index, inplace=True)
169 | 
170 |             config["nrows"] = sample_rows
171 |         else:
172 |             config["nrows"] = len(df)
173 | 
174 | 
175 | def shift_columns(df: pd.DataFrame, group: Optional[str]=None, number_columns: Optional[List[str]]=None):
176 |     if number_columns is None:
177 |         number_columns = [c for c in df if c.startswith("number_")]
178 |     shift_columns = [c + "_shift" for c in number_columns]
179 | 
180 |     if group is not None:
181 |         shifted = df.groupby([group])[number_columns].shift(-1)
182 |     else:
183 |         shifted = df[number_columns].shift(-1)
184 | 
185 |     df[shift_columns] = shifted.fillna(-1)
186 | 
187 | 
188 | @Log.timeit
189 | def time_series_detect(df: pd.DataFrame, config: Config):
190 |     sample_size = 10000
191 |     model_params = {
192 |         "objective": "regression" if config["mode"] == "regression" else "binary",
193 |         "metric": "rmse" if config["mode"] == "regression" else "auc",
194 |         "learning_rate": 0.01,
195 |         "verbosity": -1,
196 |         "seed": 1,
197 |         "max_depth": -1,
198 |     }
199 | 
200 |     if config.is_train():
201 |         datetime_columns = [c for c in df if c.startswith("datetime_")]
202 |         id_columns = [c for c in df if c.startswith("id_")]
203 | 
204 |         sort_columns = []
205 |         for dc in datetime_columns:
206 |             sort_columns.append([dc])
207 |             for ic in id_columns:
208 |                 sort_columns.append([ic, dc])
209 |         else:
210 |             for ic in id_columns:
211 |                 sort_columns.append([ic])
212 | 
213 |         scores = []
214 |         config.limit_time_fraction(0.1)
215 |         for sc in sort_columns:
216 |             if config.is_time_fraction_limit():
217 |                 break
218 | 
219 |             Log.silent(True)
220 |             df.sort_values(sc, inplace=True)
221 | 
222 |             config_sample = copy.deepcopy(config)
223 |             df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
224 |             df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]]
225 |             shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None)
226 |             transform(df_sample, config_sample)
227 | 
228 |             y = df_sample["target"]
229 |             X = df_sample.drop("target", axis=1)
230 |             X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5)
231 | 
232 |             model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test),
233 |                               early_stopping_rounds=100, verbose_eval=False)
234 |             score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]]
235 | 
236 |             sampled_columns = [c for c in X if "_shift" not in c]
237 |             model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test),
238 |                               early_stopping_rounds=100, verbose_eval=False)
239 |             score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]]
240 | 
241 |             if config.is_classification():
242 |                 score_sorted = -score_sorted
243 |                 score_sampled = -score_sampled
244 | 
245 |             Log.silent(False)
246 |             Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled))
247 |             score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled)
248 |             if score_ratio >= 1.03:
249 |                 Log.print(score_ratio)
250 |                 scores.append((score_sorted, sc))
251 | 
252 |         if len(scores) > 0:
253 |             scores = sorted(scores, key=lambda x: x[0])
254 |             Log.print("Scores: {}".format(scores))
255 |             config["sort_values"] = scores[0][1]
256 |             df.sort_values(config["sort_values"], inplace=True)
257 | 
258 |             config_sample = copy.deepcopy(config)
259 |             df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
260 |             shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None)
261 |             transform(df_sample, config_sample)
262 | 
263 |             y = df_sample["target"]
264 |             X = df_sample.drop("target", axis=1)
265 | 
266 |             model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000)
267 |             fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns)
268 |             fi = fi[fi > 0].sort_values()
269 |             selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist()
270 | 
271 |             selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c]
272 |             if len(selected_shift_columns) > 0:
273 |                 Log.print("Shift columns: {}".format(selected_shift_columns))
274 |                 config["shift_columns"] = selected_shift_columns
275 | 
276 |     if "shift_columns" in config:
277 |         shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"])
278 | 
279 | 
280 | @Log.timeit
281 | def feature_selection(df: pd.DataFrame, config: Config):
282 |     if config.is_train():
283 |         df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
284 |         if df_size_mb < 2 * 1024:
285 |             return
286 | 
287 |         selected_columns = []
288 |         config_sample = copy.deepcopy(config)
289 |         config.limit_time_fraction(0.1)
290 |         for i in range(20):
291 |             if config.is_time_fraction_limit():
292 |                 break
293 | 
294 |             df_sample = df.sample(min(3000, len(df)), random_state=i).copy()
295 |             transform(df_sample, config_sample)
296 |             y = df_sample["target"]
297 |             X = df_sample.drop("target", axis=1)
298 | 
299 |             if len(selected_columns) > 0:
300 |                 X = X.drop(selected_columns, axis=1)
301 | 
302 |             if len(X.columns) > 0:
303 |                 selected_columns += select_features(X, y, config["mode"])
304 |             else:
305 |                 break
306 | 
307 |         Log.print("Selected columns: {}".format(selected_columns))
308 | 
309 |         drop_number_columns = [c for c in df if c.startswith("number_") and c not in selected_columns]
310 |         if len(drop_number_columns) > 0:
311 |             config["drop_number_columns"] = drop_number_columns
312 | 
313 |         config["date_columns"] = {}
314 |         for c in [c for c in selected_columns if c.startswith("datetime_")]:
315 |             d = c.split("_")
316 |             date_col = d[0] + "_" + d[1]
317 |             date_part = d[2]
318 | 
319 |             if date_col not in config["date_columns"]:
320 |                 config["date_columns"][date_col] = []
321 | 
322 |             config["date_columns"][date_col].append(date_part)
323 | 
324 |         drop_datetime_columns = [c for c in df if c.startswith("datetime_") and c not in config["date_columns"]]
325 |         if len(drop_datetime_columns) > 0:
326 |             config["drop_datetime_columns"] = drop_datetime_columns
327 | 
328 |     if "drop_number_columns" in config:
329 |         Log.print("Drop number columns: {}".format(config["drop_number_columns"]))
330 |         df.drop(config["drop_number_columns"], axis=1, inplace=True)
331 | 
332 |     if "drop_datetime_columns" in config:
333 |         Log.print("Drop datetime columns: {}".format(config["drop_datetime_columns"]))
334 |         df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
335 | 
336 | 
337 | @Log.timeit
338 | def non_negative_target_detect(df: pd.DataFrame, config: Config):
339 |     if config.is_train():
340 |         config["non_negative_target"] = df["target"].lt(0).sum() == 0
341 | 
342 | 
343 | def ts_split(X: pd.DataFrame, y: pd.Series, test_size: float) -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series):
344 |     test_len = int(len(X) * test_size)
345 |     return X[:-test_len], X[-test_len:], y[:-test_len], y[-test_len:]
346 | 


--------------------------------------------------------------------------------