├── data └── .gitkeep ├── models └── .gitkeep ├── predictions └── .gitkeep ├── submissions └── .gitkeep ├── .dockerignore ├── .gitignore ├── scores └── scores.csv ├── metadata.json ├── main.py ├── lib ├── features.py ├── read.py ├── automl.py ├── model_other.py ├── util.py ├── model.py └── preprocess.py ├── score.py ├── Dockerfile ├── Makefile └── README.md /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /predictions/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /submissions/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | data 2 | models 3 | predictions 4 | submissions 5 | scores 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | .ipynb_checkpoints 4 | __pycache__ 5 | data 6 | models 7 | predictions 8 | submissions 9 | scores 10 | -------------------------------------------------------------------------------- /scores/scores.csv: -------------------------------------------------------------------------------- 1 | ,dataset,score,time 2 | 0,1_r,11.699119328400574,7.91574501991272 3 | 1,2_r,1.2025001668490136,51.469507694244385 4 | 2,3_r,13258.658787686656,74.08733296394348 5 | 3,4_c,0.9998686888554368,150.34715294837952 6 | 4,5_c,0.7892876687105762,292.5374937057495 7 | 5,6_c,0.6603385075889981,141.35956001281738 8 | 6,7_c,0.8206793915381698,519.9199469089508 9 | 7,8_c,0.8826053456934709,969.1855833530426 10 | -------------------------------------------------------------------------------- /metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "{image}", 3 | "entry_points": { 4 | "train_classification": "python3 main.py --mode classification --train-csv {train_csv} --model-dir {model_dir}", 5 | "train_regression": "python3 main.py --mode regression --train-csv {train_csv} --model-dir {model_dir}", 6 | "predict": "python3 main.py --test-csv {test_csv} --prediction-csv {prediction_csv} --model-dir {model_dir}" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from lib.util import Log 3 | from lib.automl import AutoML 4 | 5 | 6 | @Log.timeit 7 | def main(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--mode', choices=['classification', 'regression']) 10 | parser.add_argument('--model-dir') 11 | parser.add_argument('--train-csv') 12 | parser.add_argument('--test-csv') 13 | parser.add_argument('--prediction-csv') 14 | args = parser.parse_args() 15 | 16 | automl = AutoML(args.model_dir) 17 | 18 | if args.train_csv is not None: 19 | automl.train(args.train_csv, args.mode) 20 | automl.save() 21 | elif args.test_csv is not None: 22 | automl.load() 23 | automl.predict(args.test_csv, args.prediction_csv) 24 | else: 25 | exit(1) 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /lib/features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import lightgbm as lgb 3 | from boruta import BorutaPy 4 | from typing import List, Optional 5 | 6 | 7 | class LGBMFeatureEstimator(): 8 | def __init__(self, params, n_estimators: int=50): 9 | self.params = params 10 | self.n_estimators = n_estimators 11 | 12 | def get_params(self): 13 | return self.params 14 | 15 | def set_params(self, n_estimators:Optional[int]=None, random_state:Optional[int]=None): 16 | if n_estimators is not None: 17 | self.n_estimators = n_estimators 18 | 19 | def fit(self, X: pd.DataFrame, y: pd.Series): 20 | train_data = lgb.Dataset(X, label=y) 21 | model = lgb.train(self.params, train_data, self.n_estimators) 22 | self.feature_importances_ = model.feature_importance(importance_type="gain") 23 | 24 | 25 | def select_features(X: pd.DataFrame, y: pd.Series, mode: str, n_estimators: int=50, max_iter: int=50, perc: int=75) -> List[str]: 26 | feat_estimator = LGBMFeatureEstimator({ 27 | "objective": "regression" if mode == "regression" else "binary", 28 | "metric": "rmse" if mode == "regression" else "auc", 29 | "learning_rate": 0.01, 30 | "verbosity": -1, 31 | "seed": 1, 32 | "max_depth": 7, 33 | "min_data_in_leaf": 3, 34 | }, n_estimators) 35 | 36 | feat_selector = BorutaPy(feat_estimator, n_estimators=n_estimators, max_iter=max_iter, verbose=2, random_state=1, perc=perc) 37 | 38 | try: 39 | feat_selector.fit(X.values, y.values.ravel()) 40 | except: 41 | pass 42 | 43 | return X.columns[feat_selector.support_].tolist() 44 | -------------------------------------------------------------------------------- /score.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import time 4 | from lib.automl import AutoML 5 | from lib.util import Log 6 | 7 | DATASETS = [ 8 | ("1", "regression", 300), 9 | ("2", "regression", 300), 10 | ("3", "regression", 300), 11 | ("4", "classification", 300), 12 | ("5", "classification", 300), 13 | ("6", "classification", 600), 14 | ("7", "classification", 1800), 15 | ("8", "classification", 1800), 16 | ] 17 | 18 | 19 | @Log.timeit 20 | def validate_dataset(alias: str, mode: str, train_limit: int) -> np.float64: 21 | Log.print(alias) 22 | 23 | automl = AutoML("models/check_{}".format(alias)) 24 | 25 | automl.config["time_limit"] = train_limit 26 | automl.train("data/check_{}/train.csv".format(alias), mode) 27 | 28 | automl.config["time_limit"] = 300 29 | automl.config["start_time"] = time.time() 30 | _, score = automl.predict("data/check_{}/test.csv".format(alias), "predictions/check_{}.csv".format(alias)) 31 | 32 | return score 33 | 34 | 35 | if __name__ == '__main__': 36 | scores = { 37 | "dataset": [], 38 | "score": [], 39 | "time": [], 40 | } 41 | 42 | for i, mode, train_limit in DATASETS: 43 | alias = "{}_{}".format(i, mode[0]) 44 | 45 | start_time = time.time() 46 | score = validate_dataset(alias, mode, train_limit) 47 | end_time = time.time() 48 | 49 | scores["dataset"].append(alias) 50 | scores["score"].append(score) 51 | scores["time"].append(end_time - start_time) 52 | 53 | scores = pd.DataFrame(scores) 54 | scores.to_csv("scores/{}.csv".format(int(time.time()))) 55 | Log.print(scores, nesting=False) 56 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | ENV LANG=C.UTF-8 4 | 5 | # Common packages 6 | RUN apt-get update && \ 7 | apt-get install -y --no-install-recommends \ 8 | software-properties-common \ 9 | build-essential \ 10 | vim \ 11 | wget \ 12 | curl \ 13 | git \ 14 | zip \ 15 | unzip && \ 16 | rm -rf /var/lib/apt/lists/* 17 | 18 | # Python 3.6 19 | RUN add-apt-repository -y ppa:deadsnakes/ppa && \ 20 | apt-get update && \ 21 | apt-get install -y --no-install-recommends \ 22 | python3-pip \ 23 | python3-setuptools \ 24 | python3.6 \ 25 | python3.6-dev \ 26 | python3.6-venv && \ 27 | pip3 install --no-cache-dir --upgrade pip && \ 28 | rm -rf /var/lib/apt/lists/* 29 | 30 | # Vowpal Wabbit 31 | ENV PATH="/opt/vowpal_wabbit/utl:${PATH}" \ 32 | CPLUS_INCLUDE_PATH=/usr/lib/jvm/java-8-openjdk-amd64/include/linux:/usr/lib/jvm/java-1.8.0-openjdk-amd64/include 33 | RUN apt-get update && \ 34 | apt-get install -y --no-install-recommends \ 35 | libboost-python-dev\ 36 | libboost-program-options-dev \ 37 | zlib1g-dev \ 38 | openjdk-8-jdk && \ 39 | rm -rf /var/lib/apt/lists/* &&\ 40 | cd opt && git clone git://github.com/JohnLangford/vowpal_wabbit.git && cd vowpal_wabbit && make && make install && \ 41 | cd python && python3 setup.py install 42 | 43 | # H2O AutoML 44 | RUN mkdir /tmp/h2o && cd /tmp/h2o && \ 45 | wget http://h2o-release.s3.amazonaws.com/h2o/rel-wright/9/h2o-3.20.0.9.zip && \ 46 | unzip -j h2o-3.20.0.9.zip && \ 47 | pip3 install h2o-3.20.0.9-py2.py3-none-any.whl && \ 48 | rm -rf /tmp/h2o 49 | 50 | # Python packages 51 | RUN pip3 install --no-cache-dir --upgrade \ 52 | pandas \ 53 | jupyter \ 54 | lightgbm \ 55 | catboost \ 56 | xgboost \ 57 | hyperopt \ 58 | Boruta \ 59 | category_encoders \ 60 | memory_profiler 61 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | IMAGE=tyz910/sdsj2018 2 | DOWNLOAD_URL=https://s3.eu-central-1.amazonaws.com/sdsj2018-automl/public/sdsj2018_automl_check_datasets.zip 3 | 4 | ifeq ($(OS), Windows_NT) 5 | DOCKER_BUILD=docker build -t ${IMAGE} . 6 | else 7 | DOCKER_BUILD=docker build -t ${IMAGE} . && (docker ps -q -f status=exited | xargs docker rm) && (docker images -qf dangling=true | xargs docker rmi) && docker images 8 | endif 9 | 10 | ifeq ($(DATASET),) 11 | DATASET=1 12 | endif 13 | 14 | DATASET_MODE=_c 15 | ifeq ($(DATASET), $(filter $(DATASET), 1 2 3)) 16 | DATASET_MODE=_r 17 | endif 18 | 19 | DATASET_NAME=${DATASET}${DATASET_MODE} 20 | 21 | ifeq ($(DATASET_MODE), _r) 22 | TRAIN_MODE=regression 23 | else 24 | TRAIN_MODE=classification 25 | endif 26 | 27 | TRAIN_CSV=data/check_${DATASET_NAME}/train.csv 28 | TEST_CSV=data/check_${DATASET_NAME}/test.csv 29 | PREDICTIONS_CSV=predictions/check_${DATASET_NAME}.csv 30 | MODEL_DIR=models/check_${DATASET_NAME} 31 | 32 | DOCKER_RUN=docker run --rm -it -v ${CURDIR}:/app -w /app ${IMAGE} 33 | 34 | download: 35 | ${DOCKER_RUN} /bin/bash -c "test -f ${TRAIN_CSV} || (cd data && curl ${DOWNLOAD_URL} > data.zip && unzip data.zip && rm data.zip)" 36 | 37 | train: 38 | ${DOCKER_RUN} python3 main.py --mode ${TRAIN_MODE} --train-csv ${TRAIN_CSV} --model-dir ${MODEL_DIR} 39 | 40 | predict: 41 | ${DOCKER_RUN} python3 main.py --test-csv ${TEST_CSV} --prediction-csv ${PREDICTIONS_CSV} --model-dir ${MODEL_DIR} 42 | 43 | score: 44 | ${DOCKER_RUN} python3 score.py 45 | 46 | docker-build: 47 | ${DOCKER_BUILD} 48 | 49 | docker-push: 50 | docker push ${IMAGE} 51 | 52 | run-bash: 53 | ${DOCKER_RUN} /bin/bash 54 | 55 | run-jupyter: 56 | docker run --rm -it -v ${CURDIR}:/app -w /app -p 8888:8888 ${IMAGE} jupyter notebook --ip=0.0.0.0 --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password='' 57 | 58 | submission: 59 | ${DOCKER_RUN} /bin/bash -c "sed -i.bak 's~{image}~${IMAGE}~g' metadata.json && zip -9 -r submissions/submission_`date '+%Y%m%d_%H%M%S'`.zip main.py lib/*.py metadata.json && mv metadata.json.bak metadata.json" 60 | -------------------------------------------------------------------------------- /lib/read.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from lib.util import Log, Config 3 | 4 | 5 | @Log.timeit 6 | def read_df(csv_path: str, config: Config) -> pd.DataFrame: 7 | if "dtype" not in config: 8 | preview_df(csv_path, config) 9 | 10 | df = pandas_read_csv(csv_path, config) 11 | 12 | if "sort_values" in config: 13 | df.sort_values(config["sort_values"], inplace=True) 14 | 15 | return df 16 | 17 | 18 | @Log.timeit 19 | def pandas_read_csv(csv_path: str, config: Config) -> pd.DataFrame: 20 | return pd.read_csv(csv_path, encoding="utf-8", low_memory=False, dtype=config["dtype"], parse_dates=config["parse_dates"]) 21 | 22 | 23 | @Log.timeit 24 | def preview_df(train_csv: str, config: Config, nrows: int=3000): 25 | num_rows = sum(1 for line in open(train_csv)) - 1 26 | Log.print("Rows in train: {}".format(num_rows)) 27 | 28 | df = pd.read_csv(train_csv, encoding="utf-8", low_memory=False, nrows=nrows) 29 | mem_per_row = df.memory_usage(deep=True).sum() / nrows 30 | Log.print("Memory per row: {:0.2f} Kb".format(mem_per_row / 1024)) 31 | 32 | df_size = (num_rows * mem_per_row) / 1024 / 1024 33 | Log.print("Approximate dataset size: {:0.2f} Mb".format(df_size)) 34 | 35 | config["parse_dates"] = [] 36 | config["dtype"] = { 37 | "line_id": int, 38 | } 39 | 40 | counters = { 41 | "id": 0, 42 | "number": 0, 43 | "string": 0, 44 | "datetime": 0, 45 | } 46 | 47 | for c in df: 48 | if c.startswith("number_"): 49 | counters["number"] += 1 50 | elif c.startswith("string_"): 51 | counters["string"] += 1 52 | config["dtype"][c] = str 53 | elif c.startswith("datetime_"): 54 | counters["datetime"] += 1 55 | config["dtype"][c] = str 56 | config["parse_dates"].append(c) 57 | elif c.startswith("id_"): 58 | counters["id"] += 1 59 | 60 | Log.print("Number columns: {}".format(counters["number"])) 61 | Log.print("String columns: {}".format(counters["string"])) 62 | Log.print("Datetime columns: {}".format(counters["datetime"])) 63 | 64 | config["counters"] = counters 65 | -------------------------------------------------------------------------------- /lib/automl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from lib.util import Config 5 | from lib.read import read_df 6 | from lib.preprocess import preprocess 7 | from lib.model import train, predict, validate 8 | from typing import Optional 9 | 10 | 11 | class AutoML: 12 | def __init__(self, model_dir: str): 13 | os.makedirs(model_dir, exist_ok=True) 14 | self.config = Config(model_dir) 15 | 16 | def train(self, train_csv: str, mode: str): 17 | self.config["task"] = "train" 18 | self.config["mode"] = mode 19 | self.config.tmp_dir = self.config.model_dir + "/tmp" 20 | os.makedirs(self.config.tmp_dir, exist_ok=True) 21 | 22 | df = read_df(train_csv, self.config) 23 | preprocess(df, self.config) 24 | 25 | y = df["target"] 26 | X = df.drop("target", axis=1) 27 | train(X, y, self.config) 28 | 29 | def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): 30 | self.config["task"] = "predict" 31 | self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" 32 | os.makedirs(self.config.tmp_dir, exist_ok=True) 33 | 34 | df = read_df(test_csv, self.config) 35 | result = { 36 | "line_id": list(df["line_id"]), 37 | "prediction": [], 38 | } 39 | 40 | def chunker(seq, size): 41 | return (seq[pos:pos+size] for pos in range(0, len(seq), size)) 42 | 43 | for chunk in chunker(df, 100000): 44 | X = chunk.copy() 45 | preprocess(X, self.config) 46 | result["prediction"] += list(predict(X, self.config)) 47 | 48 | result = pd.DataFrame(result) 49 | result.sort_values("line_id", inplace=True) 50 | result.to_csv(prediction_csv, index=False) 51 | 52 | target_csv = test_csv.replace("test", "test-target") 53 | if os.path.exists(target_csv): 54 | score = validate(result, target_csv, self.config["mode"]) 55 | else: 56 | score = None 57 | 58 | return result, score 59 | 60 | def save(self): 61 | self.config.save() 62 | 63 | def load(self): 64 | self.config.load() 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sberbank Data Science Journey 2018: Docker-friendly baseline 2 | 3 | Для работы необходим `make` и `docker`. Перед началом работы нужно скачать датасет в папку `data`. 4 | 5 | ## Особенности решения 6 | 7 | * LightGBM с подбором гиперпараметров через hyperopt. 8 | * Mean target Encoding для категориальных фич. 9 | * Для 8-го датасета отбор фич через [BorutaPy](https://github.com/scikit-learn-contrib/boruta_py). 10 | * Лик от [bagxi](https://github.com/bagxi/sdsj2018_lightgbm_baseline). 11 | 12 | Так же есть, но не используются: Vowpal Wabbit, H2O AutoML. 13 | Скор на ЛБ: `5,30072`. 14 | 15 | ## Make-команды для работы с Docker :whale: 16 | 17 | `make download` - cкачать датасет в папку data. 18 | `make train DATASET=1` - обучение модели на датасете с указанным номером [1-8]. 19 | `make predict DATASET=1` - валидация модели на датасете с указанным номером [1-8]. 20 | `make score` - валидация модели на всех датасетах и сохранение результата в папку scores. 21 | `make docker-build` - сборка Docker-образа. 22 | `make docker-push` - залить Docker-образ на Docker Hub. 23 | `make run-bash` - запустить терминал в Docker-контейнере. 24 | `make run-jupyter` - запустить Jupyter в Docker-контейнере по адресу http://localhost:8888. 25 | `make submission` - создать сабмит-файл в директории submissions. 26 | 27 | ## Сборка своего Docker-образа 28 | 29 | 1. Зарегистрироваться на [Docker Hub](https://hub.docker.com/). 30 | 2. Отредактировать [Makefile](https://github.com/tyz910/sdsj2018/blob/master/Makefile) и указать название образа на первой строчке `IMAGE=username/image`. 31 | 3. Отредактировать [Dockerfile](https://github.com/tyz910/sdsj2018/blob/master/Dockerfile) и добавить установку нужных пакетов. 32 | 4. Запустить сборку образа `make docker-build`. 33 | 5. Залить Docker-образ на Docker Hub `make docker-push`. 34 | 6. Убедиться, что созданный репозиторий публичный (Public), а не приватный (Private). Приватность настраивается по ссылке `https://hub.docker.com/r/username/image/~/settings/`. 35 | 36 | ## Запуск на Windows 37 | 38 | 1. Установить [Docker](https://download.docker.com/win/stable/Docker%20for%20Windows%20Installer.exe). 39 | 2. Установить [Make](http://gnuwin32.sourceforge.net/downlinks/make.php). И добавить его в [PATH](https://ru.stackoverflow.com/questions/153628/Как-добавить-путь-в-переменную-окружения-path-на-windows). 40 | 3. Запустить Docker. В настройках выделить докеру нужное количество оперативной памяти. 41 | 4. Запустить PowerShell от имени администратора. При выполнении make-команд дать разрешение на монтирование директории для докера. 42 | -------------------------------------------------------------------------------- /lib/model_other.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import pandas as pd 3 | import numpy as np 4 | import h2o 5 | from h2o.automl import H2OAutoML 6 | from vowpalwabbit.sklearn_vw import tovw 7 | from sklearn.linear_model import LogisticRegression, Ridge 8 | from lib.util import timeit, Config 9 | from typing import List 10 | 11 | 12 | @timeit 13 | def train_vw(X: pd.DataFrame, y: pd.Series, config: Config): 14 | cache_file = config.tmp_dir + "/.vw_cache" 15 | data_file = config.tmp_dir + "/vw_data_train.csv" 16 | 17 | cmd = " ".join([ 18 | "rm -f {cache} && vw", 19 | "-f {f}", 20 | "--cache_file {cache}", 21 | "--passes {passes}", 22 | "-l {l}", 23 | "--early_terminate {early_terminate}", 24 | "{df}" 25 | ]).format( 26 | cache=cache_file, 27 | df=data_file, 28 | f=config.model_dir + "/vw.model", 29 | passes=max(20, int(1000000/len(X))), 30 | l=25, 31 | early_terminate=1, 32 | ) 33 | 34 | if config["mode"] == "classification": 35 | cmd += " --loss_function logistic --link logistic" 36 | y = y.replace({0: -1}) 37 | 38 | save_to_vw(data_file, X, y) 39 | subprocess.Popen(cmd, shell=True).communicate() 40 | 41 | 42 | @timeit 43 | def predict_vw(X: pd.DataFrame, config: Config) -> List: 44 | preds_file = config.tmp_dir + "/.vw_preds" 45 | data_file = config.tmp_dir + "/vw_data_test.csv" 46 | save_to_vw(data_file, X) 47 | 48 | subprocess.Popen("vw -i {i} -p {p} {df}".format( 49 | df=data_file, 50 | i=config.model_dir + "/vw.model", 51 | p=preds_file 52 | ), shell=True).communicate() 53 | 54 | return [np.float64(line) for line in open(preds_file, "r")] 55 | 56 | 57 | @timeit 58 | def save_to_vw(filepath: str, X: pd.DataFrame, y: pd.Series=None, chunk_size=1000): 59 | with open(filepath, "w+") as f: 60 | for pos in range(0, len(X), chunk_size): 61 | chunk_X = X.iloc[pos:pos + chunk_size, :] 62 | chunk_y = y.iloc[pos:pos + chunk_size] if y is not None else None 63 | for row in tovw(chunk_X, chunk_y): 64 | f.write(row + "\n") 65 | 66 | 67 | @timeit 68 | def train_lm(X: pd.DataFrame, y: pd.Series, config: Config): 69 | if config["mode"] == "regression": 70 | model = Ridge() 71 | else: 72 | model = LogisticRegression(solver="liblinear") 73 | 74 | config["model_lm"] = model.fit(X, y) 75 | 76 | 77 | @timeit 78 | def predict_lm(X: pd.DataFrame, config: Config) -> List: 79 | if config["mode"] == "regression": 80 | return config["model_lm"].predict(X) 81 | else: 82 | return config["model_lm"].predict_proba(X)[:, 1] 83 | 84 | 85 | @timeit 86 | def train_h2o(X: pd.DataFrame, y: pd.Series, config: Config): 87 | h2o.init() 88 | 89 | X["target"] = y 90 | train = h2o.H2OFrame(X) 91 | train_x = train.columns 92 | train_y = "target" 93 | train_x.remove(train_y) 94 | 95 | if config["mode"] == "classification": 96 | train[train_y] = train[train_y].asfactor() 97 | 98 | aml = H2OAutoML(max_runtime_secs=60) 99 | aml.train(x=train_x, y=train_y, training_frame=train) 100 | 101 | config["model_h2o"] = h2o.save_model(model=aml.leader, path=config.model_dir + "/h2o.model", force=True) 102 | print(aml.leaderboard) 103 | 104 | X.drop("target", axis=1, inplace=True) 105 | 106 | 107 | @timeit 108 | def predict_h2o(X: pd.DataFrame, config: Config) -> List: 109 | h2o.init() 110 | model = h2o.load_model(config["model_h2o"]) 111 | 112 | return model.predict(h2o.H2OFrame(X)).as_data_frame()["predict"].tolist() 113 | -------------------------------------------------------------------------------- /lib/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pickle 4 | import signal 5 | from contextlib import contextmanager 6 | from typing import Any 7 | 8 | 9 | class Log: 10 | nesting_level = 0 11 | is_silent = False 12 | is_method_start = None 13 | 14 | @staticmethod 15 | def silent(silent: bool): 16 | Log.is_silent = silent 17 | 18 | @staticmethod 19 | def print(entry: Any="", nesting: bool=True): 20 | if Log.is_silent: 21 | return 22 | 23 | space = "." * (4 * Log.nesting_level) if nesting else "" 24 | print("{}{}".format(space, entry)) 25 | 26 | @staticmethod 27 | def nest(n: int): 28 | Log.nesting_level += n 29 | 30 | @staticmethod 31 | def timeit(method): 32 | def timed(*args, **kw): 33 | if not Log.is_method_start: 34 | Log.print(nesting=False) 35 | 36 | Log.is_method_start = True 37 | Log.print("Start {}.".format(method.__name__)) 38 | Log.nest(1) 39 | 40 | start_time = time.time() 41 | result = method(*args, **kw) 42 | end_time = time.time() 43 | 44 | Log.nest(-1) 45 | Log.print("End {}. Time: {:0.2f} sec.".format(method.__name__, end_time - start_time)) 46 | Log.is_method_start = False 47 | 48 | return result 49 | 50 | return timed 51 | 52 | 53 | class Config: 54 | def __init__(self, model_dir: str): 55 | self.model_dir = model_dir 56 | self.tmp_dir = model_dir 57 | self.current_time_limit = 0 58 | self.current_time_limit_start = 0 59 | self.data = { 60 | "start_time": time.time(), 61 | "time_limit": int(os.environ.get("TIME_LIMIT", 5 * 60)), 62 | } 63 | 64 | def is_train(self) -> bool: 65 | return self["task"] == "train" 66 | 67 | def is_predict(self) -> bool: 68 | return self["task"] == "predict" 69 | 70 | def is_regression(self) -> bool: 71 | return self["mode"] == "regression" 72 | 73 | def is_classification(self) -> bool: 74 | return self["mode"] == "classification" 75 | 76 | def limit_time_fraction(self, fraction: float=0.1): 77 | self.current_time_limit = int(self["time_limit"] * fraction) 78 | self.current_time_limit_start = self.time_left() 79 | 80 | def is_time_fraction_limit(self) -> bool: 81 | return self.current_time_limit_start - self.time_left() >= self.current_time_limit 82 | 83 | def time_left(self) -> float: 84 | return self["time_limit"] - (time.time() - self["start_time"]) 85 | 86 | def save(self): 87 | with open(os.path.join(self.model_dir, "config.pkl"), "wb") as f: 88 | pickle.dump(self.data, f, protocol=pickle.HIGHEST_PROTOCOL) 89 | 90 | def load(self): 91 | with open(os.path.join(self.model_dir, "config.pkl"), "rb") as f: 92 | data = pickle.load(f) 93 | 94 | self.data = {**data, **self.data} 95 | 96 | def __getitem__(self, key): 97 | return self.data[key] 98 | 99 | def __setitem__(self, key, value): 100 | self.data[key] = value 101 | 102 | def __delitem__(self, key): 103 | del self.data[key] 104 | 105 | def __contains__(self, key): 106 | return key in self.data 107 | 108 | def __len__(self): 109 | return len(self.data) 110 | 111 | def __repr__(self): 112 | return repr(self.data) 113 | 114 | 115 | class TimeoutException(Exception): pass 116 | 117 | 118 | @contextmanager 119 | def time_limit(seconds): 120 | seconds = int(seconds) 121 | def signal_handler(signum, frame): 122 | raise TimeoutException("Timed out!") 123 | signal.signal(signal.SIGALRM, signal_handler) 124 | signal.alarm(seconds) 125 | try: 126 | yield 127 | finally: 128 | signal.alarm(0) 129 | -------------------------------------------------------------------------------- /lib/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import lightgbm as lgb 4 | import hyperopt 5 | from hyperopt import hp, tpe, STATUS_OK, space_eval, Trials 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import mean_squared_error, roc_auc_score 8 | from lib.util import Log, Config, time_limit, TimeoutException 9 | from typing import List, Dict 10 | 11 | 12 | @Log.timeit 13 | def train(X: pd.DataFrame, y: pd.Series, config: Config): 14 | train_lightgbm(X, y, config) 15 | 16 | 17 | @Log.timeit 18 | def predict(X: pd.DataFrame, config: Config) -> List: 19 | preds = predict_lightgbm(X, config) 20 | 21 | if config["non_negative_target"]: 22 | preds = [max(0, p) for p in preds] 23 | 24 | return preds 25 | 26 | @Log.timeit 27 | def validate(preds: pd.DataFrame, target_csv: str, mode: str) -> np.float64: 28 | df = pd.merge(preds, pd.read_csv(target_csv), on="line_id", left_index=True) 29 | score = roc_auc_score(df.target.values, df.prediction.values) if mode == "classification" else \ 30 | np.sqrt(mean_squared_error(df.target.values, df.prediction.values)) 31 | Log.print("Score: {:0.4f}".format(score)) 32 | return score 33 | 34 | 35 | @Log.timeit 36 | def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config): 37 | params = { 38 | "objective": "regression" if config.is_regression() else "binary", 39 | "metric": "rmse" if config.is_regression() else "auc", 40 | "verbosity": -1, 41 | "seed": 1, 42 | } 43 | 44 | X_sample, y_sample = data_sample(X, y, config, nrows=20000) 45 | hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config) 46 | 47 | X_train, X_val, y_train, y_val = data_split(X, y, config) 48 | 49 | config["model"] = lgb.train( 50 | {**params, **hyperparams}, 51 | lgb.Dataset(X_train, label=y_train), 52 | 5000, 53 | lgb.Dataset(X_val, label=y_val), 54 | early_stopping_rounds=100, 55 | verbose_eval=100, 56 | ) 57 | config.save() 58 | 59 | try: 60 | with time_limit(config.time_left() - 10): 61 | config["model"] = lgb.train( 62 | {**params, **hyperparams}, 63 | lgb.Dataset(X, label=y), 64 | int(1.2 * config["model"].best_iteration), 65 | ) 66 | except TimeoutException: 67 | Log.print("Timed out!") 68 | 69 | 70 | @Log.timeit 71 | def predict_lightgbm(X: pd.DataFrame, config: Config) -> List: 72 | return config["model"].predict(X) 73 | 74 | 75 | @Log.timeit 76 | def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config): 77 | X_train, X_val, y_train, y_val = data_split(X, y, config, test_size=0.5) 78 | train_data = lgb.Dataset(X_train, label=y_train) 79 | valid_data = lgb.Dataset(X_val, label=y_val) 80 | 81 | space = { 82 | "learning_rate": hp.choice("learning_rate", np.arange(0.01, 0.05, 0.01)), 83 | "boost_from_average": hp.choice("boost_from_average", [True, False]), 84 | "is_unbalance": hp.choice("is_unbalance", [True, False]), 85 | "zero_as_missing": hp.choice("zero_as_missing", [True, False]), 86 | "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 7]), 87 | "num_leaves": hp.choice("num_leaves", [11, 31, 51, 101, 151, 201]), 88 | "feature_fraction": hp.choice("feature_fraction", np.arange(0.5, 1.0, 0.1)), 89 | "bagging_fraction": hp.choice("bagging_fraction", np.arange(0.5, 1.0, 0.1)), 90 | "bagging_freq": hp.choice("bagging_freq", [1, 3, 5, 10, 20, 50]), 91 | "reg_alpha": hp.uniform("reg_alpha", 0, 10), 92 | "reg_lambda": hp.uniform("reg_lambda", 0, 10), 93 | "min_child_weight": hp.uniform("min_child_weight", 0, 10), 94 | } 95 | 96 | config.limit_time_fraction(0.15) 97 | 98 | def objective(hyperparams): 99 | if config.is_time_fraction_limit(): 100 | score = np.inf if config.is_regression() else 0 101 | return {'loss': score, 'status': STATUS_OK} 102 | 103 | model = lgb.train({**params, **hyperparams}, train_data, 300, valid_data, 104 | early_stopping_rounds=100, verbose_eval=False) 105 | 106 | score = model.best_score["valid_0"][params["metric"]] 107 | Log.print(score) 108 | if config.is_classification(): 109 | score = -score 110 | 111 | return {'loss': score, 'status': STATUS_OK} 112 | 113 | trials = Trials() 114 | best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=100, verbose=1, 115 | rstate= np.random.RandomState(1)) 116 | 117 | hyperparams = space_eval(space, best) 118 | Log.print("{:0.4f} {}".format(trials.best_trial['result']['loss'], hyperparams)) 119 | return hyperparams 120 | 121 | 122 | def ts_split(X: pd.DataFrame, y: pd.Series, test_size: float) -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series): 123 | test_len = int(len(X) * test_size) 124 | return X[:-test_len], X[-test_len:], y[:-test_len], y[-test_len:] 125 | 126 | 127 | def data_split(X: pd.DataFrame, y: pd.Series, config: Config, test_size: float=0.2) -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series): 128 | if "sort_values" in config: 129 | return ts_split(X, y, test_size=test_size) 130 | else: 131 | return train_test_split(X, y, test_size=test_size, random_state=1) 132 | 133 | 134 | def data_sample(X: pd.DataFrame, y: pd.Series, config: Config, nrows: int=10000) -> (pd.DataFrame, pd.Series): 135 | if len(X) > nrows: 136 | if "sort_values" in config: 137 | X_sample = X.iloc[:nrows] 138 | y_sample = y.iloc[:nrows] 139 | else: 140 | X_sample = X.sample(nrows, random_state=1) 141 | y_sample = y[X_sample.index] 142 | else: 143 | X_sample = X 144 | y_sample = y 145 | 146 | return X_sample, y_sample 147 | -------------------------------------------------------------------------------- /lib/preprocess.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import datetime 3 | import warnings 4 | import numpy as np 5 | import pandas as pd 6 | import lightgbm as lgb 7 | from sklearn.exceptions import DataConversionWarning 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.model_selection import train_test_split 10 | from lib.features import select_features 11 | from lib.util import Log, Config 12 | from typing import Optional, List 13 | 14 | 15 | @Log.timeit 16 | def preprocess(df: pd.DataFrame, config: Config): 17 | non_negative_target_detect(df, config) 18 | drop_columns(df, config) 19 | fillna(df, config) 20 | to_int8(df, config) 21 | 22 | time_series_detect(df, config) 23 | feature_selection(df, config) 24 | 25 | subsample(df, config, max_size_mb=2 * 1024) 26 | transform(df, config) 27 | subsample(df, config, max_size_mb=2 * 1024) 28 | 29 | 30 | @Log.timeit 31 | def transform(df: pd.DataFrame, config: Config): 32 | transform_datetime(df, config) 33 | transform_categorical(df, config) 34 | scale(df, config) 35 | 36 | 37 | @Log.timeit 38 | def drop_columns(df: pd.DataFrame, config: Config): 39 | df.drop([c for c in ["is_test", "line_id"] if c in df], axis=1, inplace=True) 40 | drop_constant_columns(df, config) 41 | 42 | 43 | @Log.timeit 44 | def fillna(df: pd.DataFrame, config: Config): 45 | for c in [c for c in df if c.startswith("number_")]: 46 | df[c].fillna(-1, inplace=True) 47 | 48 | for c in [c for c in df if c.startswith("string_")]: 49 | df[c].fillna("", inplace=True) 50 | 51 | for c in [c for c in df if c.startswith("datetime_")]: 52 | df[c].fillna(datetime.datetime(1970, 1, 1), inplace=True) 53 | 54 | 55 | @Log.timeit 56 | def drop_constant_columns(df: pd.DataFrame, config: Config): 57 | if "constant_columns" not in config: 58 | config["constant_columns"] = [c for c in df if c.startswith("number_") and not (df[c] != df[c].iloc[0]).any()] 59 | Log.print("Constant columns: {}".format(config["constant_columns"])) 60 | 61 | if len(config["constant_columns"]) > 0: 62 | df.drop(config["constant_columns"], axis=1, inplace=True) 63 | 64 | 65 | @Log.timeit 66 | def transform_datetime(df: pd.DataFrame, config: Config): 67 | date_parts = ["year", "weekday", "month", "day", "hour"] 68 | 69 | if "date_columns" not in config: 70 | config["date_columns"] = {} 71 | 72 | for c in [c for c in df if c.startswith("datetime_")]: 73 | config["date_columns"][c] = [] 74 | for part in date_parts: 75 | part_col = c + "_" + part 76 | df[part_col] = getattr(df[c].dt, part).astype(np.uint16 if part == "year" else np.uint8).values 77 | 78 | if not (df[part_col] != df[part_col].iloc[0]).any(): 79 | Log.print(part_col + " is constant") 80 | df.drop(part_col, axis=1, inplace=True) 81 | else: 82 | config["date_columns"][c].append(part) 83 | 84 | df.drop(c, axis=1, inplace=True) 85 | else: 86 | for c, parts in config["date_columns"].items(): 87 | for part in parts: 88 | part_col = c + "_" + part 89 | df[part_col] = getattr(df[c].dt, part) 90 | df.drop(c, axis=1, inplace=True) 91 | 92 | 93 | @Log.timeit 94 | def transform_categorical(df: pd.DataFrame, config: Config): 95 | if "categorical_columns" not in config: 96 | config["categorical_columns"] = [] 97 | 98 | # https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features 99 | prior = config["categorical_prior"] = df["target"].mean() 100 | min_samples_leaf = 10 101 | smoothing = 5 102 | 103 | config["categorical_columns_string"] = {} 104 | for c in [c for c in df if c.startswith("string_")]: 105 | Log.print(c) 106 | config["categorical_columns"].append(c) 107 | 108 | averages = df[[c, "target"]].groupby(c)["target"].agg(["mean", "count"]) 109 | smooth = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing)) 110 | averages["target"] = prior * (1 - smooth) + averages["mean"] * smooth 111 | config["categorical_columns_string"][c] = averages["target"].to_dict() 112 | 113 | config["categorical_columns_id"] = {} 114 | for c in [c for c in df if c.startswith("id_")]: 115 | Log.print(c) 116 | config["categorical_columns"].append(c) 117 | 118 | if df[c].dtype == str or df[c].dtype == object: 119 | config["categorical_columns_id"][c] = {v: i for i, v in enumerate(df[c].unique())} 120 | 121 | for c, values in config["categorical_columns_string"].items(): 122 | df.loc[:, c] = df[c].apply(lambda x: values[x] if x in values else config["categorical_prior"]) 123 | 124 | for c, values in config["categorical_columns_id"].items(): 125 | df.loc[:, c] = df[c].apply(lambda x: values[x] if x in values else -1) 126 | 127 | 128 | @Log.timeit 129 | def scale(df: pd.DataFrame, config: Config): 130 | warnings.filterwarnings(action='ignore', category=DataConversionWarning) 131 | scale_columns = [c for c in df if c.startswith("number_") and df[c].dtype != np.int8 and c not in config["categorical_columns"]] 132 | 133 | if len(scale_columns) > 0: 134 | if "scaler" not in config: 135 | config["scaler"] = StandardScaler(copy=False) 136 | config["scaler"].fit(df[scale_columns]) 137 | 138 | df[scale_columns] = config["scaler"].transform(df[scale_columns]) 139 | 140 | 141 | @Log.timeit 142 | def to_int8(df: pd.DataFrame, config: Config): 143 | if "int8_columns" not in config: 144 | config["int8_columns"] = [] 145 | vals = [-1, 0, 1] 146 | 147 | for c in [c for c in df if c.startswith("number_")]: 148 | if (~df[c].isin(vals)).any(): 149 | continue 150 | config["int8_columns"].append(c) 151 | 152 | Log.print("Num columns: {}".format(len(config["int8_columns"]))) 153 | 154 | if len(config["int8_columns"]) > 0: 155 | df.loc[:, config["int8_columns"]] = df.loc[:, config["int8_columns"]].astype(np.int8) 156 | 157 | 158 | @Log.timeit 159 | def subsample(df: pd.DataFrame, config: Config, max_size_mb: float=2.0): 160 | if config.is_train(): 161 | df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 162 | if df_size_mb > max_size_mb: 163 | mem_per_row = df_size_mb / len(df) 164 | sample_rows = int(max_size_mb / mem_per_row) 165 | 166 | Log.print("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows.".format(df_size_mb, len(df), sample_rows)) 167 | _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1) 168 | df.drop(df_drop.index, inplace=True) 169 | 170 | config["nrows"] = sample_rows 171 | else: 172 | config["nrows"] = len(df) 173 | 174 | 175 | def shift_columns(df: pd.DataFrame, group: Optional[str]=None, number_columns: Optional[List[str]]=None): 176 | if number_columns is None: 177 | number_columns = [c for c in df if c.startswith("number_")] 178 | shift_columns = [c + "_shift" for c in number_columns] 179 | 180 | if group is not None: 181 | shifted = df.groupby([group])[number_columns].shift(-1) 182 | else: 183 | shifted = df[number_columns].shift(-1) 184 | 185 | df[shift_columns] = shifted.fillna(-1) 186 | 187 | 188 | @Log.timeit 189 | def time_series_detect(df: pd.DataFrame, config: Config): 190 | sample_size = 10000 191 | model_params = { 192 | "objective": "regression" if config["mode"] == "regression" else "binary", 193 | "metric": "rmse" if config["mode"] == "regression" else "auc", 194 | "learning_rate": 0.01, 195 | "verbosity": -1, 196 | "seed": 1, 197 | "max_depth": -1, 198 | } 199 | 200 | if config.is_train(): 201 | datetime_columns = [c for c in df if c.startswith("datetime_")] 202 | id_columns = [c for c in df if c.startswith("id_")] 203 | 204 | sort_columns = [] 205 | for dc in datetime_columns: 206 | sort_columns.append([dc]) 207 | for ic in id_columns: 208 | sort_columns.append([ic, dc]) 209 | else: 210 | for ic in id_columns: 211 | sort_columns.append([ic]) 212 | 213 | scores = [] 214 | config.limit_time_fraction(0.1) 215 | for sc in sort_columns: 216 | if config.is_time_fraction_limit(): 217 | break 218 | 219 | Log.silent(True) 220 | df.sort_values(sc, inplace=True) 221 | 222 | config_sample = copy.deepcopy(config) 223 | df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() 224 | df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]] 225 | shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None) 226 | transform(df_sample, config_sample) 227 | 228 | y = df_sample["target"] 229 | X = df_sample.drop("target", axis=1) 230 | X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5) 231 | 232 | model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test), 233 | early_stopping_rounds=100, verbose_eval=False) 234 | score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]] 235 | 236 | sampled_columns = [c for c in X if "_shift" not in c] 237 | model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test), 238 | early_stopping_rounds=100, verbose_eval=False) 239 | score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]] 240 | 241 | if config.is_classification(): 242 | score_sorted = -score_sorted 243 | score_sampled = -score_sampled 244 | 245 | Log.silent(False) 246 | Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled)) 247 | score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled) 248 | if score_ratio >= 1.03: 249 | Log.print(score_ratio) 250 | scores.append((score_sorted, sc)) 251 | 252 | if len(scores) > 0: 253 | scores = sorted(scores, key=lambda x: x[0]) 254 | Log.print("Scores: {}".format(scores)) 255 | config["sort_values"] = scores[0][1] 256 | df.sort_values(config["sort_values"], inplace=True) 257 | 258 | config_sample = copy.deepcopy(config) 259 | df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() 260 | shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None) 261 | transform(df_sample, config_sample) 262 | 263 | y = df_sample["target"] 264 | X = df_sample.drop("target", axis=1) 265 | 266 | model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000) 267 | fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns) 268 | fi = fi[fi > 0].sort_values() 269 | selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist() 270 | 271 | selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c] 272 | if len(selected_shift_columns) > 0: 273 | Log.print("Shift columns: {}".format(selected_shift_columns)) 274 | config["shift_columns"] = selected_shift_columns 275 | 276 | if "shift_columns" in config: 277 | shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"]) 278 | 279 | 280 | @Log.timeit 281 | def feature_selection(df: pd.DataFrame, config: Config): 282 | if config.is_train(): 283 | df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 284 | if df_size_mb < 2 * 1024: 285 | return 286 | 287 | selected_columns = [] 288 | config_sample = copy.deepcopy(config) 289 | config.limit_time_fraction(0.1) 290 | for i in range(20): 291 | if config.is_time_fraction_limit(): 292 | break 293 | 294 | df_sample = df.sample(min(3000, len(df)), random_state=i).copy() 295 | transform(df_sample, config_sample) 296 | y = df_sample["target"] 297 | X = df_sample.drop("target", axis=1) 298 | 299 | if len(selected_columns) > 0: 300 | X = X.drop(selected_columns, axis=1) 301 | 302 | if len(X.columns) > 0: 303 | selected_columns += select_features(X, y, config["mode"]) 304 | else: 305 | break 306 | 307 | Log.print("Selected columns: {}".format(selected_columns)) 308 | 309 | drop_number_columns = [c for c in df if c.startswith("number_") and c not in selected_columns] 310 | if len(drop_number_columns) > 0: 311 | config["drop_number_columns"] = drop_number_columns 312 | 313 | config["date_columns"] = {} 314 | for c in [c for c in selected_columns if c.startswith("datetime_")]: 315 | d = c.split("_") 316 | date_col = d[0] + "_" + d[1] 317 | date_part = d[2] 318 | 319 | if date_col not in config["date_columns"]: 320 | config["date_columns"][date_col] = [] 321 | 322 | config["date_columns"][date_col].append(date_part) 323 | 324 | drop_datetime_columns = [c for c in df if c.startswith("datetime_") and c not in config["date_columns"]] 325 | if len(drop_datetime_columns) > 0: 326 | config["drop_datetime_columns"] = drop_datetime_columns 327 | 328 | if "drop_number_columns" in config: 329 | Log.print("Drop number columns: {}".format(config["drop_number_columns"])) 330 | df.drop(config["drop_number_columns"], axis=1, inplace=True) 331 | 332 | if "drop_datetime_columns" in config: 333 | Log.print("Drop datetime columns: {}".format(config["drop_datetime_columns"])) 334 | df.drop(config["drop_datetime_columns"], axis=1, inplace=True) 335 | 336 | 337 | @Log.timeit 338 | def non_negative_target_detect(df: pd.DataFrame, config: Config): 339 | if config.is_train(): 340 | config["non_negative_target"] = df["target"].lt(0).sum() == 0 341 | 342 | 343 | def ts_split(X: pd.DataFrame, y: pd.Series, test_size: float) -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series): 344 | test_len = int(len(X) * test_size) 345 | return X[:-test_len], X[-test_len:], y[:-test_len], y[-test_len:] 346 | --------------------------------------------------------------------------------