├── test
    ├── __init__.py
    └── unit_test
    │   ├── __init__.py
    │   └── test_cache.py
├── src
    ├── models
    │   ├── pytorch_tabnet
    │   │   ├── __init__.py
    │   │   ├── tab_model.py
    │   │   ├── multitask.py
    │   │   ├── metrics.py
    │   │   ├── callbacks.py
    │   │   ├── sparsemax.py
    │   │   ├── utils.py
    │   │   ├── multiclass_utils.py
    │   │   ├── tab_network.py
    │   │   └── abstract_model.py
    │   ├── blending.py
    │   ├── loss.py
    │   ├── svm.py
    │   ├── tabnet.py
    │   ├── boosting_tree.py
    │   ├── base.py
    │   ├── optimizer.py
    │   └── tabular_nn.py
    ├── utils
    │   ├── environment.py
    │   ├── misc.py
    │   ├── transformers.py
    │   ├── cache.py
    │   └── splitter.py
    ├── metrics.py
    ├── preprocess.py
    └── experiment
    │   └── experiment.py
├── .gitignore
├── Makefile
├── pyproject.toml
├── docker-compose.yaml
├── README.md
└── encode.py


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/unit_test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .venv
2 | .build
3 | .vscode
4 | __pycache__/
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: build
2 | build: 
3 | 	poetry run python encode.py
4 | 	cat .build/script.py | pbcopy
5 | 	echo 'copied to clipboard'


--------------------------------------------------------------------------------
/src/utils/environment.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import mlflow
 3 |     _has_mlflow = True
 4 | except ImportError:
 5 |     _has_mlflow = False
 6 | 
 7 | 
 8 | def requires_mlflow():
 9 |     if not _has_mlflow:
10 |         raise ImportError('You need to install mlflow before using this API.')
11 | 
12 | 
13 | try:
14 |     import torch
15 |     _has_torch = True
16 | except ImportError:
17 |     _has_torch = False
18 | 
19 | 
20 | def get_device():
21 |     if _has_torch:
22 |         return torch.device("cuda" if torch.cuda.is_available() else "cpu")
23 |     else:
24 |         'cpu'
25 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "moa"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["e-mon <emon18@icloud.com>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = ">=3.6"
 9 | tqdm = "*"
10 | pandas = "*"
11 | ipython = "*"
12 | sklearn = "*"
13 | lightgbm = "*"
14 | catboost = "*"
15 | xgboost = "*"
16 | numpy = "*"
17 | scipy = "*"
18 | seaborn = "*"
19 | gitpython = "^3.1.11"
20 | hydra = "^2.5"
21 | 
22 | 
23 | [tool.poetry.dev-dependencies]
24 | flake8 = "^3.8.4"
25 | black = {version = "^20.8b1", allow-prereleases = true}
26 | yapf = "^0.30.0"
27 | [build-system]
28 | requires = ["poetry>=0.12"]
29 | build-backend = "poetry.masonry.api"
30 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   jupyter:
 5 |     entrypoint: ""
 6 |     command: jupyter notebook --ip=0.0.0.0 --allow-root --no-browser --port 8080 --NotebookApp.token=hogehoge123
 7 |     image: gcr.io/kaggle-gpu-images/python:latest
 8 |     runtime: nvidia
 9 |     environment:
10 |       LD_LIBRARY_PATH: "/usr/local/cuda/lib64::/opt/conda/lib"
11 |     user: root
12 |     ports:
13 |       - "8080:8080"
14 |     volumes:
15 |       - ./notebooks:/notebooks
16 |       - /data:/input
17 |       - /usr/local/cuda:/usr/local/cuda
18 |       - ~/.jupyter:/root/.jupyter
19 |       - ~/.local/share/jupyter:/root/.local/share/jupyter
20 |     working_dir: /
21 | 
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Mechanisms of Action (MoA) Prediction
 2 | 
 3 | 4th place solution for Mechanisms of Action (MoA) Prediction https://www.kaggle.com/c/lish-moa/
 4 | 
 5 | Solution summary: [here](https://www.kaggle.com/c/lish-moa/discussion/200808)
 6 | 
 7 | Kernel: [here](https://www.kaggle.com/kento1993/nn-svm-tabnet-xgb-with-pca-cnn-stacking-without-pp)
 8 | 
 9 | ## Setup
10 | 
11 | Since these codes are designed to be executed on Kaggle Kernel, so first get the BASE64-encoded codes by running the following command.
12 | 
13 | (refer to: https://github.com/lopuhin/kaggle-imet-2019)
14 | ```shell
15 | $ make build
16 | ```
17 | 
18 | Please see below kernel, if you want to know the actual training & inference process.
19 | 
20 | https://www.kaggle.com/kento1993/nn-svm-tabnet-xgb-with-pca-cnn-stacking-without-pp
21 | 


--------------------------------------------------------------------------------
/src/models/blending.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from scipy.optimize import minimize
 4 | from sklearn.model_selection import KFold
 5 | 
 6 | from src.metrics import calc_competition_metric_torch
 7 | from src.utils.misc import LoggerFactory
 8 | 
 9 | logger = LoggerFactory().getLogger(__name__)
10 | 
11 | 
12 | def get_best_weights(oof_1, oof_2, train_features_df, targets, n_splits=10):
13 |     weight_list = []
14 |     weights = np.array([0.5])
15 |     for i in range(2):
16 |         kf = KFold(n_splits=n_splits, random_state=i, shuffle=True)
17 |         for fold, (train_idx, valid_idx) in enumerate(kf.split(X=oof_1)):
18 |             res = minimize(
19 |                 get_score,
20 |                 weights,
21 |                 args=(train_features_df, train_idx, oof_1, oof_2, targets),
22 |                 method="Nelder-Mead",
23 |                 tol=1e-6,
24 |             )
25 |             logger.info(f"i: {i} fold: {fold} res.x: {res.x}")
26 |             weight_list.append(res.x)
27 |     mean_weight = np.mean(weight_list)
28 |     logger.info(f"optimized weight: {mean_weight}")
29 |     return mean_weight
30 | 
31 | 
32 | def get_score(weights, train_features_df, train_idx, oof_1, oof_2, targets):
33 |     _oof_1 = oof_1[train_idx, :].copy()
34 |     _oof_2 = oof_2[train_idx, :].copy()
35 |     blend = (_oof_1 * weights[0]) + (_oof_2 * (1 - weights[0]))
36 |     return calc_competition_metric_torch(train_features_df, targets, blend, train_idx)


--------------------------------------------------------------------------------
/src/metrics.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | from sklearn.metrics import log_loss
 5 | 
 6 | from src.utils.misc import LoggerFactory
 7 | 
 8 | logger = LoggerFactory().getLogger(__name__)
 9 | 
10 | 
11 | def calc_competition_metric_torch(train_features_df, target_cols, oof_arr, train_idx):
12 |     # competition_metric = [log_loss(train_features_df.loc[train_idx, target_cols[i]], oof_arr[:, i]) for i in range(len(target_cols))]
13 |     y = torch.tensor(train_features_df.loc[train_idx, target_cols].values, dtype=float)
14 |     p = torch.tensor(oof_arr, dtype=float)
15 |     p = torch.clamp(p, 1e-9, 1 - (1e-9))
16 |     competition_metric = nn.BCELoss()(p, y).item()
17 |     return np.mean(competition_metric)
18 | 
19 | 
20 | def calc_competition_metric_np(train_features_df, target_cols, oof_arr):
21 |     competition_metric = []
22 |     for i in range(len(target_cols)):
23 |         competition_metric.append(log_loss(train_features_df[:, target_cols[i]], oof_arr[:, i]))
24 |     logger.info(f"competition metric: {np.mean(competition_metric)}")
25 | 
26 |     return np.mean(competition_metric)
27 | 
28 | 
29 | def logloss_for_multilabel(actual, preds, ignore_all_zeros: bool = True):
30 |     """
31 |     actual, preds: [n_samples, n_classes]
32 |     log_loss(actual[:, c], preds[:, c])
33 |     """
34 | 
35 |     actual = torch.tensor(actual, dtype=float)
36 |     preds = torch.tensor(preds, dtype=float)
37 |     preds = torch.clamp(preds, 1e-9, 1 - (1e-9))
38 | 
39 |     return np.mean(nn.BCELoss()(preds, actual).item())
40 | 


--------------------------------------------------------------------------------
/encode.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import gzip
 3 | from pathlib import Path
 4 | from src.utils.misc import get_current_commit_hash
 5 | 
 6 | template = """
 7 | import gzip
 8 | import base64
 9 | import os
10 | from pathlib import Path
11 | from typing import Dict
12 | 
13 | 
14 | # this is base64 encoded source code
15 | file_data: Dict = {file_data}
16 | 
17 | for path, encoded in file_data.items():
18 |     print(path)
19 |     path = Path(path)
20 |     path.parent.mkdir(exist_ok=True)
21 |     path.write_bytes(gzip.decompress(base64.b64decode(encoded)))
22 | 
23 | 
24 | def run(command):
25 |     os.system('echo "from setuptools import setup; setup(name=\\'src\\', packages=[\\'src\\'],)" > setup.py')
26 |     os.system('export PYTHONPATH=${PYTHONPATH}:/kaggle/working && ' + command)
27 | 
28 | 
29 | run('python setup.py develop --install-dir /kaggle/working')
30 | 
31 | # output current commit hash
32 | print('{commit_hash}')
33 | """
34 | 
35 | 
36 | def encode_file(path: Path) -> str:
37 |     compressed = gzip.compress(path.read_bytes(), compresslevel=9)
38 |     return base64.b64encode(compressed).decode('utf-8')
39 | 
40 | 
41 | def build_script():
42 |     to_encode = list(Path('src').glob('**/*.py'))
43 |     file_data = {str(path): encode_file(path) for path in to_encode}
44 |     output_path = Path('.build/script.py')
45 |     output_path.parent.mkdir(exist_ok=True)
46 |     output_path.write_text(template.replace('{file_data}', str(file_data)).replace('{commit_hash}', get_current_commit_hash()), encoding='utf8')
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     build_script()
51 | 


--------------------------------------------------------------------------------
/src/models/loss.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.modules.loss import _WeightedLoss
 2 | from src.models.pytorch_tabnet.metrics import Metric
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | class SmoothBCEwLogits(_WeightedLoss):
 9 |     def __init__(self, weight=None, reduction="mean", smoothing=0.0):
10 |         super().__init__(weight=weight, reduction=reduction)
11 |         self.smoothing = smoothing
12 |         self.weight = weight
13 |         self.reduction = reduction
14 | 
15 |     @staticmethod
16 |     def _smooth(targets: torch.Tensor, n_labels: int, smoothing=0.0):
17 |         assert 0 <= smoothing < 1
18 |         with torch.no_grad():
19 |             targets = targets * (1.0 - smoothing) + 0.5 * smoothing
20 |         return targets
21 | 
22 |     def forward(self, inputs, targets):
23 |         targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1), self.smoothing)
24 |         loss = F.binary_cross_entropy_with_logits(inputs, targets, self.weight)
25 | 
26 |         if self.reduction == "sum":
27 |             loss = loss.sum()
28 |         elif self.reduction == "mean":
29 |             loss = loss.mean()
30 | 
31 |         return loss
32 | 
33 | 
34 | class LogitsLogLoss(Metric):
35 |     """
36 |     LogLoss with sigmoid applied
37 |     """
38 |     def __init__(self):
39 |         self._name = "logits_ll"
40 |         self._maximize = False
41 | 
42 |     def __call__(self, y_true, y_pred):
43 |         """
44 |         Compute LogLoss of predictions.
45 | 
46 |         Parameters
47 |         ----------
48 |         y_true: np.ndarray
49 |             Target matrix or vector
50 |         y_score: np.ndarray
51 |             Score matrix or vector
52 | 
53 |         Returns
54 |         -------
55 |             float
56 |             LogLoss of predictions vs targets.
57 |         """
58 |         logits = 1 / (1 + np.exp(-y_pred))
59 |         aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
60 |         return np.mean(-aux)
61 | 


--------------------------------------------------------------------------------
/src/models/svm.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | import pandas as pd
 3 | import numpy as np
 4 | from cuml.svm import SVC, SVR
 5 | from tqdm import tqdm
 6 | 
 7 | from src.utils.misc import LoggerFactory
 8 | from src.models.base import MoaBaseOnline, AllZerosClassifier
 9 | 
10 | logger = LoggerFactory().getLogger(__name__)
11 | 
12 | 
13 | class SVMTrainer(MoaBaseOnline):
14 |     def __init__(self, params: Optional[dict] = None, **kwargs):
15 |         if params is None:
16 |             self.params = {}
17 |         else:
18 |             self.params = params
19 |         super().__init__(**kwargs)
20 | 
21 |     def _get_default_params(self):
22 |         return {
23 |             'cache_size': 5000,
24 |             'probability': True,
25 |         }
26 | 
27 |     def _train_predict(self, X: pd.DataFrame, y: pd.DataFrame, X_test: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray,
28 |                        seed: int):
29 |         _params = self._get_default_params()
30 |         _params.update(self.params)
31 | 
32 |         X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
33 |         y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
34 |         target_cols = y_valid.columns.tolist()
35 | 
36 |         pred_valid = np.zeros_like(y_valid).astype(float)
37 |         preds = np.zeros(shape=(X_test.shape[0], y_train.shape[1]))
38 | 
39 |         # multilabel分回す
40 |         for idx, target_col in tqdm(enumerate(target_cols), total=len(target_cols)):
41 |             # Since cuml SVC calls CalibratedClassifierCV(n_folds=5), more than 5 positive samples is required
42 |             if y_train[target_col].sum() < 5:
43 |                 logger.info(f'{target_col} is all zeros')
44 |                 clf = AllZerosClassifier()
45 |             else:
46 |                 clf = SVC(**_params)
47 |                 clf.fit(X_train[predictors].values, y_train[target_col].values.astype(int), convert_dtype=False)
48 |             pred_valid[:, idx] = clf.predict_proba(X_valid[predictors].values)[:, 1]
49 |             preds[:, idx] = clf.predict_proba(X_test[predictors].values)[:, 1]
50 | 
51 |         return preds, pred_valid
52 | 


--------------------------------------------------------------------------------
/src/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import random
 4 | import torch
 5 | import logging
 6 | import logging.handlers
 7 | from contextlib import contextmanager
 8 | import time
 9 | import git
10 | from pathlib import Path
11 | 
12 | 
13 | def seed_everything(seed=42):
14 |     random.seed(seed)
15 |     os.environ["PYTHONHASHSEED"] = str(seed)
16 |     np.random.seed(seed)
17 |     torch.manual_seed(seed)
18 |     torch.cuda.manual_seed(seed)
19 |     torch.backends.cudnn.deterministic = True
20 | 
21 | 
22 | class Singleton(type):
23 |     _instances = {}
24 | 
25 |     def __call__(cls, *args, **kwargs):
26 |         if cls not in cls._instances:
27 |             cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
28 |         return cls._instances[cls]
29 | 
30 | 
31 | class LoggerFactory(metaclass=Singleton):
32 |     def __init__(self, log_path: str = None, loglevel=logging.INFO):
33 |         self.loglevel = loglevel
34 |         if log_path is None:
35 |             self.log_path = Path('./log')
36 |         else:
37 |             self.log_path = Path(log_path)
38 |             self.log_path.parent.mkdir(parents=True, exist_ok=True)
39 | 
40 |     def getLogger(self, log_name):
41 |         fmt = '%(asctime)s [%(name)s|%(levelname)s] %(message)s'
42 |         formatter = logging.Formatter(fmt)
43 |         logger = logging.getLogger(log_name)
44 | 
45 |         # add stream Handler
46 |         handler = logging.StreamHandler()
47 |         handler.setFormatter(formatter)
48 |         logger.addHandler(handler)
49 | 
50 |         # add file Handler
51 |         handler = logging.handlers.RotatingFileHandler(filename=self.log_path, maxBytes=2 * 1024 * 1024 * 1024, backupCount=10)
52 |         handler.setFormatter(formatter)
53 |         logger.addHandler(handler)
54 | 
55 |         logger.setLevel(self.loglevel)
56 | 
57 |         return logger
58 | 
59 | 
60 | @contextmanager
61 | def timer(name, logger):
62 |     t0 = time.time()
63 |     logger.debug(f'[{name}] start')
64 |     yield
65 |     logger.debug(f'[{name}] done in {time.time() - t0:.0f} s')
66 | 
67 | 
68 | def get_current_commit_hash():
69 |     repo = git.Repo(search_parent_directories=True)
70 |     return repo.head.object.hexsha
71 | 


--------------------------------------------------------------------------------
/test/unit_test/test_cache.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch
 3 | from src.utils.cache import Cache
 4 | from typing import List, Any
 5 | import pandas as pd
 6 | 
 7 | 
 8 | class TestCache(unittest.TestCase):
 9 |     def test_hash(self):
10 |         test_case: List[Any] = [1, 'a', {1, 2, 3}, [1, 2, 3], (1, 2, 3), {'key': 'value'}]
11 |         expecteds = [
12 |             'c4ca4238a0b923820dcc509a6f75849b', '0cc175b9c0f1b6a831c399e269772661', '4c24e01fa26fc915e3f057d6c6bfd560', '49a5a960c5714c2e29dd1a7e7b950741',
13 |             '49a5a960c5714c2e29dd1a7e7b950741', '88bac95f31528d13a072c05f2a1cf371'
14 |         ]
15 | 
16 |         for obj, expected in zip(test_case, expecteds):
17 |             result = Cache._get_hash(obj)
18 |             self.assertEqual(result, expected)
19 | 
20 |     def test_dataframe(self):
21 |         df = pd.DataFrame(dict(col_1=[1, 2, 3], col_2=['a', 'b', 'c']))
22 |         expected = '6b7f6abb1cfff565fafb7be863d2c62b'
23 |         result = Cache._get_hash(df)
24 | 
25 |         self.assertEqual(result, expected)
26 | 
27 |         df = pd.DataFrame(dict(col_1=[1, 2, 3], col_3=['a', 'b', 'c']))
28 |         result = Cache._get_hash(df)
29 |         self.assertNotEqual(result, expected)
30 | 
31 |     def test_unique_id(self):
32 |         params = {'param_a': 123, 'param_b': [1, 2, 3], 'param_c': {'key': 'value'}}
33 | 
34 |         expected = '1fd1c9224dc3180dea4d058e90e095df'
35 |         result = Cache._get_unique_id(params)
36 | 
37 |         self.assertEqual(result, expected)
38 | 
39 |     def test_with_no_param(self):
40 |         def read_cache(path, rerun):
41 |             self.path = path
42 |             return path
43 | 
44 |         with patch('src.utils.cache.Cache._read_cache', read_cache):
45 | 
46 |             @Cache('test')
47 |             def func():
48 |                 return ''
49 | 
50 |             expected = 'test/func_with_no_param'
51 |             _ = func()
52 |             self.assertEqual(str(self.path), expected)
53 | 
54 |     def test_read_path(self):
55 |         def read_cache(path, rerun):
56 |             self.path = path
57 |             return path
58 | 
59 |         with patch('src.utils.cache.Cache._read_cache', read_cache):
60 | 
61 |             @Cache('test')
62 |             def func(param):
63 |                 return param
64 | 
65 |             expected = 'test/func_b4216b72b74587638f054cc8e5e9825c'
66 |             ret_1 = func('abc')
67 |             self.assertEqual(str(self.path), expected)
68 | 
69 |             ret_2 = func('def')
70 |             self.assertNotEqual(str(self.path), expected)
71 |             self.assertNotEqual(ret_1, ret_2)
72 | 


--------------------------------------------------------------------------------
/src/models/tabnet.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, List, Optional
 2 | import numpy as np
 3 | import pandas as pd
 4 | import torch.optim as optim
 5 | from torch.nn import functional as F
 6 | from torch.optim.lr_scheduler import ReduceLROnPlateau
 7 | 
 8 | from src.utils.misc import LoggerFactory
 9 | from src.models.loss import SmoothBCEwLogits, LogitsLogLoss
10 | from src.models.base import MoaBase
11 | from src.models.pytorch_tabnet.tab_model import TabNetRegressor
12 | from src.utils.environment import get_device
13 | 
14 | DEVICE = get_device()
15 | logger = LoggerFactory().getLogger(__name__)
16 | 
17 | 
18 | class Tabnet(MoaBase):
19 |     def __init__(self, params: Optional[dict] = None, **kwargs):
20 |         if params is None:
21 |             self.params = {}
22 |         else:
23 |             self.params = params
24 |         super().__init__(**kwargs)
25 | 
26 |     def _get_default_params(self):
27 |         return dict(loss_fn='logloss',
28 |                     max_epoch=200,
29 |                     batch_size=1024,
30 |                     initialize_params=dict(n_d=32,
31 |                                            n_a=32,
32 |                                            n_steps=1,
33 |                                            gamma=1.3,
34 |                                            lambda_sparse=0,
35 |                                            optimizer_fn=optim.Adam,
36 |                                            optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
37 |                                            mask_type="entmax",
38 |                                            scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9),
39 |                                            scheduler_fn=ReduceLROnPlateau,
40 |                                            seed=42,
41 |                                            verbose=10))
42 | 
43 |     def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int):
44 |         X_train, y_train = X.iloc[train_idx][predictors].values, y.iloc[train_idx].values
45 |         X_valid, y_valid = X.iloc[valid_idx][predictors].values, y.iloc[valid_idx].values
46 | 
47 |         logger.info(f"train shape: {X_train.shape}, positive frac: {y_train.sum()/y_train.shape[0]}")
48 |         logger.info(f"valid shape: {X_valid.shape}, positive frac: {y_valid.sum()/y_valid.shape[0]}")
49 | 
50 |         _params = self._get_default_params()
51 |         _params.update(self.params)
52 |         _params['initialize_params']['seed'] = seed
53 | 
54 |         model = TabNetRegressor(**_params['initialize_params'])
55 |         loss_fn = F.binary_cross_entropy_with_logits if _params['loss_fn'] == 'logloss' else SmoothBCEwLogits(smoothing=0.001)
56 |         logger.info(loss_fn)
57 | 
58 |         model.fit(
59 |             X_train=X_train,
60 |             y_train=y_train,
61 |             eval_set=[(X_valid, y_valid)],
62 |             eval_name=["val"],
63 |             eval_metric=["logits_ll"],
64 |             max_epochs=_params['max_epoch'],
65 |             patience=20,
66 |             batch_size=_params['batch_size'],
67 |             virtual_batch_size=32,
68 |             num_workers=0,
69 |             drop_last=False,
70 |             # To use binary cross entropy because this is not a regression problem
71 |             loss_fn=loss_fn)
72 | 
73 |         preds = self._sigmoid(model.predict(X_valid))
74 |         return preds, model
75 | 
76 |     def _predict(self, model: Any, X_valid: pd.DataFrame, predictors: List[str]):
77 |         preds = model.predict(X_valid[predictors].values)
78 |         return self._sigmoid(preds)
79 | 
80 |     @staticmethod
81 |     def _sigmoid(preds: np.ndarray):
82 |         return 1 / (1 + np.exp(-preds))
83 | 


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/tab_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from scipy.special import softmax
  4 | from .utils import PredictDataset, filter_weights
  5 | from .abstract_model import TabModel
  6 | from .multiclass_utils import infer_output_dim, check_output_dim
  7 | from torch.utils.data import DataLoader
  8 | 
  9 | 
 10 | class TabNetClassifier(TabModel):
 11 |     def __post_init__(self):
 12 |         super(TabNetClassifier, self).__post_init__()
 13 |         self._task = 'classification'
 14 |         self._default_loss = torch.nn.functional.cross_entropy
 15 |         self._default_metric = 'accuracy'
 16 | 
 17 |     def weight_updater(self, weights):
 18 |         """
 19 |         Updates weights dictionnary according to target_mapper.
 20 | 
 21 |         Parameters
 22 |         ----------
 23 |         weights : bool or dict
 24 |             Given weights for balancing training.
 25 | 
 26 |         Returns
 27 |         -------
 28 |         bool or dict
 29 |             Same bool if weights are bool, updated dict otherwise.
 30 | 
 31 |         """
 32 |         if isinstance(weights, int):
 33 |             return weights
 34 |         elif isinstance(weights, dict):
 35 |             return {self.target_mapper[key]: value for key, value in weights.items()}
 36 |         else:
 37 |             return weights
 38 | 
 39 |     def prepare_target(self, y):
 40 |         return np.vectorize(self.target_mapper.get)(y)
 41 | 
 42 |     def compute_loss(self, y_pred, y_true):
 43 |         return self.loss_fn(y_pred, y_true.long())
 44 | 
 45 |     def update_fit_params(
 46 |         self,
 47 |         X_train,
 48 |         y_train,
 49 |         eval_set,
 50 |         weights,
 51 |     ):
 52 |         output_dim, train_labels = infer_output_dim(y_train)
 53 |         for X, y in eval_set:
 54 |             check_output_dim(train_labels, y)
 55 |         self.output_dim = output_dim
 56 |         self._default_metric = ('auc' if self.output_dim == 2 else 'accuracy')
 57 |         self.classes_ = train_labels
 58 |         self.target_mapper = {class_label: index for index, class_label in enumerate(self.classes_)}
 59 |         self.preds_mapper = {index: class_label for index, class_label in enumerate(self.classes_)}
 60 |         self.updated_weights = self.weight_updater(weights)
 61 | 
 62 |     def stack_batches(self, list_y_true, list_y_score):
 63 |         y_true = np.hstack(list_y_true)
 64 |         y_score = np.vstack(list_y_score)
 65 |         y_score = softmax(y_score, axis=1)
 66 |         return y_true, y_score
 67 | 
 68 |     def predict_func(self, outputs):
 69 |         outputs = np.argmax(outputs, axis=1)
 70 |         return np.vectorize(self.preds_mapper.get)(outputs)
 71 | 
 72 |     def predict_proba(self, X):
 73 |         """
 74 |         Make predictions for classification on a batch (valid)
 75 | 
 76 |         Parameters
 77 |         ----------
 78 |         X : a :tensor: `torch.Tensor`
 79 |             Input data
 80 | 
 81 |         Returns
 82 |         -------
 83 |         res : np.ndarray
 84 | 
 85 |         """
 86 |         self.network.eval()
 87 | 
 88 |         dataloader = DataLoader(
 89 |             PredictDataset(X),
 90 |             batch_size=self.batch_size,
 91 |             shuffle=False,
 92 |         )
 93 | 
 94 |         results = []
 95 |         for batch_nb, data in enumerate(dataloader):
 96 |             data = data.to(self.device).float()
 97 | 
 98 |             output, M_loss = self.network(data)
 99 |             predictions = torch.nn.Softmax(dim=1)(output).cpu().detach().numpy()
100 |             results.append(predictions)
101 |         res = np.vstack(results)
102 |         return res
103 | 
104 | 
105 | class TabNetRegressor(TabModel):
106 |     def __post_init__(self):
107 |         super(TabNetRegressor, self).__post_init__()
108 |         self._task = 'regression'
109 |         self._default_loss = torch.nn.functional.mse_loss
110 |         self._default_metric = 'mse'
111 | 
112 |     def prepare_target(self, y):
113 |         return y
114 | 
115 |     def compute_loss(self, y_pred, y_true):
116 |         return self.loss_fn(y_pred, y_true)
117 | 
118 |     def update_fit_params(self, X_train, y_train, eval_set, weights):
119 |         self.output_dim = y_train.shape[1]
120 | 
121 |         self.updated_weights = weights
122 |         filter_weights(self.updated_weights)
123 | 
124 |     def predict_func(self, outputs):
125 |         return outputs
126 | 
127 |     def stack_batches(self, list_y_true, list_y_score):
128 |         y_true = np.vstack(list_y_true)
129 |         y_score = np.vstack(list_y_score)
130 |         return y_true, y_score
131 | 


--------------------------------------------------------------------------------
/src/models/boosting_tree.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | import xgboost as xgb
  3 | import pandas as pd
  4 | import numpy as np
  5 | from src.models.base import BaseModel, MoaBase, AllZerosClassifier
  6 | from logging import getLogger
  7 | from tqdm import tqdm
  8 | import lightgbm as lgb
  9 | 
 10 | logger = getLogger(__name__)
 11 | 
 12 | 
 13 | class LGBModel(BaseModel):
 14 |     def __init__(self, params: dict, **kwargs):
 15 |         self.params = params
 16 |         super().__init__(**kwargs)
 17 | 
 18 |     def _get_default_params(self):
 19 |         return {
 20 |             "n_estimators": 5000,
 21 |             "boosting_type": "gbdt",
 22 |             "objective": "binary",
 23 |             "metric": "None",
 24 |             "first_metric": True,
 25 |             "subsample": 0.8,
 26 |             "subsample_freq": 1,
 27 |             "learning_rate": 0.01,
 28 |             "feature_fraction": 0.7,
 29 |             "num_leaves": 12,
 30 |             "max_depth": -1,
 31 |             "early_stopping_rounds": 300,
 32 |             "seed": 42,
 33 |         }
 34 | 
 35 |     def _train(self, train, test, targets, train_idx, valid_idx):
 36 |         predictors = [col for col in train.columns if col not in self.ignore_cols]
 37 |         logger.info(predictors)
 38 |         X_train, y_train = train.iloc[train_idx][predictors], targets.iloc[train_idx]
 39 |         X_valid, y_valid = train.iloc[valid_idx][predictors], targets.iloc[valid_idx]
 40 | 
 41 |         logger.info(f"train shape: {X_train.shape}, positive frac: {y_train.sum()/y_train.shape[0]}")
 42 |         logger.info(f"valid shape: {X_valid.shape}, positive frac: {y_valid.sum()/y_valid.shape[0]}")
 43 | 
 44 |         train_set = lgb.Dataset(X_train, y_train, categorical_feature=self.categorical_cols)
 45 |         val_set = lgb.Dataset(X_valid, y_valid, categorical_feature=self.categorical_cols)
 46 | 
 47 |         _params = self._get_default_params()
 48 |         _params.update(self.params)
 49 | 
 50 |         clf = lgb.train(
 51 |             _params,
 52 |             train_set,
 53 |             valid_sets=[train_set, val_set],
 54 |             verbose_eval=100,
 55 |             fobj=None,
 56 |         )
 57 | 
 58 |         return clf.predict(X_valid), clf.predict(test[predictors]), clf
 59 | 
 60 | 
 61 | class XGBTrainer(MoaBase):
 62 |     def __init__(self, params: Optional[dict] = None, **kwargs):
 63 |         if params is None:
 64 |             self.params = {}
 65 |         else:
 66 |             self.params = params
 67 |         super().__init__(**kwargs)
 68 | 
 69 |     def _get_default_params(self):
 70 |         return {
 71 |             'objective': 'binary:logistic',
 72 |             'eval_metric': 'logloss',
 73 |             'tree_method': 'gpu_hist',
 74 |             'verbosity': 0,
 75 |             'colsample_bytree': 0.1818593017814899,
 76 |             'eta': 0.012887963193108452,
 77 |             'gamma': 6.576022976359221,
 78 |             'max_depth': 8,
 79 |             'min_child_weight': 8.876744371188476,
 80 |             'subsample': 0.7813380253086911,
 81 |         }
 82 | 
 83 |     def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int):
 84 |         X_train, y_train = X.iloc[train_idx][predictors], y.iloc[train_idx]
 85 |         X_valid, y_valid = X.iloc[valid_idx][predictors], y.iloc[valid_idx]
 86 | 
 87 |         logger.info(f"train shape: {X_train.shape}, positive frac: {y_train.sum()/y_train.shape[0]}")
 88 |         logger.info(f"valid shape: {X_valid.shape}, positive frac: {y_valid.sum()/y_valid.shape[0]}")
 89 | 
 90 |         _params = self._get_default_params()
 91 |         _params.update(self.params)
 92 |         _params['seed'] = seed
 93 | 
 94 |         target_cols = y_valid.columns.tolist()
 95 |         pred_valid = np.zeros_like(y_valid).astype(float)
 96 |         models = []
 97 | 
 98 |         for idx, target_col in tqdm(enumerate(target_cols), total=len(target_cols)):
 99 |             xgb_train = xgb.DMatrix(X_train.values, label=y_train[target_col].values.astype(int), nthread=-1)
100 |             xgb_valid = xgb.DMatrix(X_valid.values, label=y_valid[target_col].values.astype(int), nthread=-1)
101 |             clf = xgb.train(_params, xgb_train, 1000, [(xgb_valid, "eval")], early_stopping_rounds=25, verbose_eval=0)
102 |             pred_valid[:, idx] = clf.predict(xgb_valid)
103 |             models.append(clf)
104 | 
105 |         return pred_valid, models
106 | 
107 |     def _predict(self, model: List, X_valid: pd.DataFrame, predictors: List[str]):
108 |         assert type(model) is list, 'model is not list'
109 | 
110 |         preds = np.zeros(shape=(X_valid.shape[0], len(model)))
111 |         for idx, clf in enumerate(model):
112 |             xgb_valid = xgb.DMatrix(X_valid[predictors].values, nthread=-1)
113 |             preds[:, idx] = clf.predict(xgb_valid)
114 | 
115 |         return preds
116 | 


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/multitask.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from scipy.special import softmax
  4 | from .utils import PredictDataset, filter_weights
  5 | from .abstract_model import TabModel
  6 | from .multiclass_utils import infer_multitask_output, check_output_dim
  7 | from torch.utils.data import DataLoader
  8 | 
  9 | 
 10 | class TabNetMultiTaskClassifier(TabModel):
 11 |     def __post_init__(self):
 12 |         super(TabNetMultiTaskClassifier, self).__post_init__()
 13 |         self._task = 'classification'
 14 |         self._default_loss = torch.nn.functional.cross_entropy
 15 |         self._default_metric = 'logloss'
 16 | 
 17 |     def prepare_target(self, y):
 18 |         y_mapped = y.copy()
 19 |         for task_idx in range(y.shape[1]):
 20 |             task_mapper = self.target_mapper[task_idx]
 21 |             y_mapped[:, task_idx] = np.vectorize(task_mapper.get)(y[:, task_idx])
 22 |         return y_mapped
 23 | 
 24 |     def compute_loss(self, y_pred, y_true):
 25 |         """
 26 |         Computes the loss according to network output and targets
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         y_pred : list of tensors
 31 |             Output of network
 32 |         y_true : LongTensor
 33 |             Targets label encoded
 34 | 
 35 |         Returns
 36 |         -------
 37 |         loss : torch.Tensor
 38 |             output of loss function(s)
 39 | 
 40 |         """
 41 |         loss = 0
 42 |         y_true = y_true.long()
 43 |         if isinstance(self.loss_fn, list):
 44 |             # if you specify a different loss for each task
 45 |             for task_loss, task_output, task_id in zip(self.loss_fn, y_pred, range(len(self.loss_fn))):
 46 |                 loss += task_loss(task_output, y_true[:, task_id])
 47 |         else:
 48 |             # same loss function is applied to all tasks
 49 |             for task_id, task_output in enumerate(y_pred):
 50 |                 loss += self.loss_fn(task_output, y_true[:, task_id])
 51 | 
 52 |         loss /= len(y_pred)
 53 |         return loss
 54 | 
 55 |     def stack_batches(self, list_y_true, list_y_score):
 56 |         y_true = np.vstack(list_y_true)
 57 |         y_score = []
 58 |         for i in range(len(self.output_dim)):
 59 |             score = np.vstack([x[i] for x in list_y_score])
 60 |             score = softmax(score, axis=1)
 61 |             y_score.append(score)
 62 |         return y_true, y_score
 63 | 
 64 |     def update_fit_params(self, X_train, y_train, eval_set, weights):
 65 |         output_dim, train_labels = infer_multitask_output(y_train)
 66 |         for _, y in eval_set:
 67 |             for task_idx in range(y.shape[1]):
 68 |                 check_output_dim(train_labels[task_idx], y[:, task_idx])
 69 |         self.output_dim = output_dim
 70 |         self.classes_ = train_labels
 71 |         self.target_mapper = [{class_label: index for index, class_label in enumerate(classes)} for classes in self.classes_]
 72 |         self.preds_mapper = [{index: class_label for index, class_label in enumerate(classes)} for classes in self.classes_]
 73 |         self.updated_weights = weights
 74 |         filter_weights(self.updated_weights)
 75 | 
 76 |     def predict(self, X):
 77 |         """
 78 |         Make predictions on a batch (valid)
 79 | 
 80 |         Parameters
 81 |         ----------
 82 |         X : a :tensor: `torch.Tensor`
 83 |             Input data
 84 | 
 85 |         Returns
 86 |         -------
 87 |         results : np.array
 88 |             Predictions of the most probable class
 89 |         """
 90 |         self.network.eval()
 91 |         dataloader = DataLoader(
 92 |             PredictDataset(X),
 93 |             batch_size=self.batch_size,
 94 |             shuffle=False,
 95 |         )
 96 | 
 97 |         results = {}
 98 |         for data in dataloader:
 99 |             data = data.to(self.device).float()
100 |             output, _ = self.network(data)
101 |             predictions = [torch.argmax(torch.nn.Softmax(dim=1)(task_output), dim=1).cpu().detach().numpy().reshape(-1) for task_output in output]
102 | 
103 |             for task_idx in range(len(self.output_dim)):
104 |                 results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]]
105 |         # stack all task individually
106 |         results = [np.hstack(task_res) for task_res in results.values()]
107 |         # map all task individually
108 |         results = [np.vectorize(self.preds_mapper[task_idx].get)(task_res) for task_idx, task_res in enumerate(results)]
109 |         return results
110 | 
111 |     def predict_proba(self, X):
112 |         """
113 |         Make predictions for classification on a batch (valid)
114 | 
115 |         Parameters
116 |         ----------
117 |         X : a :tensor: `torch.Tensor`
118 |             Input data
119 | 
120 |         Returns
121 |         -------
122 |         res : list of np.ndarray
123 | 
124 |         """
125 |         self.network.eval()
126 | 
127 |         dataloader = DataLoader(
128 |             PredictDataset(X),
129 |             batch_size=self.batch_size,
130 |             shuffle=False,
131 |         )
132 | 
133 |         results = {}
134 |         for data in dataloader:
135 |             data = data.to(self.device).float()
136 |             output, _ = self.network(data)
137 |             predictions = [torch.nn.Softmax(dim=1)(task_output).cpu().detach().numpy() for task_output in output]
138 |             for task_idx in range(len(self.output_dim)):
139 |                 results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]]
140 |         res = [np.vstack(task_res) for task_res in results.values()]
141 |         return res
142 | 


--------------------------------------------------------------------------------
/src/preprocess.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | import itertools
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | from src.utils.cache import Cache
  7 | from src.utils.misc import LoggerFactory
  8 | 
  9 | logger = LoggerFactory().getLogger(__name__)
 10 | 
 11 | 
 12 | def get_cp_time_feature(s):
 13 |     if s == 72:
 14 |         return 2
 15 |     elif s == 48:
 16 |         return 1
 17 |     else:
 18 |         return 0
 19 | 
 20 | 
 21 | def get_cp_dose_feature(s):
 22 |     return 1 if s == "D1" else 0
 23 | 
 24 | 
 25 | def get_feature(df):
 26 |     features_g = list([x for x in df.columns if x.startswith("g-")])
 27 |     features_c = list([x for x in df.columns if x.startswith("c-")])
 28 | 
 29 |     df["g_sum"] = df[features_g].sum(axis=1)
 30 |     df["g_mean"] = df[features_g].mean(axis=1)
 31 |     df["g_median"] = df[features_g].median(axis=1)
 32 |     df["g_std"] = df[features_g].std(axis=1)
 33 |     df["g_kurt"] = df[features_g].kurtosis(axis=1)
 34 |     df["g_skew"] = df[features_g].skew(axis=1)
 35 |     df["c_sum"] = df[features_c].sum(axis=1)
 36 |     df["c_mean"] = df[features_c].mean(axis=1)
 37 |     df["c_std"] = df[features_c].std(axis=1)
 38 |     df["c_median"] = df[features_c].median(axis=1)
 39 |     df["c_kurt"] = df[features_c].kurtosis(axis=1)
 40 |     df["c_skew"] = df[features_c].skew(axis=1)
 41 |     df["gc_sum"] = df[features_g + features_c].sum(axis=1)
 42 |     df["gc_mean"] = df[features_g + features_c].mean(axis=1)
 43 |     df["gc_std"] = df[features_g + features_c].std(axis=1)
 44 |     df["gc_kurt"] = df[features_g + features_c].kurtosis(axis=1)
 45 |     df["gc_skew"] = df[features_g + features_c].skew(axis=1)
 46 |     df["gc_median"] = df[features_g + features_c].median(axis=1)
 47 | 
 48 |     return df
 49 | 
 50 | 
 51 | @Cache(dir_path='./cache/')
 52 | def preprocess_train(input_dir='../input/lish-moa/', sub: bool = False):
 53 |     train_features_df = pd.read_csv(f"{input_dir}/train_features.csv")
 54 |     train_drug_df = pd.read_csv(f"{input_dir}/train_drug.csv")
 55 |     train_targets_scored_df = pd.read_csv(f"{input_dir}/train_targets_scored.csv")
 56 |     train_targets_nonscored_df = pd.read_csv(f"{input_dir}/train_targets_nonscored.csv")
 57 | 
 58 |     logger.info(f"""
 59 |     train_features_df: {train_features_df.shape}
 60 |     train_drug_df: {train_drug_df.shape}
 61 |     train_targets_scored_df: {train_targets_scored_df.shape}
 62 |     train_targets_nonscored_df: {train_targets_nonscored_df.shape}
 63 |     """)
 64 | 
 65 |     drop_cols = list(train_targets_nonscored_df.columns[train_targets_nonscored_df.sum() == 0])
 66 |     use_cols = [x for x in train_targets_nonscored_df.columns if x not in drop_cols]
 67 |     train_targets_nonscored_df = train_targets_nonscored_df.loc[:, use_cols]
 68 |     logger.info(f"""
 69 |     train_targets_nonscored_df: {train_targets_nonscored_df.shape}
 70 |     """)
 71 | 
 72 |     train_features_df = train_features_df.merge(train_targets_scored_df)
 73 |     train_features_df = train_features_df.merge(train_drug_df)
 74 |     train_features_df = train_features_df.merge(train_targets_nonscored_df)
 75 |     logger.info(f"""
 76 |     train_features_df: {train_features_df.shape}
 77 |     """)
 78 | 
 79 |     train_features_df = train_features_df[train_features_df.cp_type == "trt_cp"].reset_index(drop=True)
 80 | 
 81 |     train_features_df["cp_time_feature"] = train_features_df["cp_time"].map(get_cp_time_feature)
 82 |     train_features_df["cp_dose_feature"] = train_features_df["cp_dose"].map(get_cp_dose_feature)
 83 | 
 84 |     train_features_df = train_features_df.drop(columns=["cp_type", "cp_time", "cp_dose"])
 85 | 
 86 |     features_g = list([x for x in train_features_df.columns if x.startswith("g-")])
 87 |     features_c = list([x for x in train_features_df.columns if x.startswith("c-")])
 88 | 
 89 |     var_list = []
 90 |     for c in tqdm(list(itertools.combinations(features_g + features_c, 2))):
 91 |         col_name = f"{c[0]}_{c[1]}_diff"
 92 |         d = train_features_df[c[0]] - train_features_df[c[1]]
 93 |         diff_val = np.var(d)
 94 |         if diff_val > 15:
 95 |             train_features_df[col_name] = d
 96 |             var_list.append(diff_val)
 97 | 
 98 |     stage_1_2_target_cols = [x for x in train_targets_scored_df.columns if x not in ["sig_id", "drug_id"]]
 99 |     stage_1_1_target_cols = [x for x in train_targets_nonscored_df.columns if x not in ["sig_id", "drug_id"]] + stage_1_2_target_cols
100 | 
101 |     stage_1_train_features = [x for x in train_features_df.columns if x not in ["sig_id", "drug_id"] + stage_1_1_target_cols]
102 | 
103 |     return (
104 |         train_features_df,
105 |         stage_1_1_target_cols,
106 |         stage_1_2_target_cols,
107 |         stage_1_train_features,
108 |     )
109 | 
110 | 
111 | def preprocess_test(train_features_df, input_dir='../input/lish-moa/'):
112 |     test_features_df = pd.read_csv(f"{input_dir}/test_features.csv")
113 |     sample_submission_df = pd.read_csv(f"{input_dir}/sample_submission.csv")
114 |     test_features_df["cp_time_feature"] = test_features_df["cp_time"].map(get_cp_time_feature)
115 |     test_features_df["cp_dose_feature"] = test_features_df["cp_dose"].map(get_cp_dose_feature)
116 | 
117 |     features_g = list([x for x in train_features_df.columns if x.startswith("g-")])
118 |     features_c = list([x for x in train_features_df.columns if x.startswith("c-")])
119 | 
120 |     for c in tqdm(list(itertools.combinations(features_g + features_c, 2))):
121 |         col_name = f"{c[0]}_{c[1]}_diff"
122 |         if col_name in train_features_df.columns:
123 |             test_features_df[col_name] = test_features_df[c[0]] - test_features_df[c[1]]
124 | 
125 |     return test_features_df, sample_submission_df
126 | 


--------------------------------------------------------------------------------
/src/utils/transformers.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | from sklearn.preprocessing import QuantileTransformer, RobustScaler
  3 | import pandas as pd
  4 | import numpy as np
  5 | from joblib import Parallel, delayed
  6 | from scipy.interpolate import interp1d
  7 | from scipy.special import erf, erfinv
  8 | from sklearn.base import BaseEstimator, TransformerMixin
  9 | from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
 10 | 
 11 | from src.utils.cache import Cache
 12 | 
 13 | 
 14 | class GaussRankScaler(BaseEstimator, TransformerMixin):
 15 |     """Transform features by scaling each feature to a normal distribution.
 16 |     Parameters
 17 |         ----------
 18 |         epsilon : float, optional, default 1e-4
 19 |             A small amount added to the lower bound or subtracted
 20 |             from the upper bound. This value prevents infinite number
 21 |             from occurring when applying the inverse error function.
 22 |         copy : boolean, optional, default True
 23 |             If False, try to avoid a copy and do inplace scaling instead.
 24 |             This is not guaranteed to always work inplace; e.g. if the data is
 25 |             not a NumPy array, a copy may still be returned.
 26 |         n_jobs : int or None, optional, default None
 27 |             Number of jobs to run in parallel.
 28 |             ``None`` means 1 and ``-1`` means using all processors.
 29 |         interp_kind : str or int, optional, default 'linear'
 30 |            Specifies the kind of interpolation as a string
 31 |             ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
 32 |             'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic'
 33 |             refer to a spline interpolation of zeroth, first, second or third
 34 |             order; 'previous' and 'next' simply return the previous or next value
 35 |             of the point) or as an integer specifying the order of the spline
 36 |             interpolator to use.
 37 |         interp_copy : bool, optional, default False
 38 |             If True, the interpolation function makes internal copies of x and y.
 39 |             If False, references to `x` and `y` are used.
 40 |         Attributes
 41 |         ----------
 42 |         interp_func_ : list
 43 |             The interpolation function for each feature in the training set.
 44 |         """
 45 |     def __init__(self, epsilon=1e-4, copy=True, n_jobs=None, interp_kind='linear', interp_copy=False):
 46 |         self.epsilon = epsilon
 47 |         self.copy = copy
 48 |         self.interp_kind = interp_kind
 49 |         self.interp_copy = interp_copy
 50 |         self.fill_value = 'extrapolate'
 51 |         self.n_jobs = n_jobs
 52 | 
 53 |     def fit(self, X, y=None):
 54 |         """Fit interpolation function to link rank with original data for future scaling
 55 |         Parameters
 56 |         ----------
 57 |         X : array-like, shape (n_samples, n_features)
 58 |             The data used to fit interpolation function for later scaling along the features axis.
 59 |         y
 60 |             Ignored
 61 |         """
 62 |         X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)
 63 | 
 64 |         self.interp_func_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit)(x) for x in X.T)
 65 |         return self
 66 | 
 67 |     def _fit(self, x):
 68 |         x = self.drop_duplicates(x)
 69 |         rank = np.argsort(np.argsort(x))
 70 |         bound = 1.0 - self.epsilon
 71 |         factor = np.max(rank) / 2.0 * bound
 72 |         scaled_rank = np.clip(rank / factor - bound, -bound, bound)
 73 |         return interp1d(x, scaled_rank, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value)
 74 | 
 75 |     def transform(self, X, copy=None):
 76 |         """Scale the data with the Gauss Rank algorithm
 77 |         Parameters
 78 |         ----------
 79 |         X : array-like, shape (n_samples, n_features)
 80 |             The data used to scale along the features axis.
 81 |         copy : bool, optional (default: None)
 82 |             Copy the input X or not.
 83 |         """
 84 |         check_is_fitted(self, 'interp_func_')
 85 | 
 86 |         copy = copy if copy is not None else self.copy
 87 |         X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)
 88 | 
 89 |         X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._transform)(i, x) for i, x in enumerate(X.T))).T
 90 |         return X
 91 | 
 92 |     def _transform(self, i, x):
 93 |         return erfinv(self.interp_func_[i](x))
 94 | 
 95 |     def inverse_transform(self, X, copy=None):
 96 |         """Scale back the data to the original representation
 97 |         Parameters
 98 |         ----------
 99 |         X : array-like, shape [n_samples, n_features]
100 |             The data used to scale along the features axis.
101 |         copy : bool, optional (default: None)
102 |             Copy the input X or not.
103 |         """
104 |         check_is_fitted(self, 'interp_func_')
105 | 
106 |         copy = copy if copy is not None else self.copy
107 |         X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)
108 | 
109 |         X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T))).T
110 |         return X
111 | 
112 |     def _inverse_transform(self, i, x):
113 |         inv_interp_func = interp1d(self.interp_func_[i].y, self.interp_func_[i].x, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value)
114 |         return inv_interp_func(erf(x))
115 | 
116 |     @staticmethod
117 |     def drop_duplicates(x):
118 |         is_unique = np.zeros_like(x, dtype=bool)
119 |         is_unique[np.unique(x, return_index=True)[1]] = True
120 |         return x[is_unique]
121 | 
122 | 
123 | TRANSFORMERS = {
124 |     'quantile': QuantileTransformer,
125 |     'robust': RobustScaler,
126 |     'gauss_rank': GaussRankScaler,
127 | }
128 | 
129 | 
130 | # return transformer
131 | @Cache('./cache')
132 | def normalizer(transformer: str, df: pd.DataFrame, params: Optional[dict]):
133 |     if params is None:
134 |         params = dict()
135 |     trans = TRANSFORMERS[transformer](**params)
136 |     trans.fit(df)
137 | 
138 |     return trans
139 | 


--------------------------------------------------------------------------------
/src/utils/cache.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Union, Dict, List, Callable, Optional, ByteString, Tuple
  2 | import hashlib
  3 | import pickle
  4 | from pandas.util import hash_pandas_object
  5 | import pandas as pd
  6 | from pathlib import Path
  7 | from inspect import signature
  8 | import numpy as np
  9 | from collections import OrderedDict
 10 | import operator
 11 | import functools
 12 | import json
 13 | from json import JSONEncoder
 14 | try:
 15 |     from collections.abc import Mapping
 16 | except ImportError:
 17 |     from collections import Mapping
 18 | 
 19 | from src.utils.misc import LoggerFactory
 20 | 
 21 | logger = LoggerFactory().getLogger(__name__)
 22 | 
 23 | 
 24 | def _hash(obj: ByteString) -> str:
 25 |     return hashlib.md5(obj).hexdigest()
 26 | 
 27 | 
 28 | class Cache:
 29 |     def __init__(self, dir_path: str, rerun: bool = False, with_param: bool = False):
 30 |         self.dir_path = Path(dir_path)
 31 |         self.dir_path.mkdir(exist_ok=True)
 32 |         self.with_param = with_param
 33 |         self.rerun = rerun
 34 | 
 35 |     def __call__(self, func: Callable):
 36 |         func_name = func.__name__
 37 | 
 38 |         def wrapper(*args, **kwargs):
 39 |             sig = signature(func)
 40 |             # ignore default value
 41 |             bound_args = sig.bind(*args, **kwargs)
 42 |             unique_id: str = self._get_unique_id(bound_args.arguments)
 43 |             path: Path = self.dir_path.joinpath(f'{func_name}_{unique_id}')
 44 | 
 45 |             logger.info(f'{func_name}_{unique_id} has been called')
 46 |             ret = Cache._read_cache(path, rerun=self.rerun)
 47 |             if ret is None:
 48 |                 logger.info(f'{func_name}_{unique_id} cache not found')
 49 |                 ret = func(*args, **kwargs)
 50 |                 Cache._write(path, ret)
 51 |             return ret
 52 | 
 53 |         return wrapper
 54 | 
 55 |     @staticmethod
 56 |     def _write(path, obj: Union[pd.DataFrame, Any]):
 57 |         # TODO: FileProcessor
 58 |         if isinstance(obj, pd.DataFrame):
 59 |             path = f'{path}.feather'
 60 |             obj.to_feather(str(path))
 61 |         else:
 62 |             path = f'{path}.pickle'
 63 |             with open(str(path), 'wb') as f:
 64 |                 pickle.dump(obj, f, protocol=4)
 65 | 
 66 |     @staticmethod
 67 |     def _read_cache(path: Path, rerun: bool) -> Optional[Any]:
 68 |         if rerun:
 69 |             return None
 70 |         if Path(f'{path}.pickle').exists():
 71 |             logger.info(f'cache hit: {path}.pickle')
 72 |             return pickle.load(open(f'{path}.pickle', 'rb'))
 73 |         if Path(f'{path}.feather').exists():
 74 |             logger.info(f'cache hit: {path}.feather')
 75 |             return pd.read_feather(f'{path}.feather')
 76 |         return None
 77 | 
 78 |     @classmethod
 79 |     def _get_unique_id(cls, params: Dict) -> str:
 80 |         if not params:
 81 |             return 'with_no_param'
 82 |         dependencies = [f'{key}_{cls._get_hash(param)}' for key, param in sorted(params.items(), key=lambda item: str(item[0]))]
 83 |         return hashlib.md5(str(dependencies).encode()).hexdigest()
 84 | 
 85 |     @classmethod
 86 |     def _get_hash(cls, obj: Any) -> str:
 87 |         if isinstance(obj, (str, int, float)):
 88 |             return cls._literals(obj)
 89 |         elif isinstance(obj, pd.DataFrame):
 90 |             return cls._data_frame(obj)
 91 |         elif isinstance(obj, np.ndarray):
 92 |             return cls._ndarray(obj)
 93 |         elif isinstance(obj, (list, dict, tuple)):
 94 |             return cls._containers(obj)
 95 |         else:
 96 |             return _hash(pickle.dumps(obj))
 97 | 
 98 |         return '-1'
 99 | 
100 |     @staticmethod
101 |     def _data_frame(obj: pd.DataFrame):
102 |         string = str(obj.columns.tolist()) + str(obj.index) + str(obj.shape)
103 |         return _hash(string.encode())
104 |         # return hash_pandas_object(obj).sum()
105 | 
106 |     @staticmethod
107 |     def _ndarray(obj: np.ndarray):
108 |         return _hash(bytes(obj))
109 | 
110 |     @staticmethod
111 |     def _containers(obj: Union[List[Any], Dict[Any, Any], Tuple[Any, ...]]):
112 |         return _hash(json.dumps(obj, cls=_DictParamEncoder).encode())
113 | 
114 |     @staticmethod
115 |     def _literals(obj: Union[int, str, float]):
116 |         return _hash(str(obj).encode())
117 | 
118 | 
119 | # from https://github.com/spotify/luigi/blob/master/luigi/parameter.py#L940
120 | 
121 | 
122 | class _DictParamEncoder(JSONEncoder):
123 |     """
124 |     JSON encoder for :py:class:`~DictParameter`, which makes :py:class:`~FrozenOrderedDict` JSON serializable.
125 |     """
126 |     def default(self, obj):
127 |         if isinstance(obj, FrozenOrderedDict):
128 |             return obj.get_wrapped()
129 |         return json.JSONEncoder.default(self, obj)
130 | 
131 | 
132 | class FrozenOrderedDict(Mapping):
133 |     """
134 |     It is an immutable wrapper around ordered dictionaries that implements the complete :py:class:`collections.Mapping`
135 |     interface. It can be used as a drop-in replacement for dictionaries where immutability and ordering are desired.
136 |     """
137 |     def __init__(self, *args, **kwargs):
138 |         self.__dict = OrderedDict(*args, **kwargs)
139 |         self.__hash = None
140 | 
141 |     def __getitem__(self, key):
142 |         return self.__dict[key]
143 | 
144 |     def __iter__(self):
145 |         return iter(self.__dict)
146 | 
147 |     def __len__(self):
148 |         return len(self.__dict)
149 | 
150 |     def __repr__(self):
151 |         # We should use short representation for beautiful console output
152 |         return repr(dict(self.__dict))
153 | 
154 |     def __hash__(self):
155 |         if self.__hash is None:
156 |             hashes = map(hash, self.items())
157 |             self.__hash = functools.reduce(operator.xor, hashes, 0)
158 | 
159 |         return self.__hash
160 | 
161 |     def get_wrapped(self):
162 |         return self.__dict
163 | 
164 | 
165 | def recursively_freeze(value):
166 |     """
167 |     Recursively walks ``Mapping``s and ``list``s and converts them to ``FrozenOrderedDict`` and ``tuples``, respectively.
168 |     """
169 |     if isinstance(value, Mapping):
170 |         return FrozenOrderedDict(((k, recursively_freeze(v)) for k, v in value.items()))
171 |     elif isinstance(value, list) or isinstance(value, tuple):
172 |         return tuple(recursively_freeze(v) for v in value)
173 |     return value
174 | 


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/metrics.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import List
  3 | import numpy as np
  4 | from sklearn.metrics import (
  5 |     roc_auc_score,
  6 |     mean_squared_error,
  7 |     mean_absolute_error,
  8 |     accuracy_score,
  9 |     log_loss,
 10 |     balanced_accuracy_score,
 11 | )
 12 | 
 13 | 
 14 | @dataclass
 15 | class MetricContainer:
 16 |     """Container holding a list of metrics.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     metric_names : list of str
 21 |         List of metric names.
 22 |     prefix : str
 23 |         Prefix of metric names.
 24 | 
 25 |     """
 26 | 
 27 |     metric_names: List[str]
 28 |     prefix: str = ""
 29 | 
 30 |     def __post_init__(self):
 31 |         self.metrics = Metric.get_metrics_by_names(self.metric_names)
 32 |         self.names = [self.prefix + name for name in self.metric_names]
 33 | 
 34 |     def __call__(self, y_true, y_pred):
 35 |         """Compute all metrics and store into a dict.
 36 | 
 37 |         Parameters
 38 |         ----------
 39 |         y_true : np.ndarray
 40 |             Target matrix or vector
 41 |         y_pred : np.ndarray
 42 |             Score matrix or vector
 43 | 
 44 |         Returns
 45 |         -------
 46 |         dict
 47 |             Dict of metrics ({metric_name: metric_value}).
 48 | 
 49 |         """
 50 |         logs = {}
 51 |         for metric in self.metrics:
 52 |             if isinstance(y_pred, list):
 53 |                 res = np.mean(
 54 |                     [metric(y_true[:, i], y_pred[i]) for i in range(len(y_pred))]
 55 |                 )
 56 |             else:
 57 |                 res = metric(y_true, y_pred)
 58 |             logs[self.prefix + metric._name] = res
 59 |         return logs
 60 | 
 61 | 
 62 | class Metric:
 63 |     def __call__(self, y_true, y_pred):
 64 |         raise NotImplementedError("Custom Metrics must implement this function")
 65 | 
 66 |     @classmethod
 67 |     def get_metrics_by_names(cls, names):
 68 |         """Get list of metric classes.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         cls : Metric
 73 |             Metric class.
 74 |         names : list
 75 |             List of metric names.
 76 | 
 77 |         Returns
 78 |         -------
 79 |         metrics : list
 80 |             List of metric classes.
 81 | 
 82 |         """
 83 |         available_metrics = cls.__subclasses__()
 84 |         available_names = [metric()._name for metric in available_metrics]
 85 |         metrics = []
 86 |         for name in names:
 87 |             assert name in available_names, f"{name} is not available, choose in {available_names}"
 88 |             idx = available_names.index(name)
 89 |             metric = available_metrics[idx]()
 90 |             metrics.append(metric)
 91 |         return metrics
 92 | 
 93 | 
 94 | class AUC(Metric):
 95 |     """
 96 |     AUC.
 97 |     """
 98 | 
 99 |     def __init__(self):
100 |         self._name = "auc"
101 |         self._maximize = True
102 | 
103 |     def __call__(self, y_true, y_score):
104 |         """
105 |         Compute AUC of predictions.
106 | 
107 |         Parameters
108 |         ----------
109 |         y_true : np.ndarray
110 |             Target matrix or vector
111 |         y_score : np.ndarray
112 |             Score matrix or vector
113 | 
114 |         Returns
115 |         -------
116 |         float
117 |             AUC of predictions vs targets.
118 |         """
119 |         return roc_auc_score(y_true, y_score[:, 1])
120 | 
121 | 
122 | class Accuracy(Metric):
123 |     """
124 |     Accuracy.
125 |     """
126 | 
127 |     def __init__(self):
128 |         self._name = "accuracy"
129 |         self._maximize = True
130 | 
131 |     def __call__(self, y_true, y_score):
132 |         """
133 |         Compute Accuracy of predictions.
134 | 
135 |         Parameters
136 |         ----------
137 |         y_true: np.ndarray
138 |             Target matrix or vector
139 |         y_score: np.ndarray
140 |             Score matrix or vector
141 | 
142 |         Returns
143 |         -------
144 |         float
145 |             Accuracy of predictions vs targets.
146 |         """
147 |         y_pred = np.argmax(y_score, axis=1)
148 |         return accuracy_score(y_true, y_pred)
149 | 
150 | 
151 | class BalancedAccuracy(Metric):
152 |     """
153 |     Balanced Accuracy.
154 |     """
155 | 
156 |     def __init__(self):
157 |         self._name = "balanced_accuracy"
158 |         self._maximize = True
159 | 
160 |     def __call__(self, y_true, y_score):
161 |         """
162 |         Compute Accuracy of predictions.
163 | 
164 |         Parameters
165 |         ----------
166 |         y_true : np.ndarray
167 |             Target matrix or vector
168 |         y_score : np.ndarray
169 |             Score matrix or vector
170 | 
171 |         Returns
172 |         -------
173 |         float
174 |             Accuracy of predictions vs targets.
175 |         """
176 |         y_pred = np.argmax(y_score, axis=1)
177 |         return balanced_accuracy_score(y_true, y_pred)
178 | 
179 | 
180 | class LogLoss(Metric):
181 |     """
182 |     LogLoss.
183 |     """
184 | 
185 |     def __init__(self):
186 |         self._name = "logloss"
187 |         self._maximize = False
188 | 
189 |     def __call__(self, y_true, y_score):
190 |         """
191 |         Compute LogLoss of predictions.
192 | 
193 |         Parameters
194 |         ----------
195 |         y_true : np.ndarray
196 |             Target matrix or vector
197 |         y_score : np.ndarray
198 |             Score matrix or vector
199 | 
200 |         Returns
201 |         -------
202 |         float
203 |             LogLoss of predictions vs targets.
204 |         """
205 |         return log_loss(y_true, y_score)
206 | 
207 | 
208 | class MAE(Metric):
209 |     """
210 |     Mean Absolute Error.
211 |     """
212 | 
213 |     def __init__(self):
214 |         self._name = "mae"
215 |         self._maximize = False
216 | 
217 |     def __call__(self, y_true, y_score):
218 |         """
219 |         Compute MAE (Mean Absolute Error) of predictions.
220 | 
221 |         Parameters
222 |         ----------
223 |         y_true : np.ndarray
224 |             Target matrix or vector
225 |         y_score : np.ndarray
226 |             Score matrix or vector
227 | 
228 |         Returns
229 |         -------
230 |         float
231 |             MAE of predictions vs targets.
232 |         """
233 |         return mean_absolute_error(y_true, y_score)
234 | 
235 | 
236 | class MSE(Metric):
237 |     """
238 |     Mean Squared Error.
239 |     """
240 | 
241 |     def __init__(self):
242 |         self._name = "mse"
243 |         self._maximize = False
244 | 
245 |     def __call__(self, y_true, y_score):
246 |         """
247 |         Compute MSE (Mean Squared Error) of predictions.
248 | 
249 |         Parameters
250 |         ----------
251 |         y_true : np.ndarray
252 |             Target matrix or vector
253 |         y_score : np.ndarray
254 |             Score matrix or vector
255 | 
256 |         Returns
257 |         -------
258 |         float
259 |             MSE of predictions vs targets.
260 |         """
261 |         return mean_squared_error(y_true, y_score)
262 | 
263 | 
264 | class RMSE(Metric):
265 |     """
266 |     Root Mean Squared Error.
267 |     """
268 | 
269 |     def __init__(self):
270 |         self._name = "rmse"
271 |         self._maximize = False
272 | 
273 |     def __call__(self, y_true, y_score):
274 |         """
275 |         Compute RMSE (Root Mean Squared Error) of predictions.
276 | 
277 |         Parameters
278 |         ----------
279 |         y_true : np.ndarray
280 |             Target matrix or vector
281 |         y_score : np.ndarray
282 |             Score matrix or vector
283 | 
284 |         Returns
285 |         -------
286 |         float
287 |             RMSE of predictions vs targets.
288 |         """
289 |         return np.sqrt(mean_squared_error(y_true, y_score))
290 | 
291 | 
292 | def check_metrics(metrics):
293 |     """Check if custom metrics are provided.
294 | 
295 |     Parameters
296 |     ----------
297 |     metrics : list of str or classes
298 |         List with built-in metrics (str) or custom metrics (classes).
299 | 
300 |     Returns
301 |     -------
302 |     val_metrics : list of str
303 |         List of metric names.
304 | 
305 |     """
306 |     val_metrics = []
307 |     for metric in metrics:
308 |         if isinstance(metric, str):
309 |             val_metrics.append(metric)
310 |         elif issubclass(metric, Metric):
311 |             val_metrics.append(metric()._name)
312 |         else:
313 |             raise TypeError("You need to provide a valid metric format")
314 |     return val_metrics
315 | 


--------------------------------------------------------------------------------
/src/models/base.py:
--------------------------------------------------------------------------------
  1 | from abc import abstractmethod
  2 | from src.utils.splitter import SplitFactory
  3 | from typing import NamedTuple, Callable, List, Dict, Optional, Tuple, Any
  4 | import numpy as np
  5 | import pandas as pd
  6 | from src.experiment.experiment import Experiment
  7 | from src.utils.misc import LoggerFactory
  8 | 
  9 | logger = LoggerFactory().getLogger(__name__)
 10 | 
 11 | 
 12 | class ModelResult(NamedTuple):
 13 |     oof_preds: np.ndarray
 14 |     preds: Optional[np.ndarray]
 15 |     models: Dict[str, any]
 16 |     scores: Dict[str, float]
 17 |     folds: List[Tuple[np.ndarray, np.ndarray]]
 18 | 
 19 | 
 20 | class BaseModel:
 21 |     def __init__(self, ignore_cols: List[str], target_cols: str, categorical_cols: List[str], metric: Callable, exp: Experiment):
 22 |         self.ignore_cols = ignore_cols
 23 |         self.metric = metric
 24 |         self.result = None
 25 | 
 26 |     @abstractmethod
 27 |     def _train(self, train: pd.DataFrame, targets: pd.DataFrame, train_idx, valid_idx):
 28 |         raise NotImplementedError
 29 | 
 30 |     def train(self, X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, splitter: Optional[SplitFactory],
 31 |               folds: Optional[List[Tuple[np.ndarray, np.ndarray]]]):
 32 | 
 33 |         models = dict()
 34 |         scores = dict()
 35 |         oof_preds = np.zeros_like(y_train).astype(float)
 36 |         preds = np.zeros(shape=(X_test.shape[0], y_train.shape[1]))
 37 |         assert (folds is not None) or (splitter is not None), 'splitter or folds is must be specified'
 38 |         if folds is None:
 39 |             folds = splitter.split(X_train, y_train)
 40 | 
 41 |         for fold, (train_idx, valid_idx) in enumerate(folds):
 42 |             valid_preds, _preds, model = self._train(X_train, X_test, y_train, train_idx, valid_idx)
 43 |             oof_preds[valid_idx] += valid_preds
 44 |             preds += _preds / len(folds)
 45 | 
 46 |             score = self.metric(y_train[valid_idx].values, valid_preds)
 47 |             logger.info(f"fold {fold}: {score}")
 48 |             models[f'fold_{fold}'] = model
 49 |             scores[f'fold_{fold}'] = score
 50 |         oof_score = self.metric(y_train.values, oof_preds)
 51 |         logger.info(f"{len(folds)} folds cv mean: {np.mean(scores)}")
 52 |         logger.info(f"oof score: {oof_score}")
 53 | 
 54 |         self.result = ModelResult(oof_preds=oof_preds, models=models, preds=preds, folds=folds, scores={
 55 |             'oof_score': oof_score,
 56 |             'KFoldsScores': scores,
 57 |         })
 58 | 
 59 |         return True
 60 | 
 61 |     def predict(self, X_test):
 62 |         assert self.result is None, 'Model is not tained Error'
 63 |         pass
 64 | 
 65 | 
 66 | class MoaBase:
 67 |     def __init__(self, target_cols: List[str], categorical_cols: List[str], ignore_cols: Optional[List[str]], num_seed_blends: int, metric: Callable,
 68 |                  exp: Experiment):
 69 |         self.exp = exp
 70 |         self.ignore_cols = ignore_cols
 71 |         self.categorical_cols = categorical_cols
 72 |         self.metric = metric
 73 |         self.result = None
 74 |         self.num_seed_blends = num_seed_blends
 75 | 
 76 |     @abstractmethod
 77 |     def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int):
 78 |         raise NotImplementedError
 79 | 
 80 |     def train(self, X_train: pd.DataFrame, y_train: pd.DataFrame, folds: List[Tuple[np.ndarray, np.ndarray]]):
 81 | 
 82 |         models = dict()
 83 |         scores = dict()
 84 |         oof_preds = np.zeros_like(y_train).astype(float)
 85 |         self.predictors = [col for col in X_train.columns.tolist() if col not in self.ignore_cols]
 86 | 
 87 |         logger.info(f'{self.__class__.__name__} train start')
 88 |         logger.info(f'X shape: {X_train.shape}, y shape: {y_train.shape}')
 89 |         for fold, (train_idx, valid_idx) in enumerate(folds):
 90 |             logger.info(f'fold {fold}: #row of train: {len(train_idx)}, #row of valid: {len(valid_idx)}')
 91 |             for i in range(self.num_seed_blends):
 92 |                 valid_preds, model = self._train(X=X_train, y=y_train, predictors=self.predictors, train_idx=train_idx, valid_idx=valid_idx, seed=i)
 93 | 
 94 |                 oof_preds[valid_idx, :] += valid_preds / self.num_seed_blends
 95 |                 models[f'fold_{fold}_{i}'] = model
 96 | 
 97 |             score = self.metric(y_train.iloc[valid_idx].values, oof_preds[valid_idx, :])
 98 |             logger.info(f"fold {fold}: {score}")
 99 |             scores[f'fold_{fold}'] = score
100 |         oof_score = self.metric(y_train.values, oof_preds)
101 |         logger.info(f"{len(folds)} folds cv mean: {np.mean(list(scores.values()))}")
102 |         logger.info(f"oof score: {oof_score}")
103 | 
104 |         self.result = ModelResult(oof_preds=oof_preds, models=models, preds=None, folds=folds, scores={
105 |             'oof_score': oof_score,
106 |             'KFoldsScores': scores,
107 |         })
108 | 
109 |         return True
110 | 
111 |     @abstractmethod
112 |     def _predict(self, model: Any, X_valid: pd.DataFrame, predictors: List[str]):
113 |         pass
114 | 
115 |     def predict(self, X_test) -> np.ndarray:
116 |         assert self.result is not None, 'Model is not trained Error'
117 | 
118 |         folds = self.result.folds
119 | 
120 |         n_targets = self.result.oof_preds.shape[1]
121 |         preds = np.zeros(shape=(X_test.shape[0], n_targets))
122 | 
123 |         for fold, (train_idx, valid_idx) in enumerate(folds):
124 |             for i in range(self.num_seed_blends):
125 |                 model = self.result.models[f'fold_{fold}_{i}']
126 |                 preds += self._predict(model=model, X_valid=X_test, predictors=self.predictors) / (len(folds) * self.num_seed_blends)
127 | 
128 |         return preds
129 | 
130 | 
131 | class MoaBaseOnline:
132 |     def __init__(self, target_cols: List[str], categorical_cols: List[str], ignore_cols: Optional[List[str]], num_seed_blends: int, metric: Callable,
133 |                  exp: Experiment):
134 |         self.exp = exp
135 |         self.ignore_cols = ignore_cols
136 |         self.categorical_cols = categorical_cols
137 |         self.metric = metric
138 |         self.result = None
139 |         self.num_seed_blends = num_seed_blends
140 | 
141 |     @abstractmethod
142 |     def _train_predict(self, X: pd.DataFrame, y: pd.DataFrame, X_test: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray,
143 |                        seed: int):
144 |         raise NotImplementedError
145 | 
146 |     def train_predict(self, X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, folds: List[Tuple[np.ndarray, np.ndarray]]):
147 | 
148 |         scores = dict()
149 |         oof_preds = np.zeros_like(y_train).astype(float)
150 |         preds = np.zeros(shape=(X_test.shape[0], y_train.shape[1]))
151 |         self.predictors = [col for col in X_train.columns.tolist() if col not in self.ignore_cols]
152 | 
153 |         logger.info(f'{self.__class__.__name__} train start')
154 |         logger.info(f'X shape: {X_train.shape}, y shape: {y_train.shape}')
155 |         for fold, (train_idx, valid_idx) in enumerate(folds):
156 |             logger.info(f'fold {fold}: #row of train: {len(train_idx)}, #row of valid: {len(valid_idx)}')
157 |             for i in range(self.num_seed_blends):
158 |                 _preds, valid_preds, = self._train_predict(X=X_train,
159 |                                                            y=y_train,
160 |                                                            X_test=X_test,
161 |                                                            predictors=self.predictors,
162 |                                                            train_idx=train_idx,
163 |                                                            valid_idx=valid_idx,
164 |                                                            seed=i)
165 | 
166 |                 oof_preds[valid_idx, :] += valid_preds / self.num_seed_blends
167 |                 preds += _preds / (len(folds) * self.num_seed_blends)
168 | 
169 |             score = self.metric(y_train.iloc[valid_idx].values, oof_preds[valid_idx, :])
170 |             logger.info(f"fold {fold}: {score}")
171 |             scores[f'fold_{fold}'] = score
172 |         oof_score = self.metric(y_train.values, oof_preds)
173 |         logger.info(f"{len(folds)} folds cv mean: {np.mean(list(scores.values()))}")
174 |         logger.info(f"oof score: {oof_score}")
175 | 
176 |         self.result = ModelResult(oof_preds=oof_preds, models=None, preds=preds, folds=folds, scores={
177 |             'oof_score': oof_score,
178 |             'KFoldsScores': scores,
179 |         })
180 | 
181 |         return True
182 | 
183 | 
184 | class AllZerosClassifier:
185 |     def __init__(self, label=0):
186 |         self.label = label
187 | 
188 |     def predict(self, X):
189 |         return np.ones(X.shape[0]) * self.label
190 | 
191 |     def predict_proba(self, X):
192 |         labels = np.ones(shape=(X.shape[0], 2))
193 |         labels[:, 1 - self.label] = 0
194 |         return labels
195 | 


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/callbacks.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import datetime
  3 | import copy
  4 | import numpy as np
  5 | from dataclasses import dataclass, field
  6 | from typing import List, Any
  7 | 
  8 | 
  9 | class Callback:
 10 |     """
 11 |     Abstract base class used to build new callbacks.
 12 |     """
 13 | 
 14 |     def __init__(self):
 15 |         pass
 16 | 
 17 |     def set_params(self, params):
 18 |         self.params = params
 19 | 
 20 |     def set_trainer(self, model):
 21 |         self.trainer = model
 22 | 
 23 |     def on_epoch_begin(self, epoch, logs=None):
 24 |         pass
 25 | 
 26 |     def on_epoch_end(self, epoch, logs=None):
 27 |         pass
 28 | 
 29 |     def on_batch_begin(self, batch, logs=None):
 30 |         pass
 31 | 
 32 |     def on_batch_end(self, batch, logs=None):
 33 |         pass
 34 | 
 35 |     def on_train_begin(self, logs=None):
 36 |         pass
 37 | 
 38 |     def on_train_end(self, logs=None):
 39 |         pass
 40 | 
 41 | 
 42 | @dataclass
 43 | class CallbackContainer:
 44 |     """
 45 |     Container holding a list of callbacks.
 46 |     """
 47 | 
 48 |     callbacks: List[Callback] = field(default_factory=list)
 49 | 
 50 |     def append(self, callback):
 51 |         self.callbacks.append(callback)
 52 | 
 53 |     def set_params(self, params):
 54 |         for callback in self.callbacks:
 55 |             callback.set_params(params)
 56 | 
 57 |     def set_trainer(self, trainer):
 58 |         self.trainer = trainer
 59 |         for callback in self.callbacks:
 60 |             callback.set_trainer(trainer)
 61 | 
 62 |     def on_epoch_begin(self, epoch, logs=None):
 63 |         logs = logs or {}
 64 |         for callback in self.callbacks:
 65 |             callback.on_epoch_begin(epoch, logs)
 66 | 
 67 |     def on_epoch_end(self, epoch, logs=None):
 68 |         logs = logs or {}
 69 |         for callback in self.callbacks:
 70 |             callback.on_epoch_end(epoch, logs)
 71 | 
 72 |     def on_batch_begin(self, batch, logs=None):
 73 |         logs = logs or {}
 74 |         for callback in self.callbacks:
 75 |             callback.on_batch_begin(batch, logs)
 76 | 
 77 |     def on_batch_end(self, batch, logs=None):
 78 |         logs = logs or {}
 79 |         for callback in self.callbacks:
 80 |             callback.on_batch_end(batch, logs)
 81 | 
 82 |     def on_train_begin(self, logs=None):
 83 |         logs = logs or {}
 84 |         logs["start_time"] = time.time()
 85 |         for callback in self.callbacks:
 86 |             callback.on_train_begin(logs)
 87 | 
 88 |     def on_train_end(self, logs=None):
 89 |         logs = logs or {}
 90 |         for callback in self.callbacks:
 91 |             callback.on_train_end(logs)
 92 | 
 93 | 
 94 | @dataclass
 95 | class EarlyStopping(Callback):
 96 |     """EarlyStopping callback to exit the training loop if early_stopping_metric
 97 |     does not improve by a certain amount for a certain
 98 |     number of epochs.
 99 | 
100 |     Parameters
101 |     ---------
102 |     early_stopping_metric : str
103 |         Early stopping metric name
104 |     is_maximize : bool
105 |         Whether to maximize or not early_stopping_metric
106 |     tol : float
107 |         minimum change in monitored value to qualify as improvement.
108 |         This number should be positive.
109 |     patience : integer
110 |         number of epochs to wait for improvment before terminating.
111 |         the counter be reset after each improvment
112 | 
113 |     """
114 | 
115 |     early_stopping_metric: str
116 |     is_maximize: bool
117 |     tol: float = 0.0
118 |     patience: int = 5
119 | 
120 |     def __post_init__(self):
121 |         self.best_epoch = 0
122 |         self.stopped_epoch = 0
123 |         self.wait = 0
124 |         self.best_weights = None
125 |         self.best_loss = np.inf
126 |         if self.is_maximize:
127 |             self.best_loss = -self.best_loss
128 |         super().__init__()
129 | 
130 |     def on_epoch_end(self, epoch, logs=None):
131 |         current_loss = logs.get(self.early_stopping_metric)
132 |         if current_loss is None:
133 |             return
134 | 
135 |         loss_change = current_loss - self.best_loss
136 |         max_improved = self.is_maximize and loss_change > self.tol
137 |         min_improved = (not self.is_maximize) and (-loss_change > self.tol)
138 |         if max_improved or min_improved:
139 |             self.best_loss = current_loss
140 |             self.best_epoch = epoch
141 |             self.wait = 1
142 |             self.best_weights = copy.deepcopy(self.trainer.network.state_dict())
143 |         else:
144 |             if self.wait >= self.patience:
145 |                 self.stopped_epoch = epoch
146 |                 self.trainer._stop_training = True
147 |             self.wait += 1
148 | 
149 |     def on_train_end(self, logs=None):
150 |         self.trainer.best_epoch = self.best_epoch
151 |         self.trainer.best_cost = self.best_loss
152 | 
153 |         if self.best_weights is not None:
154 |             self.trainer.network.load_state_dict(self.best_weights)
155 | 
156 |         if self.stopped_epoch > 0:
157 |             msg = f"\nEarly stopping occured at epoch {self.stopped_epoch}"
158 |             msg += (
159 |                 f" with best_epoch = {self.best_epoch} and "
160 |                 + f"best_{self.early_stopping_metric} = {round(self.best_loss, 5)}"
161 |             )
162 |             print(msg)
163 |         else:
164 |             msg = (f"Stop training because you reached max_epochs = {self.trainer.max_epochs}"
165 |                    + f" with best_epoch = {self.best_epoch} and "
166 |                    + f"best_{self.early_stopping_metric} = {round(self.best_loss, 5)}")
167 |             print(msg)
168 |         print("Best weights from best epoch are automatically used!")
169 | 
170 | 
171 | @dataclass
172 | class History(Callback):
173 |     """Callback that records events into a `History` object.
174 |     This callback is automatically applied to
175 |     every SuperModule.
176 | 
177 |     Parameters
178 |     ---------
179 |     trainer : DeepRecoModel
180 |         Model class to train
181 |     verbose : int
182 |         Print results every verbose iteration
183 | 
184 |     """
185 | 
186 |     trainer: Any
187 |     verbose: int = 1
188 | 
189 |     def __post_init__(self):
190 |         super().__init__()
191 |         self.samples_seen = 0.0
192 |         self.total_time = 0.0
193 | 
194 |     def on_train_begin(self, logs=None):
195 |         self.history = {"loss": []}
196 |         self.history.update({"lr": []})
197 |         self.history.update({name: [] for name in self.trainer._metrics_names})
198 |         self.start_time = logs["start_time"]
199 |         self.epoch_loss = 0.
200 | 
201 |     def on_epoch_begin(self, epoch, logs=None):
202 |         self.epoch_metrics = {"loss": 0.0}
203 |         self.samples_seen = 0.0
204 | 
205 |     def on_epoch_end(self, epoch, logs=None):
206 |         self.epoch_metrics["loss"] = self.epoch_loss
207 |         for metric_name, metric_value in self.epoch_metrics.items():
208 |             self.history[metric_name].append(metric_value)
209 |         if self.verbose == 0:
210 |             return
211 |         if epoch % self.verbose != 0:
212 |             return
213 |         msg = f"epoch {epoch:<3}"
214 |         for metric_name, metric_value in self.epoch_metrics.items():
215 |             if metric_name != "lr":
216 |                 msg += f"| {metric_name:<3}: {np.round(metric_value, 5):<8}"
217 |         self.total_time = int(time.time() - self.start_time)
218 |         msg += f"|  {str(datetime.timedelta(seconds=self.total_time)) + 's':<6}"
219 |         print(msg)
220 | 
221 |     def on_batch_end(self, batch, logs=None):
222 |         batch_size = logs["batch_size"]
223 |         self.epoch_loss = (self.samples_seen * self.epoch_loss + batch_size * logs["loss"]
224 |                            ) / (self.samples_seen + batch_size)
225 |         self.samples_seen += batch_size
226 | 
227 |     def __getitem__(self, name):
228 |         return self.history[name]
229 | 
230 |     def __repr__(self):
231 |         return str(self.history)
232 | 
233 |     def __str__(self):
234 |         return str(self.history)
235 | 
236 | 
237 | @dataclass
238 | class LRSchedulerCallback(Callback):
239 |     """Wrapper for most torch scheduler functions.
240 | 
241 |     Parameters
242 |     ---------
243 |     scheduler_fn : torch.optim.lr_scheduler
244 |         Torch scheduling class
245 |     scheduler_params : dict
246 |         Dictionnary containing all parameters for the scheduler_fn
247 |     is_batch_level : bool (default = False)
248 |         If set to False : lr updates will happen at every epoch
249 |         If set to True : lr updates happen at every batch
250 |         Set this to True for OneCycleLR for example
251 |     """
252 | 
253 |     scheduler_fn: Any
254 |     optimizer: Any
255 |     scheduler_params: dict
256 |     early_stopping_metric: str
257 |     is_batch_level: bool = False
258 | 
259 |     def __post_init__(self, ):
260 |         self.is_metric_related = hasattr(self.scheduler_fn,
261 |                                          "is_better")
262 |         self.scheduler = self.scheduler_fn(self.optimizer,
263 |                                            **self.scheduler_params)
264 |         super().__init__()
265 | 
266 |     def on_batch_end(self, batch, logs=None):
267 |         if self.is_batch_level:
268 |             self.scheduler.step()
269 |         else:
270 |             pass
271 | 
272 |     def on_epoch_end(self, epoch, logs=None):
273 |         current_loss = logs.get(self.early_stopping_metric)
274 |         if current_loss is None:
275 |             return
276 |         if self.is_batch_level:
277 |             pass
278 |         else:
279 |             if self.is_metric_related:
280 |                 self.scheduler.step(current_loss)
281 |             else:
282 |                 self.scheduler.step()
283 | 


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/sparsemax.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | from torch.autograd import Function
  3 | import torch.nn.functional as F
  4 | 
  5 | import torch
  6 | 
  7 | """
  8 | Other possible implementations:
  9 | https://github.com/KrisKorrel/sparsemax-pytorch/blob/master/sparsemax.py
 10 | https://github.com/msobroza/SparsemaxPytorch/blob/master/mnist/sparsemax.py
 11 | https://github.com/vene/sparse-structured-attention/blob/master/pytorch/torchsparseattn/sparsemax.py
 12 | """
 13 | 
 14 | 
 15 | # credits to Yandex https://github.com/Qwicen/node/blob/master/lib/nn_utils.py
 16 | def _make_ix_like(input, dim=0):
 17 |     d = input.size(dim)
 18 |     rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
 19 |     view = [1] * input.dim()
 20 |     view[0] = -1
 21 |     return rho.view(view).transpose(0, dim)
 22 | 
 23 | 
 24 | class SparsemaxFunction(Function):
 25 |     """
 26 |     An implementation of sparsemax (Martins & Astudillo, 2016). See
 27 |     :cite:`DBLP:journals/corr/MartinsA16` for detailed description.
 28 |     By Ben Peters and Vlad Niculae
 29 |     """
 30 | 
 31 |     @staticmethod
 32 |     def forward(ctx, input, dim=-1):
 33 |         """sparsemax: normalizing sparse transform (a la softmax)
 34 | 
 35 |         Parameters
 36 |         ----------
 37 |         ctx : torch.autograd.function._ContextMethodMixin
 38 |         input : torch.Tensor
 39 |             any shape
 40 |         dim : int
 41 |             dimension along which to apply sparsemax
 42 | 
 43 |         Returns
 44 |         -------
 45 |         output : torch.Tensor
 46 |             same shape as input
 47 | 
 48 |         """
 49 |         ctx.dim = dim
 50 |         max_val, _ = input.max(dim=dim, keepdim=True)
 51 |         input -= max_val  # same numerical stability trick as for softmax
 52 |         tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim)
 53 |         output = torch.clamp(input - tau, min=0)
 54 |         ctx.save_for_backward(supp_size, output)
 55 |         return output
 56 | 
 57 |     @staticmethod
 58 |     def backward(ctx, grad_output):
 59 |         supp_size, output = ctx.saved_tensors
 60 |         dim = ctx.dim
 61 |         grad_input = grad_output.clone()
 62 |         grad_input[output == 0] = 0
 63 | 
 64 |         v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
 65 |         v_hat = v_hat.unsqueeze(dim)
 66 |         grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
 67 |         return grad_input, None
 68 | 
 69 |     @staticmethod
 70 |     def _threshold_and_support(input, dim=-1):
 71 |         """Sparsemax building block: compute the threshold
 72 | 
 73 |         Parameters
 74 |         ----------
 75 |         input: torch.Tensor
 76 |             any dimension
 77 |         dim : int
 78 |             dimension along which to apply the sparsemax
 79 | 
 80 |         Returns
 81 |         -------
 82 |         tau : torch.Tensor
 83 |             the threshold value
 84 |         support_size : torch.Tensor
 85 | 
 86 |         """
 87 | 
 88 |         input_srt, _ = torch.sort(input, descending=True, dim=dim)
 89 |         input_cumsum = input_srt.cumsum(dim) - 1
 90 |         rhos = _make_ix_like(input, dim)
 91 |         support = rhos * input_srt > input_cumsum
 92 | 
 93 |         support_size = support.sum(dim=dim).unsqueeze(dim)
 94 |         tau = input_cumsum.gather(dim, support_size - 1)
 95 |         tau /= support_size.to(input.dtype)
 96 |         return tau, support_size
 97 | 
 98 | 
 99 | sparsemax = SparsemaxFunction.apply
100 | 
101 | 
102 | class Sparsemax(nn.Module):
103 | 
104 |     def __init__(self, dim=-1):
105 |         self.dim = dim
106 |         super(Sparsemax, self).__init__()
107 | 
108 |     def forward(self, input):
109 |         return sparsemax(input, self.dim)
110 | 
111 | 
112 | class Entmax15Function(Function):
113 |     """
114 |     An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See
115 |     :cite:`https://arxiv.org/abs/1905.05702 for detailed description.
116 |     Source: https://github.com/deep-spin/entmax
117 |     """
118 | 
119 |     @staticmethod
120 |     def forward(ctx, input, dim=-1):
121 |         ctx.dim = dim
122 | 
123 |         max_val, _ = input.max(dim=dim, keepdim=True)
124 |         input = input - max_val  # same numerical stability trick as for softmax
125 |         input = input / 2  # divide by 2 to solve actual Entmax
126 | 
127 |         tau_star, _ = Entmax15Function._threshold_and_support(input, dim)
128 |         output = torch.clamp(input - tau_star, min=0) ** 2
129 |         ctx.save_for_backward(output)
130 |         return output
131 | 
132 |     @staticmethod
133 |     def backward(ctx, grad_output):
134 |         Y, = ctx.saved_tensors
135 |         gppr = Y.sqrt()  # = 1 / g'' (Y)
136 |         dX = grad_output * gppr
137 |         q = dX.sum(ctx.dim) / gppr.sum(ctx.dim)
138 |         q = q.unsqueeze(ctx.dim)
139 |         dX -= q * gppr
140 |         return dX, None
141 | 
142 |     @staticmethod
143 |     def _threshold_and_support(input, dim=-1):
144 |         Xsrt, _ = torch.sort(input, descending=True, dim=dim)
145 | 
146 |         rho = _make_ix_like(input, dim)
147 |         mean = Xsrt.cumsum(dim) / rho
148 |         mean_sq = (Xsrt ** 2).cumsum(dim) / rho
149 |         ss = rho * (mean_sq - mean ** 2)
150 |         delta = (1 - ss) / rho
151 | 
152 |         # NOTE this is not exactly the same as in reference algo
153 |         # Fortunately it seems the clamped values never wrongly
154 |         # get selected by tau <= sorted_z. Prove this!
155 |         delta_nz = torch.clamp(delta, 0)
156 |         tau = mean - torch.sqrt(delta_nz)
157 | 
158 |         support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim)
159 |         tau_star = tau.gather(dim, support_size - 1)
160 |         return tau_star, support_size
161 | 
162 | 
163 | class Entmoid15(Function):
164 |     """ A highly optimized equivalent of labda x: Entmax15([x, 0]) """
165 | 
166 |     @staticmethod
167 |     def forward(ctx, input):
168 |         output = Entmoid15._forward(input)
169 |         ctx.save_for_backward(output)
170 |         return output
171 | 
172 |     @staticmethod
173 |     def _forward(input):
174 |         input, is_pos = abs(input), input >= 0
175 |         tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2
176 |         tau.masked_fill_(tau <= input, 2.0)
177 |         y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2
178 |         return torch.where(is_pos, 1 - y_neg, y_neg)
179 | 
180 |     @staticmethod
181 |     def backward(ctx, grad_output):
182 |         return Entmoid15._backward(ctx.saved_tensors[0], grad_output)
183 | 
184 |     @staticmethod
185 |     def _backward(output, grad_output):
186 |         gppr0, gppr1 = output.sqrt(), (1 - output).sqrt()
187 |         grad_input = grad_output * gppr0
188 |         q = grad_input / (gppr0 + gppr1)
189 |         grad_input -= q * gppr0
190 |         return grad_input
191 | 
192 | 
193 | entmax15 = Entmax15Function.apply
194 | entmoid15 = Entmoid15.apply
195 | 
196 | 
197 | class Entmax15(nn.Module):
198 | 
199 |     def __init__(self, dim=-1):
200 |         self.dim = dim
201 |         super(Entmax15, self).__init__()
202 | 
203 |     def forward(self, input):
204 |         return entmax15(input, self.dim)
205 | 
206 | 
207 | # Credits were lost...
208 | # def _make_ix_like(input, dim=0):
209 | #     d = input.size(dim)
210 | #     rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
211 | #     view = [1] * input.dim()
212 | #     view[0] = -1
213 | #     return rho.view(view).transpose(0, dim)
214 | #
215 | #
216 | # def _threshold_and_support(input, dim=0):
217 | #     """Sparsemax building block: compute the threshold
218 | #     Args:
219 | #         input: any dimension
220 | #         dim: dimension along which to apply the sparsemax
221 | #     Returns:
222 | #         the threshold value
223 | #     """
224 | #
225 | #     input_srt, _ = torch.sort(input, descending=True, dim=dim)
226 | #     input_cumsum = input_srt.cumsum(dim) - 1
227 | #     rhos = _make_ix_like(input, dim)
228 | #     support = rhos * input_srt > input_cumsum
229 | #
230 | #     support_size = support.sum(dim=dim).unsqueeze(dim)
231 | #     tau = input_cumsum.gather(dim, support_size - 1)
232 | #     tau /= support_size.to(input.dtype)
233 | #     return tau, support_size
234 | #
235 | #
236 | # class SparsemaxFunction(Function):
237 | #
238 | #     @staticmethod
239 | #     def forward(ctx, input, dim=0):
240 | #         """sparsemax: normalizing sparse transform (a la softmax)
241 | #         Parameters:
242 | #             input (Tensor): any shape
243 | #             dim: dimension along which to apply sparsemax
244 | #         Returns:
245 | #             output (Tensor): same shape as input
246 | #         """
247 | #         ctx.dim = dim
248 | #         max_val, _ = input.max(dim=dim, keepdim=True)
249 | #         input -= max_val  # same numerical stability trick as for softmax
250 | #         tau, supp_size = _threshold_and_support(input, dim=dim)
251 | #         output = torch.clamp(input - tau, min=0)
252 | #         ctx.save_for_backward(supp_size, output)
253 | #         return output
254 | #
255 | #     @staticmethod
256 | #     def backward(ctx, grad_output):
257 | #         supp_size, output = ctx.saved_tensors
258 | #         dim = ctx.dim
259 | #         grad_input = grad_output.clone()
260 | #         grad_input[output == 0] = 0
261 | #
262 | #         v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
263 | #         v_hat = v_hat.unsqueeze(dim)
264 | #         grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
265 | #         return grad_input, None
266 | #
267 | #
268 | # sparsemax = SparsemaxFunction.apply
269 | #
270 | #
271 | # class Sparsemax(nn.Module):
272 | #
273 | #     def __init__(self, dim=0):
274 | #         self.dim = dim
275 | #         super(Sparsemax, self).__init__()
276 | #
277 | #     def forward(self, input):
278 | #         return sparsemax(input, self.dim)
279 | 


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/utils.py:
--------------------------------------------------------------------------------
  1 | from torch.utils.data import Dataset
  2 | from torch.utils.data import DataLoader, WeightedRandomSampler
  3 | import torch
  4 | import numpy as np
  5 | import scipy
  6 | from sklearn.utils import check_array
  7 | 
  8 | 
  9 | class TorchDataset(Dataset):
 10 |     """
 11 |     Format for numpy array
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     X : 2D array
 16 |         The input matrix
 17 |     y : 2D array
 18 |         The one-hot encoded target
 19 |     """
 20 | 
 21 |     def __init__(self, x, y):
 22 |         self.x = x
 23 |         self.y = y
 24 | 
 25 |     def __len__(self):
 26 |         return len(self.x)
 27 | 
 28 |     def __getitem__(self, index):
 29 |         x, y = self.x[index], self.y[index]
 30 |         return x, y
 31 | 
 32 | 
 33 | class PredictDataset(Dataset):
 34 |     """
 35 |     Format for numpy array
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     X : 2D array
 40 |         The input matrix
 41 |     """
 42 | 
 43 |     def __init__(self, x):
 44 |         self.x = x
 45 | 
 46 |     def __len__(self):
 47 |         return len(self.x)
 48 | 
 49 |     def __getitem__(self, index):
 50 |         x = self.x[index]
 51 |         return x
 52 | 
 53 | 
 54 | def create_dataloaders(
 55 |     X_train, y_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory
 56 | ):
 57 |     """
 58 |     Create dataloaders with or wihtout subsampling depending on weights and balanced.
 59 | 
 60 |     Parameters
 61 |     ----------
 62 |     X_train : np.ndarray
 63 |         Training data
 64 |     y_train : np.array
 65 |         Mapped Training targets
 66 |     eval_set : list of tuple
 67 |         List of eval tuple set (X, y)
 68 |     weights : either 0, 1, dict or iterable
 69 |         if 0 (default) : no weights will be applied
 70 |         if 1 : classification only, will balanced class with inverse frequency
 71 |         if dict : keys are corresponding class values are sample weights
 72 |         if iterable : list or np array must be of length equal to nb elements
 73 |                       in the training set
 74 |     batch_size : int
 75 |         how many samples per batch to load
 76 |     num_workers : int
 77 |         how many subprocesses to use for data loading. 0 means that the data
 78 |         will be loaded in the main process
 79 |     drop_last : bool
 80 |         set to True to drop the last incomplete batch, if the dataset size is not
 81 |         divisible by the batch size. If False and the size of dataset is not
 82 |         divisible by the batch size, then the last batch will be smaller
 83 |     pin_memory : bool
 84 |         Whether to pin GPU memory during training
 85 | 
 86 |     Returns
 87 |     -------
 88 |     train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader
 89 |         Training and validation dataloaders
 90 |     """
 91 | 
 92 |     if isinstance(weights, int):
 93 |         if weights == 0:
 94 |             need_shuffle = True
 95 |             sampler = None
 96 |         elif weights == 1:
 97 |             need_shuffle = False
 98 |             class_sample_count = np.array(
 99 |                 [len(np.where(y_train == t)[0]) for t in np.unique(y_train)]
100 |             )
101 | 
102 |             weights = 1.0 / class_sample_count
103 | 
104 |             samples_weight = np.array([weights[t] for t in y_train])
105 | 
106 |             samples_weight = torch.from_numpy(samples_weight)
107 |             samples_weight = samples_weight.double()
108 |             sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
109 |         else:
110 |             raise ValueError("Weights should be either 0, 1, dictionnary or list.")
111 |     elif isinstance(weights, dict):
112 |         # custom weights per class
113 |         need_shuffle = False
114 |         samples_weight = np.array([weights[t] for t in y_train])
115 |         sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
116 |     else:
117 |         # custom weights
118 |         if len(weights) != len(y_train):
119 |             raise ValueError("Custom weights should match number of train samples.")
120 |         need_shuffle = False
121 |         samples_weight = np.array(weights)
122 |         sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
123 | 
124 |     train_dataloader = DataLoader(
125 |         TorchDataset(X_train.astype(np.float32), y_train),
126 |         batch_size=batch_size,
127 |         sampler=sampler,
128 |         shuffle=need_shuffle,
129 |         num_workers=num_workers,
130 |         drop_last=drop_last,
131 |         pin_memory=pin_memory
132 |     )
133 | 
134 |     valid_dataloaders = []
135 |     for X, y in eval_set:
136 |         valid_dataloaders.append(
137 |             DataLoader(
138 |                 TorchDataset(X.astype(np.float32), y),
139 |                 batch_size=batch_size,
140 |                 shuffle=False,
141 |                 num_workers=num_workers,
142 |                 pin_memory=pin_memory
143 |             )
144 |         )
145 | 
146 |     return train_dataloader, valid_dataloaders
147 | 
148 | 
149 | def create_explain_matrix(input_dim, cat_emb_dim, cat_idxs, post_embed_dim):
150 |     """
151 |     This is a computational trick.
152 |     In order to rapidly sum importances from same embeddings
153 |     to the initial index.
154 | 
155 |     Parameters
156 |     ----------
157 |     input_dim : int
158 |         Initial input dim
159 |     cat_emb_dim : int or list of int
160 |         if int : size of embedding for all categorical feature
161 |         if list of int : size of embedding for each categorical feature
162 |     cat_idxs : list of int
163 |         Initial position of categorical features
164 |     post_embed_dim : int
165 |         Post embedding inputs dimension
166 | 
167 |     Returns
168 |     -------
169 |     reducing_matrix : np.array
170 |         Matrix of dim (post_embed_dim, input_dim)  to performe reduce
171 |     """
172 | 
173 |     if isinstance(cat_emb_dim, int):
174 |         all_emb_impact = [cat_emb_dim - 1] * len(cat_idxs)
175 |     else:
176 |         all_emb_impact = [emb_dim - 1 for emb_dim in cat_emb_dim]
177 | 
178 |     acc_emb = 0
179 |     nb_emb = 0
180 |     indices_trick = []
181 |     for i in range(input_dim):
182 |         if i not in cat_idxs:
183 |             indices_trick.append([i + acc_emb])
184 |         else:
185 |             indices_trick.append(
186 |                 range(i + acc_emb, i + acc_emb + all_emb_impact[nb_emb] + 1)
187 |             )
188 |             acc_emb += all_emb_impact[nb_emb]
189 |             nb_emb += 1
190 | 
191 |     reducing_matrix = np.zeros((post_embed_dim, input_dim))
192 |     for i, cols in enumerate(indices_trick):
193 |         reducing_matrix[cols, i] = 1
194 | 
195 |     return scipy.sparse.csc_matrix(reducing_matrix)
196 | 
197 | 
198 | def filter_weights(weights):
199 |     """
200 |     This function makes sure that weights are in correct format for
201 |     regression and multitask TabNet
202 | 
203 |     Parameters
204 |     ----------
205 |     weights : int, dict or list
206 |         Initial weights parameters given by user
207 | 
208 |     Returns
209 |     -------
210 |     None : This function will only throw an error if format is wrong
211 |     """
212 |     err_msg = "Please provide a list of weights for regression or multitask : "
213 |     if isinstance(weights, int):
214 |         if weights == 1:
215 |             raise ValueError(err_msg + "1 given.")
216 |     if isinstance(weights, dict):
217 |         raise ValueError(err_msg + "Dict given.")
218 |     return
219 | 
220 | 
221 | def validate_eval_set(eval_set, eval_name, X_train, y_train):
222 |     """Check if the shapes of eval_set are compatible with (X_train, y_train).
223 | 
224 |     Parameters
225 |     ----------
226 |     eval_set : list of tuple
227 |         List of eval tuple set (X, y).
228 |         The last one is used for early stopping
229 |     eval_name : list of str
230 |         List of eval set names.
231 |     X_train : np.ndarray
232 |         Train owned products
233 |     y_train : np.array
234 |         Train targeted products
235 | 
236 |     Returns
237 |     -------
238 |     eval_names : list of str
239 |         Validated list of eval_names.
240 |     eval_set : list of tuple
241 |         Validated list of eval_set.
242 | 
243 |     """
244 |     eval_name = eval_name or [f"val_{i}" for i in range(len(eval_set))]
245 | 
246 |     assert len(eval_set) == len(
247 |         eval_name
248 |     ), "eval_set and eval_name have not the same length"
249 |     if len(eval_set) > 0:
250 |         assert all(
251 |             len(elem) == 2 for elem in eval_set
252 |         ), "Each tuple of eval_set need to have two elements"
253 |     for name, (X, y) in zip(eval_name, eval_set):
254 |         check_array(X)
255 |         msg = (
256 |             f"Number of columns is different between X_{name} "
257 |             + f"({X.shape[1]}) and X_train ({X_train.shape[1]})"
258 |         )
259 |         assert X.shape[1] == X_train.shape[1], msg
260 |         if len(y_train.shape) == 2:
261 |             msg = (
262 |                 f"Number of columns is different between y_{name} "
263 |                 + f"({y.shape[1]}) and y_train ({y_train.shape[1]})"
264 |             )
265 |             assert y.shape[1] == y_train.shape[1], msg
266 |         msg = (
267 |             f"You need the same number of rows between X_{name} "
268 |             + f"({X.shape[0]}) and y_{name} ({y.shape[0]})"
269 |         )
270 |         assert X.shape[0] == y.shape[0], msg
271 | 
272 |     return eval_name, eval_set
273 | 
274 | 
275 | def define_device(device_name):
276 |     """
277 |     Define the device to use during training and inference.
278 |     If auto it will detect automatically whether to use cuda or cpu
279 | 
280 |     Parameters
281 |     ----------
282 |     device_name : str
283 |         Either "auto", "cpu" or "cuda"
284 | 
285 |     Returns
286 |     -------
287 |     str
288 |         Either "cpu" or "cuda"
289 |     """
290 |     if device_name == "auto":
291 |         if torch.cuda.is_available():
292 |             return "cuda"
293 |         else:
294 |             return "cpu"
295 |     else:
296 |         return device_name
297 | 


--------------------------------------------------------------------------------
/src/models/optimizer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim.optimizer import Optimizer, required
  4 | 
  5 | 
  6 | class RAdam(Optimizer):
  7 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
  8 |         if not 0.0 <= lr:
  9 |             raise ValueError("Invalid learning rate: {}".format(lr))
 10 |         if not 0.0 <= eps:
 11 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 12 |         if not 0.0 <= betas[0] < 1.0:
 13 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 14 |         if not 0.0 <= betas[1] < 1.0:
 15 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 16 | 
 17 |         self.degenerated_to_sgd = degenerated_to_sgd
 18 |         if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
 19 |             for param in params:
 20 |                 if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
 21 |                     param['buffer'] = [[None, None, None] for _ in range(10)]
 22 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
 23 |         super(RAdam, self).__init__(params, defaults)
 24 | 
 25 |     def __setstate__(self, state):
 26 |         super(RAdam, self).__setstate__(state)
 27 | 
 28 |     def step(self, closure=None):
 29 | 
 30 |         loss = None
 31 |         if closure is not None:
 32 |             loss = closure()
 33 | 
 34 |         for group in self.param_groups:
 35 | 
 36 |             for p in group['params']:
 37 |                 if p.grad is None:
 38 |                     continue
 39 |                 grad = p.grad.data.float()
 40 |                 if grad.is_sparse:
 41 |                     raise RuntimeError('RAdam does not support sparse gradients')
 42 | 
 43 |                 p_data_fp32 = p.data.float()
 44 | 
 45 |                 state = self.state[p]
 46 | 
 47 |                 if len(state) == 0:
 48 |                     state['step'] = 0
 49 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
 50 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
 51 |                 else:
 52 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
 53 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
 54 | 
 55 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 56 |                 beta1, beta2 = group['betas']
 57 | 
 58 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 59 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 60 | 
 61 |                 state['step'] += 1
 62 |                 buffered = group['buffer'][int(state['step'] % 10)]
 63 |                 if state['step'] == buffered[0]:
 64 |                     N_sma, step_size = buffered[1], buffered[2]
 65 |                 else:
 66 |                     buffered[0] = state['step']
 67 |                     beta2_t = beta2**state['step']
 68 |                     N_sma_max = 2 / (1 - beta2) - 1
 69 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
 70 |                     buffered[1] = N_sma
 71 | 
 72 |                     # more conservative since it's an approximated value
 73 |                     if N_sma >= 5:
 74 |                         step_size = math.sqrt(
 75 |                             (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1**state['step'])
 76 |                     elif self.degenerated_to_sgd:
 77 |                         step_size = 1.0 / (1 - beta1**state['step'])
 78 |                     else:
 79 |                         step_size = -1
 80 |                     buffered[2] = step_size
 81 | 
 82 |                 # more conservative since it's an approximated value
 83 |                 if N_sma >= 5:
 84 |                     if group['weight_decay'] != 0:
 85 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
 86 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
 87 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
 88 |                     p.data.copy_(p_data_fp32)
 89 |                 elif step_size > 0:
 90 |                     if group['weight_decay'] != 0:
 91 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
 92 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
 93 |                     p.data.copy_(p_data_fp32)
 94 | 
 95 |         return loss
 96 | 
 97 | 
 98 | class PlainRAdam(Optimizer):
 99 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
100 |         if not 0.0 <= lr:
101 |             raise ValueError("Invalid learning rate: {}".format(lr))
102 |         if not 0.0 <= eps:
103 |             raise ValueError("Invalid epsilon value: {}".format(eps))
104 |         if not 0.0 <= betas[0] < 1.0:
105 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
106 |         if not 0.0 <= betas[1] < 1.0:
107 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
108 | 
109 |         self.degenerated_to_sgd = degenerated_to_sgd
110 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
111 | 
112 |         super(PlainRAdam, self).__init__(params, defaults)
113 | 
114 |     def __setstate__(self, state):
115 |         super(PlainRAdam, self).__setstate__(state)
116 | 
117 |     def step(self, closure=None):
118 | 
119 |         loss = None
120 |         if closure is not None:
121 |             loss = closure()
122 | 
123 |         for group in self.param_groups:
124 | 
125 |             for p in group['params']:
126 |                 if p.grad is None:
127 |                     continue
128 |                 grad = p.grad.data.float()
129 |                 if grad.is_sparse:
130 |                     raise RuntimeError('RAdam does not support sparse gradients')
131 | 
132 |                 p_data_fp32 = p.data.float()
133 | 
134 |                 state = self.state[p]
135 | 
136 |                 if len(state) == 0:
137 |                     state['step'] = 0
138 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
139 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
140 |                 else:
141 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
142 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
143 | 
144 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
145 |                 beta1, beta2 = group['betas']
146 | 
147 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
148 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
149 | 
150 |                 state['step'] += 1
151 |                 beta2_t = beta2**state['step']
152 |                 N_sma_max = 2 / (1 - beta2) - 1
153 |                 N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
154 | 
155 |                 # more conservative since it's an approximated value
156 |                 if N_sma >= 5:
157 |                     if group['weight_decay'] != 0:
158 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
159 |                     step_size = group['lr'] * math.sqrt(
160 |                         (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1**state['step'])
161 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
162 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
163 |                     p.data.copy_(p_data_fp32)
164 |                 elif self.degenerated_to_sgd:
165 |                     if group['weight_decay'] != 0:
166 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
167 |                     step_size = group['lr'] / (1 - beta1**state['step'])
168 |                     p_data_fp32.add_(-step_size, exp_avg)
169 |                     p.data.copy_(p_data_fp32)
170 | 
171 |         return loss
172 | 
173 | 
174 | class AdamW(Optimizer):
175 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0):
176 |         if not 0.0 <= lr:
177 |             raise ValueError("Invalid learning rate: {}".format(lr))
178 |         if not 0.0 <= eps:
179 |             raise ValueError("Invalid epsilon value: {}".format(eps))
180 |         if not 0.0 <= betas[0] < 1.0:
181 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
182 |         if not 0.0 <= betas[1] < 1.0:
183 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
184 | 
185 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, warmup=warmup)
186 |         super(AdamW, self).__init__(params, defaults)
187 | 
188 |     def __setstate__(self, state):
189 |         super(AdamW, self).__setstate__(state)
190 | 
191 |     def step(self, closure=None):
192 |         loss = None
193 |         if closure is not None:
194 |             loss = closure()
195 | 
196 |         for group in self.param_groups:
197 | 
198 |             for p in group['params']:
199 |                 if p.grad is None:
200 |                     continue
201 |                 grad = p.grad.data.float()
202 |                 if grad.is_sparse:
203 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
204 | 
205 |                 p_data_fp32 = p.data.float()
206 | 
207 |                 state = self.state[p]
208 | 
209 |                 if len(state) == 0:
210 |                     state['step'] = 0
211 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
212 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
213 |                 else:
214 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
215 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
216 | 
217 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
218 |                 beta1, beta2 = group['betas']
219 | 
220 |                 state['step'] += 1
221 | 
222 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
223 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
224 | 
225 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
226 |                 bias_correction1 = 1 - beta1**state['step']
227 |                 bias_correction2 = 1 - beta2**state['step']
228 | 
229 |                 if group['warmup'] > state['step']:
230 |                     scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
231 |                 else:
232 |                     scheduled_lr = group['lr']
233 | 
234 |                 step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
235 | 
236 |                 if group['weight_decay'] != 0:
237 |                     p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
238 | 
239 |                 p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
240 | 
241 |                 p.data.copy_(p_data_fp32)
242 | 
243 |         return loss


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/multiclass_utils.py:
--------------------------------------------------------------------------------
  1 | # Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi
  2 | #
  3 | # License: BSD 3 clause
  4 | """
  5 | Multi-class / multi-label utility function
  6 | ==========================================
  7 | 
  8 | """
  9 | from collections.abc import Sequence
 10 | from itertools import chain
 11 | 
 12 | from scipy.sparse import issparse
 13 | from scipy.sparse.base import spmatrix
 14 | from scipy.sparse import dok_matrix
 15 | from scipy.sparse import lil_matrix
 16 | import scipy.sparse as sp
 17 | 
 18 | import numpy as np
 19 | 
 20 | 
 21 | def _assert_all_finite(X, allow_nan=False):
 22 |     """Like assert_all_finite, but only for ndarray."""
 23 | 
 24 |     X = np.asanyarray(X)
 25 |     # First try an O(n) time, O(1) space solution for the common case that
 26 |     # everything is finite; fall back to O(n) space np.isfinite to prevent
 27 |     # false positives from overflow in sum method. The sum is also calculated
 28 |     # safely to reduce dtype induced overflows.
 29 |     is_float = X.dtype.kind in "fc"
 30 |     if is_float and (np.isfinite(np.sum(X))):
 31 |         pass
 32 |     elif is_float:
 33 |         msg_err = "Input contains {} or a value too large for {!r}."
 34 |         if (
 35 |             allow_nan
 36 |             and np.isinf(X).any()
 37 |             or not allow_nan
 38 |             and not np.isfinite(X).all()
 39 |         ):
 40 |             type_err = "infinity" if allow_nan else "NaN, infinity"
 41 |             raise ValueError(msg_err.format(type_err, X.dtype))
 42 |     # for object dtype data, we only check for NaNs (GH-13254)
 43 |     elif X.dtype == np.dtype("object") and not allow_nan:
 44 |         if np.isnan(X).any():
 45 |             raise ValueError("Input contains NaN")
 46 | 
 47 | 
 48 | def assert_all_finite(X, allow_nan=False):
 49 |     """Throw a ValueError if X contains NaN or infinity.
 50 | 
 51 |     Parameters
 52 |     ----------
 53 |     X : array or sparse matrix
 54 |     allow_nan : bool
 55 |     """
 56 |     _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
 57 | 
 58 | 
 59 | def _unique_multiclass(y):
 60 |     if hasattr(y, "__array__"):
 61 |         return np.unique(np.asarray(y))
 62 |     else:
 63 |         return set(y)
 64 | 
 65 | 
 66 | def _unique_indicator(y):
 67 |     """
 68 |     Not implemented
 69 |     """
 70 |     pass
 71 | 
 72 | 
 73 | _FN_UNIQUE_LABELS = {
 74 |     "binary": _unique_multiclass,
 75 |     "multiclass": _unique_multiclass,
 76 |     "multilabel-indicator": _unique_indicator,
 77 | }
 78 | 
 79 | 
 80 | def unique_labels(*ys):
 81 |     """Extract an ordered array of unique labels
 82 | 
 83 |     We don't allow:
 84 |         - mix of multilabel and multiclass (single label) targets
 85 |         - mix of label indicator matrix and anything else,
 86 |           because there are no explicit labels)
 87 |         - mix of label indicator matrices of different sizes
 88 |         - mix of string and integer labels
 89 | 
 90 |     At the moment, we also don't allow "multiclass-multioutput" input type.
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     *ys : array-likes
 95 | 
 96 |     Returns
 97 |     -------
 98 |     out : numpy array of shape [n_unique_labels]
 99 |         An ordered array of unique labels.
100 | 
101 |     Examples
102 |     --------
103 |     >>> from sklearn.utils.multiclass import unique_labels
104 |     >>> unique_labels([3, 5, 5, 5, 7, 7])
105 |     array([3, 5, 7])
106 |     >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
107 |     array([1, 2, 3, 4])
108 |     >>> unique_labels([1, 2, 10], [5, 11])
109 |     array([ 1,  2,  5, 10, 11])
110 |     """
111 |     if not ys:
112 |         raise ValueError("No argument has been passed.")
113 |     # Check that we don't mix label format
114 | 
115 |     ys_types = set(type_of_target(x) for x in ys)
116 |     if ys_types == {"binary", "multiclass"}:
117 |         ys_types = {"multiclass"}
118 | 
119 |     if len(ys_types) > 1:
120 |         raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
121 | 
122 |     label_type = ys_types.pop()
123 | 
124 |     # Get the unique set of labels
125 |     _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
126 |     if not _unique_labels:
127 |         raise ValueError("Unknown label type: %s" % repr(ys))
128 | 
129 |     ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys))
130 | 
131 |     # Check that we don't mix string type with number type
132 |     if len(set(isinstance(label, str) for label in ys_labels)) > 1:
133 |         raise ValueError("Mix of label input types (string and number)")
134 | 
135 |     return np.array(sorted(ys_labels))
136 | 
137 | 
138 | def _is_integral_float(y):
139 |     return y.dtype.kind == "f" and np.all(y.astype(int) == y)
140 | 
141 | 
142 | def is_multilabel(y):
143 |     """ Check if ``y`` is in a multilabel format.
144 | 
145 |     Parameters
146 |     ----------
147 |     y : numpy array of shape [n_samples]
148 |         Target values.
149 | 
150 |     Returns
151 |     -------
152 |     out : bool
153 |         Return ``True``, if ``y`` is in a multilabel format, else ```False``.
154 | 
155 |     Examples
156 |     --------
157 |     >>> import numpy as np
158 |     >>> from sklearn.utils.multiclass import is_multilabel
159 |     >>> is_multilabel([0, 1, 0, 1])
160 |     False
161 |     >>> is_multilabel([[1], [0, 2], []])
162 |     False
163 |     >>> is_multilabel(np.array([[1, 0], [0, 0]]))
164 |     True
165 |     >>> is_multilabel(np.array([[1], [0], [0]]))
166 |     False
167 |     >>> is_multilabel(np.array([[1, 0, 0]]))
168 |     True
169 |     """
170 |     if hasattr(y, "__array__"):
171 |         y = np.asarray(y)
172 |     if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
173 |         return False
174 | 
175 |     if issparse(y):
176 |         if isinstance(y, (dok_matrix, lil_matrix)):
177 |             y = y.tocsr()
178 |         return (
179 |             len(y.data) == 0
180 |             or np.unique(y.data).size == 1
181 |             and (
182 |                 y.dtype.kind in "biu"
183 |                 or _is_integral_float(np.unique(y.data))  # bool, int, uint
184 |             )
185 |         )
186 |     else:
187 |         labels = np.unique(y)
188 | 
189 |         return len(labels) < 3 and (
190 |             y.dtype.kind in "biu" or _is_integral_float(labels)  # bool, int, uint
191 |         )
192 | 
193 | 
194 | def check_classification_targets(y):
195 |     """Ensure that target y is of a non-regression type.
196 | 
197 |     Only the following target types (as defined in type_of_target) are allowed:
198 |         'binary', 'multiclass', 'multiclass-multioutput',
199 |         'multilabel-indicator', 'multilabel-sequences'
200 | 
201 |     Parameters
202 |     ----------
203 |     y : array-like
204 |     """
205 |     y_type = type_of_target(y)
206 |     if y_type not in [
207 |         "binary",
208 |         "multiclass",
209 |         "multiclass-multioutput",
210 |         "multilabel-indicator",
211 |         "multilabel-sequences",
212 |     ]:
213 |         raise ValueError("Unknown label type: %r" % y_type)
214 | 
215 | 
216 | def type_of_target(y):
217 |     """Determine the type of data indicated by the target.
218 | 
219 |     Note that this type is the most specific type that can be inferred.
220 |     For example:
221 | 
222 |         * ``binary`` is more specific but compatible with ``multiclass``.
223 |         * ``multiclass`` of integers is more specific but compatible with
224 |           ``continuous``.
225 |         * ``multilabel-indicator`` is more specific but compatible with
226 |           ``multiclass-multioutput``.
227 | 
228 |     Parameters
229 |     ----------
230 |     y : array-like
231 | 
232 |     Returns
233 |     -------
234 |     target_type : string
235 |         One of:
236 | 
237 |         * 'continuous': `y` is an array-like of floats that are not all
238 |           integers, and is 1d or a column vector.
239 |         * 'continuous-multioutput': `y` is a 2d array of floats that are
240 |           not all integers, and both dimensions are of size > 1.
241 |         * 'binary': `y` contains <= 2 discrete values and is 1d or a column
242 |           vector.
243 |         * 'multiclass': `y` contains more than two discrete values, is not a
244 |           sequence of sequences, and is 1d or a column vector.
245 |         * 'multiclass-multioutput': `y` is a 2d array that contains more
246 |           than two discrete values, is not a sequence of sequences, and both
247 |           dimensions are of size > 1.
248 |         * 'multilabel-indicator': `y` is a label indicator matrix, an array
249 |           of two dimensions with at least two columns, and at most 2 unique
250 |           values.
251 |         * 'unknown': `y` is array-like but none of the above, such as a 3d
252 |           array, sequence of sequences, or an array of non-sequence objects.
253 | 
254 |     Examples
255 |     --------
256 |     >>> import numpy as np
257 |     >>> type_of_target([0.1, 0.6])
258 |     'continuous'
259 |     >>> type_of_target([1, -1, -1, 1])
260 |     'binary'
261 |     >>> type_of_target(['a', 'b', 'a'])
262 |     'binary'
263 |     >>> type_of_target([1.0, 2.0])
264 |     'binary'
265 |     >>> type_of_target([1, 0, 2])
266 |     'multiclass'
267 |     >>> type_of_target([1.0, 0.0, 3.0])
268 |     'multiclass'
269 |     >>> type_of_target(['a', 'b', 'c'])
270 |     'multiclass'
271 |     >>> type_of_target(np.array([[1, 2], [3, 1]]))
272 |     'multiclass-multioutput'
273 |     >>> type_of_target([[1, 2]])
274 |     'multiclass-multioutput'
275 |     >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
276 |     'continuous-multioutput'
277 |     >>> type_of_target(np.array([[0, 1], [1, 1]]))
278 |     'multilabel-indicator'
279 |     """
280 |     valid = (
281 |         isinstance(y, (Sequence, spmatrix)) or hasattr(y, "__array__")
282 |     ) and not isinstance(y, str)
283 | 
284 |     if not valid:
285 |         raise ValueError(
286 |             "Expected array-like (array or non-string sequence), " "got %r" % y
287 |         )
288 | 
289 |     sparseseries = y.__class__.__name__ == "SparseSeries"
290 |     if sparseseries:
291 |         raise ValueError("y cannot be class 'SparseSeries'.")
292 | 
293 |     if is_multilabel(y):
294 |         return "multilabel-indicator"
295 | 
296 |     try:
297 |         y = np.asarray(y)
298 |     except ValueError:
299 |         # Known to fail in numpy 1.3 for array of arrays
300 |         return "unknown"
301 | 
302 |     # The old sequence of sequences format
303 |     try:
304 |         if (
305 |             not hasattr(y[0], "__array__")
306 |             and isinstance(y[0], Sequence)
307 |             and not isinstance(y[0], str)
308 |         ):
309 |             raise ValueError(
310 |                 "You appear to be using a legacy multi-label data"
311 |                 " representation. Sequence of sequences are no"
312 |                 " longer supported; use a binary array or sparse"
313 |                 " matrix instead - the MultiLabelBinarizer"
314 |                 " transformer can convert to this format."
315 |             )
316 |     except IndexError:
317 |         pass
318 | 
319 |     # Invalid inputs
320 |     if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], str)):
321 |         return "unknown"  # [[[1, 2]]] or [obj_1] and not ["label_1"]
322 | 
323 |     if y.ndim == 2 and y.shape[1] == 0:
324 |         return "unknown"  # [[]]
325 | 
326 |     if y.ndim == 2 and y.shape[1] > 1:
327 |         suffix = "-multioutput"  # [[1, 2], [1, 2]]
328 |     else:
329 |         suffix = ""  # [1, 2, 3] or [[1], [2], [3]]
330 | 
331 |     # check float and contains non-integer float values
332 |     if y.dtype.kind == "f" and np.any(y != y.astype(int)):
333 |         # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
334 |         _assert_all_finite(y)
335 |         return "continuous" + suffix
336 | 
337 |     if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
338 |         return "multiclass" + suffix  # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
339 |     else:
340 |         return "binary"  # [1, 2] or [["a"], ["b"]]
341 | 
342 | 
343 | def infer_output_dim(y_train):
344 |     """
345 |     Infer output_dim from targets
346 | 
347 |     Parameters
348 |     ----------
349 |     y_train : np.array
350 |         Training targets
351 | 
352 |     Returns
353 |     -------
354 |     output_dim : int
355 |         Number of classes for output
356 |     train_labels : list
357 |         Sorted list of initial classes
358 |     """
359 |     train_labels = unique_labels(y_train)
360 |     output_dim = len(train_labels)
361 | 
362 |     return output_dim, train_labels
363 | 
364 | 
365 | def check_output_dim(labels, y):
366 |     if y is not None:
367 |         valid_labels = unique_labels(y)
368 |         if not set(valid_labels).issubset(set(labels)):
369 |             raise ValueError(
370 |                 f"""Valid set -- {set(valid_labels)} --
371 |                              contains unkown targets from training --
372 |                              {set(labels)}"""
373 |             )
374 |     return
375 | 
376 | 
377 | def infer_multitask_output(y_train):
378 |     """
379 |     Infer output_dim from targets
380 |     This is for multiple tasks.
381 | 
382 |     Parameters
383 |     ----------
384 |     y_train : np.ndarray
385 |         Training targets
386 | 
387 |     Returns
388 |     -------
389 |     tasks_dims : list
390 |         Number of classes for output
391 |     tasks_labels : list
392 |         List of sorted list of initial classes
393 |     """
394 | 
395 |     if len(y_train.shape) < 2:
396 |         raise ValueError(
397 |             f"""y_train shoud be of shape (n_examples, n_tasks) """
398 |             + f"""but got {y_train.shape}"""
399 |         )
400 |     nb_tasks = y_train.shape[1]
401 |     tasks_dims = []
402 |     tasks_labels = []
403 |     for task_idx in range(nb_tasks):
404 |         try:
405 |             output_dim, train_labels = infer_output_dim(
406 |                 y_train[:, task_idx]
407 |             )
408 |             tasks_dims.append(output_dim)
409 |             tasks_labels.append(train_labels)
410 |         except ValueError as err:
411 |             raise ValueError(f"""Error for task {task_idx} : {err}""")
412 |     return tasks_dims, tasks_labels
413 | 


--------------------------------------------------------------------------------
/src/models/tabular_nn.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, List, Optional
  2 | 
  3 | from src.models.optimizer import RAdam
  4 | from src.utils.misc import LoggerFactory
  5 | from src.models.loss import SmoothBCEwLogits
  6 | from src.models.base import MoaBase
  7 | from src.utils.environment import get_device
  8 | import copy
  9 | import pandas as pd
 10 | import numpy as np
 11 | from tqdm.auto import trange
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | from torch.utils.data import Dataset, DataLoader
 16 | from torch.optim.lr_scheduler import MultiStepLR
 17 | from torch import optim
 18 | 
 19 | DEVICE = get_device()
 20 | logger = LoggerFactory().getLogger(__name__)
 21 | 
 22 | 
 23 | class TabularDataset(Dataset):
 24 |     def __init__(self, X: pd.DataFrame, y: Optional[pd.DataFrame], predictors):
 25 |         self.predictors = predictors
 26 |         self.X = X[predictors].values
 27 | 
 28 |         if y is not None:
 29 |             self.y = y.values
 30 |         else:
 31 |             self.y = y
 32 | 
 33 |     def __len__(self):
 34 |         return self.X.shape[0]
 35 | 
 36 |     def __getitem__(self, idx):
 37 |         if self.y is None:
 38 |             return torch.tensor(self.X[idx], dtype=torch.float).to(DEVICE)
 39 |         else:
 40 |             return (
 41 |                 torch.tensor(self.X[idx], dtype=torch.float).to(DEVICE),
 42 |                 torch.tensor(self.y[idx], dtype=torch.float).to(DEVICE),
 43 |             )
 44 | 
 45 | 
 46 | class TabularMLP_1_1(nn.Module):
 47 |     def __init__(self, features, targets):
 48 |         super(TabularMLP_1_1, self).__init__()
 49 | 
 50 |         self.sq = nn.Sequential(
 51 |             nn.BatchNorm1d(len(features)),
 52 |             nn.utils.weight_norm(nn.Linear(len(features), 1024)),
 53 |             #             nn.Dropout(0.8),
 54 |             nn.LeakyReLU(),
 55 |             nn.BatchNorm1d(1024),
 56 |             nn.utils.weight_norm(nn.Linear(1024, 500)),
 57 |             nn.Dropout(0.8),
 58 |             nn.LeakyReLU(),
 59 |             nn.Linear(500, len(targets)),
 60 |         )
 61 | 
 62 |     def forward(self, x):
 63 |         x = self.sq(x)
 64 |         return x
 65 | 
 66 | 
 67 | class TabularMLP_1_2(nn.Module):
 68 |     def __init__(self, n_features, n_targets, hidden_size=512, dropratio=0.2):
 69 |         super(TabularMLP_1_2, self).__init__()
 70 |         n_features = len(n_features)
 71 |         n_targets = len(n_targets)
 72 |         self.batch_norm1 = nn.BatchNorm1d(n_features)
 73 |         self.dropout1 = nn.Dropout(dropratio)
 74 |         self.dense1 = nn.utils.weight_norm(nn.Linear(n_features, hidden_size))
 75 | 
 76 |         self.batch_norm2 = nn.BatchNorm1d(hidden_size)
 77 |         self.dropout2 = nn.Dropout(dropratio)
 78 |         self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
 79 | 
 80 |         self.batch_norm3 = nn.BatchNorm1d(hidden_size)
 81 |         self.dropout3 = nn.Dropout(dropratio)
 82 |         self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, n_targets))
 83 | 
 84 |         self.relu = nn.ReLU()
 85 | 
 86 |     def forward(self, x):
 87 |         x = self.batch_norm1(x)
 88 |         x = self.dropout1(x)
 89 |         x = self.relu(self.dense1(x))
 90 | 
 91 |         x = self.batch_norm2(x)
 92 |         x = self.dropout2(x)
 93 |         x = self.relu(self.dense2(x))
 94 | 
 95 |         x = self.batch_norm3(x)
 96 |         x = self.dropout3(x)
 97 |         x = self.dense3(x)
 98 | 
 99 |         return x
100 | 
101 | 
102 | class TabularMLP_2(nn.Module):
103 |     def __init__(self, features, targets):
104 |         super(TabularMLP_2, self).__init__()
105 | 
106 |         self.sq = nn.Sequential(
107 |             nn.BatchNorm1d(len(features)),
108 |             nn.Linear(len(features), 2048),
109 |             #             nn.Dropout(0.8),
110 |             nn.LeakyReLU(),
111 |             nn.BatchNorm1d(2048),
112 |             nn.Linear(2048, 500),
113 |             nn.Dropout(0.8),
114 |             nn.LeakyReLU(),
115 |             nn.Linear(500, len(targets)),
116 |         )
117 | 
118 |     def forward(self, x):
119 |         x = self.sq(x)
120 |         return x
121 | 
122 | 
123 | class NNTrainer(MoaBase):
124 |     def __init__(self, params: Optional[dict] = None, **kwargs):
125 |         if params is None:
126 |             self.params = {}
127 |         else:
128 |             self.params = params
129 |         super().__init__(**kwargs)
130 | 
131 |     def _get_default_params(self):
132 |         return {
133 |             'lr': 1e-4,
134 |             'batch_size': 256,
135 |             'epoch': 20,
136 |             'model_class': TabularMLP_1_1,
137 |         }
138 | 
139 |     def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int):
140 |         X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
141 |         y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
142 |         target_cols = y_valid.columns.tolist()
143 | 
144 |         _params = self._get_default_params()
145 |         _params.update(self.params)
146 | 
147 |         # define model & schedulers
148 |         num_epoch = _params['epoch']
149 |         batch_size = _params['batch_size']
150 |         net = _params['model_class'](predictors, target_cols)
151 |         net.to(DEVICE)
152 | 
153 |         optimizer = optim.Adam(net.parameters(), lr=_params['lr'], weight_decay=1e-6)
154 |         valid_criterion = nn.BCEWithLogitsLoss()
155 |         criterion = SmoothBCEwLogits(smoothing=0.001)
156 |         scheduler = MultiStepLR(optimizer, milestones=[10, 15], gamma=0.1)
157 | 
158 |         # 学習時はlength=1の破片などを回避するためdrop_last=1とする
159 |         train_dataset = TabularDataset(X_train, y_train, predictors)
160 |         train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
161 | 
162 |         valid_dataset = TabularDataset(X_valid, y_valid, predictors)
163 |         valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
164 | 
165 |         bar = trange(num_epoch, desc=f"seed: {seed} train : {X_train.shape[0]}  valid:{X_valid.shape[0]}====")
166 |         train_loss = []
167 |         valid_loss = []
168 | 
169 |         best_loss = np.inf
170 |         best_preds = None
171 |         best_loss_epoch = 1
172 | 
173 |         for epoch in bar:
174 |             running_loss = []
175 |             valid_loss = []
176 | 
177 |             # train
178 |             net.train()
179 |             for x, y in train_dataloader:
180 |                 x = x.to(DEVICE)
181 |                 y = y.to(DEVICE)
182 |                 optimizer.zero_grad()
183 |                 out = net(x)
184 |                 loss = criterion(out, y)
185 |                 loss.backward()
186 |                 running_loss.append(loss.item())
187 |                 optimizer.step()
188 |             scheduler.step()
189 | 
190 |             net.eval()
191 | 
192 |             preds_valid = []
193 |             _valid_loss = []
194 | 
195 |             with torch.no_grad():
196 |                 for x, y in valid_dataloader:
197 |                     x = x.to(DEVICE)
198 |                     y = y.to(DEVICE)
199 |                     out = net(x)
200 |                     loss = valid_criterion(out, y)
201 |                     preds_valid.append(out.sigmoid().detach().cpu().numpy())
202 |                     _valid_loss.append(loss.item())
203 | 
204 |                 bar.set_postfix(
205 |                     running_loss=f"{np.mean(running_loss):.5f}",
206 |                     valid_loss=f"{np.mean(_valid_loss):.5f}",
207 |                     best_loss=f"{best_loss:.5f}",
208 |                     best_loss_epoch=f"{best_loss_epoch}",
209 |                 )
210 | 
211 |             train_loss.append(np.mean(running_loss))
212 |             valid_loss.append(np.mean(_valid_loss))
213 | 
214 |             if best_loss > np.mean(_valid_loss):
215 |                 best_loss = np.mean(_valid_loss)
216 |                 best_loss_epoch = epoch + 1
217 |                 best_preds = np.concatenate(preds_valid)
218 |                 best_state = copy.deepcopy(net.state_dict())
219 | 
220 |         logger.info(f"best loss : {best_loss}")
221 |         model = _params['model_class'](predictors, target_cols)
222 |         model.load_state_dict(best_state)
223 |         model.to(DEVICE)
224 |         return best_preds, model
225 | 
226 |     def _predict(self, model: Any, X_valid: pd.DataFrame, predictors: List[str]):
227 |         _params = self._get_default_params()
228 |         _params.update(self.params)
229 |         batch_size = _params['batch_size']
230 |         valid_dataset = TabularDataset(X_valid, None, predictors)
231 |         valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
232 |         tmp_pred = []
233 | 
234 |         model.eval()
235 |         with torch.no_grad():
236 |             for x in valid_dataloader:
237 |                 x = x.to(DEVICE)
238 |                 out = model(x)
239 |                 tmp_pred.append(out.sigmoid().detach().cpu().numpy())
240 |         return np.concatenate(tmp_pred)
241 | 
242 | 
243 | class CNNDataset(Dataset):
244 |     def __init__(self, X: np.ndarray, y: Optional[pd.DataFrame]):
245 | 
246 |         self.X = X
247 | 
248 |         if y is not None:
249 |             self.y = y.values
250 |         else:
251 |             self.y = y
252 | 
253 |     def __len__(self):
254 |         return self.X.shape[0]
255 | 
256 |     def __getitem__(self, idx):
257 |         if self.y is None:
258 |             return torch.tensor(self.X[idx], dtype=torch.float).to(DEVICE)
259 |         else:
260 |             return (
261 |                 torch.tensor(self.X[idx], dtype=torch.float).to(DEVICE),
262 |                 torch.tensor(self.y[idx], dtype=torch.float).to(DEVICE),
263 |             )
264 | 
265 | 
266 | class CNNStacking(nn.Module):
267 |     def __init__(self, n_features, n_labels):
268 |         super(CNNStacking, self).__init__()
269 | 
270 |         self.sq = nn.Sequential(
271 |             nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(2, 1), bias=False),
272 |             nn.ReLU(),
273 |             nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(2, 1), bias=False),
274 |             nn.ReLU(),
275 |             # nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 1), bias=False),
276 |             # nn.ReLU(inplace=True),
277 |             # nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 1), bias=False),
278 |             # nn.ReLU(),
279 |             nn.Flatten(),
280 |             nn.Linear(in_features=16 * n_labels, out_features=4 * n_labels),
281 |             nn.ReLU(),
282 |             nn.Linear(in_features=4 * n_labels, out_features=n_labels),
283 |         )
284 | 
285 |     def forward(self, x):
286 |         return self.sq(x)
287 | 
288 | 
289 | class CNNTrainer(MoaBase):
290 |     def __init__(self, params: Optional[dict] = None, **kwargs):
291 |         if params is None:
292 |             self.params = {}
293 |         else:
294 |             self.params = params
295 |         super().__init__(**kwargs)
296 | 
297 |     def _get_default_params(self):
298 |         return {
299 |             'lr': 1e-4,
300 |             'batch_size': 256,
301 |             'epoch': 20,
302 |         }
303 | 
304 |     def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int):
305 |         X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
306 |         y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
307 |         target_cols = y_valid.columns.tolist()
308 | 
309 |         _params = self._get_default_params()
310 |         _params.update(self.params)
311 | 
312 |         # define model & schedulers
313 |         self.n_predictors = len(predictors)
314 |         self.n_targets = len(target_cols)
315 |         self.n_models = self.n_predictors // self.n_targets
316 | 
317 |         num_epoch = _params['epoch']
318 |         batch_size = _params['batch_size']
319 |         net = CNNStacking(n_features=self.n_predictors, n_labels=self.n_targets)
320 |         net.to(DEVICE)
321 | 
322 |         # optimizer = optim.Adam(net.parameters(), lr=_params['lr'], weight_decay=1e-6)
323 |         optimizer = RAdam(net.parameters(), lr=_params['lr'])
324 |         valid_criterion = nn.BCEWithLogitsLoss()
325 |         criterion = SmoothBCEwLogits(smoothing=0.001)
326 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0.0001)
327 |         # scheduler = MultiStepLR(optimizer, milestones=[10, 15], gamma=0.1)
328 | 
329 |         # [N, Models, Labels, Channel] -> [N, Channel, Models, Labels]
330 |         X_train = X_train[predictors].values.reshape(-1, self.n_models, self.n_targets, 1).transpose(0, 3, 1, 2)
331 |         X_valid = X_valid[predictors].values.reshape(-1, self.n_models, self.n_targets, 1).transpose(0, 3, 1, 2)
332 | 
333 |         # 学習時はlength=1の破片などを回避するためdrop_last=1とする
334 |         train_dataset = CNNDataset(X_train, y_train)
335 |         train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
336 | 
337 |         valid_dataset = CNNDataset(X_valid, y_valid)
338 |         valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
339 | 
340 |         bar = trange(num_epoch, desc=f"seed: {seed} train : {X_train.shape[0]}  valid:{X_valid.shape[0]}====")
341 |         train_loss = []
342 |         valid_loss = []
343 | 
344 |         best_loss = np.inf
345 |         best_preds = None
346 |         best_loss_epoch = 1
347 | 
348 |         for epoch in bar:
349 |             running_loss = []
350 |             valid_loss = []
351 | 
352 |             # train
353 |             net.train()
354 |             for x, y in train_dataloader:
355 |                 x = x.to(DEVICE)
356 |                 y = y.to(DEVICE)
357 | 
358 |                 optimizer.zero_grad()
359 |                 out = net(x)
360 |                 loss = criterion(out, y)
361 |                 loss.backward()
362 |                 running_loss.append(loss.item())
363 |                 optimizer.step()
364 |             scheduler.step()
365 | 
366 |             preds_valid = []
367 |             _valid_loss = []
368 | 
369 |             net.eval()
370 |             with torch.no_grad():
371 |                 for x, y in valid_dataloader:
372 |                     x = x.to(DEVICE)
373 |                     y = y.to(DEVICE)
374 | 
375 |                     out = net(x)
376 |                     loss = valid_criterion(out, y)
377 |                     preds_valid.append(out.sigmoid().detach().cpu().numpy())
378 |                     _valid_loss.append(loss.item())
379 | 
380 |                 bar.set_postfix(
381 |                     running_loss=f"{np.mean(running_loss):.5f}",
382 |                     valid_loss=f"{np.mean(_valid_loss):.5f}",
383 |                     best_loss=f"{best_loss:.5f}",
384 |                     best_loss_epoch=f"{best_loss_epoch}",
385 |                 )
386 | 
387 |             train_loss.append(np.mean(running_loss))
388 |             valid_loss.append(np.mean(_valid_loss))
389 | 
390 |             if best_loss > np.mean(_valid_loss):
391 |                 best_loss = np.mean(_valid_loss)
392 |                 best_loss_epoch = epoch + 1
393 |                 best_preds = np.concatenate(preds_valid)
394 |                 best_state = copy.deepcopy(net.state_dict())
395 | 
396 |         logger.info(f"best loss : {best_loss}")
397 |         model = CNNStacking(n_features=self.n_predictors, n_labels=self.n_targets)
398 |         model.load_state_dict(best_state)
399 |         model.to(DEVICE)
400 |         return best_preds, model
401 | 
402 |     def _predict(self, model: Any, X_valid: pd.DataFrame, predictors: List[str]):
403 |         _params = self._get_default_params()
404 |         _params.update(self.params)
405 | 
406 |         # [N, Models, Labels, Channel] -> [N, Channel, Models, Labels]
407 |         X_valid = X_valid[predictors].values.reshape(-1, self.n_models, self.n_targets, 1).transpose(0, 3, 1, 2)
408 | 
409 |         batch_size = _params['batch_size']
410 |         valid_dataset = CNNDataset(X_valid, None)
411 |         valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
412 |         tmp_pred = []
413 | 
414 |         model.eval()
415 |         with torch.no_grad():
416 |             for x in valid_dataloader:
417 |                 x = x.to(DEVICE)
418 | 
419 |                 out = model(x)
420 |                 tmp_pred.append(out.sigmoid().detach().cpu().numpy())
421 |         return np.concatenate(tmp_pred)


--------------------------------------------------------------------------------
/src/experiment/experiment.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/nyanp/nyaggle/blob/master/nyaggle/experiment/experiment.py
  2 | 
  3 | import json
  4 | import numbers
  5 | import os
  6 | import shutil
  7 | import uuid
  8 | import warnings
  9 | from logging import getLogger, FileHandler, DEBUG, Logger
 10 | from typing import Dict, Optional
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | 
 15 | from src.utils.environment import requires_mlflow
 16 | 
 17 | MLFLOW_KEY_LENGTH_LIMIT = 250
 18 | MLFLOW_VALUE_LENGTH_LIMIT = 250
 19 | 
 20 | 
 21 | def _sanitize_mlflow_param(param, limit):
 22 |     if len(str(param)) > limit:
 23 |         warnings.warn('Length of param exceeds limit {}. It will be truncated. value: {}'.format(limit, param))
 24 |         param = str(param)[:limit]
 25 |     return param
 26 | 
 27 | 
 28 | def _check_directory(directory: str, if_exists: str) -> str:
 29 |     if os.path.exists(directory):
 30 |         if if_exists == 'error':
 31 |             raise ValueError('directory {} already exists.'.format(directory))
 32 |         elif if_exists == 'replace':
 33 |             warnings.warn('directory {} already exists. It will be replaced by the new result'.format(directory))
 34 | 
 35 |             existing_run_id = _try_to_get_existing_mlflow_run_id(directory)
 36 |             if existing_run_id is not None:
 37 |                 requires_mlflow()
 38 |                 import mlflow
 39 |                 mlflow.delete_run(existing_run_id)
 40 | 
 41 |             shutil.rmtree(directory, ignore_errors=True)
 42 |         elif if_exists == 'rename':
 43 |             postfix_index = 1
 44 | 
 45 |             while os.path.exists(directory + '_' + str(postfix_index)):
 46 |                 postfix_index += 1
 47 | 
 48 |             directory += '_' + str(postfix_index)
 49 |             warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory))
 50 |     return directory
 51 | 
 52 | 
 53 | def _sanitize(v):
 54 |     return v if isinstance(v, numbers.Number) else str(v)
 55 | 
 56 | 
 57 | def _try_to_get_existing_mlflow_run_id(logging_directory: str) -> Optional[str]:
 58 |     mlflow_path = os.path.join(logging_directory, 'mlflow.json')
 59 |     if os.path.exists(mlflow_path):
 60 |         with open(mlflow_path, 'r') as f:
 61 |             mlflow_metadata = json.load(f)
 62 |             return mlflow_metadata['run_id']
 63 |     return None
 64 | 
 65 | 
 66 | class Experiment(object):
 67 |     """Minimal experiment logger for Kaggle
 68 |     This module provides minimal functionality for tracking experiments.
 69 |     The output files are laid out as follows:
 70 |     .. code-block:: none
 71 |       <logging_directory>/
 72 |           log.txt       <== Output of log
 73 |           metrics.json  <== Output of log_metric(s), format: name,score
 74 |           params.json   <== Output of log_param(s), format: key,value
 75 |           mlflow.json   <== mlflow's run_id, experiment_id and artifact_uri (logged if with_mlflow=True)
 76 |     You can add numpy array and pandas dataframe under the directory through ``log_numpy`` and ``log_dataframe``.
 77 |     Args:
 78 |         logging_directory:
 79 |             Path to directory where output is stored.
 80 |         custom_logger:
 81 |             A custom logger to be used instead of default logger.
 82 |         with_mlflow:
 83 |             If True, `mlflow tracking <https://www.mlflow.org/docs/latest/tracking.html>`_ is used.
 84 |             One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
 85 |             Note that all output files are located both ``logging_directory`` and
 86 |             mlflow's directory (``mlruns`` by default).
 87 |         if_exists:
 88 |             How to behave if the logging directory already exists.
 89 |             - error: Raise a ValueError.
 90 |             - replace: Delete logging directory before logging.
 91 |             - append: Append to exisitng experiment.
 92 |             - rename: Rename current directory by adding "_1", "_2"... prefix
 93 |     Example:
 94 |         >>> import numpy as np
 95 |         >>> import pandas as pd
 96 |         >>> from nyaggle.experiment import Experiment
 97 |         >>>
 98 |         >>> with Experiment(logging_directory='./output/') as exp:
 99 |         >>>     # log key-value pair as a parameter
100 |         >>>     exp.log_param('lr', 0.01)
101 |         >>>     exp.log_param('optimizer', 'adam')
102 |         >>>
103 |         >>>     # log text
104 |         >>>     exp.log('blah blah blah')
105 |         >>>
106 |         >>>     # log metric
107 |         >>>     exp.log_metric('CV', 0.85)
108 |         >>>
109 |         >>>     # log dictionary with flattening keys
110 |         >>>     exp.log_dict('params', {'X': 3, 'Y': {'Z': 'foobar'}})
111 |         >>>
112 |         >>>     # log numpy ndarray, pandas dafaframe and any artifacts
113 |         >>>     exp.log_numpy('predicted', np.zeros(1))
114 |         >>>     exp.log_dataframe('submission', pd.DataFrame(), file_format='csv')
115 |         >>>     exp.log_artifact('path-to-your-file')
116 |     """
117 |     def __init__(self, logging_directory: str, custom_logger: Optional[Logger] = None, with_mlflow: bool = False, if_exists: str = 'error'):
118 |         logging_directory = _check_directory(logging_directory, if_exists)
119 |         os.makedirs(logging_directory, exist_ok=True)
120 | 
121 |         self.logging_directory = logging_directory
122 |         self.with_mlflow = with_mlflow
123 | 
124 |         if custom_logger is not None:
125 |             self.logger = custom_logger
126 |             self.is_custom = True
127 |         else:
128 |             self.logger = getLogger(str(uuid.uuid4()))
129 |             self.log_path = os.path.join(logging_directory, 'log.txt')
130 |             self.logger.addHandler(FileHandler(self.log_path))
131 |             self.logger.setLevel(DEBUG)
132 |             self.is_custom = False
133 |         self.metrics = self._load_dict('metrics.json')
134 |         self.params = self._load_dict('params.json')
135 |         self.inherit_existing_run = False
136 | 
137 |         if self.with_mlflow:
138 |             requires_mlflow()
139 |             self.mlflow_run_id = _try_to_get_existing_mlflow_run_id(logging_directory)
140 |             if self.mlflow_run_id is not None:
141 |                 self.mlflow_run_name = None
142 |             else:
143 |                 self.mlflow_run_name = logging_directory
144 | 
145 |     def __enter__(self):
146 |         self.start()
147 |         return self
148 | 
149 |     def __exit__(self, ex_type, ex_value, trace):
150 |         self.stop()
151 | 
152 |     @classmethod
153 |     def continue_from(cls, logging_directory: str, with_mlflow: bool = False):
154 |         return cls(logging_directory=logging_directory, if_exists='append', with_mlflow=with_mlflow)
155 | 
156 |     def start(self):
157 |         """
158 |         Start a new experiment.
159 |         """
160 |         if self.with_mlflow:
161 |             import mlflow
162 | 
163 |             if mlflow.active_run() is not None:
164 |                 active_run = mlflow.active_run()
165 |                 self.inherit_existing_run = True
166 |             else:
167 |                 active_run = mlflow.start_run(run_name=self.mlflow_run_name, run_id=self.mlflow_run_id)
168 |             mlflow_metadata = {'artifact_uri': active_run.info.artifact_uri, 'experiment_id': active_run.info.experiment_id, 'run_id': active_run.info.run_id}
169 |             self.mlflow_run_id = active_run.info.run_id
170 |             with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f:
171 |                 json.dump(mlflow_metadata, f, indent=4)
172 | 
173 |     def _load_dict(self, filename: str) -> Dict:
174 |         try:
175 |             path = os.path.join(self.logging_directory, filename)
176 |             with open(path, 'r') as f:
177 |                 return json.load(f)
178 |         except IOError:
179 |             self.logger.warning('failed to load file: {}'.format(filename))
180 |             return {}
181 | 
182 |     def _save_dict(self, obj: Dict, filename: str):
183 |         try:
184 |             path = os.path.join(self.logging_directory, filename)
185 |             with open(path, 'w') as f:
186 |                 json.dump(obj, f, indent=2)
187 |         except IOError:
188 |             self.logger.warning('failed to save file: {}'.format(filename))
189 | 
190 |     def stop(self):
191 |         """
192 |         Stop current experiment.
193 |         """
194 |         self._save_dict(self.metrics, 'metrics.json')
195 |         self._save_dict(self.params, 'params.json')
196 | 
197 |         if not self.is_custom:
198 |             for h in self.logger.handlers:
199 |                 h.close()
200 | 
201 |         if self.with_mlflow:
202 |             import mlflow
203 |             from mlflow.exceptions import MlflowException
204 | 
205 |             try:
206 |                 mlflow.log_artifact(self.log_path)
207 |                 mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json'))
208 |                 mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json'))
209 |             except MlflowException as e:
210 |                 warnings.warn('Error in saving artifacts to mlflow. The result may not be saved.: {}'.format(e))
211 |             if not self.inherit_existing_run:
212 |                 mlflow.end_run()
213 | 
214 |     def get_logger(self) -> Logger:
215 |         """
216 |         Get logger used in this experiment.
217 |         Returns:
218 |             logger object
219 |         """
220 |         return self.logger
221 | 
222 |     def get_run(self):
223 |         """
224 |         Get mlflow's currently active run, or None if ``with_mlflow = False``.
225 |         Returns:
226 |             active Run
227 |         """
228 |         if not self.with_mlflow:
229 |             return None
230 | 
231 |         import mlflow
232 |         return mlflow.active_run()
233 | 
234 |     def log(self, text: str):
235 |         """
236 |         Logs a message on the logger for the experiment.
237 |         Args:
238 |             text:
239 |                 The message to be written.
240 |         """
241 |         self.logger.info(text)
242 | 
243 |     def log_param(self, key, value):
244 |         """
245 |         Logs a key-value pair for the experiment.
246 |         Args:
247 |             key: parameter name
248 |             value: parameter value
249 |         """
250 |         key = _sanitize(key)
251 |         value = _sanitize(value)
252 |         self.params[key] = value
253 | 
254 |         if self.with_mlflow:
255 |             import mlflow
256 |             from mlflow.exceptions import MlflowException
257 | 
258 |             key_mlflow = _sanitize_mlflow_param(key, MLFLOW_KEY_LENGTH_LIMIT)
259 |             value_mlflow = _sanitize_mlflow_param(value, MLFLOW_VALUE_LENGTH_LIMIT)
260 | 
261 |             try:
262 |                 mlflow.log_param(key_mlflow, value_mlflow)
263 |             except MlflowException as e:
264 |                 warnings.warn('Error in logging parameter {} to mlflow. Skipped. {}'.format(key, e))
265 | 
266 |     def log_params(self, params: Dict):
267 |         """
268 |         Logs a batch of params for the experiments.
269 |         Args:
270 |             params: dictionary of parameters
271 |         """
272 |         for k, v in params.items():
273 |             self.log_param(k, v)
274 | 
275 |     def log_dict(self, name: str, value: Dict, separator: str = '.'):
276 |         """
277 |         Logs a dictionary as parameter with flatten format.
278 |         Args:
279 |             name: Parameter name
280 |             value: Parameter value
281 |             separator: Separating character used to concatanate keys
282 |         Examples:
283 |             >>> with Experiment('./') as e:
284 |             >>>     e.log_dict('a', {'b': 1, 'c': 'd'})
285 |             >>>     print(e.params)
286 |             { 'a.b': 1, 'a.c': 'd' }
287 |         """
288 | 
289 |         if value is None:
290 |             self.log_param(name, value)
291 |             return
292 | 
293 |         def _flatten(d: Dict, prefix: str, separator: str) -> Dict:
294 |             items = []
295 |             for k, v in d.items():
296 |                 child_key = prefix + separator + str(k) if prefix else str(k)
297 |                 if isinstance(v, Dict) and v:
298 |                     items.extend(_flatten(v, child_key, separator).items())
299 |                 else:
300 |                     items.append((child_key, v))
301 |             return dict(items)
302 | 
303 |         value = _flatten(value, name, separator)
304 |         self.log_params(value)
305 | 
306 |     def log_metric(self, name: str, score: float):
307 |         """
308 |         Log a metric under the logging directory.
309 |         Args:
310 |             name:
311 |                 Metric name.
312 |             score:
313 |                 Metric value.
314 |         """
315 |         name = _sanitize(name)
316 |         score = _sanitize(score)
317 |         self.metrics[name] = score
318 | 
319 |         if self.with_mlflow:
320 |             import mlflow
321 |             from mlflow.exceptions import MlflowException
322 | 
323 |             try:
324 |                 mlflow.log_metric(name, score)
325 |             except MlflowException as e:
326 |                 warnings.warn('Error in logging metric {} to mlflow. Skipped. {}'.format(name, e))
327 | 
328 |     def log_metrics(self, metrics: Dict):
329 |         """
330 |         Log a batch of metrics under the logging directory.
331 |         Args:
332 |             metrics: dictionary of metrics.
333 |         """
334 |         for k, v in metrics.items():
335 |             self.log_metric(k, v)
336 | 
337 |     def log_numpy(self, name: str, array: np.ndarray):
338 |         """
339 |         Log a numpy ndarray under the logging directory.
340 |         Args:
341 |             name:
342 |                 Name of the file. A .npy extension will be appended to the file name if it does not already have one.
343 |             array:
344 |                 Array data to be saved.
345 |         """
346 |         path = os.path.join(self.logging_directory, name)
347 |         np.save(path, array)
348 | 
349 |         if self.with_mlflow:
350 |             import mlflow
351 |             mlflow.log_artifact(path + '.npy')
352 | 
353 |     def log_dataframe(self, name: str, df: pd.DataFrame, file_format: str = 'feather'):
354 |         """
355 |         Log a pandas dataframe under the logging directory.
356 |         Args:
357 |             name:
358 |                 Name of the file. A ``.f`` or ``.csv`` extension will be appended to the file name
359 |                 if it does not already have one.
360 |             df:
361 |                 A dataframe to be saved.
362 |             file_format:
363 |                 A format of output file. ``csv`` and ``feather`` are supported.
364 |         """
365 |         path = os.path.join(self.logging_directory, name)
366 |         if file_format == 'feather':
367 |             if not path.endswith('.f'):
368 |                 path += '.f'
369 |             df.to_feather(path)
370 |         elif file_format == 'csv':
371 |             if not path.endswith('.csv'):
372 |                 path += '.csv'
373 |             df.to_csv(path, index=False)
374 |         else:
375 |             raise RuntimeError('format not supported')
376 | 
377 |         if self.with_mlflow:
378 |             import mlflow
379 |             mlflow.log_artifact(path)
380 | 
381 |     def log_artifact(self, src_file_path: str):
382 |         """
383 |         Make a copy of the file under the logging directory.
384 |         Args:
385 |             src_file_path:
386 |                 Path of the file. If path is not a child of the logging directory, the file will be copied.
387 |                 If ``with_mlflow`` is True, ``mlflow.log_artifact`` will be called (then another copy will be made).
388 |         """
389 |         logging_path = os.path.abspath(self.logging_directory)
390 |         src_file_path = os.path.abspath(src_file_path)
391 | 
392 |         if os.path.commonpath([logging_path]) != os.path.commonpath([logging_path, src_file_path]):
393 |             src_file = os.path.basename(src_file_path)
394 |             shutil.copy(src_file, self.logging_directory)
395 | 
396 |         if self.with_mlflow:
397 |             import mlflow
398 |             mlflow.log_artifact(src_file_path)
399 | 
400 | 
401 | def add_leaderboard_score(logging_directory: str, score: float):
402 |     """
403 |     Record leaderboard score to the existing experiment directory.
404 |     Args:
405 |         logging_directory:
406 |             The directory to be added
407 |         score:
408 |             Leaderboard score
409 |     """
410 |     with Experiment.continue_from(logging_directory) as e:
411 |         e.log_metric('LB', score)


--------------------------------------------------------------------------------
/src/utils/splitter.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from sklearn.utils import check_random_state
  4 | from sklearn.utils.validation import _num_samples, check_array
  5 | from sklearn.utils.multiclass import type_of_target
  6 | from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
  7 | 
  8 | from sklearn.model_selection._split import (
  9 |     _BaseKFold,
 10 |     _RepeatedSplits,
 11 |     BaseShuffleSplit,
 12 |     _validate_shuffle_split,
 13 | )
 14 | 
 15 | 
 16 | def splitter_for_moa(X, target_cols, n_splits=5, seed=42):
 17 |     folds = []
 18 |     X = X.copy()
 19 |     # LOCATE DRUGS
 20 |     vc = X.drug_id.value_counts()
 21 | 
 22 |     vc1 = vc.loc[(vc == 6) | (vc == 12) | (vc == 18)].index.sort_values()
 23 |     vc2 = vc.loc[(vc != 6) & (vc != 12) & (vc != 18)].index.sort_values()
 24 | 
 25 |     # STRATIFY DRUGS 18X OR LESS
 26 |     dct1 = {}
 27 |     dct2 = {}
 28 |     skf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
 29 |     tmp = X.groupby("drug_id")[target_cols].mean().loc[vc1]
 30 |     for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
 31 |         dd = {k: fold for k in tmp.index[idxV].values}
 32 |         dct1.update(dd)
 33 | 
 34 |     # STRATIFY DRUGS MORE THAN 18X
 35 |     skf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
 36 |     tmp = X.loc[X.drug_id.isin(vc2)].reset_index(drop=True)
 37 |     for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
 38 |         dd = {k: fold for k in tmp.sig_id[idxV].values}
 39 |         dct2.update(dd)
 40 | 
 41 |     # ASSIGN FOLDS
 42 |     X["fold"] = X.drug_id.map(dct1)
 43 |     X.loc[X.fold.isna(), "fold"] = X.loc[X.fold.isna(), "sig_id"].map(dct2)
 44 |     X.fold = X.fold.astype("int8")
 45 |     folds.append(X.fold.values)
 46 | 
 47 |     _folds = np.stack(folds).flatten()
 48 |     folds = []
 49 |     for idx in range(n_splits):
 50 |         train_idx = np.where(_folds != idx)[0]
 51 |         valid_idx = np.where(_folds == idx)[0]
 52 |         folds.append((train_idx, valid_idx))
 53 |     return folds
 54 | 
 55 | 
 56 | class SplitFactory:
 57 |     # split_type: necessary params
 58 |     split_pattern = {
 59 |         'kfolds': [],
 60 |         'stratified': ['target_col'],
 61 |         'multilabel_stratified': ['target_col'],
 62 |         'group': ['group_col'],
 63 |     }
 64 | 
 65 |     def __init__(self, n_splits: int = 5, split_type: str = "kfolds", random_state: int = 46, **kwargs):
 66 |         self.split_type = split_type
 67 |         self.n_splits = n_splits
 68 |         self.random_state = random_state
 69 |         self.params = kwargs
 70 | 
 71 |         if split_type not in self.split_pattern:
 72 |             raise ValueError(f"type: {split_type} is not in {list(self.split_pattern.keys())}")
 73 |         for arg in self.split_pattern[split_type]:
 74 |             if arg not in self.params:
 75 |                 raise ValueError(f"split type {split_type} require {arg} param")
 76 | 
 77 |         # set splitter
 78 |         self._get_splitter()
 79 | 
 80 |     def _get_splitter(self):
 81 |         if self.split_type == 'kfolds':
 82 |             self._split = lambda X, y: KFold(n_splits=self.n_splits, random_state=self.random_state).split(X, y)
 83 |         elif self.split_type == 'stratified':
 84 |             target_col = self.params['target_col']
 85 |             self._split = lambda X, y: StratifiedKFold(n_splits=self.n_splits, random_state=self.random_state).split(X, y, X[target_col])
 86 |         elif self.split_type == 'group':
 87 |             group_col = self.params['group_col']
 88 |             self._split = lambda X, y: GroupKFold(n_splits=self.n_splits).split(X, y, groups=X[group_col])
 89 |         elif self.split_type == 'multilabel_stratified':
 90 |             target_col = self.params['target_col']
 91 |             self._split = lambda X, y: MultilabelStratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state).split(X, y)
 92 |         else:
 93 |             raise AttributeError(f"split_type: {self.split_type} is not supperted")
 94 | 
 95 |     def split(self, X, y=None):
 96 |         return self._split(X, y)
 97 | 
 98 | 
 99 | # https://github.com/trent-b/iterative-stratification
100 | def IterativeStratification(labels, r, random_state):
101 |     """This function implements the Iterative Stratification algorithm described
102 |     in the following paper:
103 |     Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of
104 |     Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M.
105 |     (eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD
106 |     2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin,
107 |     Heidelberg.
108 |     """
109 | 
110 |     n_samples = labels.shape[0]
111 |     test_folds = np.zeros(n_samples, dtype=int)
112 | 
113 |     # Calculate the desired number of examples at each subset
114 |     c_folds = r * n_samples
115 | 
116 |     # Calculate the desired number of examples of each label at each subset
117 |     c_folds_labels = np.outer(r, labels.sum(axis=0))
118 | 
119 |     labels_not_processed_mask = np.ones(n_samples, dtype=bool)
120 | 
121 |     while np.any(labels_not_processed_mask):
122 |         # Find the label with the fewest (but at least one) remaining examples,
123 |         # breaking ties randomly
124 |         num_labels = labels[labels_not_processed_mask].sum(axis=0)
125 | 
126 |         # Handle case where only all-zero labels are left by distributing
127 |         # across all folds as evenly as possible (not in original algorithm but
128 |         # mentioned in the text). (By handling this case separately, some
129 |         # code redundancy is introduced; however, this approach allows for
130 |         # decreased execution time when there are a relatively large number
131 |         # of all-zero labels.)
132 |         if num_labels.sum() == 0:
133 |             sample_idxs = np.where(labels_not_processed_mask)[0]
134 | 
135 |             for sample_idx in sample_idxs:
136 |                 fold_idx = np.where(c_folds == c_folds.max())[0]
137 | 
138 |                 if fold_idx.shape[0] > 1:
139 |                     fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])]
140 | 
141 |                 test_folds[sample_idx] = fold_idx
142 |                 c_folds[fold_idx] -= 1
143 | 
144 |             break
145 | 
146 |         label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0]
147 |         if label_idx.shape[0] > 1:
148 |             label_idx = label_idx[random_state.choice(label_idx.shape[0])]
149 | 
150 |         sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0]
151 | 
152 |         for sample_idx in sample_idxs:
153 |             # Find the subset(s) with the largest number of desired examples
154 |             # for this label, breaking ties by considering the largest number
155 |             # of desired examples, breaking further ties randomly
156 |             label_folds = c_folds_labels[:, label_idx]
157 |             fold_idx = np.where(label_folds == label_folds.max())[0]
158 | 
159 |             if fold_idx.shape[0] > 1:
160 |                 temp_fold_idx = np.where(c_folds[fold_idx] == c_folds[fold_idx].max())[0]
161 |                 fold_idx = fold_idx[temp_fold_idx]
162 | 
163 |                 if temp_fold_idx.shape[0] > 1:
164 |                     fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])]
165 | 
166 |             test_folds[sample_idx] = fold_idx
167 |             labels_not_processed_mask[sample_idx] = False
168 | 
169 |             # Update desired number of examples
170 |             c_folds_labels[fold_idx, labels[sample_idx]] -= 1
171 |             c_folds[fold_idx] -= 1
172 | 
173 |     return test_folds
174 | 
175 | 
176 | class MultilabelStratifiedKFold(_BaseKFold):
177 |     """Multilabel stratified K-Folds cross-validator
178 |     Provides train/test indices to split multilabel data into train/test sets.
179 |     This cross-validation object is a variation of KFold that returns
180 |     stratified folds for multilabel data. The folds are made by preserving
181 |     the percentage of samples for each label.
182 |     Parameters
183 |     ----------
184 |     n_splits : int, default=3
185 |         Number of folds. Must be at least 2.
186 |     shuffle : boolean, optional
187 |         Whether to shuffle each stratification of the data before splitting
188 |         into batches.
189 |     random_state : int, RandomState instance or None, optional, default=None
190 |         If int, random_state is the seed used by the random number generator;
191 |         If RandomState instance, random_state is the random number generator;
192 |         If None, the random number generator is the RandomState instance used
193 |         by `np.random`. Unlike StratifiedKFold that only uses random_state
194 |         when ``shuffle`` == True, this multilabel implementation
195 |         always uses the random_state since the iterative stratification
196 |         algorithm breaks ties randomly.
197 |     Examples
198 |     --------
199 |     >>> from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
200 |     >>> import numpy as np
201 |     >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
202 |     >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
203 |     >>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)
204 |     >>> mskf.get_n_splits(X, y)
205 |     2
206 |     >>> print(mskf)  # doctest: +NORMALIZE_WHITESPACE
207 |     MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False)
208 |     >>> for train_index, test_index in mskf.split(X, y):
209 |     ...    print("TRAIN:", train_index, "TEST:", test_index)
210 |     ...    X_train, X_test = X[train_index], X[test_index]
211 |     ...    y_train, y_test = y[train_index], y[test_index]
212 |     TRAIN: [0 3 4 6] TEST: [1 2 5 7]
213 |     TRAIN: [1 2 5 7] TEST: [0 3 4 6]
214 |     Notes
215 |     -----
216 |     Train and test sizes may be slightly different in each fold.
217 |     See also
218 |     --------
219 |     RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold
220 |     n times.
221 |     """
222 |     def __init__(self, n_splits=3, shuffle=False, random_state=None):
223 |         super(MultilabelStratifiedKFold, self).__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
224 | 
225 |     def _make_test_folds(self, X, y):
226 |         y = np.asarray(y, dtype=bool)
227 |         type_of_target_y = type_of_target(y)
228 | 
229 |         if type_of_target_y != "multilabel-indicator":
230 |             raise ValueError("Supported target type is: multilabel-indicator. Got {!r} instead.".format(type_of_target_y))
231 | 
232 |         num_samples = y.shape[0]
233 | 
234 |         rng = check_random_state(self.random_state)
235 |         indices = np.arange(num_samples)
236 | 
237 |         if self.shuffle:
238 |             rng.shuffle(indices)
239 |             y = y[indices]
240 | 
241 |         r = np.asarray([1 / self.n_splits] * self.n_splits)
242 | 
243 |         test_folds = IterativeStratification(labels=y, r=r, random_state=rng)
244 | 
245 |         return test_folds[np.argsort(indices)]
246 | 
247 |     def _iter_test_masks(self, X=None, y=None, groups=None):
248 |         test_folds = self._make_test_folds(X, y)
249 |         for i in range(self.n_splits):
250 |             yield test_folds == i
251 | 
252 |     def split(self, X, y, groups=None):
253 |         """Generate indices to split data into training and test set.
254 |         Parameters
255 |         ----------
256 |         X : array-like, shape (n_samples, n_features)
257 |             Training data, where n_samples is the number of samples
258 |             and n_features is the number of features.
259 |             Note that providing ``y`` is sufficient to generate the splits and
260 |             hence ``np.zeros(n_samples)`` may be used as a placeholder for
261 |             ``X`` instead of actual training data.
262 |         y : array-like, shape (n_samples, n_labels)
263 |             The target variable for supervised learning problems.
264 |             Multilabel stratification is done based on the y labels.
265 |         groups : object
266 |             Always ignored, exists for compatibility.
267 |         Returns
268 |         -------
269 |         train : ndarray
270 |             The training set indices for that split.
271 |         test : ndarray
272 |             The testing set indices for that split.
273 |         Notes
274 |         -----
275 |         Randomized CV splitters may return different results for each call of
276 |         split. You can make the results identical by setting ``random_state``
277 |         to an integer.
278 |         """
279 |         y = check_array(y, ensure_2d=False, dtype=None)
280 |         return super(MultilabelStratifiedKFold, self).split(X, y, groups)
281 | 
282 | 
283 | class RepeatedMultilabelStratifiedKFold(_RepeatedSplits):
284 |     """Repeated Multilabel Stratified K-Fold cross validator.
285 |     Repeats Mulilabel Stratified K-Fold n times with different randomization
286 |     in each repetition.
287 |     Parameters
288 |     ----------
289 |     n_splits : int, default=5
290 |         Number of folds. Must be at least 2.
291 |     n_repeats : int, default=10
292 |         Number of times cross-validator needs to be repeated.
293 |     random_state : None, int or RandomState, default=None
294 |         Random state to be used to generate random state for each
295 |         repetition as well as randomly breaking ties within the iterative
296 |         stratification algorithm.
297 |     Examples
298 |     --------
299 |     >>> from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold
300 |     >>> import numpy as np
301 |     >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
302 |     >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
303 |     >>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2,
304 |     ...     random_state=0)
305 |     >>> for train_index, test_index in rmskf.split(X, y):
306 |     ...     print("TRAIN:", train_index, "TEST:", test_index)
307 |     ...     X_train, X_test = X[train_index], X[test_index]
308 |     ...     y_train, y_test = y[train_index], y[test_index]
309 |     ...
310 |     TRAIN: [0 3 4 6] TEST: [1 2 5 7]
311 |     TRAIN: [1 2 5 7] TEST: [0 3 4 6]
312 |     TRAIN: [0 1 4 5] TEST: [2 3 6 7]
313 |     TRAIN: [2 3 6 7] TEST: [0 1 4 5]
314 |     See also
315 |     --------
316 |     RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold
317 |     n times.
318 |     """
319 |     def __init__(self, n_splits=5, n_repeats=10, random_state=None):
320 |         super(RepeatedMultilabelStratifiedKFold, self).__init__(
321 |             MultilabelStratifiedKFold,
322 |             n_repeats=n_repeats,
323 |             random_state=random_state,
324 |             n_splits=n_splits,
325 |         )
326 | 
327 | 
328 | class MultilabelStratifiedShuffleSplit(BaseShuffleSplit):
329 |     """Multilabel Stratified ShuffleSplit cross-validator
330 |     Provides train/test indices to split data into train/test sets.
331 |     This cross-validation object is a merge of MultilabelStratifiedKFold and
332 |     ShuffleSplit, which returns stratified randomized folds for multilabel
333 |     data. The folds are made by preserving the percentage of each label.
334 |     Note: like the ShuffleSplit strategy, multilabel stratified random splits
335 |     do not guarantee that all folds will be different, although this is
336 |     still very likely for sizeable datasets.
337 |     Parameters
338 |     ----------
339 |     n_splits : int, default 10
340 |         Number of re-shuffling & splitting iterations.
341 |     test_size : float, int, None, optional
342 |         If float, should be between 0.0 and 1.0 and represent the proportion
343 |         of the dataset to include in the test split. If int, represents the
344 |         absolute number of test samples. If None, the value is set to the
345 |         complement of the train size. By default, the value is set to 0.1.
346 |         The default will change in version 0.21. It will remain 0.1 only
347 |         if ``train_size`` is unspecified, otherwise it will complement
348 |         the specified ``train_size``.
349 |     train_size : float, int, or None, default is None
350 |         If float, should be between 0.0 and 1.0 and represent the
351 |         proportion of the dataset to include in the train split. If
352 |         int, represents the absolute number of train samples. If None,
353 |         the value is automatically set to the complement of the test size.
354 |     random_state : int, RandomState instance or None, optional (default=None)
355 |         If int, random_state is the seed used by the random number generator;
356 |         If RandomState instance, random_state is the random number generator;
357 |         If None, the random number generator is the RandomState instance used
358 |         by `np.random`. Unlike StratifiedShuffleSplit that only uses
359 |         random_state when ``shuffle`` == True, this multilabel implementation
360 |         always uses the random_state since the iterative stratification
361 |         algorithm breaks ties randomly.
362 |     Examples
363 |     --------
364 |     >>> from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
365 |     >>> import numpy as np
366 |     >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
367 |     >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
368 |     >>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5,
369 |     ...    random_state=0)
370 |     >>> msss.get_n_splits(X, y)
371 |     3
372 |     >>> print(mss)       # doctest: +ELLIPSIS
373 |     MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,
374 |                                      train_size=None)
375 |     >>> for train_index, test_index in msss.split(X, y):
376 |     ...    print("TRAIN:", train_index, "TEST:", test_index)
377 |     ...    X_train, X_test = X[train_index], X[test_index]
378 |     ...    y_train, y_test = y[train_index], y[test_index]
379 |     TRAIN: [1 2 5 7] TEST: [0 3 4 6]
380 |     TRAIN: [2 3 6 7] TEST: [0 1 4 5]
381 |     TRAIN: [1 2 5 6] TEST: [0 3 4 7]
382 |     Notes
383 |     -----
384 |     Train and test sizes may be slightly different from desired due to the
385 |     preference of stratification over perfectly sized folds.
386 |     """
387 |     def __init__(self, n_splits=10, test_size="default", train_size=None, random_state=None):
388 |         super(MultilabelStratifiedShuffleSplit, self).__init__(
389 |             n_splits=n_splits,
390 |             test_size=test_size,
391 |             train_size=train_size,
392 |             random_state=random_state,
393 |         )
394 | 
395 |     def _iter_indices(self, X, y, groups=None):
396 |         n_samples = _num_samples(X)
397 |         y = check_array(y, ensure_2d=False, dtype=None)
398 |         y = np.asarray(y, dtype=bool)
399 |         type_of_target_y = type_of_target(y)
400 | 
401 |         if type_of_target_y != "multilabel-indicator":
402 |             raise ValueError("Supported target type is: multilabel-indicator. Got {!r} instead.".format(type_of_target_y))
403 | 
404 |         n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size)
405 | 
406 |         n_samples = y.shape[0]
407 |         rng = check_random_state(self.random_state)
408 |         y_orig = y.copy()
409 | 
410 |         r = np.array([n_train, n_test]) / (n_train + n_test)
411 | 
412 |         for _ in range(self.n_splits):
413 |             indices = np.arange(n_samples)
414 |             rng.shuffle(indices)
415 |             y = y_orig[indices]
416 | 
417 |             test_folds = IterativeStratification(labels=y, r=r, random_state=rng)
418 | 
419 |             test_idx = test_folds[np.argsort(indices)] == 1
420 |             test = np.where(test_idx)[0]
421 |             train = np.where(~test_idx)[0]
422 | 
423 |             yield train, test
424 | 
425 |     def split(self, X, y, groups=None):
426 |         """Generate indices to split data into training and test set.
427 |         Parameters
428 |         ----------
429 |         X : array-like, shape (n_samples, n_features)
430 |             Training data, where n_samples is the number of samples
431 |             and n_features is the number of features.
432 |             Note that providing ``y`` is sufficient to generate the splits and
433 |             hence ``np.zeros(n_samples)`` may be used as a placeholder for
434 |             ``X`` instead of actual training data.
435 |         y : array-like, shape (n_samples, n_labels)
436 |             The target variable for supervised learning problems.
437 |             Multilabel stratification is done based on the y labels.
438 |         groups : object
439 |             Always ignored, exists for compatibility.
440 |         Returns
441 |         -------
442 |         train : ndarray
443 |             The training set indices for that split.
444 |         test : ndarray
445 |             The testing set indices for that split.
446 |         Notes
447 |         -----
448 |         Randomized CV splitters may return different results for each call of
449 |         split. You can make the results identical by setting ``random_state``
450 |         to an integer.
451 |         """
452 |         y = check_array(y, ensure_2d=False, dtype=None)
453 |         return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups)
454 | 


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/tab_network.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import Linear, BatchNorm1d, ReLU
  3 | import numpy as np
  4 | from . import sparsemax
  5 | 
  6 | 
  7 | def initialize_non_glu(module, input_dim, output_dim):
  8 |     gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(4 * input_dim))
  9 |     torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
 10 |     # torch.nn.init.zeros_(module.bias)
 11 |     return
 12 | 
 13 | 
 14 | def initialize_glu(module, input_dim, output_dim):
 15 |     gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(input_dim))
 16 |     torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
 17 |     # torch.nn.init.zeros_(module.bias)
 18 |     return
 19 | 
 20 | 
 21 | class GBN(torch.nn.Module):
 22 |     """
 23 |         Ghost Batch Normalization
 24 |         https://arxiv.org/abs/1705.08741
 25 |     """
 26 |     def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01):
 27 |         super(GBN, self).__init__()
 28 | 
 29 |         self.input_dim = input_dim
 30 |         self.virtual_batch_size = virtual_batch_size
 31 |         self.bn = BatchNorm1d(self.input_dim, momentum=momentum)
 32 | 
 33 |     def forward(self, x):
 34 |         chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
 35 |         res = [self.bn(x_) for x_ in chunks]
 36 | 
 37 |         return torch.cat(res, dim=0)
 38 | 
 39 | 
 40 | class TabNetNoEmbeddings(torch.nn.Module):
 41 |     def __init__(self,
 42 |                  input_dim,
 43 |                  output_dim,
 44 |                  n_d=8,
 45 |                  n_a=8,
 46 |                  n_steps=3,
 47 |                  gamma=1.3,
 48 |                  n_independent=2,
 49 |                  n_shared=2,
 50 |                  epsilon=1e-15,
 51 |                  virtual_batch_size=128,
 52 |                  momentum=0.02,
 53 |                  mask_type="sparsemax"):
 54 |         """
 55 |         Defines main part of the TabNet network without the embedding layers.
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         input_dim : int
 60 |             Number of features
 61 |         output_dim : int or list of int for multi task classification
 62 |             Dimension of network output
 63 |             examples : one for regression, 2 for binary classification etc...
 64 |         n_d : int
 65 |             Dimension of the prediction  layer (usually between 4 and 64)
 66 |         n_a : int
 67 |             Dimension of the attention  layer (usually between 4 and 64)
 68 |         n_steps : int
 69 |             Number of sucessive steps in the newtork (usually betwenn 3 and 10)
 70 |         gamma : float
 71 |             Float above 1, scaling factor for attention updates (usually betwenn 1.0 to 2.0)
 72 |         n_independent : int
 73 |             Number of independent GLU layer in each GLU block (default 2)
 74 |         n_shared : int
 75 |             Number of independent GLU layer in each GLU block (default 2)
 76 |         epsilon : float
 77 |             Avoid log(0), this should be kept very low
 78 |         virtual_batch_size : int
 79 |             Batch size for Ghost Batch Normalization
 80 |         momentum : float
 81 |             Float value between 0 and 1 which will be used for momentum in all batch norm
 82 |         mask_type : str
 83 |             Either "sparsemax" or "entmax" : this is the masking function to use
 84 |         """
 85 |         super(TabNetNoEmbeddings, self).__init__()
 86 |         self.input_dim = input_dim
 87 |         self.output_dim = output_dim
 88 |         self.is_multi_task = isinstance(output_dim, list)
 89 |         self.n_d = n_d
 90 |         self.n_a = n_a
 91 |         self.n_steps = n_steps
 92 |         self.gamma = gamma
 93 |         self.epsilon = epsilon
 94 |         self.n_independent = n_independent
 95 |         self.n_shared = n_shared
 96 |         self.virtual_batch_size = virtual_batch_size
 97 |         self.mask_type = mask_type
 98 |         self.initial_bn = BatchNorm1d(self.input_dim, momentum=0.01)
 99 | 
100 |         if self.n_shared > 0:
101 |             shared_feat_transform = torch.nn.ModuleList()
102 |             for i in range(self.n_shared):
103 |                 if i == 0:
104 |                     shared_feat_transform.append(Linear(self.input_dim, 2 * (n_d + n_a), bias=False))
105 |                 else:
106 |                     shared_feat_transform.append(Linear(n_d + n_a, 2 * (n_d + n_a), bias=False))
107 | 
108 |         else:
109 |             shared_feat_transform = None
110 | 
111 |         self.initial_splitter = FeatTransformer(self.input_dim,
112 |                                                 n_d + n_a,
113 |                                                 shared_feat_transform,
114 |                                                 n_glu_independent=self.n_independent,
115 |                                                 virtual_batch_size=self.virtual_batch_size,
116 |                                                 momentum=momentum)
117 | 
118 |         self.feat_transformers = torch.nn.ModuleList()
119 |         self.att_transformers = torch.nn.ModuleList()
120 | 
121 |         for step in range(n_steps):
122 |             transformer = FeatTransformer(self.input_dim,
123 |                                           n_d + n_a,
124 |                                           shared_feat_transform,
125 |                                           n_glu_independent=self.n_independent,
126 |                                           virtual_batch_size=self.virtual_batch_size,
127 |                                           momentum=momentum)
128 |             attention = AttentiveTransformer(n_a, self.input_dim, virtual_batch_size=self.virtual_batch_size, momentum=momentum, mask_type=self.mask_type)
129 |             self.feat_transformers.append(transformer)
130 |             self.att_transformers.append(attention)
131 | 
132 |         if self.is_multi_task:
133 |             self.multi_task_mappings = torch.nn.ModuleList()
134 |             for task_dim in output_dim:
135 |                 task_mapping = Linear(n_d, task_dim, bias=False)
136 |                 initialize_non_glu(task_mapping, n_d, task_dim)
137 |                 self.multi_task_mappings.append(task_mapping)
138 |         else:
139 |             self.final_mapping = Linear(n_d, output_dim, bias=False)
140 |             initialize_non_glu(self.final_mapping, n_d, output_dim)
141 | 
142 |     def forward(self, x):
143 |         res = 0
144 |         x = self.initial_bn(x)
145 | 
146 |         prior = torch.ones(x.shape).to(x.device)
147 |         M_loss = 0
148 |         att = self.initial_splitter(x)[:, self.n_d:]
149 | 
150 |         for step in range(self.n_steps):
151 |             M = self.att_transformers[step](prior, att)
152 |             M_loss += torch.mean(torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1))
153 |             # update prior
154 |             prior = torch.mul(self.gamma - M, prior)
155 |             # output
156 |             masked_x = torch.mul(M, x)
157 |             out = self.feat_transformers[step](masked_x)
158 |             d = ReLU()(out[:, :self.n_d])
159 |             res = torch.add(res, d)
160 |             # update attention
161 |             att = out[:, self.n_d:]
162 | 
163 |         M_loss /= self.n_steps
164 | 
165 |         if self.is_multi_task:
166 |             # Result will be in list format
167 |             out = []
168 |             for task_mapping in self.multi_task_mappings:
169 |                 out.append(task_mapping(res))
170 |         else:
171 |             out = self.final_mapping(res)
172 |         return out, M_loss
173 | 
174 |     def forward_masks(self, x):
175 |         x = self.initial_bn(x)
176 | 
177 |         prior = torch.ones(x.shape).to(x.device)
178 |         M_explain = torch.zeros(x.shape).to(x.device)
179 |         att = self.initial_splitter(x)[:, self.n_d:]
180 |         masks = {}
181 | 
182 |         for step in range(self.n_steps):
183 |             M = self.att_transformers[step](prior, att)
184 |             masks[step] = M
185 |             # update prior
186 |             prior = torch.mul(self.gamma - M, prior)
187 |             # output
188 |             masked_x = torch.mul(M, x)
189 |             out = self.feat_transformers[step](masked_x)
190 |             d = ReLU()(out[:, :self.n_d])
191 |             # explain
192 |             step_importance = torch.sum(d, dim=1)
193 |             M_explain += torch.mul(M, step_importance.unsqueeze(dim=1))
194 |             # update attention
195 |             att = out[:, self.n_d:]
196 | 
197 |         return M_explain, masks
198 | 
199 | 
200 | class TabNet(torch.nn.Module):
201 |     def __init__(self,
202 |                  input_dim,
203 |                  output_dim,
204 |                  n_d=8,
205 |                  n_a=8,
206 |                  n_steps=3,
207 |                  gamma=1.3,
208 |                  cat_idxs=[],
209 |                  cat_dims=[],
210 |                  cat_emb_dim=1,
211 |                  n_independent=2,
212 |                  n_shared=2,
213 |                  epsilon=1e-15,
214 |                  virtual_batch_size=128,
215 |                  momentum=0.02,
216 |                  device_name='auto',
217 |                  mask_type="sparsemax"):
218 |         """
219 |         Defines TabNet network
220 | 
221 |         Parameters
222 |         ----------
223 |         input_dim : int
224 |             Initial number of features
225 |         output_dim : int
226 |             Dimension of network output
227 |             examples : one for regression, 2 for binary classification etc...
228 |         n_d : int
229 |             Dimension of the prediction  layer (usually between 4 and 64)
230 |         n_a : int
231 |             Dimension of the attention  layer (usually between 4 and 64)
232 |         n_steps : int
233 |             Number of sucessive steps in the newtork (usually betwenn 3 and 10)
234 |         gamma : float
235 |             Float above 1, scaling factor for attention updates (usually betwenn 1.0 to 2.0)
236 |         cat_idxs : list of int
237 |             Index of each categorical column in the dataset
238 |         cat_dims : list of int
239 |             Number of categories in each categorical column
240 |         cat_emb_dim : int or list of int
241 |             Size of the embedding of categorical features
242 |             if int, all categorical features will have same embedding size
243 |             if list of int, every corresponding feature will have specific size
244 |         n_independent : int
245 |             Number of independent GLU layer in each GLU block (default 2)
246 |         n_shared : int
247 |             Number of independent GLU layer in each GLU block (default 2)
248 |         epsilon : float
249 |             Avoid log(0), this should be kept very low
250 |         virtual_batch_size : int
251 |             Batch size for Ghost Batch Normalization
252 |         momentum : float
253 |             Float value between 0 and 1 which will be used for momentum in all batch norm
254 |         device_name : {'auto', 'cuda', 'cpu'}
255 |         mask_type : str
256 |             Either "sparsemax" or "entmax" : this is the masking function to use
257 |         """
258 |         super(TabNet, self).__init__()
259 |         self.cat_idxs = cat_idxs or []
260 |         self.cat_dims = cat_dims or []
261 |         self.cat_emb_dim = cat_emb_dim
262 | 
263 |         self.input_dim = input_dim
264 |         self.output_dim = output_dim
265 |         self.n_d = n_d
266 |         self.n_a = n_a
267 |         self.n_steps = n_steps
268 |         self.gamma = gamma
269 |         self.epsilon = epsilon
270 |         self.n_independent = n_independent
271 |         self.n_shared = n_shared
272 |         self.mask_type = mask_type
273 | 
274 |         if self.n_steps <= 0:
275 |             raise ValueError("n_steps should be a positive integer.")
276 |         if self.n_independent == 0 and self.n_shared == 0:
277 |             raise ValueError("n_shared and n_independant can't be both zero.")
278 | 
279 |         self.virtual_batch_size = virtual_batch_size
280 |         self.embedder = EmbeddingGenerator(input_dim, cat_dims, cat_idxs, cat_emb_dim)
281 |         self.post_embed_dim = self.embedder.post_embed_dim
282 |         self.tabnet = TabNetNoEmbeddings(self.post_embed_dim, output_dim, n_d, n_a, n_steps, gamma, n_independent, n_shared, epsilon, virtual_batch_size,
283 |                                          momentum, mask_type)
284 | 
285 |         # Defining device
286 |         if device_name == 'auto':
287 |             if torch.cuda.is_available():
288 |                 device_name = 'cuda'
289 |             else:
290 |                 device_name = 'cpu'
291 |         self.device = torch.device(device_name)
292 |         self.to(self.device)
293 | 
294 |     def forward(self, x):
295 |         x = self.embedder(x)
296 |         return self.tabnet(x)
297 | 
298 |     def forward_masks(self, x):
299 |         x = self.embedder(x)
300 |         return self.tabnet.forward_masks(x)
301 | 
302 | 
303 | class AttentiveTransformer(torch.nn.Module):
304 |     def __init__(self, input_dim, output_dim, virtual_batch_size=128, momentum=0.02, mask_type="sparsemax"):
305 |         """
306 |         Initialize an attention transformer.
307 | 
308 |         Parameters
309 |         ----------
310 |         input_dim : int
311 |             Input size
312 |         output_dim : int
313 |             Outpu_size
314 |         virtual_batch_size : int
315 |             Batch size for Ghost Batch Normalization
316 |         momentum : float
317 |             Float value between 0 and 1 which will be used for momentum in batch norm
318 |         mask_type : str
319 |             Either "sparsemax" or "entmax" : this is the masking function to use
320 |         """
321 |         super(AttentiveTransformer, self).__init__()
322 |         self.fc = Linear(input_dim, output_dim, bias=False)
323 |         initialize_non_glu(self.fc, input_dim, output_dim)
324 |         self.bn = GBN(output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum)
325 | 
326 |         if mask_type == "sparsemax":
327 |             # Sparsemax
328 |             self.selector = sparsemax.Sparsemax(dim=-1)
329 |         elif mask_type == "entmax":
330 |             # Entmax
331 |             self.selector = sparsemax.Entmax15(dim=-1)
332 |         else:
333 |             raise NotImplementedError("Please choose either sparsemax" + "or entmax as masktype")
334 | 
335 |     def forward(self, priors, processed_feat):
336 |         x = self.fc(processed_feat)
337 |         x = self.bn(x)
338 |         x = torch.mul(x, priors)
339 |         x = self.selector(x)
340 |         return x
341 | 
342 | 
343 | class FeatTransformer(torch.nn.Module):
344 |     def __init__(self, input_dim, output_dim, shared_layers, n_glu_independent, virtual_batch_size=128, momentum=0.02):
345 |         super(FeatTransformer, self).__init__()
346 |         """
347 |         Initialize a feature transformer.
348 | 
349 |         Parameters
350 |         ----------
351 |         input_dim : int
352 |             Input size
353 |         output_dim : int
354 |             Outpu_size
355 |         shared_layers : torch.nn.ModuleList
356 |             The shared block that should be common to every step
357 |         n_glu_independant : int
358 |             Number of independent GLU layers
359 |         virtual_batch_size : int
360 |             Batch size for Ghost Batch Normalization within GLU block(s)
361 |         momentum : float
362 |             Float value between 0 and 1 which will be used for momentum in batch norm
363 |         """
364 | 
365 |         params = {'n_glu': n_glu_independent, 'virtual_batch_size': virtual_batch_size, 'momentum': momentum}
366 | 
367 |         if shared_layers is None:
368 |             # no shared layers
369 |             self.shared = torch.nn.Identity()
370 |             is_first = True
371 |         else:
372 |             self.shared = GLU_Block(input_dim,
373 |                                     output_dim,
374 |                                     first=True,
375 |                                     shared_layers=shared_layers,
376 |                                     n_glu=len(shared_layers),
377 |                                     virtual_batch_size=virtual_batch_size,
378 |                                     momentum=momentum)
379 |             is_first = False
380 | 
381 |         if n_glu_independent == 0:
382 |             # no independent layers
383 |             self.specifics = torch.nn.Identity()
384 |         else:
385 |             spec_input_dim = input_dim if is_first else output_dim
386 |             self.specifics = GLU_Block(spec_input_dim, output_dim, first=is_first, **params)
387 | 
388 |     def forward(self, x):
389 |         x = self.shared(x)
390 |         x = self.specifics(x)
391 |         return x
392 | 
393 | 
394 | class GLU_Block(torch.nn.Module):
395 |     """
396 |         Independant GLU block, specific to each step
397 |     """
398 |     def __init__(self, input_dim, output_dim, n_glu=2, first=False, shared_layers=None, virtual_batch_size=128, momentum=0.02):
399 |         super(GLU_Block, self).__init__()
400 |         self.first = first
401 |         self.shared_layers = shared_layers
402 |         self.n_glu = n_glu
403 |         self.glu_layers = torch.nn.ModuleList()
404 | 
405 |         params = {'virtual_batch_size': virtual_batch_size, 'momentum': momentum}
406 | 
407 |         fc = shared_layers[0] if shared_layers else None
408 |         self.glu_layers.append(GLU_Layer(input_dim, output_dim, fc=fc, **params))
409 |         for glu_id in range(1, self.n_glu):
410 |             fc = shared_layers[glu_id] if shared_layers else None
411 |             self.glu_layers.append(GLU_Layer(output_dim, output_dim, fc=fc, **params))
412 | 
413 |     def forward(self, x):
414 |         scale = torch.sqrt(torch.FloatTensor([0.5]).to(x.device))
415 |         if self.first:  # the first layer of the block has no scale multiplication
416 |             x = self.glu_layers[0](x)
417 |             layers_left = range(1, self.n_glu)
418 |         else:
419 |             layers_left = range(self.n_glu)
420 | 
421 |         for glu_id in layers_left:
422 |             x = torch.add(x, self.glu_layers[glu_id](x))
423 |             x = x * scale
424 |         return x
425 | 
426 | 
427 | class GLU_Layer(torch.nn.Module):
428 |     def __init__(self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02):
429 |         super(GLU_Layer, self).__init__()
430 | 
431 |         self.output_dim = output_dim
432 |         if fc:
433 |             self.fc = fc
434 |         else:
435 |             self.fc = Linear(input_dim, 2 * output_dim, bias=False)
436 |         initialize_glu(self.fc, input_dim, 2 * output_dim)
437 | 
438 |         self.bn = GBN(2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum)
439 | 
440 |     def forward(self, x):
441 |         x = self.fc(x)
442 |         x = self.bn(x)
443 |         out = torch.mul(x[:, :self.output_dim], torch.sigmoid(x[:, self.output_dim:]))
444 |         return out
445 | 
446 | 
447 | class EmbeddingGenerator(torch.nn.Module):
448 |     """
449 |         Classical embeddings generator
450 |     """
451 |     def __init__(self, input_dim, cat_dims, cat_idxs, cat_emb_dim):
452 |         """ This is an embedding module for an entier set of features
453 | 
454 |         Parameters
455 |         ----------
456 |         input_dim : int
457 |             Number of features coming as input (number of columns)
458 |         cat_dims : list of int
459 |             Number of modalities for each categorial features
460 |             If the list is empty, no embeddings will be done
461 |         cat_idxs : list of int
462 |             Positional index for each categorical features in inputs
463 |         cat_emb_dim : int or list of int
464 |             Embedding dimension for each categorical features
465 |             If int, the same embdeding dimension will be used for all categorical features
466 |         """
467 |         super(EmbeddingGenerator, self).__init__()
468 |         if cat_dims == [] or cat_idxs == []:
469 |             self.skip_embedding = True
470 |             self.post_embed_dim = input_dim
471 |             return
472 | 
473 |         self.skip_embedding = False
474 |         if isinstance(cat_emb_dim, int):
475 |             self.cat_emb_dims = [cat_emb_dim] * len(cat_idxs)
476 |         else:
477 |             self.cat_emb_dims = cat_emb_dim
478 | 
479 |         # check that all embeddings are provided
480 |         if len(self.cat_emb_dims) != len(cat_dims):
481 |             msg = """ cat_emb_dim and cat_dims must be lists of same length, got {len(self.cat_emb_dims)}
482 |                       and {len(cat_dims)}"""
483 |             raise ValueError(msg)
484 |         self.post_embed_dim = int(input_dim + np.sum(self.cat_emb_dims) - len(self.cat_emb_dims))
485 | 
486 |         self.embeddings = torch.nn.ModuleList()
487 | 
488 |         # Sort dims by cat_idx
489 |         sorted_idxs = np.argsort(cat_idxs)
490 |         cat_dims = [cat_dims[i] for i in sorted_idxs]
491 |         self.cat_emb_dims = [self.cat_emb_dims[i] for i in sorted_idxs]
492 | 
493 |         for cat_dim, emb_dim in zip(cat_dims, self.cat_emb_dims):
494 |             self.embeddings.append(torch.nn.Embedding(cat_dim, emb_dim))
495 | 
496 |         # record continuous indices
497 |         self.continuous_idx = torch.ones(input_dim, dtype=torch.bool)
498 |         self.continuous_idx[cat_idxs] = 0
499 | 
500 |     def forward(self, x):
501 |         """
502 |         Apply embdeddings to inputs
503 |         Inputs should be (batch_size, input_dim)
504 |         Outputs will be of size (batch_size, self.post_embed_dim)
505 |         """
506 |         if self.skip_embedding:
507 |             # no embeddings required
508 |             return x
509 | 
510 |         cols = []
511 |         cat_feat_counter = 0
512 |         for feat_init_idx, is_continuous in enumerate(self.continuous_idx):
513 |             # Enumerate through continuous idx boolean mask to apply embeddings
514 |             if is_continuous:
515 |                 cols.append(x[:, feat_init_idx].float().view(-1, 1))
516 |             else:
517 |                 cols.append(self.embeddings[cat_feat_counter](x[:, feat_init_idx].long()))
518 |                 cat_feat_counter += 1
519 |         # concat
520 |         post_embeddings = torch.cat(cols, dim=1)
521 |         return post_embeddings
522 | 


--------------------------------------------------------------------------------
/src/models/pytorch_tabnet/abstract_model.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import List, Any, Dict
  3 | import torch
  4 | from torch.nn.utils import clip_grad_norm_
  5 | import numpy as np
  6 | from scipy.sparse import csc_matrix
  7 | from abc import abstractmethod
  8 | from . import tab_network
  9 | from .utils import (
 10 |     PredictDataset,
 11 |     create_explain_matrix,
 12 |     validate_eval_set,
 13 |     create_dataloaders,
 14 |     define_device,
 15 | )
 16 | from .callbacks import (
 17 |     CallbackContainer,
 18 |     History,
 19 |     EarlyStopping,
 20 |     LRSchedulerCallback,
 21 | )
 22 | from .metrics import MetricContainer, check_metrics
 23 | from sklearn.base import BaseEstimator
 24 | from sklearn.utils import check_array
 25 | from torch.utils.data import DataLoader
 26 | import io
 27 | import json
 28 | from pathlib import Path
 29 | import shutil
 30 | import zipfile
 31 | 
 32 | 
 33 | @dataclass
 34 | class TabModel(BaseEstimator):
 35 |     """ Class for TabNet model."""
 36 | 
 37 |     n_d: int = 8
 38 |     n_a: int = 8
 39 |     n_steps: int = 3
 40 |     gamma: float = 1.3
 41 |     cat_idxs: List[int] = field(default_factory=list)
 42 |     cat_dims: List[int] = field(default_factory=list)
 43 |     cat_emb_dim: int = 1
 44 |     n_independent: int = 2
 45 |     n_shared: int = 2
 46 |     epsilon: float = 1e-15
 47 |     momentum: float = 0.02
 48 |     lambda_sparse: float = 1e-3
 49 |     seed: int = 0
 50 |     clip_value: int = 1
 51 |     verbose: int = 1
 52 |     optimizer_fn: Any = torch.optim.Adam
 53 |     optimizer_params: Dict = field(default_factory=lambda: dict(lr=2e-2))
 54 |     scheduler_fn: Any = None
 55 |     scheduler_params: Dict = field(default_factory=dict)
 56 |     mask_type: str = "sparsemax"
 57 |     input_dim: int = None
 58 |     output_dim: int = None
 59 |     device_name: str = "auto"
 60 | 
 61 |     def __post_init__(self):
 62 |         self.batch_size = 1024
 63 |         self.virtual_batch_size = 1024
 64 |         torch.manual_seed(self.seed)
 65 |         # Defining device
 66 |         self.device = torch.device(define_device(self.device_name))
 67 |         print(f"Device used : {self.device}")
 68 | 
 69 |     def fit(self,
 70 |             X_train,
 71 |             y_train,
 72 |             eval_set=None,
 73 |             eval_name=None,
 74 |             eval_metric=None,
 75 |             loss_fn=None,
 76 |             weights=0,
 77 |             max_epochs=100,
 78 |             patience=10,
 79 |             batch_size=1024,
 80 |             virtual_batch_size=128,
 81 |             num_workers=0,
 82 |             drop_last=False,
 83 |             callbacks=None,
 84 |             pin_memory=True):
 85 |         """Train a neural network stored in self.network
 86 |         Using train_dataloader for training data and
 87 |         valid_dataloader for validation.
 88 | 
 89 |         Parameters
 90 |         ----------
 91 |         X_train : np.ndarray
 92 |             Train set
 93 |         y_train : np.array
 94 |             Train targets
 95 |         eval_set : list of tuple
 96 |             List of eval tuple set (X, y).
 97 |             The last one is used for early stopping
 98 |         eval_name : list of str
 99 |             List of eval set names.
100 |         eval_metric : list of str
101 |             List of evaluation metrics.
102 |             The last metric is used for early stopping.
103 |         loss_fn : callable or None
104 |             a PyTorch loss function
105 |         weights : bool or dictionnary
106 |             0 for no balancing
107 |             1 for automated balancing
108 |             dict for custom weights per class
109 |         max_epochs : int
110 |             Maximum number of epochs during training
111 |         patience : int
112 |             Number of consecutive non improving epoch before early stopping
113 |         batch_size : int
114 |             Training batch size
115 |         virtual_batch_size : int
116 |             Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size)
117 |         num_workers : int
118 |             Number of workers used in torch.utils.data.DataLoader
119 |         drop_last : bool
120 |             Whether to drop last batch during training
121 |         callbacks : list of callback function
122 |             List of custom callbacks
123 |         pin_memory: bool
124 |             Whether to set pin_memory to True or False during training
125 |         """
126 |         # update model name
127 | 
128 |         self.max_epochs = max_epochs
129 |         self.patience = patience
130 |         self.batch_size = batch_size
131 |         self.virtual_batch_size = virtual_batch_size
132 |         self.num_workers = num_workers
133 |         self.drop_last = drop_last
134 |         self.input_dim = X_train.shape[1]
135 |         self._stop_training = False
136 |         self.pin_memory = pin_memory and (self.device.type != 'cpu')
137 | 
138 |         eval_set = eval_set if eval_set else []
139 | 
140 |         if loss_fn is None:
141 |             self.loss_fn = self._default_loss
142 |         else:
143 |             self.loss_fn = loss_fn
144 | 
145 |         check_array(X_train)
146 | 
147 |         self.update_fit_params(
148 |             X_train,
149 |             y_train,
150 |             eval_set,
151 |             weights,
152 |         )
153 | 
154 |         # Validate and reformat eval set depending on training data
155 |         eval_names, eval_set = validate_eval_set(eval_set, eval_name, X_train, y_train)
156 | 
157 |         train_dataloader, valid_dataloaders = self._construct_loaders(X_train, y_train, eval_set)
158 | 
159 |         self._set_network()
160 |         self._set_metrics(eval_metric, eval_names)
161 |         self._set_optimizer()
162 |         self._set_callbacks(callbacks)
163 | 
164 |         # Call method on_train_begin for all callbacks
165 |         self._callback_container.on_train_begin()
166 | 
167 |         # Training loop over epochs
168 |         for epoch_idx in range(self.max_epochs):
169 | 
170 |             # Call method on_epoch_begin for all callbacks
171 |             self._callback_container.on_epoch_begin(epoch_idx)
172 | 
173 |             self._train_epoch(train_dataloader)
174 | 
175 |             # Apply predict epoch to all eval sets
176 |             for eval_name, valid_dataloader in zip(eval_names, valid_dataloaders):
177 |                 self._predict_epoch(eval_name, valid_dataloader)
178 | 
179 |             # Call method on_epoch_end for all callbacks
180 |             self._callback_container.on_epoch_end(epoch_idx, logs=self.history.epoch_metrics)
181 | 
182 |             if self._stop_training:
183 |                 break
184 | 
185 |         # Call method on_train_end for all callbacks
186 |         self._callback_container.on_train_end()
187 |         self.network.eval()
188 | 
189 |         # compute feature importance once the best model is defined
190 |         self._compute_feature_importances(train_dataloader)
191 | 
192 |     def predict(self, X):
193 |         """
194 |         Make predictions on a batch (valid)
195 | 
196 |         Parameters
197 |         ----------
198 |         X : a :tensor: `torch.Tensor`
199 |             Input data
200 | 
201 |         Returns
202 |         -------
203 |         predictions : np.array
204 |             Predictions of the regression problem
205 |         """
206 |         self.network.eval()
207 |         dataloader = DataLoader(
208 |             PredictDataset(X),
209 |             batch_size=self.batch_size,
210 |             shuffle=False,
211 |         )
212 | 
213 |         results = []
214 |         for batch_nb, data in enumerate(dataloader):
215 |             data = data.to(self.device).float()
216 |             output, M_loss = self.network(data)
217 |             predictions = output.cpu().detach().numpy()
218 |             results.append(predictions)
219 |         res = np.vstack(results)
220 |         return self.predict_func(res)
221 | 
222 |     def explain(self, X):
223 |         """
224 |         Return local explanation
225 | 
226 |         Parameters
227 |         ----------
228 |         X : tensor: `torch.Tensor`
229 |             Input data
230 | 
231 |         Returns
232 |         -------
233 |         M_explain : matrix
234 |             Importance per sample, per columns.
235 |         masks : matrix
236 |             Sparse matrix showing attention masks used by network.
237 |         """
238 |         self.network.eval()
239 | 
240 |         dataloader = DataLoader(
241 |             PredictDataset(X),
242 |             batch_size=self.batch_size,
243 |             shuffle=False,
244 |         )
245 | 
246 |         res_explain = []
247 | 
248 |         for batch_nb, data in enumerate(dataloader):
249 |             data = data.to(self.device).float()
250 | 
251 |             M_explain, masks = self.network.forward_masks(data)
252 |             for key, value in masks.items():
253 |                 masks[key] = csc_matrix.dot(value.cpu().detach().numpy(), self.reducing_matrix)
254 | 
255 |             res_explain.append(csc_matrix.dot(M_explain.cpu().detach().numpy(), self.reducing_matrix))
256 | 
257 |             if batch_nb == 0:
258 |                 res_masks = masks
259 |             else:
260 |                 for key, value in masks.items():
261 |                     res_masks[key] = np.vstack([res_masks[key], value])
262 | 
263 |         res_explain = np.vstack(res_explain)
264 | 
265 |         return res_explain, res_masks
266 | 
267 |     def save_model(self, path):
268 |         """Saving TabNet model in two distinct files.
269 | 
270 |         Parameters
271 |         ----------
272 |         path : str
273 |             Path of the model.
274 | 
275 |         Returns
276 |         -------
277 |         str
278 |             input filepath with ".zip" appended
279 | 
280 |         """
281 |         saved_params = {}
282 |         for key, val in self.get_params().items():
283 |             if isinstance(val, type):
284 |                 # Don't save torch specific params
285 |                 continue
286 |             else:
287 |                 saved_params[key] = val
288 | 
289 |         # Create folder
290 |         Path(path).mkdir(parents=True, exist_ok=True)
291 | 
292 |         # Save models params
293 |         with open(Path(path).joinpath("model_params.json"), "w", encoding="utf8") as f:
294 |             json.dump(saved_params, f)
295 | 
296 |         # Save state_dict
297 |         torch.save(self.network.state_dict(), Path(path).joinpath("network.pt"))
298 |         shutil.make_archive(path, "zip", path)
299 |         shutil.rmtree(path)
300 |         print(f"Successfully saved model at {path}.zip")
301 |         return f"{path}.zip"
302 | 
303 |     def load_model(self, filepath):
304 |         """Load TabNet model.
305 | 
306 |         Parameters
307 |         ----------
308 |         filepath : str
309 |             Path of the model.
310 |         """
311 |         try:
312 |             with zipfile.ZipFile(filepath) as z:
313 |                 with z.open("model_params.json") as f:
314 |                     loaded_params = json.load(f)
315 |                 with z.open("network.pt") as f:
316 |                     try:
317 |                         saved_state_dict = torch.load(f, map_location=self.device)
318 |                     except io.UnsupportedOperation:
319 |                         # In Python <3.7, the returned file object is not seekable (which at least
320 |                         # some versions of PyTorch require) - so we'll try buffering it in to a
321 |                         # BytesIO instead:
322 |                         saved_state_dict = torch.load(
323 |                             io.BytesIO(f.read()),
324 |                             map_location=self.device,
325 |                         )
326 |         except KeyError:
327 |             raise KeyError("Your zip file is missing at least one component")
328 | 
329 |         self.__init__(**loaded_params)
330 | 
331 |         self._set_network()
332 |         self.network.load_state_dict(saved_state_dict)
333 |         self.network.eval()
334 |         return
335 | 
336 |     def _train_epoch(self, train_loader):
337 |         """
338 |         Trains one epoch of the network in self.network
339 | 
340 |         Parameters
341 |         ----------
342 |         train_loader : a :class: `torch.utils.data.Dataloader`
343 |             DataLoader with train set
344 |         """
345 |         self.network.train()
346 | 
347 |         for batch_idx, (X, y) in enumerate(train_loader):
348 |             self._callback_container.on_batch_begin(batch_idx)
349 | 
350 |             batch_logs = self._train_batch(X, y)
351 | 
352 |             self._callback_container.on_batch_end(batch_idx, batch_logs)
353 | 
354 |         epoch_logs = {"lr": self._optimizer.param_groups[-1]["lr"]}
355 |         self.history.epoch_metrics.update(epoch_logs)
356 | 
357 |         return
358 | 
359 |     def _train_batch(self, X, y):
360 |         """
361 |         Trains one batch of data
362 | 
363 |         Parameters
364 |         ----------
365 |         X : torch.Tensor
366 |             Train matrix
367 |         y : torch.Tensor
368 |             Target matrix
369 | 
370 |         Returns
371 |         -------
372 |         batch_outs : dict
373 |             Dictionnary with "y": target and "score": prediction scores.
374 |         batch_logs : dict
375 |             Dictionnary with "batch_size" and "loss".
376 |         """
377 |         batch_logs = {"batch_size": X.shape[0]}
378 | 
379 |         X = X.to(self.device).float()
380 |         y = y.to(self.device).float()
381 | 
382 |         for param in self.network.parameters():
383 |             param.grad = None
384 | 
385 |         output, M_loss = self.network(X)
386 | 
387 |         loss = self.compute_loss(output, y)
388 |         # Add the overall sparsity loss
389 |         loss -= self.lambda_sparse * M_loss
390 | 
391 |         # Perform backward pass and optimization
392 |         loss.backward()
393 |         if self.clip_value:
394 |             clip_grad_norm_(self.network.parameters(), self.clip_value)
395 |         self._optimizer.step()
396 | 
397 |         batch_logs["loss"] = loss.cpu().detach().numpy().item()
398 | 
399 |         return batch_logs
400 | 
401 |     def _predict_epoch(self, name, loader):
402 |         """
403 |         Predict an epoch and update metrics.
404 | 
405 |         Parameters
406 |         ----------
407 |         name : str
408 |             Name of the validation set
409 |         loader : torch.utils.data.Dataloader
410 |                 DataLoader with validation set
411 |         """
412 |         # Setting network on evaluation mode (no dropout etc...)
413 |         self.network.eval()
414 | 
415 |         list_y_true = []
416 |         list_y_score = []
417 | 
418 |         # Main loop
419 |         for batch_idx, (X, y) in enumerate(loader):
420 |             scores = self._predict_batch(X)
421 |             list_y_true.append(y)
422 |             list_y_score.append(scores)
423 | 
424 |         y_true, scores = self.stack_batches(list_y_true, list_y_score)
425 | 
426 |         metrics_logs = self._metric_container_dict[name](y_true, scores)
427 |         self.network.train()
428 |         self.history.epoch_metrics.update(metrics_logs)
429 |         return
430 | 
431 |     def _predict_batch(self, X):
432 |         """
433 |         Predict one batch of data.
434 | 
435 |         Parameters
436 |         ----------
437 |         X : torch.Tensor
438 |             Owned products
439 | 
440 |         Returns
441 |         -------
442 |         np.array
443 |             model scores
444 |         """
445 |         X = X.to(self.device).float()
446 | 
447 |         # compute model output
448 |         scores, _ = self.network(X)
449 | 
450 |         if isinstance(scores, list):
451 |             scores = [x.cpu().detach().numpy() for x in scores]
452 |         else:
453 |             scores = scores.cpu().detach().numpy()
454 | 
455 |         return scores
456 | 
457 |     def _set_network(self):
458 |         """Setup the network and explain matrix."""
459 |         self.network = tab_network.TabNet(
460 |             self.input_dim,
461 |             self.output_dim,
462 |             n_d=self.n_d,
463 |             n_a=self.n_a,
464 |             n_steps=self.n_steps,
465 |             gamma=self.gamma,
466 |             cat_idxs=self.cat_idxs,
467 |             cat_dims=self.cat_dims,
468 |             cat_emb_dim=self.cat_emb_dim,
469 |             n_independent=self.n_independent,
470 |             n_shared=self.n_shared,
471 |             epsilon=self.epsilon,
472 |             virtual_batch_size=self.virtual_batch_size,
473 |             momentum=self.momentum,
474 |             device_name=self.device_name,
475 |             mask_type=self.mask_type,
476 |         ).to(self.device)
477 | 
478 |         self.reducing_matrix = create_explain_matrix(
479 |             self.network.input_dim,
480 |             self.network.cat_emb_dim,
481 |             self.network.cat_idxs,
482 |             self.network.post_embed_dim,
483 |         )
484 | 
485 |     def _set_metrics(self, metrics, eval_names):
486 |         """Set attributes relative to the metrics.
487 | 
488 |         Parameters
489 |         ----------
490 |         metrics : list of str
491 |             List of eval metric names.
492 |         eval_names : list of str
493 |             List of eval set names.
494 | 
495 |         """
496 |         metrics = metrics or [self._default_metric]
497 | 
498 |         metrics = check_metrics(metrics)
499 |         # Set metric container for each sets
500 |         self._metric_container_dict = {}
501 |         for name in eval_names:
502 |             self._metric_container_dict.update({name: MetricContainer(metrics, prefix=f"{name}_")})
503 | 
504 |         self._metrics = []
505 |         self._metrics_names = []
506 |         for _, metric_container in self._metric_container_dict.items():
507 |             self._metrics.extend(metric_container.metrics)
508 |             self._metrics_names.extend(metric_container.names)
509 | 
510 |         # Early stopping metric is the last eval metric
511 |         self.early_stopping_metric = (self._metrics_names[-1] if len(self._metrics_names) > 0 else None)
512 | 
513 |     def _set_callbacks(self, custom_callbacks):
514 |         """Setup the callbacks functions.
515 | 
516 |         Parameters
517 |         ----------
518 |         custom_callbacks : list of func
519 |             List of callback functions.
520 | 
521 |         """
522 |         # Setup default callbacks history, early stopping and scheduler
523 |         callbacks = []
524 |         self.history = History(self, verbose=self.verbose)
525 |         callbacks.append(self.history)
526 |         if (self.early_stopping_metric is not None) and (self.patience > 0):
527 |             early_stopping = EarlyStopping(
528 |                 early_stopping_metric=self.early_stopping_metric,
529 |                 is_maximize=(self._metrics[-1]._maximize if len(self._metrics) > 0 else None),
530 |                 patience=self.patience,
531 |             )
532 |             callbacks.append(early_stopping)
533 |         else:
534 |             print("No early stopping will be performed, last training weights will be used.")
535 |         if self.scheduler_fn is not None:
536 |             # Add LR Scheduler call_back
537 |             is_batch_level = self.scheduler_params.pop("is_batch_level", False)
538 |             scheduler = LRSchedulerCallback(
539 |                 scheduler_fn=self.scheduler_fn,
540 |                 scheduler_params=self.scheduler_params,
541 |                 optimizer=self._optimizer,
542 |                 early_stopping_metric=self.early_stopping_metric,
543 |                 is_batch_level=is_batch_level,
544 |             )
545 |             callbacks.append(scheduler)
546 | 
547 |         if custom_callbacks:
548 |             callbacks.extend(custom_callbacks)
549 |         self._callback_container = CallbackContainer(callbacks)
550 |         self._callback_container.set_trainer(self)
551 | 
552 |     def _set_optimizer(self):
553 |         """Setup optimizer."""
554 |         self._optimizer = self.optimizer_fn(self.network.parameters(), **self.optimizer_params)
555 | 
556 |     def _construct_loaders(self, X_train, y_train, eval_set):
557 |         """Generate dataloaders for train and eval set.
558 | 
559 |         Parameters
560 |         ----------
561 |         X_train : np.array
562 |             Train set.
563 |         y_train : np.array
564 |             Train targets.
565 |         eval_set : list of tuple
566 |             List of eval tuple set (X, y).
567 | 
568 |         Returns
569 |         -------
570 |         train_dataloader : `torch.utils.data.Dataloader`
571 |             Training dataloader.
572 |         valid_dataloaders : list of `torch.utils.data.Dataloader`
573 |             List of validation dataloaders.
574 | 
575 |         """
576 |         # all weights are not allowed for this type of model
577 |         y_train_mapped = self.prepare_target(y_train)
578 |         for i, (X, y) in enumerate(eval_set):
579 |             y_mapped = self.prepare_target(y)
580 |             eval_set[i] = (X, y_mapped)
581 | 
582 |         train_dataloader, valid_dataloaders = create_dataloaders(
583 |             X_train,
584 |             y_train_mapped,
585 |             eval_set,
586 |             self.updated_weights,
587 |             self.batch_size,
588 |             self.num_workers,
589 |             self.drop_last,
590 |             self.pin_memory,
591 |         )
592 |         return train_dataloader, valid_dataloaders
593 | 
594 |     def _compute_feature_importances(self, loader):
595 |         """Compute global feature importance.
596 | 
597 |         Parameters
598 |         ----------
599 |         loader : `torch.utils.data.Dataloader`
600 |             Pytorch dataloader.
601 | 
602 |         """
603 |         self.network.eval()
604 |         feature_importances_ = np.zeros((self.network.post_embed_dim))
605 |         for data, targets in loader:
606 |             data = data.to(self.device).float()
607 |             M_explain, masks = self.network.forward_masks(data)
608 |             feature_importances_ += M_explain.sum(dim=0).cpu().detach().numpy()
609 | 
610 |         feature_importances_ = csc_matrix.dot(feature_importances_, self.reducing_matrix)
611 |         self.feature_importances_ = feature_importances_ / np.sum(feature_importances_)
612 | 
613 |     @abstractmethod
614 |     def update_fit_params(self, X_train, y_train, eval_set, weights):
615 |         """
616 |         Set attributes relative to fit function.
617 | 
618 |         Parameters
619 |         ----------
620 |         X_train : np.ndarray
621 |             Train set
622 |         y_train : np.array
623 |             Train targets
624 |         eval_set : list of tuple
625 |             List of eval tuple set (X, y).
626 |         weights : bool or dictionnary
627 |             0 for no balancing
628 |             1 for automated balancing
629 |         """
630 |         raise NotImplementedError("users must define update_fit_params to use this base class")
631 | 
632 |     @abstractmethod
633 |     def compute_loss(self, y_score, y_true):
634 |         """
635 |         Compute the loss.
636 | 
637 |         Parameters
638 |         ----------
639 |         y_score : a :tensor: `torch.Tensor`
640 |             Score matrix
641 |         y_true : a :tensor: `torch.Tensor`
642 |             Target matrix
643 | 
644 |         Returns
645 |         -------
646 |         float
647 |             Loss value
648 |         """
649 |         raise NotImplementedError("users must define compute_loss to use this base class")
650 | 
651 |     @abstractmethod
652 |     def prepare_target(self, y):
653 |         """
654 |         Prepare target before training.
655 | 
656 |         Parameters
657 |         ----------
658 |         y : a :tensor: `torch.Tensor`
659 |             Target matrix.
660 | 
661 |         Returns
662 |         -------
663 |         `torch.Tensor`
664 |             Converted target matrix.
665 |         """
666 |         raise NotImplementedError("users must define prepare_target to use this base class")
667 | 


--------------------------------------------------------------------------------