├── test ├── __init__.py └── unit_test │ ├── __init__.py │ └── test_cache.py ├── src ├── models │ ├── pytorch_tabnet │ │ ├── __init__.py │ │ ├── tab_model.py │ │ ├── multitask.py │ │ ├── metrics.py │ │ ├── callbacks.py │ │ ├── sparsemax.py │ │ ├── utils.py │ │ ├── multiclass_utils.py │ │ ├── tab_network.py │ │ └── abstract_model.py │ ├── blending.py │ ├── loss.py │ ├── svm.py │ ├── tabnet.py │ ├── boosting_tree.py │ ├── base.py │ ├── optimizer.py │ └── tabular_nn.py ├── utils │ ├── environment.py │ ├── misc.py │ ├── transformers.py │ ├── cache.py │ └── splitter.py ├── metrics.py ├── preprocess.py └── experiment │ └── experiment.py ├── .gitignore ├── Makefile ├── pyproject.toml ├── docker-compose.yaml ├── README.md └── encode.py /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/unit_test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .build 3 | .vscode 4 | __pycache__/ 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build 2 | build: 3 | poetry run python encode.py 4 | cat .build/script.py | pbcopy 5 | echo 'copied to clipboard' -------------------------------------------------------------------------------- /src/utils/environment.py: -------------------------------------------------------------------------------- 1 | try: 2 | import mlflow 3 | _has_mlflow = True 4 | except ImportError: 5 | _has_mlflow = False 6 | 7 | 8 | def requires_mlflow(): 9 | if not _has_mlflow: 10 | raise ImportError('You need to install mlflow before using this API.') 11 | 12 | 13 | try: 14 | import torch 15 | _has_torch = True 16 | except ImportError: 17 | _has_torch = False 18 | 19 | 20 | def get_device(): 21 | if _has_torch: 22 | return torch.device("cuda" if torch.cuda.is_available() else "cpu") 23 | else: 24 | 'cpu' 25 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "moa" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["e-mon "] 6 | 7 | [tool.poetry.dependencies] 8 | python = ">=3.6" 9 | tqdm = "*" 10 | pandas = "*" 11 | ipython = "*" 12 | sklearn = "*" 13 | lightgbm = "*" 14 | catboost = "*" 15 | xgboost = "*" 16 | numpy = "*" 17 | scipy = "*" 18 | seaborn = "*" 19 | gitpython = "^3.1.11" 20 | hydra = "^2.5" 21 | 22 | 23 | [tool.poetry.dev-dependencies] 24 | flake8 = "^3.8.4" 25 | black = {version = "^20.8b1", allow-prereleases = true} 26 | yapf = "^0.30.0" 27 | [build-system] 28 | requires = ["poetry>=0.12"] 29 | build-backend = "poetry.masonry.api" 30 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | jupyter: 5 | entrypoint: "" 6 | command: jupyter notebook --ip=0.0.0.0 --allow-root --no-browser --port 8080 --NotebookApp.token=hogehoge123 7 | image: gcr.io/kaggle-gpu-images/python:latest 8 | runtime: nvidia 9 | environment: 10 | LD_LIBRARY_PATH: "/usr/local/cuda/lib64::/opt/conda/lib" 11 | user: root 12 | ports: 13 | - "8080:8080" 14 | volumes: 15 | - ./notebooks:/notebooks 16 | - /data:/input 17 | - /usr/local/cuda:/usr/local/cuda 18 | - ~/.jupyter:/root/.jupyter 19 | - ~/.local/share/jupyter:/root/.local/share/jupyter 20 | working_dir: / 21 | 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mechanisms of Action (MoA) Prediction 2 | 3 | 4th place solution for Mechanisms of Action (MoA) Prediction https://www.kaggle.com/c/lish-moa/ 4 | 5 | Solution summary: [here](https://www.kaggle.com/c/lish-moa/discussion/200808) 6 | 7 | Kernel: [here](https://www.kaggle.com/kento1993/nn-svm-tabnet-xgb-with-pca-cnn-stacking-without-pp) 8 | 9 | ## Setup 10 | 11 | Since these codes are designed to be executed on Kaggle Kernel, so first get the BASE64-encoded codes by running the following command. 12 | 13 | (refer to: https://github.com/lopuhin/kaggle-imet-2019) 14 | ```shell 15 | $ make build 16 | ``` 17 | 18 | Please see below kernel, if you want to know the actual training & inference process. 19 | 20 | https://www.kaggle.com/kento1993/nn-svm-tabnet-xgb-with-pca-cnn-stacking-without-pp 21 | -------------------------------------------------------------------------------- /src/models/blending.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from scipy.optimize import minimize 4 | from sklearn.model_selection import KFold 5 | 6 | from src.metrics import calc_competition_metric_torch 7 | from src.utils.misc import LoggerFactory 8 | 9 | logger = LoggerFactory().getLogger(__name__) 10 | 11 | 12 | def get_best_weights(oof_1, oof_2, train_features_df, targets, n_splits=10): 13 | weight_list = [] 14 | weights = np.array([0.5]) 15 | for i in range(2): 16 | kf = KFold(n_splits=n_splits, random_state=i, shuffle=True) 17 | for fold, (train_idx, valid_idx) in enumerate(kf.split(X=oof_1)): 18 | res = minimize( 19 | get_score, 20 | weights, 21 | args=(train_features_df, train_idx, oof_1, oof_2, targets), 22 | method="Nelder-Mead", 23 | tol=1e-6, 24 | ) 25 | logger.info(f"i: {i} fold: {fold} res.x: {res.x}") 26 | weight_list.append(res.x) 27 | mean_weight = np.mean(weight_list) 28 | logger.info(f"optimized weight: {mean_weight}") 29 | return mean_weight 30 | 31 | 32 | def get_score(weights, train_features_df, train_idx, oof_1, oof_2, targets): 33 | _oof_1 = oof_1[train_idx, :].copy() 34 | _oof_2 = oof_2[train_idx, :].copy() 35 | blend = (_oof_1 * weights[0]) + (_oof_2 * (1 - weights[0])) 36 | return calc_competition_metric_torch(train_features_df, targets, blend, train_idx) -------------------------------------------------------------------------------- /src/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from sklearn.metrics import log_loss 5 | 6 | from src.utils.misc import LoggerFactory 7 | 8 | logger = LoggerFactory().getLogger(__name__) 9 | 10 | 11 | def calc_competition_metric_torch(train_features_df, target_cols, oof_arr, train_idx): 12 | # competition_metric = [log_loss(train_features_df.loc[train_idx, target_cols[i]], oof_arr[:, i]) for i in range(len(target_cols))] 13 | y = torch.tensor(train_features_df.loc[train_idx, target_cols].values, dtype=float) 14 | p = torch.tensor(oof_arr, dtype=float) 15 | p = torch.clamp(p, 1e-9, 1 - (1e-9)) 16 | competition_metric = nn.BCELoss()(p, y).item() 17 | return np.mean(competition_metric) 18 | 19 | 20 | def calc_competition_metric_np(train_features_df, target_cols, oof_arr): 21 | competition_metric = [] 22 | for i in range(len(target_cols)): 23 | competition_metric.append(log_loss(train_features_df[:, target_cols[i]], oof_arr[:, i])) 24 | logger.info(f"competition metric: {np.mean(competition_metric)}") 25 | 26 | return np.mean(competition_metric) 27 | 28 | 29 | def logloss_for_multilabel(actual, preds, ignore_all_zeros: bool = True): 30 | """ 31 | actual, preds: [n_samples, n_classes] 32 | log_loss(actual[:, c], preds[:, c]) 33 | """ 34 | 35 | actual = torch.tensor(actual, dtype=float) 36 | preds = torch.tensor(preds, dtype=float) 37 | preds = torch.clamp(preds, 1e-9, 1 - (1e-9)) 38 | 39 | return np.mean(nn.BCELoss()(preds, actual).item()) 40 | -------------------------------------------------------------------------------- /encode.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import gzip 3 | from pathlib import Path 4 | from src.utils.misc import get_current_commit_hash 5 | 6 | template = """ 7 | import gzip 8 | import base64 9 | import os 10 | from pathlib import Path 11 | from typing import Dict 12 | 13 | 14 | # this is base64 encoded source code 15 | file_data: Dict = {file_data} 16 | 17 | for path, encoded in file_data.items(): 18 | print(path) 19 | path = Path(path) 20 | path.parent.mkdir(exist_ok=True) 21 | path.write_bytes(gzip.decompress(base64.b64decode(encoded))) 22 | 23 | 24 | def run(command): 25 | os.system('echo "from setuptools import setup; setup(name=\\'src\\', packages=[\\'src\\'],)" > setup.py') 26 | os.system('export PYTHONPATH=${PYTHONPATH}:/kaggle/working && ' + command) 27 | 28 | 29 | run('python setup.py develop --install-dir /kaggle/working') 30 | 31 | # output current commit hash 32 | print('{commit_hash}') 33 | """ 34 | 35 | 36 | def encode_file(path: Path) -> str: 37 | compressed = gzip.compress(path.read_bytes(), compresslevel=9) 38 | return base64.b64encode(compressed).decode('utf-8') 39 | 40 | 41 | def build_script(): 42 | to_encode = list(Path('src').glob('**/*.py')) 43 | file_data = {str(path): encode_file(path) for path in to_encode} 44 | output_path = Path('.build/script.py') 45 | output_path.parent.mkdir(exist_ok=True) 46 | output_path.write_text(template.replace('{file_data}', str(file_data)).replace('{commit_hash}', get_current_commit_hash()), encoding='utf8') 47 | 48 | 49 | if __name__ == '__main__': 50 | build_script() 51 | -------------------------------------------------------------------------------- /src/models/loss.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.loss import _WeightedLoss 2 | from src.models.pytorch_tabnet.metrics import Metric 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | 8 | class SmoothBCEwLogits(_WeightedLoss): 9 | def __init__(self, weight=None, reduction="mean", smoothing=0.0): 10 | super().__init__(weight=weight, reduction=reduction) 11 | self.smoothing = smoothing 12 | self.weight = weight 13 | self.reduction = reduction 14 | 15 | @staticmethod 16 | def _smooth(targets: torch.Tensor, n_labels: int, smoothing=0.0): 17 | assert 0 <= smoothing < 1 18 | with torch.no_grad(): 19 | targets = targets * (1.0 - smoothing) + 0.5 * smoothing 20 | return targets 21 | 22 | def forward(self, inputs, targets): 23 | targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1), self.smoothing) 24 | loss = F.binary_cross_entropy_with_logits(inputs, targets, self.weight) 25 | 26 | if self.reduction == "sum": 27 | loss = loss.sum() 28 | elif self.reduction == "mean": 29 | loss = loss.mean() 30 | 31 | return loss 32 | 33 | 34 | class LogitsLogLoss(Metric): 35 | """ 36 | LogLoss with sigmoid applied 37 | """ 38 | def __init__(self): 39 | self._name = "logits_ll" 40 | self._maximize = False 41 | 42 | def __call__(self, y_true, y_pred): 43 | """ 44 | Compute LogLoss of predictions. 45 | 46 | Parameters 47 | ---------- 48 | y_true: np.ndarray 49 | Target matrix or vector 50 | y_score: np.ndarray 51 | Score matrix or vector 52 | 53 | Returns 54 | ------- 55 | float 56 | LogLoss of predictions vs targets. 57 | """ 58 | logits = 1 / (1 + np.exp(-y_pred)) 59 | aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15) 60 | return np.mean(-aux) 61 | -------------------------------------------------------------------------------- /src/models/svm.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import pandas as pd 3 | import numpy as np 4 | from cuml.svm import SVC, SVR 5 | from tqdm import tqdm 6 | 7 | from src.utils.misc import LoggerFactory 8 | from src.models.base import MoaBaseOnline, AllZerosClassifier 9 | 10 | logger = LoggerFactory().getLogger(__name__) 11 | 12 | 13 | class SVMTrainer(MoaBaseOnline): 14 | def __init__(self, params: Optional[dict] = None, **kwargs): 15 | if params is None: 16 | self.params = {} 17 | else: 18 | self.params = params 19 | super().__init__(**kwargs) 20 | 21 | def _get_default_params(self): 22 | return { 23 | 'cache_size': 5000, 24 | 'probability': True, 25 | } 26 | 27 | def _train_predict(self, X: pd.DataFrame, y: pd.DataFrame, X_test: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, 28 | seed: int): 29 | _params = self._get_default_params() 30 | _params.update(self.params) 31 | 32 | X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx] 33 | y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx] 34 | target_cols = y_valid.columns.tolist() 35 | 36 | pred_valid = np.zeros_like(y_valid).astype(float) 37 | preds = np.zeros(shape=(X_test.shape[0], y_train.shape[1])) 38 | 39 | # multilabel分回す 40 | for idx, target_col in tqdm(enumerate(target_cols), total=len(target_cols)): 41 | # Since cuml SVC calls CalibratedClassifierCV(n_folds=5), more than 5 positive samples is required 42 | if y_train[target_col].sum() < 5: 43 | logger.info(f'{target_col} is all zeros') 44 | clf = AllZerosClassifier() 45 | else: 46 | clf = SVC(**_params) 47 | clf.fit(X_train[predictors].values, y_train[target_col].values.astype(int), convert_dtype=False) 48 | pred_valid[:, idx] = clf.predict_proba(X_valid[predictors].values)[:, 1] 49 | preds[:, idx] = clf.predict_proba(X_test[predictors].values)[:, 1] 50 | 51 | return preds, pred_valid 52 | -------------------------------------------------------------------------------- /src/utils/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import random 4 | import torch 5 | import logging 6 | import logging.handlers 7 | from contextlib import contextmanager 8 | import time 9 | import git 10 | from pathlib import Path 11 | 12 | 13 | def seed_everything(seed=42): 14 | random.seed(seed) 15 | os.environ["PYTHONHASHSEED"] = str(seed) 16 | np.random.seed(seed) 17 | torch.manual_seed(seed) 18 | torch.cuda.manual_seed(seed) 19 | torch.backends.cudnn.deterministic = True 20 | 21 | 22 | class Singleton(type): 23 | _instances = {} 24 | 25 | def __call__(cls, *args, **kwargs): 26 | if cls not in cls._instances: 27 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 28 | return cls._instances[cls] 29 | 30 | 31 | class LoggerFactory(metaclass=Singleton): 32 | def __init__(self, log_path: str = None, loglevel=logging.INFO): 33 | self.loglevel = loglevel 34 | if log_path is None: 35 | self.log_path = Path('./log') 36 | else: 37 | self.log_path = Path(log_path) 38 | self.log_path.parent.mkdir(parents=True, exist_ok=True) 39 | 40 | def getLogger(self, log_name): 41 | fmt = '%(asctime)s [%(name)s|%(levelname)s] %(message)s' 42 | formatter = logging.Formatter(fmt) 43 | logger = logging.getLogger(log_name) 44 | 45 | # add stream Handler 46 | handler = logging.StreamHandler() 47 | handler.setFormatter(formatter) 48 | logger.addHandler(handler) 49 | 50 | # add file Handler 51 | handler = logging.handlers.RotatingFileHandler(filename=self.log_path, maxBytes=2 * 1024 * 1024 * 1024, backupCount=10) 52 | handler.setFormatter(formatter) 53 | logger.addHandler(handler) 54 | 55 | logger.setLevel(self.loglevel) 56 | 57 | return logger 58 | 59 | 60 | @contextmanager 61 | def timer(name, logger): 62 | t0 = time.time() 63 | logger.debug(f'[{name}] start') 64 | yield 65 | logger.debug(f'[{name}] done in {time.time() - t0:.0f} s') 66 | 67 | 68 | def get_current_commit_hash(): 69 | repo = git.Repo(search_parent_directories=True) 70 | return repo.head.object.hexsha 71 | -------------------------------------------------------------------------------- /test/unit_test/test_cache.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | from src.utils.cache import Cache 4 | from typing import List, Any 5 | import pandas as pd 6 | 7 | 8 | class TestCache(unittest.TestCase): 9 | def test_hash(self): 10 | test_case: List[Any] = [1, 'a', {1, 2, 3}, [1, 2, 3], (1, 2, 3), {'key': 'value'}] 11 | expecteds = [ 12 | 'c4ca4238a0b923820dcc509a6f75849b', '0cc175b9c0f1b6a831c399e269772661', '4c24e01fa26fc915e3f057d6c6bfd560', '49a5a960c5714c2e29dd1a7e7b950741', 13 | '49a5a960c5714c2e29dd1a7e7b950741', '88bac95f31528d13a072c05f2a1cf371' 14 | ] 15 | 16 | for obj, expected in zip(test_case, expecteds): 17 | result = Cache._get_hash(obj) 18 | self.assertEqual(result, expected) 19 | 20 | def test_dataframe(self): 21 | df = pd.DataFrame(dict(col_1=[1, 2, 3], col_2=['a', 'b', 'c'])) 22 | expected = '6b7f6abb1cfff565fafb7be863d2c62b' 23 | result = Cache._get_hash(df) 24 | 25 | self.assertEqual(result, expected) 26 | 27 | df = pd.DataFrame(dict(col_1=[1, 2, 3], col_3=['a', 'b', 'c'])) 28 | result = Cache._get_hash(df) 29 | self.assertNotEqual(result, expected) 30 | 31 | def test_unique_id(self): 32 | params = {'param_a': 123, 'param_b': [1, 2, 3], 'param_c': {'key': 'value'}} 33 | 34 | expected = '1fd1c9224dc3180dea4d058e90e095df' 35 | result = Cache._get_unique_id(params) 36 | 37 | self.assertEqual(result, expected) 38 | 39 | def test_with_no_param(self): 40 | def read_cache(path, rerun): 41 | self.path = path 42 | return path 43 | 44 | with patch('src.utils.cache.Cache._read_cache', read_cache): 45 | 46 | @Cache('test') 47 | def func(): 48 | return '' 49 | 50 | expected = 'test/func_with_no_param' 51 | _ = func() 52 | self.assertEqual(str(self.path), expected) 53 | 54 | def test_read_path(self): 55 | def read_cache(path, rerun): 56 | self.path = path 57 | return path 58 | 59 | with patch('src.utils.cache.Cache._read_cache', read_cache): 60 | 61 | @Cache('test') 62 | def func(param): 63 | return param 64 | 65 | expected = 'test/func_b4216b72b74587638f054cc8e5e9825c' 66 | ret_1 = func('abc') 67 | self.assertEqual(str(self.path), expected) 68 | 69 | ret_2 = func('def') 70 | self.assertNotEqual(str(self.path), expected) 71 | self.assertNotEqual(ret_1, ret_2) 72 | -------------------------------------------------------------------------------- /src/models/tabnet.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional 2 | import numpy as np 3 | import pandas as pd 4 | import torch.optim as optim 5 | from torch.nn import functional as F 6 | from torch.optim.lr_scheduler import ReduceLROnPlateau 7 | 8 | from src.utils.misc import LoggerFactory 9 | from src.models.loss import SmoothBCEwLogits, LogitsLogLoss 10 | from src.models.base import MoaBase 11 | from src.models.pytorch_tabnet.tab_model import TabNetRegressor 12 | from src.utils.environment import get_device 13 | 14 | DEVICE = get_device() 15 | logger = LoggerFactory().getLogger(__name__) 16 | 17 | 18 | class Tabnet(MoaBase): 19 | def __init__(self, params: Optional[dict] = None, **kwargs): 20 | if params is None: 21 | self.params = {} 22 | else: 23 | self.params = params 24 | super().__init__(**kwargs) 25 | 26 | def _get_default_params(self): 27 | return dict(loss_fn='logloss', 28 | max_epoch=200, 29 | batch_size=1024, 30 | initialize_params=dict(n_d=32, 31 | n_a=32, 32 | n_steps=1, 33 | gamma=1.3, 34 | lambda_sparse=0, 35 | optimizer_fn=optim.Adam, 36 | optimizer_params=dict(lr=2e-2, weight_decay=1e-5), 37 | mask_type="entmax", 38 | scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9), 39 | scheduler_fn=ReduceLROnPlateau, 40 | seed=42, 41 | verbose=10)) 42 | 43 | def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int): 44 | X_train, y_train = X.iloc[train_idx][predictors].values, y.iloc[train_idx].values 45 | X_valid, y_valid = X.iloc[valid_idx][predictors].values, y.iloc[valid_idx].values 46 | 47 | logger.info(f"train shape: {X_train.shape}, positive frac: {y_train.sum()/y_train.shape[0]}") 48 | logger.info(f"valid shape: {X_valid.shape}, positive frac: {y_valid.sum()/y_valid.shape[0]}") 49 | 50 | _params = self._get_default_params() 51 | _params.update(self.params) 52 | _params['initialize_params']['seed'] = seed 53 | 54 | model = TabNetRegressor(**_params['initialize_params']) 55 | loss_fn = F.binary_cross_entropy_with_logits if _params['loss_fn'] == 'logloss' else SmoothBCEwLogits(smoothing=0.001) 56 | logger.info(loss_fn) 57 | 58 | model.fit( 59 | X_train=X_train, 60 | y_train=y_train, 61 | eval_set=[(X_valid, y_valid)], 62 | eval_name=["val"], 63 | eval_metric=["logits_ll"], 64 | max_epochs=_params['max_epoch'], 65 | patience=20, 66 | batch_size=_params['batch_size'], 67 | virtual_batch_size=32, 68 | num_workers=0, 69 | drop_last=False, 70 | # To use binary cross entropy because this is not a regression problem 71 | loss_fn=loss_fn) 72 | 73 | preds = self._sigmoid(model.predict(X_valid)) 74 | return preds, model 75 | 76 | def _predict(self, model: Any, X_valid: pd.DataFrame, predictors: List[str]): 77 | preds = model.predict(X_valid[predictors].values) 78 | return self._sigmoid(preds) 79 | 80 | @staticmethod 81 | def _sigmoid(preds: np.ndarray): 82 | return 1 / (1 + np.exp(-preds)) 83 | -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/tab_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.special import softmax 4 | from .utils import PredictDataset, filter_weights 5 | from .abstract_model import TabModel 6 | from .multiclass_utils import infer_output_dim, check_output_dim 7 | from torch.utils.data import DataLoader 8 | 9 | 10 | class TabNetClassifier(TabModel): 11 | def __post_init__(self): 12 | super(TabNetClassifier, self).__post_init__() 13 | self._task = 'classification' 14 | self._default_loss = torch.nn.functional.cross_entropy 15 | self._default_metric = 'accuracy' 16 | 17 | def weight_updater(self, weights): 18 | """ 19 | Updates weights dictionnary according to target_mapper. 20 | 21 | Parameters 22 | ---------- 23 | weights : bool or dict 24 | Given weights for balancing training. 25 | 26 | Returns 27 | ------- 28 | bool or dict 29 | Same bool if weights are bool, updated dict otherwise. 30 | 31 | """ 32 | if isinstance(weights, int): 33 | return weights 34 | elif isinstance(weights, dict): 35 | return {self.target_mapper[key]: value for key, value in weights.items()} 36 | else: 37 | return weights 38 | 39 | def prepare_target(self, y): 40 | return np.vectorize(self.target_mapper.get)(y) 41 | 42 | def compute_loss(self, y_pred, y_true): 43 | return self.loss_fn(y_pred, y_true.long()) 44 | 45 | def update_fit_params( 46 | self, 47 | X_train, 48 | y_train, 49 | eval_set, 50 | weights, 51 | ): 52 | output_dim, train_labels = infer_output_dim(y_train) 53 | for X, y in eval_set: 54 | check_output_dim(train_labels, y) 55 | self.output_dim = output_dim 56 | self._default_metric = ('auc' if self.output_dim == 2 else 'accuracy') 57 | self.classes_ = train_labels 58 | self.target_mapper = {class_label: index for index, class_label in enumerate(self.classes_)} 59 | self.preds_mapper = {index: class_label for index, class_label in enumerate(self.classes_)} 60 | self.updated_weights = self.weight_updater(weights) 61 | 62 | def stack_batches(self, list_y_true, list_y_score): 63 | y_true = np.hstack(list_y_true) 64 | y_score = np.vstack(list_y_score) 65 | y_score = softmax(y_score, axis=1) 66 | return y_true, y_score 67 | 68 | def predict_func(self, outputs): 69 | outputs = np.argmax(outputs, axis=1) 70 | return np.vectorize(self.preds_mapper.get)(outputs) 71 | 72 | def predict_proba(self, X): 73 | """ 74 | Make predictions for classification on a batch (valid) 75 | 76 | Parameters 77 | ---------- 78 | X : a :tensor: `torch.Tensor` 79 | Input data 80 | 81 | Returns 82 | ------- 83 | res : np.ndarray 84 | 85 | """ 86 | self.network.eval() 87 | 88 | dataloader = DataLoader( 89 | PredictDataset(X), 90 | batch_size=self.batch_size, 91 | shuffle=False, 92 | ) 93 | 94 | results = [] 95 | for batch_nb, data in enumerate(dataloader): 96 | data = data.to(self.device).float() 97 | 98 | output, M_loss = self.network(data) 99 | predictions = torch.nn.Softmax(dim=1)(output).cpu().detach().numpy() 100 | results.append(predictions) 101 | res = np.vstack(results) 102 | return res 103 | 104 | 105 | class TabNetRegressor(TabModel): 106 | def __post_init__(self): 107 | super(TabNetRegressor, self).__post_init__() 108 | self._task = 'regression' 109 | self._default_loss = torch.nn.functional.mse_loss 110 | self._default_metric = 'mse' 111 | 112 | def prepare_target(self, y): 113 | return y 114 | 115 | def compute_loss(self, y_pred, y_true): 116 | return self.loss_fn(y_pred, y_true) 117 | 118 | def update_fit_params(self, X_train, y_train, eval_set, weights): 119 | self.output_dim = y_train.shape[1] 120 | 121 | self.updated_weights = weights 122 | filter_weights(self.updated_weights) 123 | 124 | def predict_func(self, outputs): 125 | return outputs 126 | 127 | def stack_batches(self, list_y_true, list_y_score): 128 | y_true = np.vstack(list_y_true) 129 | y_score = np.vstack(list_y_score) 130 | return y_true, y_score 131 | -------------------------------------------------------------------------------- /src/models/boosting_tree.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import xgboost as xgb 3 | import pandas as pd 4 | import numpy as np 5 | from src.models.base import BaseModel, MoaBase, AllZerosClassifier 6 | from logging import getLogger 7 | from tqdm import tqdm 8 | import lightgbm as lgb 9 | 10 | logger = getLogger(__name__) 11 | 12 | 13 | class LGBModel(BaseModel): 14 | def __init__(self, params: dict, **kwargs): 15 | self.params = params 16 | super().__init__(**kwargs) 17 | 18 | def _get_default_params(self): 19 | return { 20 | "n_estimators": 5000, 21 | "boosting_type": "gbdt", 22 | "objective": "binary", 23 | "metric": "None", 24 | "first_metric": True, 25 | "subsample": 0.8, 26 | "subsample_freq": 1, 27 | "learning_rate": 0.01, 28 | "feature_fraction": 0.7, 29 | "num_leaves": 12, 30 | "max_depth": -1, 31 | "early_stopping_rounds": 300, 32 | "seed": 42, 33 | } 34 | 35 | def _train(self, train, test, targets, train_idx, valid_idx): 36 | predictors = [col for col in train.columns if col not in self.ignore_cols] 37 | logger.info(predictors) 38 | X_train, y_train = train.iloc[train_idx][predictors], targets.iloc[train_idx] 39 | X_valid, y_valid = train.iloc[valid_idx][predictors], targets.iloc[valid_idx] 40 | 41 | logger.info(f"train shape: {X_train.shape}, positive frac: {y_train.sum()/y_train.shape[0]}") 42 | logger.info(f"valid shape: {X_valid.shape}, positive frac: {y_valid.sum()/y_valid.shape[0]}") 43 | 44 | train_set = lgb.Dataset(X_train, y_train, categorical_feature=self.categorical_cols) 45 | val_set = lgb.Dataset(X_valid, y_valid, categorical_feature=self.categorical_cols) 46 | 47 | _params = self._get_default_params() 48 | _params.update(self.params) 49 | 50 | clf = lgb.train( 51 | _params, 52 | train_set, 53 | valid_sets=[train_set, val_set], 54 | verbose_eval=100, 55 | fobj=None, 56 | ) 57 | 58 | return clf.predict(X_valid), clf.predict(test[predictors]), clf 59 | 60 | 61 | class XGBTrainer(MoaBase): 62 | def __init__(self, params: Optional[dict] = None, **kwargs): 63 | if params is None: 64 | self.params = {} 65 | else: 66 | self.params = params 67 | super().__init__(**kwargs) 68 | 69 | def _get_default_params(self): 70 | return { 71 | 'objective': 'binary:logistic', 72 | 'eval_metric': 'logloss', 73 | 'tree_method': 'gpu_hist', 74 | 'verbosity': 0, 75 | 'colsample_bytree': 0.1818593017814899, 76 | 'eta': 0.012887963193108452, 77 | 'gamma': 6.576022976359221, 78 | 'max_depth': 8, 79 | 'min_child_weight': 8.876744371188476, 80 | 'subsample': 0.7813380253086911, 81 | } 82 | 83 | def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int): 84 | X_train, y_train = X.iloc[train_idx][predictors], y.iloc[train_idx] 85 | X_valid, y_valid = X.iloc[valid_idx][predictors], y.iloc[valid_idx] 86 | 87 | logger.info(f"train shape: {X_train.shape}, positive frac: {y_train.sum()/y_train.shape[0]}") 88 | logger.info(f"valid shape: {X_valid.shape}, positive frac: {y_valid.sum()/y_valid.shape[0]}") 89 | 90 | _params = self._get_default_params() 91 | _params.update(self.params) 92 | _params['seed'] = seed 93 | 94 | target_cols = y_valid.columns.tolist() 95 | pred_valid = np.zeros_like(y_valid).astype(float) 96 | models = [] 97 | 98 | for idx, target_col in tqdm(enumerate(target_cols), total=len(target_cols)): 99 | xgb_train = xgb.DMatrix(X_train.values, label=y_train[target_col].values.astype(int), nthread=-1) 100 | xgb_valid = xgb.DMatrix(X_valid.values, label=y_valid[target_col].values.astype(int), nthread=-1) 101 | clf = xgb.train(_params, xgb_train, 1000, [(xgb_valid, "eval")], early_stopping_rounds=25, verbose_eval=0) 102 | pred_valid[:, idx] = clf.predict(xgb_valid) 103 | models.append(clf) 104 | 105 | return pred_valid, models 106 | 107 | def _predict(self, model: List, X_valid: pd.DataFrame, predictors: List[str]): 108 | assert type(model) is list, 'model is not list' 109 | 110 | preds = np.zeros(shape=(X_valid.shape[0], len(model))) 111 | for idx, clf in enumerate(model): 112 | xgb_valid = xgb.DMatrix(X_valid[predictors].values, nthread=-1) 113 | preds[:, idx] = clf.predict(xgb_valid) 114 | 115 | return preds 116 | -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/multitask.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.special import softmax 4 | from .utils import PredictDataset, filter_weights 5 | from .abstract_model import TabModel 6 | from .multiclass_utils import infer_multitask_output, check_output_dim 7 | from torch.utils.data import DataLoader 8 | 9 | 10 | class TabNetMultiTaskClassifier(TabModel): 11 | def __post_init__(self): 12 | super(TabNetMultiTaskClassifier, self).__post_init__() 13 | self._task = 'classification' 14 | self._default_loss = torch.nn.functional.cross_entropy 15 | self._default_metric = 'logloss' 16 | 17 | def prepare_target(self, y): 18 | y_mapped = y.copy() 19 | for task_idx in range(y.shape[1]): 20 | task_mapper = self.target_mapper[task_idx] 21 | y_mapped[:, task_idx] = np.vectorize(task_mapper.get)(y[:, task_idx]) 22 | return y_mapped 23 | 24 | def compute_loss(self, y_pred, y_true): 25 | """ 26 | Computes the loss according to network output and targets 27 | 28 | Parameters 29 | ---------- 30 | y_pred : list of tensors 31 | Output of network 32 | y_true : LongTensor 33 | Targets label encoded 34 | 35 | Returns 36 | ------- 37 | loss : torch.Tensor 38 | output of loss function(s) 39 | 40 | """ 41 | loss = 0 42 | y_true = y_true.long() 43 | if isinstance(self.loss_fn, list): 44 | # if you specify a different loss for each task 45 | for task_loss, task_output, task_id in zip(self.loss_fn, y_pred, range(len(self.loss_fn))): 46 | loss += task_loss(task_output, y_true[:, task_id]) 47 | else: 48 | # same loss function is applied to all tasks 49 | for task_id, task_output in enumerate(y_pred): 50 | loss += self.loss_fn(task_output, y_true[:, task_id]) 51 | 52 | loss /= len(y_pred) 53 | return loss 54 | 55 | def stack_batches(self, list_y_true, list_y_score): 56 | y_true = np.vstack(list_y_true) 57 | y_score = [] 58 | for i in range(len(self.output_dim)): 59 | score = np.vstack([x[i] for x in list_y_score]) 60 | score = softmax(score, axis=1) 61 | y_score.append(score) 62 | return y_true, y_score 63 | 64 | def update_fit_params(self, X_train, y_train, eval_set, weights): 65 | output_dim, train_labels = infer_multitask_output(y_train) 66 | for _, y in eval_set: 67 | for task_idx in range(y.shape[1]): 68 | check_output_dim(train_labels[task_idx], y[:, task_idx]) 69 | self.output_dim = output_dim 70 | self.classes_ = train_labels 71 | self.target_mapper = [{class_label: index for index, class_label in enumerate(classes)} for classes in self.classes_] 72 | self.preds_mapper = [{index: class_label for index, class_label in enumerate(classes)} for classes in self.classes_] 73 | self.updated_weights = weights 74 | filter_weights(self.updated_weights) 75 | 76 | def predict(self, X): 77 | """ 78 | Make predictions on a batch (valid) 79 | 80 | Parameters 81 | ---------- 82 | X : a :tensor: `torch.Tensor` 83 | Input data 84 | 85 | Returns 86 | ------- 87 | results : np.array 88 | Predictions of the most probable class 89 | """ 90 | self.network.eval() 91 | dataloader = DataLoader( 92 | PredictDataset(X), 93 | batch_size=self.batch_size, 94 | shuffle=False, 95 | ) 96 | 97 | results = {} 98 | for data in dataloader: 99 | data = data.to(self.device).float() 100 | output, _ = self.network(data) 101 | predictions = [torch.argmax(torch.nn.Softmax(dim=1)(task_output), dim=1).cpu().detach().numpy().reshape(-1) for task_output in output] 102 | 103 | for task_idx in range(len(self.output_dim)): 104 | results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]] 105 | # stack all task individually 106 | results = [np.hstack(task_res) for task_res in results.values()] 107 | # map all task individually 108 | results = [np.vectorize(self.preds_mapper[task_idx].get)(task_res) for task_idx, task_res in enumerate(results)] 109 | return results 110 | 111 | def predict_proba(self, X): 112 | """ 113 | Make predictions for classification on a batch (valid) 114 | 115 | Parameters 116 | ---------- 117 | X : a :tensor: `torch.Tensor` 118 | Input data 119 | 120 | Returns 121 | ------- 122 | res : list of np.ndarray 123 | 124 | """ 125 | self.network.eval() 126 | 127 | dataloader = DataLoader( 128 | PredictDataset(X), 129 | batch_size=self.batch_size, 130 | shuffle=False, 131 | ) 132 | 133 | results = {} 134 | for data in dataloader: 135 | data = data.to(self.device).float() 136 | output, _ = self.network(data) 137 | predictions = [torch.nn.Softmax(dim=1)(task_output).cpu().detach().numpy() for task_output in output] 138 | for task_idx in range(len(self.output_dim)): 139 | results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]] 140 | res = [np.vstack(task_res) for task_res in results.values()] 141 | return res 142 | -------------------------------------------------------------------------------- /src/preprocess.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import itertools 3 | import pandas as pd 4 | import numpy as np 5 | 6 | from src.utils.cache import Cache 7 | from src.utils.misc import LoggerFactory 8 | 9 | logger = LoggerFactory().getLogger(__name__) 10 | 11 | 12 | def get_cp_time_feature(s): 13 | if s == 72: 14 | return 2 15 | elif s == 48: 16 | return 1 17 | else: 18 | return 0 19 | 20 | 21 | def get_cp_dose_feature(s): 22 | return 1 if s == "D1" else 0 23 | 24 | 25 | def get_feature(df): 26 | features_g = list([x for x in df.columns if x.startswith("g-")]) 27 | features_c = list([x for x in df.columns if x.startswith("c-")]) 28 | 29 | df["g_sum"] = df[features_g].sum(axis=1) 30 | df["g_mean"] = df[features_g].mean(axis=1) 31 | df["g_median"] = df[features_g].median(axis=1) 32 | df["g_std"] = df[features_g].std(axis=1) 33 | df["g_kurt"] = df[features_g].kurtosis(axis=1) 34 | df["g_skew"] = df[features_g].skew(axis=1) 35 | df["c_sum"] = df[features_c].sum(axis=1) 36 | df["c_mean"] = df[features_c].mean(axis=1) 37 | df["c_std"] = df[features_c].std(axis=1) 38 | df["c_median"] = df[features_c].median(axis=1) 39 | df["c_kurt"] = df[features_c].kurtosis(axis=1) 40 | df["c_skew"] = df[features_c].skew(axis=1) 41 | df["gc_sum"] = df[features_g + features_c].sum(axis=1) 42 | df["gc_mean"] = df[features_g + features_c].mean(axis=1) 43 | df["gc_std"] = df[features_g + features_c].std(axis=1) 44 | df["gc_kurt"] = df[features_g + features_c].kurtosis(axis=1) 45 | df["gc_skew"] = df[features_g + features_c].skew(axis=1) 46 | df["gc_median"] = df[features_g + features_c].median(axis=1) 47 | 48 | return df 49 | 50 | 51 | @Cache(dir_path='./cache/') 52 | def preprocess_train(input_dir='../input/lish-moa/', sub: bool = False): 53 | train_features_df = pd.read_csv(f"{input_dir}/train_features.csv") 54 | train_drug_df = pd.read_csv(f"{input_dir}/train_drug.csv") 55 | train_targets_scored_df = pd.read_csv(f"{input_dir}/train_targets_scored.csv") 56 | train_targets_nonscored_df = pd.read_csv(f"{input_dir}/train_targets_nonscored.csv") 57 | 58 | logger.info(f""" 59 | train_features_df: {train_features_df.shape} 60 | train_drug_df: {train_drug_df.shape} 61 | train_targets_scored_df: {train_targets_scored_df.shape} 62 | train_targets_nonscored_df: {train_targets_nonscored_df.shape} 63 | """) 64 | 65 | drop_cols = list(train_targets_nonscored_df.columns[train_targets_nonscored_df.sum() == 0]) 66 | use_cols = [x for x in train_targets_nonscored_df.columns if x not in drop_cols] 67 | train_targets_nonscored_df = train_targets_nonscored_df.loc[:, use_cols] 68 | logger.info(f""" 69 | train_targets_nonscored_df: {train_targets_nonscored_df.shape} 70 | """) 71 | 72 | train_features_df = train_features_df.merge(train_targets_scored_df) 73 | train_features_df = train_features_df.merge(train_drug_df) 74 | train_features_df = train_features_df.merge(train_targets_nonscored_df) 75 | logger.info(f""" 76 | train_features_df: {train_features_df.shape} 77 | """) 78 | 79 | train_features_df = train_features_df[train_features_df.cp_type == "trt_cp"].reset_index(drop=True) 80 | 81 | train_features_df["cp_time_feature"] = train_features_df["cp_time"].map(get_cp_time_feature) 82 | train_features_df["cp_dose_feature"] = train_features_df["cp_dose"].map(get_cp_dose_feature) 83 | 84 | train_features_df = train_features_df.drop(columns=["cp_type", "cp_time", "cp_dose"]) 85 | 86 | features_g = list([x for x in train_features_df.columns if x.startswith("g-")]) 87 | features_c = list([x for x in train_features_df.columns if x.startswith("c-")]) 88 | 89 | var_list = [] 90 | for c in tqdm(list(itertools.combinations(features_g + features_c, 2))): 91 | col_name = f"{c[0]}_{c[1]}_diff" 92 | d = train_features_df[c[0]] - train_features_df[c[1]] 93 | diff_val = np.var(d) 94 | if diff_val > 15: 95 | train_features_df[col_name] = d 96 | var_list.append(diff_val) 97 | 98 | stage_1_2_target_cols = [x for x in train_targets_scored_df.columns if x not in ["sig_id", "drug_id"]] 99 | stage_1_1_target_cols = [x for x in train_targets_nonscored_df.columns if x not in ["sig_id", "drug_id"]] + stage_1_2_target_cols 100 | 101 | stage_1_train_features = [x for x in train_features_df.columns if x not in ["sig_id", "drug_id"] + stage_1_1_target_cols] 102 | 103 | return ( 104 | train_features_df, 105 | stage_1_1_target_cols, 106 | stage_1_2_target_cols, 107 | stage_1_train_features, 108 | ) 109 | 110 | 111 | def preprocess_test(train_features_df, input_dir='../input/lish-moa/'): 112 | test_features_df = pd.read_csv(f"{input_dir}/test_features.csv") 113 | sample_submission_df = pd.read_csv(f"{input_dir}/sample_submission.csv") 114 | test_features_df["cp_time_feature"] = test_features_df["cp_time"].map(get_cp_time_feature) 115 | test_features_df["cp_dose_feature"] = test_features_df["cp_dose"].map(get_cp_dose_feature) 116 | 117 | features_g = list([x for x in train_features_df.columns if x.startswith("g-")]) 118 | features_c = list([x for x in train_features_df.columns if x.startswith("c-")]) 119 | 120 | for c in tqdm(list(itertools.combinations(features_g + features_c, 2))): 121 | col_name = f"{c[0]}_{c[1]}_diff" 122 | if col_name in train_features_df.columns: 123 | test_features_df[col_name] = test_features_df[c[0]] - test_features_df[c[1]] 124 | 125 | return test_features_df, sample_submission_df 126 | -------------------------------------------------------------------------------- /src/utils/transformers.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from sklearn.preprocessing import QuantileTransformer, RobustScaler 3 | import pandas as pd 4 | import numpy as np 5 | from joblib import Parallel, delayed 6 | from scipy.interpolate import interp1d 7 | from scipy.special import erf, erfinv 8 | from sklearn.base import BaseEstimator, TransformerMixin 9 | from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted 10 | 11 | from src.utils.cache import Cache 12 | 13 | 14 | class GaussRankScaler(BaseEstimator, TransformerMixin): 15 | """Transform features by scaling each feature to a normal distribution. 16 | Parameters 17 | ---------- 18 | epsilon : float, optional, default 1e-4 19 | A small amount added to the lower bound or subtracted 20 | from the upper bound. This value prevents infinite number 21 | from occurring when applying the inverse error function. 22 | copy : boolean, optional, default True 23 | If False, try to avoid a copy and do inplace scaling instead. 24 | This is not guaranteed to always work inplace; e.g. if the data is 25 | not a NumPy array, a copy may still be returned. 26 | n_jobs : int or None, optional, default None 27 | Number of jobs to run in parallel. 28 | ``None`` means 1 and ``-1`` means using all processors. 29 | interp_kind : str or int, optional, default 'linear' 30 | Specifies the kind of interpolation as a string 31 | ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 32 | 'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic' 33 | refer to a spline interpolation of zeroth, first, second or third 34 | order; 'previous' and 'next' simply return the previous or next value 35 | of the point) or as an integer specifying the order of the spline 36 | interpolator to use. 37 | interp_copy : bool, optional, default False 38 | If True, the interpolation function makes internal copies of x and y. 39 | If False, references to `x` and `y` are used. 40 | Attributes 41 | ---------- 42 | interp_func_ : list 43 | The interpolation function for each feature in the training set. 44 | """ 45 | def __init__(self, epsilon=1e-4, copy=True, n_jobs=None, interp_kind='linear', interp_copy=False): 46 | self.epsilon = epsilon 47 | self.copy = copy 48 | self.interp_kind = interp_kind 49 | self.interp_copy = interp_copy 50 | self.fill_value = 'extrapolate' 51 | self.n_jobs = n_jobs 52 | 53 | def fit(self, X, y=None): 54 | """Fit interpolation function to link rank with original data for future scaling 55 | Parameters 56 | ---------- 57 | X : array-like, shape (n_samples, n_features) 58 | The data used to fit interpolation function for later scaling along the features axis. 59 | y 60 | Ignored 61 | """ 62 | X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True) 63 | 64 | self.interp_func_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit)(x) for x in X.T) 65 | return self 66 | 67 | def _fit(self, x): 68 | x = self.drop_duplicates(x) 69 | rank = np.argsort(np.argsort(x)) 70 | bound = 1.0 - self.epsilon 71 | factor = np.max(rank) / 2.0 * bound 72 | scaled_rank = np.clip(rank / factor - bound, -bound, bound) 73 | return interp1d(x, scaled_rank, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value) 74 | 75 | def transform(self, X, copy=None): 76 | """Scale the data with the Gauss Rank algorithm 77 | Parameters 78 | ---------- 79 | X : array-like, shape (n_samples, n_features) 80 | The data used to scale along the features axis. 81 | copy : bool, optional (default: None) 82 | Copy the input X or not. 83 | """ 84 | check_is_fitted(self, 'interp_func_') 85 | 86 | copy = copy if copy is not None else self.copy 87 | X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True) 88 | 89 | X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._transform)(i, x) for i, x in enumerate(X.T))).T 90 | return X 91 | 92 | def _transform(self, i, x): 93 | return erfinv(self.interp_func_[i](x)) 94 | 95 | def inverse_transform(self, X, copy=None): 96 | """Scale back the data to the original representation 97 | Parameters 98 | ---------- 99 | X : array-like, shape [n_samples, n_features] 100 | The data used to scale along the features axis. 101 | copy : bool, optional (default: None) 102 | Copy the input X or not. 103 | """ 104 | check_is_fitted(self, 'interp_func_') 105 | 106 | copy = copy if copy is not None else self.copy 107 | X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True) 108 | 109 | X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T))).T 110 | return X 111 | 112 | def _inverse_transform(self, i, x): 113 | inv_interp_func = interp1d(self.interp_func_[i].y, self.interp_func_[i].x, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value) 114 | return inv_interp_func(erf(x)) 115 | 116 | @staticmethod 117 | def drop_duplicates(x): 118 | is_unique = np.zeros_like(x, dtype=bool) 119 | is_unique[np.unique(x, return_index=True)[1]] = True 120 | return x[is_unique] 121 | 122 | 123 | TRANSFORMERS = { 124 | 'quantile': QuantileTransformer, 125 | 'robust': RobustScaler, 126 | 'gauss_rank': GaussRankScaler, 127 | } 128 | 129 | 130 | # return transformer 131 | @Cache('./cache') 132 | def normalizer(transformer: str, df: pd.DataFrame, params: Optional[dict]): 133 | if params is None: 134 | params = dict() 135 | trans = TRANSFORMERS[transformer](**params) 136 | trans.fit(df) 137 | 138 | return trans 139 | -------------------------------------------------------------------------------- /src/utils/cache.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Union, Dict, List, Callable, Optional, ByteString, Tuple 2 | import hashlib 3 | import pickle 4 | from pandas.util import hash_pandas_object 5 | import pandas as pd 6 | from pathlib import Path 7 | from inspect import signature 8 | import numpy as np 9 | from collections import OrderedDict 10 | import operator 11 | import functools 12 | import json 13 | from json import JSONEncoder 14 | try: 15 | from collections.abc import Mapping 16 | except ImportError: 17 | from collections import Mapping 18 | 19 | from src.utils.misc import LoggerFactory 20 | 21 | logger = LoggerFactory().getLogger(__name__) 22 | 23 | 24 | def _hash(obj: ByteString) -> str: 25 | return hashlib.md5(obj).hexdigest() 26 | 27 | 28 | class Cache: 29 | def __init__(self, dir_path: str, rerun: bool = False, with_param: bool = False): 30 | self.dir_path = Path(dir_path) 31 | self.dir_path.mkdir(exist_ok=True) 32 | self.with_param = with_param 33 | self.rerun = rerun 34 | 35 | def __call__(self, func: Callable): 36 | func_name = func.__name__ 37 | 38 | def wrapper(*args, **kwargs): 39 | sig = signature(func) 40 | # ignore default value 41 | bound_args = sig.bind(*args, **kwargs) 42 | unique_id: str = self._get_unique_id(bound_args.arguments) 43 | path: Path = self.dir_path.joinpath(f'{func_name}_{unique_id}') 44 | 45 | logger.info(f'{func_name}_{unique_id} has been called') 46 | ret = Cache._read_cache(path, rerun=self.rerun) 47 | if ret is None: 48 | logger.info(f'{func_name}_{unique_id} cache not found') 49 | ret = func(*args, **kwargs) 50 | Cache._write(path, ret) 51 | return ret 52 | 53 | return wrapper 54 | 55 | @staticmethod 56 | def _write(path, obj: Union[pd.DataFrame, Any]): 57 | # TODO: FileProcessor 58 | if isinstance(obj, pd.DataFrame): 59 | path = f'{path}.feather' 60 | obj.to_feather(str(path)) 61 | else: 62 | path = f'{path}.pickle' 63 | with open(str(path), 'wb') as f: 64 | pickle.dump(obj, f, protocol=4) 65 | 66 | @staticmethod 67 | def _read_cache(path: Path, rerun: bool) -> Optional[Any]: 68 | if rerun: 69 | return None 70 | if Path(f'{path}.pickle').exists(): 71 | logger.info(f'cache hit: {path}.pickle') 72 | return pickle.load(open(f'{path}.pickle', 'rb')) 73 | if Path(f'{path}.feather').exists(): 74 | logger.info(f'cache hit: {path}.feather') 75 | return pd.read_feather(f'{path}.feather') 76 | return None 77 | 78 | @classmethod 79 | def _get_unique_id(cls, params: Dict) -> str: 80 | if not params: 81 | return 'with_no_param' 82 | dependencies = [f'{key}_{cls._get_hash(param)}' for key, param in sorted(params.items(), key=lambda item: str(item[0]))] 83 | return hashlib.md5(str(dependencies).encode()).hexdigest() 84 | 85 | @classmethod 86 | def _get_hash(cls, obj: Any) -> str: 87 | if isinstance(obj, (str, int, float)): 88 | return cls._literals(obj) 89 | elif isinstance(obj, pd.DataFrame): 90 | return cls._data_frame(obj) 91 | elif isinstance(obj, np.ndarray): 92 | return cls._ndarray(obj) 93 | elif isinstance(obj, (list, dict, tuple)): 94 | return cls._containers(obj) 95 | else: 96 | return _hash(pickle.dumps(obj)) 97 | 98 | return '-1' 99 | 100 | @staticmethod 101 | def _data_frame(obj: pd.DataFrame): 102 | string = str(obj.columns.tolist()) + str(obj.index) + str(obj.shape) 103 | return _hash(string.encode()) 104 | # return hash_pandas_object(obj).sum() 105 | 106 | @staticmethod 107 | def _ndarray(obj: np.ndarray): 108 | return _hash(bytes(obj)) 109 | 110 | @staticmethod 111 | def _containers(obj: Union[List[Any], Dict[Any, Any], Tuple[Any, ...]]): 112 | return _hash(json.dumps(obj, cls=_DictParamEncoder).encode()) 113 | 114 | @staticmethod 115 | def _literals(obj: Union[int, str, float]): 116 | return _hash(str(obj).encode()) 117 | 118 | 119 | # from https://github.com/spotify/luigi/blob/master/luigi/parameter.py#L940 120 | 121 | 122 | class _DictParamEncoder(JSONEncoder): 123 | """ 124 | JSON encoder for :py:class:`~DictParameter`, which makes :py:class:`~FrozenOrderedDict` JSON serializable. 125 | """ 126 | def default(self, obj): 127 | if isinstance(obj, FrozenOrderedDict): 128 | return obj.get_wrapped() 129 | return json.JSONEncoder.default(self, obj) 130 | 131 | 132 | class FrozenOrderedDict(Mapping): 133 | """ 134 | It is an immutable wrapper around ordered dictionaries that implements the complete :py:class:`collections.Mapping` 135 | interface. It can be used as a drop-in replacement for dictionaries where immutability and ordering are desired. 136 | """ 137 | def __init__(self, *args, **kwargs): 138 | self.__dict = OrderedDict(*args, **kwargs) 139 | self.__hash = None 140 | 141 | def __getitem__(self, key): 142 | return self.__dict[key] 143 | 144 | def __iter__(self): 145 | return iter(self.__dict) 146 | 147 | def __len__(self): 148 | return len(self.__dict) 149 | 150 | def __repr__(self): 151 | # We should use short representation for beautiful console output 152 | return repr(dict(self.__dict)) 153 | 154 | def __hash__(self): 155 | if self.__hash is None: 156 | hashes = map(hash, self.items()) 157 | self.__hash = functools.reduce(operator.xor, hashes, 0) 158 | 159 | return self.__hash 160 | 161 | def get_wrapped(self): 162 | return self.__dict 163 | 164 | 165 | def recursively_freeze(value): 166 | """ 167 | Recursively walks ``Mapping``s and ``list``s and converts them to ``FrozenOrderedDict`` and ``tuples``, respectively. 168 | """ 169 | if isinstance(value, Mapping): 170 | return FrozenOrderedDict(((k, recursively_freeze(v)) for k, v in value.items())) 171 | elif isinstance(value, list) or isinstance(value, tuple): 172 | return tuple(recursively_freeze(v) for v in value) 173 | return value 174 | -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/metrics.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List 3 | import numpy as np 4 | from sklearn.metrics import ( 5 | roc_auc_score, 6 | mean_squared_error, 7 | mean_absolute_error, 8 | accuracy_score, 9 | log_loss, 10 | balanced_accuracy_score, 11 | ) 12 | 13 | 14 | @dataclass 15 | class MetricContainer: 16 | """Container holding a list of metrics. 17 | 18 | Parameters 19 | ---------- 20 | metric_names : list of str 21 | List of metric names. 22 | prefix : str 23 | Prefix of metric names. 24 | 25 | """ 26 | 27 | metric_names: List[str] 28 | prefix: str = "" 29 | 30 | def __post_init__(self): 31 | self.metrics = Metric.get_metrics_by_names(self.metric_names) 32 | self.names = [self.prefix + name for name in self.metric_names] 33 | 34 | def __call__(self, y_true, y_pred): 35 | """Compute all metrics and store into a dict. 36 | 37 | Parameters 38 | ---------- 39 | y_true : np.ndarray 40 | Target matrix or vector 41 | y_pred : np.ndarray 42 | Score matrix or vector 43 | 44 | Returns 45 | ------- 46 | dict 47 | Dict of metrics ({metric_name: metric_value}). 48 | 49 | """ 50 | logs = {} 51 | for metric in self.metrics: 52 | if isinstance(y_pred, list): 53 | res = np.mean( 54 | [metric(y_true[:, i], y_pred[i]) for i in range(len(y_pred))] 55 | ) 56 | else: 57 | res = metric(y_true, y_pred) 58 | logs[self.prefix + metric._name] = res 59 | return logs 60 | 61 | 62 | class Metric: 63 | def __call__(self, y_true, y_pred): 64 | raise NotImplementedError("Custom Metrics must implement this function") 65 | 66 | @classmethod 67 | def get_metrics_by_names(cls, names): 68 | """Get list of metric classes. 69 | 70 | Parameters 71 | ---------- 72 | cls : Metric 73 | Metric class. 74 | names : list 75 | List of metric names. 76 | 77 | Returns 78 | ------- 79 | metrics : list 80 | List of metric classes. 81 | 82 | """ 83 | available_metrics = cls.__subclasses__() 84 | available_names = [metric()._name for metric in available_metrics] 85 | metrics = [] 86 | for name in names: 87 | assert name in available_names, f"{name} is not available, choose in {available_names}" 88 | idx = available_names.index(name) 89 | metric = available_metrics[idx]() 90 | metrics.append(metric) 91 | return metrics 92 | 93 | 94 | class AUC(Metric): 95 | """ 96 | AUC. 97 | """ 98 | 99 | def __init__(self): 100 | self._name = "auc" 101 | self._maximize = True 102 | 103 | def __call__(self, y_true, y_score): 104 | """ 105 | Compute AUC of predictions. 106 | 107 | Parameters 108 | ---------- 109 | y_true : np.ndarray 110 | Target matrix or vector 111 | y_score : np.ndarray 112 | Score matrix or vector 113 | 114 | Returns 115 | ------- 116 | float 117 | AUC of predictions vs targets. 118 | """ 119 | return roc_auc_score(y_true, y_score[:, 1]) 120 | 121 | 122 | class Accuracy(Metric): 123 | """ 124 | Accuracy. 125 | """ 126 | 127 | def __init__(self): 128 | self._name = "accuracy" 129 | self._maximize = True 130 | 131 | def __call__(self, y_true, y_score): 132 | """ 133 | Compute Accuracy of predictions. 134 | 135 | Parameters 136 | ---------- 137 | y_true: np.ndarray 138 | Target matrix or vector 139 | y_score: np.ndarray 140 | Score matrix or vector 141 | 142 | Returns 143 | ------- 144 | float 145 | Accuracy of predictions vs targets. 146 | """ 147 | y_pred = np.argmax(y_score, axis=1) 148 | return accuracy_score(y_true, y_pred) 149 | 150 | 151 | class BalancedAccuracy(Metric): 152 | """ 153 | Balanced Accuracy. 154 | """ 155 | 156 | def __init__(self): 157 | self._name = "balanced_accuracy" 158 | self._maximize = True 159 | 160 | def __call__(self, y_true, y_score): 161 | """ 162 | Compute Accuracy of predictions. 163 | 164 | Parameters 165 | ---------- 166 | y_true : np.ndarray 167 | Target matrix or vector 168 | y_score : np.ndarray 169 | Score matrix or vector 170 | 171 | Returns 172 | ------- 173 | float 174 | Accuracy of predictions vs targets. 175 | """ 176 | y_pred = np.argmax(y_score, axis=1) 177 | return balanced_accuracy_score(y_true, y_pred) 178 | 179 | 180 | class LogLoss(Metric): 181 | """ 182 | LogLoss. 183 | """ 184 | 185 | def __init__(self): 186 | self._name = "logloss" 187 | self._maximize = False 188 | 189 | def __call__(self, y_true, y_score): 190 | """ 191 | Compute LogLoss of predictions. 192 | 193 | Parameters 194 | ---------- 195 | y_true : np.ndarray 196 | Target matrix or vector 197 | y_score : np.ndarray 198 | Score matrix or vector 199 | 200 | Returns 201 | ------- 202 | float 203 | LogLoss of predictions vs targets. 204 | """ 205 | return log_loss(y_true, y_score) 206 | 207 | 208 | class MAE(Metric): 209 | """ 210 | Mean Absolute Error. 211 | """ 212 | 213 | def __init__(self): 214 | self._name = "mae" 215 | self._maximize = False 216 | 217 | def __call__(self, y_true, y_score): 218 | """ 219 | Compute MAE (Mean Absolute Error) of predictions. 220 | 221 | Parameters 222 | ---------- 223 | y_true : np.ndarray 224 | Target matrix or vector 225 | y_score : np.ndarray 226 | Score matrix or vector 227 | 228 | Returns 229 | ------- 230 | float 231 | MAE of predictions vs targets. 232 | """ 233 | return mean_absolute_error(y_true, y_score) 234 | 235 | 236 | class MSE(Metric): 237 | """ 238 | Mean Squared Error. 239 | """ 240 | 241 | def __init__(self): 242 | self._name = "mse" 243 | self._maximize = False 244 | 245 | def __call__(self, y_true, y_score): 246 | """ 247 | Compute MSE (Mean Squared Error) of predictions. 248 | 249 | Parameters 250 | ---------- 251 | y_true : np.ndarray 252 | Target matrix or vector 253 | y_score : np.ndarray 254 | Score matrix or vector 255 | 256 | Returns 257 | ------- 258 | float 259 | MSE of predictions vs targets. 260 | """ 261 | return mean_squared_error(y_true, y_score) 262 | 263 | 264 | class RMSE(Metric): 265 | """ 266 | Root Mean Squared Error. 267 | """ 268 | 269 | def __init__(self): 270 | self._name = "rmse" 271 | self._maximize = False 272 | 273 | def __call__(self, y_true, y_score): 274 | """ 275 | Compute RMSE (Root Mean Squared Error) of predictions. 276 | 277 | Parameters 278 | ---------- 279 | y_true : np.ndarray 280 | Target matrix or vector 281 | y_score : np.ndarray 282 | Score matrix or vector 283 | 284 | Returns 285 | ------- 286 | float 287 | RMSE of predictions vs targets. 288 | """ 289 | return np.sqrt(mean_squared_error(y_true, y_score)) 290 | 291 | 292 | def check_metrics(metrics): 293 | """Check if custom metrics are provided. 294 | 295 | Parameters 296 | ---------- 297 | metrics : list of str or classes 298 | List with built-in metrics (str) or custom metrics (classes). 299 | 300 | Returns 301 | ------- 302 | val_metrics : list of str 303 | List of metric names. 304 | 305 | """ 306 | val_metrics = [] 307 | for metric in metrics: 308 | if isinstance(metric, str): 309 | val_metrics.append(metric) 310 | elif issubclass(metric, Metric): 311 | val_metrics.append(metric()._name) 312 | else: 313 | raise TypeError("You need to provide a valid metric format") 314 | return val_metrics 315 | -------------------------------------------------------------------------------- /src/models/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from src.utils.splitter import SplitFactory 3 | from typing import NamedTuple, Callable, List, Dict, Optional, Tuple, Any 4 | import numpy as np 5 | import pandas as pd 6 | from src.experiment.experiment import Experiment 7 | from src.utils.misc import LoggerFactory 8 | 9 | logger = LoggerFactory().getLogger(__name__) 10 | 11 | 12 | class ModelResult(NamedTuple): 13 | oof_preds: np.ndarray 14 | preds: Optional[np.ndarray] 15 | models: Dict[str, any] 16 | scores: Dict[str, float] 17 | folds: List[Tuple[np.ndarray, np.ndarray]] 18 | 19 | 20 | class BaseModel: 21 | def __init__(self, ignore_cols: List[str], target_cols: str, categorical_cols: List[str], metric: Callable, exp: Experiment): 22 | self.ignore_cols = ignore_cols 23 | self.metric = metric 24 | self.result = None 25 | 26 | @abstractmethod 27 | def _train(self, train: pd.DataFrame, targets: pd.DataFrame, train_idx, valid_idx): 28 | raise NotImplementedError 29 | 30 | def train(self, X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, splitter: Optional[SplitFactory], 31 | folds: Optional[List[Tuple[np.ndarray, np.ndarray]]]): 32 | 33 | models = dict() 34 | scores = dict() 35 | oof_preds = np.zeros_like(y_train).astype(float) 36 | preds = np.zeros(shape=(X_test.shape[0], y_train.shape[1])) 37 | assert (folds is not None) or (splitter is not None), 'splitter or folds is must be specified' 38 | if folds is None: 39 | folds = splitter.split(X_train, y_train) 40 | 41 | for fold, (train_idx, valid_idx) in enumerate(folds): 42 | valid_preds, _preds, model = self._train(X_train, X_test, y_train, train_idx, valid_idx) 43 | oof_preds[valid_idx] += valid_preds 44 | preds += _preds / len(folds) 45 | 46 | score = self.metric(y_train[valid_idx].values, valid_preds) 47 | logger.info(f"fold {fold}: {score}") 48 | models[f'fold_{fold}'] = model 49 | scores[f'fold_{fold}'] = score 50 | oof_score = self.metric(y_train.values, oof_preds) 51 | logger.info(f"{len(folds)} folds cv mean: {np.mean(scores)}") 52 | logger.info(f"oof score: {oof_score}") 53 | 54 | self.result = ModelResult(oof_preds=oof_preds, models=models, preds=preds, folds=folds, scores={ 55 | 'oof_score': oof_score, 56 | 'KFoldsScores': scores, 57 | }) 58 | 59 | return True 60 | 61 | def predict(self, X_test): 62 | assert self.result is None, 'Model is not tained Error' 63 | pass 64 | 65 | 66 | class MoaBase: 67 | def __init__(self, target_cols: List[str], categorical_cols: List[str], ignore_cols: Optional[List[str]], num_seed_blends: int, metric: Callable, 68 | exp: Experiment): 69 | self.exp = exp 70 | self.ignore_cols = ignore_cols 71 | self.categorical_cols = categorical_cols 72 | self.metric = metric 73 | self.result = None 74 | self.num_seed_blends = num_seed_blends 75 | 76 | @abstractmethod 77 | def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int): 78 | raise NotImplementedError 79 | 80 | def train(self, X_train: pd.DataFrame, y_train: pd.DataFrame, folds: List[Tuple[np.ndarray, np.ndarray]]): 81 | 82 | models = dict() 83 | scores = dict() 84 | oof_preds = np.zeros_like(y_train).astype(float) 85 | self.predictors = [col for col in X_train.columns.tolist() if col not in self.ignore_cols] 86 | 87 | logger.info(f'{self.__class__.__name__} train start') 88 | logger.info(f'X shape: {X_train.shape}, y shape: {y_train.shape}') 89 | for fold, (train_idx, valid_idx) in enumerate(folds): 90 | logger.info(f'fold {fold}: #row of train: {len(train_idx)}, #row of valid: {len(valid_idx)}') 91 | for i in range(self.num_seed_blends): 92 | valid_preds, model = self._train(X=X_train, y=y_train, predictors=self.predictors, train_idx=train_idx, valid_idx=valid_idx, seed=i) 93 | 94 | oof_preds[valid_idx, :] += valid_preds / self.num_seed_blends 95 | models[f'fold_{fold}_{i}'] = model 96 | 97 | score = self.metric(y_train.iloc[valid_idx].values, oof_preds[valid_idx, :]) 98 | logger.info(f"fold {fold}: {score}") 99 | scores[f'fold_{fold}'] = score 100 | oof_score = self.metric(y_train.values, oof_preds) 101 | logger.info(f"{len(folds)} folds cv mean: {np.mean(list(scores.values()))}") 102 | logger.info(f"oof score: {oof_score}") 103 | 104 | self.result = ModelResult(oof_preds=oof_preds, models=models, preds=None, folds=folds, scores={ 105 | 'oof_score': oof_score, 106 | 'KFoldsScores': scores, 107 | }) 108 | 109 | return True 110 | 111 | @abstractmethod 112 | def _predict(self, model: Any, X_valid: pd.DataFrame, predictors: List[str]): 113 | pass 114 | 115 | def predict(self, X_test) -> np.ndarray: 116 | assert self.result is not None, 'Model is not trained Error' 117 | 118 | folds = self.result.folds 119 | 120 | n_targets = self.result.oof_preds.shape[1] 121 | preds = np.zeros(shape=(X_test.shape[0], n_targets)) 122 | 123 | for fold, (train_idx, valid_idx) in enumerate(folds): 124 | for i in range(self.num_seed_blends): 125 | model = self.result.models[f'fold_{fold}_{i}'] 126 | preds += self._predict(model=model, X_valid=X_test, predictors=self.predictors) / (len(folds) * self.num_seed_blends) 127 | 128 | return preds 129 | 130 | 131 | class MoaBaseOnline: 132 | def __init__(self, target_cols: List[str], categorical_cols: List[str], ignore_cols: Optional[List[str]], num_seed_blends: int, metric: Callable, 133 | exp: Experiment): 134 | self.exp = exp 135 | self.ignore_cols = ignore_cols 136 | self.categorical_cols = categorical_cols 137 | self.metric = metric 138 | self.result = None 139 | self.num_seed_blends = num_seed_blends 140 | 141 | @abstractmethod 142 | def _train_predict(self, X: pd.DataFrame, y: pd.DataFrame, X_test: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, 143 | seed: int): 144 | raise NotImplementedError 145 | 146 | def train_predict(self, X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, folds: List[Tuple[np.ndarray, np.ndarray]]): 147 | 148 | scores = dict() 149 | oof_preds = np.zeros_like(y_train).astype(float) 150 | preds = np.zeros(shape=(X_test.shape[0], y_train.shape[1])) 151 | self.predictors = [col for col in X_train.columns.tolist() if col not in self.ignore_cols] 152 | 153 | logger.info(f'{self.__class__.__name__} train start') 154 | logger.info(f'X shape: {X_train.shape}, y shape: {y_train.shape}') 155 | for fold, (train_idx, valid_idx) in enumerate(folds): 156 | logger.info(f'fold {fold}: #row of train: {len(train_idx)}, #row of valid: {len(valid_idx)}') 157 | for i in range(self.num_seed_blends): 158 | _preds, valid_preds, = self._train_predict(X=X_train, 159 | y=y_train, 160 | X_test=X_test, 161 | predictors=self.predictors, 162 | train_idx=train_idx, 163 | valid_idx=valid_idx, 164 | seed=i) 165 | 166 | oof_preds[valid_idx, :] += valid_preds / self.num_seed_blends 167 | preds += _preds / (len(folds) * self.num_seed_blends) 168 | 169 | score = self.metric(y_train.iloc[valid_idx].values, oof_preds[valid_idx, :]) 170 | logger.info(f"fold {fold}: {score}") 171 | scores[f'fold_{fold}'] = score 172 | oof_score = self.metric(y_train.values, oof_preds) 173 | logger.info(f"{len(folds)} folds cv mean: {np.mean(list(scores.values()))}") 174 | logger.info(f"oof score: {oof_score}") 175 | 176 | self.result = ModelResult(oof_preds=oof_preds, models=None, preds=preds, folds=folds, scores={ 177 | 'oof_score': oof_score, 178 | 'KFoldsScores': scores, 179 | }) 180 | 181 | return True 182 | 183 | 184 | class AllZerosClassifier: 185 | def __init__(self, label=0): 186 | self.label = label 187 | 188 | def predict(self, X): 189 | return np.ones(X.shape[0]) * self.label 190 | 191 | def predict_proba(self, X): 192 | labels = np.ones(shape=(X.shape[0], 2)) 193 | labels[:, 1 - self.label] = 0 194 | return labels 195 | -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/callbacks.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | import copy 4 | import numpy as np 5 | from dataclasses import dataclass, field 6 | from typing import List, Any 7 | 8 | 9 | class Callback: 10 | """ 11 | Abstract base class used to build new callbacks. 12 | """ 13 | 14 | def __init__(self): 15 | pass 16 | 17 | def set_params(self, params): 18 | self.params = params 19 | 20 | def set_trainer(self, model): 21 | self.trainer = model 22 | 23 | def on_epoch_begin(self, epoch, logs=None): 24 | pass 25 | 26 | def on_epoch_end(self, epoch, logs=None): 27 | pass 28 | 29 | def on_batch_begin(self, batch, logs=None): 30 | pass 31 | 32 | def on_batch_end(self, batch, logs=None): 33 | pass 34 | 35 | def on_train_begin(self, logs=None): 36 | pass 37 | 38 | def on_train_end(self, logs=None): 39 | pass 40 | 41 | 42 | @dataclass 43 | class CallbackContainer: 44 | """ 45 | Container holding a list of callbacks. 46 | """ 47 | 48 | callbacks: List[Callback] = field(default_factory=list) 49 | 50 | def append(self, callback): 51 | self.callbacks.append(callback) 52 | 53 | def set_params(self, params): 54 | for callback in self.callbacks: 55 | callback.set_params(params) 56 | 57 | def set_trainer(self, trainer): 58 | self.trainer = trainer 59 | for callback in self.callbacks: 60 | callback.set_trainer(trainer) 61 | 62 | def on_epoch_begin(self, epoch, logs=None): 63 | logs = logs or {} 64 | for callback in self.callbacks: 65 | callback.on_epoch_begin(epoch, logs) 66 | 67 | def on_epoch_end(self, epoch, logs=None): 68 | logs = logs or {} 69 | for callback in self.callbacks: 70 | callback.on_epoch_end(epoch, logs) 71 | 72 | def on_batch_begin(self, batch, logs=None): 73 | logs = logs or {} 74 | for callback in self.callbacks: 75 | callback.on_batch_begin(batch, logs) 76 | 77 | def on_batch_end(self, batch, logs=None): 78 | logs = logs or {} 79 | for callback in self.callbacks: 80 | callback.on_batch_end(batch, logs) 81 | 82 | def on_train_begin(self, logs=None): 83 | logs = logs or {} 84 | logs["start_time"] = time.time() 85 | for callback in self.callbacks: 86 | callback.on_train_begin(logs) 87 | 88 | def on_train_end(self, logs=None): 89 | logs = logs or {} 90 | for callback in self.callbacks: 91 | callback.on_train_end(logs) 92 | 93 | 94 | @dataclass 95 | class EarlyStopping(Callback): 96 | """EarlyStopping callback to exit the training loop if early_stopping_metric 97 | does not improve by a certain amount for a certain 98 | number of epochs. 99 | 100 | Parameters 101 | --------- 102 | early_stopping_metric : str 103 | Early stopping metric name 104 | is_maximize : bool 105 | Whether to maximize or not early_stopping_metric 106 | tol : float 107 | minimum change in monitored value to qualify as improvement. 108 | This number should be positive. 109 | patience : integer 110 | number of epochs to wait for improvment before terminating. 111 | the counter be reset after each improvment 112 | 113 | """ 114 | 115 | early_stopping_metric: str 116 | is_maximize: bool 117 | tol: float = 0.0 118 | patience: int = 5 119 | 120 | def __post_init__(self): 121 | self.best_epoch = 0 122 | self.stopped_epoch = 0 123 | self.wait = 0 124 | self.best_weights = None 125 | self.best_loss = np.inf 126 | if self.is_maximize: 127 | self.best_loss = -self.best_loss 128 | super().__init__() 129 | 130 | def on_epoch_end(self, epoch, logs=None): 131 | current_loss = logs.get(self.early_stopping_metric) 132 | if current_loss is None: 133 | return 134 | 135 | loss_change = current_loss - self.best_loss 136 | max_improved = self.is_maximize and loss_change > self.tol 137 | min_improved = (not self.is_maximize) and (-loss_change > self.tol) 138 | if max_improved or min_improved: 139 | self.best_loss = current_loss 140 | self.best_epoch = epoch 141 | self.wait = 1 142 | self.best_weights = copy.deepcopy(self.trainer.network.state_dict()) 143 | else: 144 | if self.wait >= self.patience: 145 | self.stopped_epoch = epoch 146 | self.trainer._stop_training = True 147 | self.wait += 1 148 | 149 | def on_train_end(self, logs=None): 150 | self.trainer.best_epoch = self.best_epoch 151 | self.trainer.best_cost = self.best_loss 152 | 153 | if self.best_weights is not None: 154 | self.trainer.network.load_state_dict(self.best_weights) 155 | 156 | if self.stopped_epoch > 0: 157 | msg = f"\nEarly stopping occured at epoch {self.stopped_epoch}" 158 | msg += ( 159 | f" with best_epoch = {self.best_epoch} and " 160 | + f"best_{self.early_stopping_metric} = {round(self.best_loss, 5)}" 161 | ) 162 | print(msg) 163 | else: 164 | msg = (f"Stop training because you reached max_epochs = {self.trainer.max_epochs}" 165 | + f" with best_epoch = {self.best_epoch} and " 166 | + f"best_{self.early_stopping_metric} = {round(self.best_loss, 5)}") 167 | print(msg) 168 | print("Best weights from best epoch are automatically used!") 169 | 170 | 171 | @dataclass 172 | class History(Callback): 173 | """Callback that records events into a `History` object. 174 | This callback is automatically applied to 175 | every SuperModule. 176 | 177 | Parameters 178 | --------- 179 | trainer : DeepRecoModel 180 | Model class to train 181 | verbose : int 182 | Print results every verbose iteration 183 | 184 | """ 185 | 186 | trainer: Any 187 | verbose: int = 1 188 | 189 | def __post_init__(self): 190 | super().__init__() 191 | self.samples_seen = 0.0 192 | self.total_time = 0.0 193 | 194 | def on_train_begin(self, logs=None): 195 | self.history = {"loss": []} 196 | self.history.update({"lr": []}) 197 | self.history.update({name: [] for name in self.trainer._metrics_names}) 198 | self.start_time = logs["start_time"] 199 | self.epoch_loss = 0. 200 | 201 | def on_epoch_begin(self, epoch, logs=None): 202 | self.epoch_metrics = {"loss": 0.0} 203 | self.samples_seen = 0.0 204 | 205 | def on_epoch_end(self, epoch, logs=None): 206 | self.epoch_metrics["loss"] = self.epoch_loss 207 | for metric_name, metric_value in self.epoch_metrics.items(): 208 | self.history[metric_name].append(metric_value) 209 | if self.verbose == 0: 210 | return 211 | if epoch % self.verbose != 0: 212 | return 213 | msg = f"epoch {epoch:<3}" 214 | for metric_name, metric_value in self.epoch_metrics.items(): 215 | if metric_name != "lr": 216 | msg += f"| {metric_name:<3}: {np.round(metric_value, 5):<8}" 217 | self.total_time = int(time.time() - self.start_time) 218 | msg += f"| {str(datetime.timedelta(seconds=self.total_time)) + 's':<6}" 219 | print(msg) 220 | 221 | def on_batch_end(self, batch, logs=None): 222 | batch_size = logs["batch_size"] 223 | self.epoch_loss = (self.samples_seen * self.epoch_loss + batch_size * logs["loss"] 224 | ) / (self.samples_seen + batch_size) 225 | self.samples_seen += batch_size 226 | 227 | def __getitem__(self, name): 228 | return self.history[name] 229 | 230 | def __repr__(self): 231 | return str(self.history) 232 | 233 | def __str__(self): 234 | return str(self.history) 235 | 236 | 237 | @dataclass 238 | class LRSchedulerCallback(Callback): 239 | """Wrapper for most torch scheduler functions. 240 | 241 | Parameters 242 | --------- 243 | scheduler_fn : torch.optim.lr_scheduler 244 | Torch scheduling class 245 | scheduler_params : dict 246 | Dictionnary containing all parameters for the scheduler_fn 247 | is_batch_level : bool (default = False) 248 | If set to False : lr updates will happen at every epoch 249 | If set to True : lr updates happen at every batch 250 | Set this to True for OneCycleLR for example 251 | """ 252 | 253 | scheduler_fn: Any 254 | optimizer: Any 255 | scheduler_params: dict 256 | early_stopping_metric: str 257 | is_batch_level: bool = False 258 | 259 | def __post_init__(self, ): 260 | self.is_metric_related = hasattr(self.scheduler_fn, 261 | "is_better") 262 | self.scheduler = self.scheduler_fn(self.optimizer, 263 | **self.scheduler_params) 264 | super().__init__() 265 | 266 | def on_batch_end(self, batch, logs=None): 267 | if self.is_batch_level: 268 | self.scheduler.step() 269 | else: 270 | pass 271 | 272 | def on_epoch_end(self, epoch, logs=None): 273 | current_loss = logs.get(self.early_stopping_metric) 274 | if current_loss is None: 275 | return 276 | if self.is_batch_level: 277 | pass 278 | else: 279 | if self.is_metric_related: 280 | self.scheduler.step(current_loss) 281 | else: 282 | self.scheduler.step() 283 | -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/sparsemax.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.autograd import Function 3 | import torch.nn.functional as F 4 | 5 | import torch 6 | 7 | """ 8 | Other possible implementations: 9 | https://github.com/KrisKorrel/sparsemax-pytorch/blob/master/sparsemax.py 10 | https://github.com/msobroza/SparsemaxPytorch/blob/master/mnist/sparsemax.py 11 | https://github.com/vene/sparse-structured-attention/blob/master/pytorch/torchsparseattn/sparsemax.py 12 | """ 13 | 14 | 15 | # credits to Yandex https://github.com/Qwicen/node/blob/master/lib/nn_utils.py 16 | def _make_ix_like(input, dim=0): 17 | d = input.size(dim) 18 | rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) 19 | view = [1] * input.dim() 20 | view[0] = -1 21 | return rho.view(view).transpose(0, dim) 22 | 23 | 24 | class SparsemaxFunction(Function): 25 | """ 26 | An implementation of sparsemax (Martins & Astudillo, 2016). See 27 | :cite:`DBLP:journals/corr/MartinsA16` for detailed description. 28 | By Ben Peters and Vlad Niculae 29 | """ 30 | 31 | @staticmethod 32 | def forward(ctx, input, dim=-1): 33 | """sparsemax: normalizing sparse transform (a la softmax) 34 | 35 | Parameters 36 | ---------- 37 | ctx : torch.autograd.function._ContextMethodMixin 38 | input : torch.Tensor 39 | any shape 40 | dim : int 41 | dimension along which to apply sparsemax 42 | 43 | Returns 44 | ------- 45 | output : torch.Tensor 46 | same shape as input 47 | 48 | """ 49 | ctx.dim = dim 50 | max_val, _ = input.max(dim=dim, keepdim=True) 51 | input -= max_val # same numerical stability trick as for softmax 52 | tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim) 53 | output = torch.clamp(input - tau, min=0) 54 | ctx.save_for_backward(supp_size, output) 55 | return output 56 | 57 | @staticmethod 58 | def backward(ctx, grad_output): 59 | supp_size, output = ctx.saved_tensors 60 | dim = ctx.dim 61 | grad_input = grad_output.clone() 62 | grad_input[output == 0] = 0 63 | 64 | v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() 65 | v_hat = v_hat.unsqueeze(dim) 66 | grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) 67 | return grad_input, None 68 | 69 | @staticmethod 70 | def _threshold_and_support(input, dim=-1): 71 | """Sparsemax building block: compute the threshold 72 | 73 | Parameters 74 | ---------- 75 | input: torch.Tensor 76 | any dimension 77 | dim : int 78 | dimension along which to apply the sparsemax 79 | 80 | Returns 81 | ------- 82 | tau : torch.Tensor 83 | the threshold value 84 | support_size : torch.Tensor 85 | 86 | """ 87 | 88 | input_srt, _ = torch.sort(input, descending=True, dim=dim) 89 | input_cumsum = input_srt.cumsum(dim) - 1 90 | rhos = _make_ix_like(input, dim) 91 | support = rhos * input_srt > input_cumsum 92 | 93 | support_size = support.sum(dim=dim).unsqueeze(dim) 94 | tau = input_cumsum.gather(dim, support_size - 1) 95 | tau /= support_size.to(input.dtype) 96 | return tau, support_size 97 | 98 | 99 | sparsemax = SparsemaxFunction.apply 100 | 101 | 102 | class Sparsemax(nn.Module): 103 | 104 | def __init__(self, dim=-1): 105 | self.dim = dim 106 | super(Sparsemax, self).__init__() 107 | 108 | def forward(self, input): 109 | return sparsemax(input, self.dim) 110 | 111 | 112 | class Entmax15Function(Function): 113 | """ 114 | An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See 115 | :cite:`https://arxiv.org/abs/1905.05702 for detailed description. 116 | Source: https://github.com/deep-spin/entmax 117 | """ 118 | 119 | @staticmethod 120 | def forward(ctx, input, dim=-1): 121 | ctx.dim = dim 122 | 123 | max_val, _ = input.max(dim=dim, keepdim=True) 124 | input = input - max_val # same numerical stability trick as for softmax 125 | input = input / 2 # divide by 2 to solve actual Entmax 126 | 127 | tau_star, _ = Entmax15Function._threshold_and_support(input, dim) 128 | output = torch.clamp(input - tau_star, min=0) ** 2 129 | ctx.save_for_backward(output) 130 | return output 131 | 132 | @staticmethod 133 | def backward(ctx, grad_output): 134 | Y, = ctx.saved_tensors 135 | gppr = Y.sqrt() # = 1 / g'' (Y) 136 | dX = grad_output * gppr 137 | q = dX.sum(ctx.dim) / gppr.sum(ctx.dim) 138 | q = q.unsqueeze(ctx.dim) 139 | dX -= q * gppr 140 | return dX, None 141 | 142 | @staticmethod 143 | def _threshold_and_support(input, dim=-1): 144 | Xsrt, _ = torch.sort(input, descending=True, dim=dim) 145 | 146 | rho = _make_ix_like(input, dim) 147 | mean = Xsrt.cumsum(dim) / rho 148 | mean_sq = (Xsrt ** 2).cumsum(dim) / rho 149 | ss = rho * (mean_sq - mean ** 2) 150 | delta = (1 - ss) / rho 151 | 152 | # NOTE this is not exactly the same as in reference algo 153 | # Fortunately it seems the clamped values never wrongly 154 | # get selected by tau <= sorted_z. Prove this! 155 | delta_nz = torch.clamp(delta, 0) 156 | tau = mean - torch.sqrt(delta_nz) 157 | 158 | support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim) 159 | tau_star = tau.gather(dim, support_size - 1) 160 | return tau_star, support_size 161 | 162 | 163 | class Entmoid15(Function): 164 | """ A highly optimized equivalent of labda x: Entmax15([x, 0]) """ 165 | 166 | @staticmethod 167 | def forward(ctx, input): 168 | output = Entmoid15._forward(input) 169 | ctx.save_for_backward(output) 170 | return output 171 | 172 | @staticmethod 173 | def _forward(input): 174 | input, is_pos = abs(input), input >= 0 175 | tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2 176 | tau.masked_fill_(tau <= input, 2.0) 177 | y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2 178 | return torch.where(is_pos, 1 - y_neg, y_neg) 179 | 180 | @staticmethod 181 | def backward(ctx, grad_output): 182 | return Entmoid15._backward(ctx.saved_tensors[0], grad_output) 183 | 184 | @staticmethod 185 | def _backward(output, grad_output): 186 | gppr0, gppr1 = output.sqrt(), (1 - output).sqrt() 187 | grad_input = grad_output * gppr0 188 | q = grad_input / (gppr0 + gppr1) 189 | grad_input -= q * gppr0 190 | return grad_input 191 | 192 | 193 | entmax15 = Entmax15Function.apply 194 | entmoid15 = Entmoid15.apply 195 | 196 | 197 | class Entmax15(nn.Module): 198 | 199 | def __init__(self, dim=-1): 200 | self.dim = dim 201 | super(Entmax15, self).__init__() 202 | 203 | def forward(self, input): 204 | return entmax15(input, self.dim) 205 | 206 | 207 | # Credits were lost... 208 | # def _make_ix_like(input, dim=0): 209 | # d = input.size(dim) 210 | # rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) 211 | # view = [1] * input.dim() 212 | # view[0] = -1 213 | # return rho.view(view).transpose(0, dim) 214 | # 215 | # 216 | # def _threshold_and_support(input, dim=0): 217 | # """Sparsemax building block: compute the threshold 218 | # Args: 219 | # input: any dimension 220 | # dim: dimension along which to apply the sparsemax 221 | # Returns: 222 | # the threshold value 223 | # """ 224 | # 225 | # input_srt, _ = torch.sort(input, descending=True, dim=dim) 226 | # input_cumsum = input_srt.cumsum(dim) - 1 227 | # rhos = _make_ix_like(input, dim) 228 | # support = rhos * input_srt > input_cumsum 229 | # 230 | # support_size = support.sum(dim=dim).unsqueeze(dim) 231 | # tau = input_cumsum.gather(dim, support_size - 1) 232 | # tau /= support_size.to(input.dtype) 233 | # return tau, support_size 234 | # 235 | # 236 | # class SparsemaxFunction(Function): 237 | # 238 | # @staticmethod 239 | # def forward(ctx, input, dim=0): 240 | # """sparsemax: normalizing sparse transform (a la softmax) 241 | # Parameters: 242 | # input (Tensor): any shape 243 | # dim: dimension along which to apply sparsemax 244 | # Returns: 245 | # output (Tensor): same shape as input 246 | # """ 247 | # ctx.dim = dim 248 | # max_val, _ = input.max(dim=dim, keepdim=True) 249 | # input -= max_val # same numerical stability trick as for softmax 250 | # tau, supp_size = _threshold_and_support(input, dim=dim) 251 | # output = torch.clamp(input - tau, min=0) 252 | # ctx.save_for_backward(supp_size, output) 253 | # return output 254 | # 255 | # @staticmethod 256 | # def backward(ctx, grad_output): 257 | # supp_size, output = ctx.saved_tensors 258 | # dim = ctx.dim 259 | # grad_input = grad_output.clone() 260 | # grad_input[output == 0] = 0 261 | # 262 | # v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() 263 | # v_hat = v_hat.unsqueeze(dim) 264 | # grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) 265 | # return grad_input, None 266 | # 267 | # 268 | # sparsemax = SparsemaxFunction.apply 269 | # 270 | # 271 | # class Sparsemax(nn.Module): 272 | # 273 | # def __init__(self, dim=0): 274 | # self.dim = dim 275 | # super(Sparsemax, self).__init__() 276 | # 277 | # def forward(self, input): 278 | # return sparsemax(input, self.dim) 279 | -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/utils.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from torch.utils.data import DataLoader, WeightedRandomSampler 3 | import torch 4 | import numpy as np 5 | import scipy 6 | from sklearn.utils import check_array 7 | 8 | 9 | class TorchDataset(Dataset): 10 | """ 11 | Format for numpy array 12 | 13 | Parameters 14 | ---------- 15 | X : 2D array 16 | The input matrix 17 | y : 2D array 18 | The one-hot encoded target 19 | """ 20 | 21 | def __init__(self, x, y): 22 | self.x = x 23 | self.y = y 24 | 25 | def __len__(self): 26 | return len(self.x) 27 | 28 | def __getitem__(self, index): 29 | x, y = self.x[index], self.y[index] 30 | return x, y 31 | 32 | 33 | class PredictDataset(Dataset): 34 | """ 35 | Format for numpy array 36 | 37 | Parameters 38 | ---------- 39 | X : 2D array 40 | The input matrix 41 | """ 42 | 43 | def __init__(self, x): 44 | self.x = x 45 | 46 | def __len__(self): 47 | return len(self.x) 48 | 49 | def __getitem__(self, index): 50 | x = self.x[index] 51 | return x 52 | 53 | 54 | def create_dataloaders( 55 | X_train, y_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory 56 | ): 57 | """ 58 | Create dataloaders with or wihtout subsampling depending on weights and balanced. 59 | 60 | Parameters 61 | ---------- 62 | X_train : np.ndarray 63 | Training data 64 | y_train : np.array 65 | Mapped Training targets 66 | eval_set : list of tuple 67 | List of eval tuple set (X, y) 68 | weights : either 0, 1, dict or iterable 69 | if 0 (default) : no weights will be applied 70 | if 1 : classification only, will balanced class with inverse frequency 71 | if dict : keys are corresponding class values are sample weights 72 | if iterable : list or np array must be of length equal to nb elements 73 | in the training set 74 | batch_size : int 75 | how many samples per batch to load 76 | num_workers : int 77 | how many subprocesses to use for data loading. 0 means that the data 78 | will be loaded in the main process 79 | drop_last : bool 80 | set to True to drop the last incomplete batch, if the dataset size is not 81 | divisible by the batch size. If False and the size of dataset is not 82 | divisible by the batch size, then the last batch will be smaller 83 | pin_memory : bool 84 | Whether to pin GPU memory during training 85 | 86 | Returns 87 | ------- 88 | train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader 89 | Training and validation dataloaders 90 | """ 91 | 92 | if isinstance(weights, int): 93 | if weights == 0: 94 | need_shuffle = True 95 | sampler = None 96 | elif weights == 1: 97 | need_shuffle = False 98 | class_sample_count = np.array( 99 | [len(np.where(y_train == t)[0]) for t in np.unique(y_train)] 100 | ) 101 | 102 | weights = 1.0 / class_sample_count 103 | 104 | samples_weight = np.array([weights[t] for t in y_train]) 105 | 106 | samples_weight = torch.from_numpy(samples_weight) 107 | samples_weight = samples_weight.double() 108 | sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) 109 | else: 110 | raise ValueError("Weights should be either 0, 1, dictionnary or list.") 111 | elif isinstance(weights, dict): 112 | # custom weights per class 113 | need_shuffle = False 114 | samples_weight = np.array([weights[t] for t in y_train]) 115 | sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) 116 | else: 117 | # custom weights 118 | if len(weights) != len(y_train): 119 | raise ValueError("Custom weights should match number of train samples.") 120 | need_shuffle = False 121 | samples_weight = np.array(weights) 122 | sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) 123 | 124 | train_dataloader = DataLoader( 125 | TorchDataset(X_train.astype(np.float32), y_train), 126 | batch_size=batch_size, 127 | sampler=sampler, 128 | shuffle=need_shuffle, 129 | num_workers=num_workers, 130 | drop_last=drop_last, 131 | pin_memory=pin_memory 132 | ) 133 | 134 | valid_dataloaders = [] 135 | for X, y in eval_set: 136 | valid_dataloaders.append( 137 | DataLoader( 138 | TorchDataset(X.astype(np.float32), y), 139 | batch_size=batch_size, 140 | shuffle=False, 141 | num_workers=num_workers, 142 | pin_memory=pin_memory 143 | ) 144 | ) 145 | 146 | return train_dataloader, valid_dataloaders 147 | 148 | 149 | def create_explain_matrix(input_dim, cat_emb_dim, cat_idxs, post_embed_dim): 150 | """ 151 | This is a computational trick. 152 | In order to rapidly sum importances from same embeddings 153 | to the initial index. 154 | 155 | Parameters 156 | ---------- 157 | input_dim : int 158 | Initial input dim 159 | cat_emb_dim : int or list of int 160 | if int : size of embedding for all categorical feature 161 | if list of int : size of embedding for each categorical feature 162 | cat_idxs : list of int 163 | Initial position of categorical features 164 | post_embed_dim : int 165 | Post embedding inputs dimension 166 | 167 | Returns 168 | ------- 169 | reducing_matrix : np.array 170 | Matrix of dim (post_embed_dim, input_dim) to performe reduce 171 | """ 172 | 173 | if isinstance(cat_emb_dim, int): 174 | all_emb_impact = [cat_emb_dim - 1] * len(cat_idxs) 175 | else: 176 | all_emb_impact = [emb_dim - 1 for emb_dim in cat_emb_dim] 177 | 178 | acc_emb = 0 179 | nb_emb = 0 180 | indices_trick = [] 181 | for i in range(input_dim): 182 | if i not in cat_idxs: 183 | indices_trick.append([i + acc_emb]) 184 | else: 185 | indices_trick.append( 186 | range(i + acc_emb, i + acc_emb + all_emb_impact[nb_emb] + 1) 187 | ) 188 | acc_emb += all_emb_impact[nb_emb] 189 | nb_emb += 1 190 | 191 | reducing_matrix = np.zeros((post_embed_dim, input_dim)) 192 | for i, cols in enumerate(indices_trick): 193 | reducing_matrix[cols, i] = 1 194 | 195 | return scipy.sparse.csc_matrix(reducing_matrix) 196 | 197 | 198 | def filter_weights(weights): 199 | """ 200 | This function makes sure that weights are in correct format for 201 | regression and multitask TabNet 202 | 203 | Parameters 204 | ---------- 205 | weights : int, dict or list 206 | Initial weights parameters given by user 207 | 208 | Returns 209 | ------- 210 | None : This function will only throw an error if format is wrong 211 | """ 212 | err_msg = "Please provide a list of weights for regression or multitask : " 213 | if isinstance(weights, int): 214 | if weights == 1: 215 | raise ValueError(err_msg + "1 given.") 216 | if isinstance(weights, dict): 217 | raise ValueError(err_msg + "Dict given.") 218 | return 219 | 220 | 221 | def validate_eval_set(eval_set, eval_name, X_train, y_train): 222 | """Check if the shapes of eval_set are compatible with (X_train, y_train). 223 | 224 | Parameters 225 | ---------- 226 | eval_set : list of tuple 227 | List of eval tuple set (X, y). 228 | The last one is used for early stopping 229 | eval_name : list of str 230 | List of eval set names. 231 | X_train : np.ndarray 232 | Train owned products 233 | y_train : np.array 234 | Train targeted products 235 | 236 | Returns 237 | ------- 238 | eval_names : list of str 239 | Validated list of eval_names. 240 | eval_set : list of tuple 241 | Validated list of eval_set. 242 | 243 | """ 244 | eval_name = eval_name or [f"val_{i}" for i in range(len(eval_set))] 245 | 246 | assert len(eval_set) == len( 247 | eval_name 248 | ), "eval_set and eval_name have not the same length" 249 | if len(eval_set) > 0: 250 | assert all( 251 | len(elem) == 2 for elem in eval_set 252 | ), "Each tuple of eval_set need to have two elements" 253 | for name, (X, y) in zip(eval_name, eval_set): 254 | check_array(X) 255 | msg = ( 256 | f"Number of columns is different between X_{name} " 257 | + f"({X.shape[1]}) and X_train ({X_train.shape[1]})" 258 | ) 259 | assert X.shape[1] == X_train.shape[1], msg 260 | if len(y_train.shape) == 2: 261 | msg = ( 262 | f"Number of columns is different between y_{name} " 263 | + f"({y.shape[1]}) and y_train ({y_train.shape[1]})" 264 | ) 265 | assert y.shape[1] == y_train.shape[1], msg 266 | msg = ( 267 | f"You need the same number of rows between X_{name} " 268 | + f"({X.shape[0]}) and y_{name} ({y.shape[0]})" 269 | ) 270 | assert X.shape[0] == y.shape[0], msg 271 | 272 | return eval_name, eval_set 273 | 274 | 275 | def define_device(device_name): 276 | """ 277 | Define the device to use during training and inference. 278 | If auto it will detect automatically whether to use cuda or cpu 279 | 280 | Parameters 281 | ---------- 282 | device_name : str 283 | Either "auto", "cpu" or "cuda" 284 | 285 | Returns 286 | ------- 287 | str 288 | Either "cpu" or "cuda" 289 | """ 290 | if device_name == "auto": 291 | if torch.cuda.is_available(): 292 | return "cuda" 293 | else: 294 | return "cpu" 295 | else: 296 | return device_name 297 | -------------------------------------------------------------------------------- /src/models/optimizer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.optim.optimizer import Optimizer, required 4 | 5 | 6 | class RAdam(Optimizer): 7 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): 8 | if not 0.0 <= lr: 9 | raise ValueError("Invalid learning rate: {}".format(lr)) 10 | if not 0.0 <= eps: 11 | raise ValueError("Invalid epsilon value: {}".format(eps)) 12 | if not 0.0 <= betas[0] < 1.0: 13 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 14 | if not 0.0 <= betas[1] < 1.0: 15 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 16 | 17 | self.degenerated_to_sgd = degenerated_to_sgd 18 | if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): 19 | for param in params: 20 | if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): 21 | param['buffer'] = [[None, None, None] for _ in range(10)] 22 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)]) 23 | super(RAdam, self).__init__(params, defaults) 24 | 25 | def __setstate__(self, state): 26 | super(RAdam, self).__setstate__(state) 27 | 28 | def step(self, closure=None): 29 | 30 | loss = None 31 | if closure is not None: 32 | loss = closure() 33 | 34 | for group in self.param_groups: 35 | 36 | for p in group['params']: 37 | if p.grad is None: 38 | continue 39 | grad = p.grad.data.float() 40 | if grad.is_sparse: 41 | raise RuntimeError('RAdam does not support sparse gradients') 42 | 43 | p_data_fp32 = p.data.float() 44 | 45 | state = self.state[p] 46 | 47 | if len(state) == 0: 48 | state['step'] = 0 49 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 50 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 51 | else: 52 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 53 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 54 | 55 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 56 | beta1, beta2 = group['betas'] 57 | 58 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 59 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 60 | 61 | state['step'] += 1 62 | buffered = group['buffer'][int(state['step'] % 10)] 63 | if state['step'] == buffered[0]: 64 | N_sma, step_size = buffered[1], buffered[2] 65 | else: 66 | buffered[0] = state['step'] 67 | beta2_t = beta2**state['step'] 68 | N_sma_max = 2 / (1 - beta2) - 1 69 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 70 | buffered[1] = N_sma 71 | 72 | # more conservative since it's an approximated value 73 | if N_sma >= 5: 74 | step_size = math.sqrt( 75 | (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1**state['step']) 76 | elif self.degenerated_to_sgd: 77 | step_size = 1.0 / (1 - beta1**state['step']) 78 | else: 79 | step_size = -1 80 | buffered[2] = step_size 81 | 82 | # more conservative since it's an approximated value 83 | if N_sma >= 5: 84 | if group['weight_decay'] != 0: 85 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 86 | denom = exp_avg_sq.sqrt().add_(group['eps']) 87 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 88 | p.data.copy_(p_data_fp32) 89 | elif step_size > 0: 90 | if group['weight_decay'] != 0: 91 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 92 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 93 | p.data.copy_(p_data_fp32) 94 | 95 | return loss 96 | 97 | 98 | class PlainRAdam(Optimizer): 99 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): 100 | if not 0.0 <= lr: 101 | raise ValueError("Invalid learning rate: {}".format(lr)) 102 | if not 0.0 <= eps: 103 | raise ValueError("Invalid epsilon value: {}".format(eps)) 104 | if not 0.0 <= betas[0] < 1.0: 105 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 106 | if not 0.0 <= betas[1] < 1.0: 107 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 108 | 109 | self.degenerated_to_sgd = degenerated_to_sgd 110 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 111 | 112 | super(PlainRAdam, self).__init__(params, defaults) 113 | 114 | def __setstate__(self, state): 115 | super(PlainRAdam, self).__setstate__(state) 116 | 117 | def step(self, closure=None): 118 | 119 | loss = None 120 | if closure is not None: 121 | loss = closure() 122 | 123 | for group in self.param_groups: 124 | 125 | for p in group['params']: 126 | if p.grad is None: 127 | continue 128 | grad = p.grad.data.float() 129 | if grad.is_sparse: 130 | raise RuntimeError('RAdam does not support sparse gradients') 131 | 132 | p_data_fp32 = p.data.float() 133 | 134 | state = self.state[p] 135 | 136 | if len(state) == 0: 137 | state['step'] = 0 138 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 139 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 140 | else: 141 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 142 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 143 | 144 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 145 | beta1, beta2 = group['betas'] 146 | 147 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 148 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 149 | 150 | state['step'] += 1 151 | beta2_t = beta2**state['step'] 152 | N_sma_max = 2 / (1 - beta2) - 1 153 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 154 | 155 | # more conservative since it's an approximated value 156 | if N_sma >= 5: 157 | if group['weight_decay'] != 0: 158 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 159 | step_size = group['lr'] * math.sqrt( 160 | (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1**state['step']) 161 | denom = exp_avg_sq.sqrt().add_(group['eps']) 162 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 163 | p.data.copy_(p_data_fp32) 164 | elif self.degenerated_to_sgd: 165 | if group['weight_decay'] != 0: 166 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 167 | step_size = group['lr'] / (1 - beta1**state['step']) 168 | p_data_fp32.add_(-step_size, exp_avg) 169 | p.data.copy_(p_data_fp32) 170 | 171 | return loss 172 | 173 | 174 | class AdamW(Optimizer): 175 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0): 176 | if not 0.0 <= lr: 177 | raise ValueError("Invalid learning rate: {}".format(lr)) 178 | if not 0.0 <= eps: 179 | raise ValueError("Invalid epsilon value: {}".format(eps)) 180 | if not 0.0 <= betas[0] < 1.0: 181 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 182 | if not 0.0 <= betas[1] < 1.0: 183 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 184 | 185 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, warmup=warmup) 186 | super(AdamW, self).__init__(params, defaults) 187 | 188 | def __setstate__(self, state): 189 | super(AdamW, self).__setstate__(state) 190 | 191 | def step(self, closure=None): 192 | loss = None 193 | if closure is not None: 194 | loss = closure() 195 | 196 | for group in self.param_groups: 197 | 198 | for p in group['params']: 199 | if p.grad is None: 200 | continue 201 | grad = p.grad.data.float() 202 | if grad.is_sparse: 203 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 204 | 205 | p_data_fp32 = p.data.float() 206 | 207 | state = self.state[p] 208 | 209 | if len(state) == 0: 210 | state['step'] = 0 211 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 212 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 213 | else: 214 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 215 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 216 | 217 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 218 | beta1, beta2 = group['betas'] 219 | 220 | state['step'] += 1 221 | 222 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 223 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 224 | 225 | denom = exp_avg_sq.sqrt().add_(group['eps']) 226 | bias_correction1 = 1 - beta1**state['step'] 227 | bias_correction2 = 1 - beta2**state['step'] 228 | 229 | if group['warmup'] > state['step']: 230 | scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] 231 | else: 232 | scheduled_lr = group['lr'] 233 | 234 | step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1 235 | 236 | if group['weight_decay'] != 0: 237 | p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) 238 | 239 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 240 | 241 | p.data.copy_(p_data_fp32) 242 | 243 | return loss -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/multiclass_utils.py: -------------------------------------------------------------------------------- 1 | # Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi 2 | # 3 | # License: BSD 3 clause 4 | """ 5 | Multi-class / multi-label utility function 6 | ========================================== 7 | 8 | """ 9 | from collections.abc import Sequence 10 | from itertools import chain 11 | 12 | from scipy.sparse import issparse 13 | from scipy.sparse.base import spmatrix 14 | from scipy.sparse import dok_matrix 15 | from scipy.sparse import lil_matrix 16 | import scipy.sparse as sp 17 | 18 | import numpy as np 19 | 20 | 21 | def _assert_all_finite(X, allow_nan=False): 22 | """Like assert_all_finite, but only for ndarray.""" 23 | 24 | X = np.asanyarray(X) 25 | # First try an O(n) time, O(1) space solution for the common case that 26 | # everything is finite; fall back to O(n) space np.isfinite to prevent 27 | # false positives from overflow in sum method. The sum is also calculated 28 | # safely to reduce dtype induced overflows. 29 | is_float = X.dtype.kind in "fc" 30 | if is_float and (np.isfinite(np.sum(X))): 31 | pass 32 | elif is_float: 33 | msg_err = "Input contains {} or a value too large for {!r}." 34 | if ( 35 | allow_nan 36 | and np.isinf(X).any() 37 | or not allow_nan 38 | and not np.isfinite(X).all() 39 | ): 40 | type_err = "infinity" if allow_nan else "NaN, infinity" 41 | raise ValueError(msg_err.format(type_err, X.dtype)) 42 | # for object dtype data, we only check for NaNs (GH-13254) 43 | elif X.dtype == np.dtype("object") and not allow_nan: 44 | if np.isnan(X).any(): 45 | raise ValueError("Input contains NaN") 46 | 47 | 48 | def assert_all_finite(X, allow_nan=False): 49 | """Throw a ValueError if X contains NaN or infinity. 50 | 51 | Parameters 52 | ---------- 53 | X : array or sparse matrix 54 | allow_nan : bool 55 | """ 56 | _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan) 57 | 58 | 59 | def _unique_multiclass(y): 60 | if hasattr(y, "__array__"): 61 | return np.unique(np.asarray(y)) 62 | else: 63 | return set(y) 64 | 65 | 66 | def _unique_indicator(y): 67 | """ 68 | Not implemented 69 | """ 70 | pass 71 | 72 | 73 | _FN_UNIQUE_LABELS = { 74 | "binary": _unique_multiclass, 75 | "multiclass": _unique_multiclass, 76 | "multilabel-indicator": _unique_indicator, 77 | } 78 | 79 | 80 | def unique_labels(*ys): 81 | """Extract an ordered array of unique labels 82 | 83 | We don't allow: 84 | - mix of multilabel and multiclass (single label) targets 85 | - mix of label indicator matrix and anything else, 86 | because there are no explicit labels) 87 | - mix of label indicator matrices of different sizes 88 | - mix of string and integer labels 89 | 90 | At the moment, we also don't allow "multiclass-multioutput" input type. 91 | 92 | Parameters 93 | ---------- 94 | *ys : array-likes 95 | 96 | Returns 97 | ------- 98 | out : numpy array of shape [n_unique_labels] 99 | An ordered array of unique labels. 100 | 101 | Examples 102 | -------- 103 | >>> from sklearn.utils.multiclass import unique_labels 104 | >>> unique_labels([3, 5, 5, 5, 7, 7]) 105 | array([3, 5, 7]) 106 | >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]) 107 | array([1, 2, 3, 4]) 108 | >>> unique_labels([1, 2, 10], [5, 11]) 109 | array([ 1, 2, 5, 10, 11]) 110 | """ 111 | if not ys: 112 | raise ValueError("No argument has been passed.") 113 | # Check that we don't mix label format 114 | 115 | ys_types = set(type_of_target(x) for x in ys) 116 | if ys_types == {"binary", "multiclass"}: 117 | ys_types = {"multiclass"} 118 | 119 | if len(ys_types) > 1: 120 | raise ValueError("Mix type of y not allowed, got types %s" % ys_types) 121 | 122 | label_type = ys_types.pop() 123 | 124 | # Get the unique set of labels 125 | _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) 126 | if not _unique_labels: 127 | raise ValueError("Unknown label type: %s" % repr(ys)) 128 | 129 | ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys)) 130 | 131 | # Check that we don't mix string type with number type 132 | if len(set(isinstance(label, str) for label in ys_labels)) > 1: 133 | raise ValueError("Mix of label input types (string and number)") 134 | 135 | return np.array(sorted(ys_labels)) 136 | 137 | 138 | def _is_integral_float(y): 139 | return y.dtype.kind == "f" and np.all(y.astype(int) == y) 140 | 141 | 142 | def is_multilabel(y): 143 | """ Check if ``y`` is in a multilabel format. 144 | 145 | Parameters 146 | ---------- 147 | y : numpy array of shape [n_samples] 148 | Target values. 149 | 150 | Returns 151 | ------- 152 | out : bool 153 | Return ``True``, if ``y`` is in a multilabel format, else ```False``. 154 | 155 | Examples 156 | -------- 157 | >>> import numpy as np 158 | >>> from sklearn.utils.multiclass import is_multilabel 159 | >>> is_multilabel([0, 1, 0, 1]) 160 | False 161 | >>> is_multilabel([[1], [0, 2], []]) 162 | False 163 | >>> is_multilabel(np.array([[1, 0], [0, 0]])) 164 | True 165 | >>> is_multilabel(np.array([[1], [0], [0]])) 166 | False 167 | >>> is_multilabel(np.array([[1, 0, 0]])) 168 | True 169 | """ 170 | if hasattr(y, "__array__"): 171 | y = np.asarray(y) 172 | if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1): 173 | return False 174 | 175 | if issparse(y): 176 | if isinstance(y, (dok_matrix, lil_matrix)): 177 | y = y.tocsr() 178 | return ( 179 | len(y.data) == 0 180 | or np.unique(y.data).size == 1 181 | and ( 182 | y.dtype.kind in "biu" 183 | or _is_integral_float(np.unique(y.data)) # bool, int, uint 184 | ) 185 | ) 186 | else: 187 | labels = np.unique(y) 188 | 189 | return len(labels) < 3 and ( 190 | y.dtype.kind in "biu" or _is_integral_float(labels) # bool, int, uint 191 | ) 192 | 193 | 194 | def check_classification_targets(y): 195 | """Ensure that target y is of a non-regression type. 196 | 197 | Only the following target types (as defined in type_of_target) are allowed: 198 | 'binary', 'multiclass', 'multiclass-multioutput', 199 | 'multilabel-indicator', 'multilabel-sequences' 200 | 201 | Parameters 202 | ---------- 203 | y : array-like 204 | """ 205 | y_type = type_of_target(y) 206 | if y_type not in [ 207 | "binary", 208 | "multiclass", 209 | "multiclass-multioutput", 210 | "multilabel-indicator", 211 | "multilabel-sequences", 212 | ]: 213 | raise ValueError("Unknown label type: %r" % y_type) 214 | 215 | 216 | def type_of_target(y): 217 | """Determine the type of data indicated by the target. 218 | 219 | Note that this type is the most specific type that can be inferred. 220 | For example: 221 | 222 | * ``binary`` is more specific but compatible with ``multiclass``. 223 | * ``multiclass`` of integers is more specific but compatible with 224 | ``continuous``. 225 | * ``multilabel-indicator`` is more specific but compatible with 226 | ``multiclass-multioutput``. 227 | 228 | Parameters 229 | ---------- 230 | y : array-like 231 | 232 | Returns 233 | ------- 234 | target_type : string 235 | One of: 236 | 237 | * 'continuous': `y` is an array-like of floats that are not all 238 | integers, and is 1d or a column vector. 239 | * 'continuous-multioutput': `y` is a 2d array of floats that are 240 | not all integers, and both dimensions are of size > 1. 241 | * 'binary': `y` contains <= 2 discrete values and is 1d or a column 242 | vector. 243 | * 'multiclass': `y` contains more than two discrete values, is not a 244 | sequence of sequences, and is 1d or a column vector. 245 | * 'multiclass-multioutput': `y` is a 2d array that contains more 246 | than two discrete values, is not a sequence of sequences, and both 247 | dimensions are of size > 1. 248 | * 'multilabel-indicator': `y` is a label indicator matrix, an array 249 | of two dimensions with at least two columns, and at most 2 unique 250 | values. 251 | * 'unknown': `y` is array-like but none of the above, such as a 3d 252 | array, sequence of sequences, or an array of non-sequence objects. 253 | 254 | Examples 255 | -------- 256 | >>> import numpy as np 257 | >>> type_of_target([0.1, 0.6]) 258 | 'continuous' 259 | >>> type_of_target([1, -1, -1, 1]) 260 | 'binary' 261 | >>> type_of_target(['a', 'b', 'a']) 262 | 'binary' 263 | >>> type_of_target([1.0, 2.0]) 264 | 'binary' 265 | >>> type_of_target([1, 0, 2]) 266 | 'multiclass' 267 | >>> type_of_target([1.0, 0.0, 3.0]) 268 | 'multiclass' 269 | >>> type_of_target(['a', 'b', 'c']) 270 | 'multiclass' 271 | >>> type_of_target(np.array([[1, 2], [3, 1]])) 272 | 'multiclass-multioutput' 273 | >>> type_of_target([[1, 2]]) 274 | 'multiclass-multioutput' 275 | >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]])) 276 | 'continuous-multioutput' 277 | >>> type_of_target(np.array([[0, 1], [1, 1]])) 278 | 'multilabel-indicator' 279 | """ 280 | valid = ( 281 | isinstance(y, (Sequence, spmatrix)) or hasattr(y, "__array__") 282 | ) and not isinstance(y, str) 283 | 284 | if not valid: 285 | raise ValueError( 286 | "Expected array-like (array or non-string sequence), " "got %r" % y 287 | ) 288 | 289 | sparseseries = y.__class__.__name__ == "SparseSeries" 290 | if sparseseries: 291 | raise ValueError("y cannot be class 'SparseSeries'.") 292 | 293 | if is_multilabel(y): 294 | return "multilabel-indicator" 295 | 296 | try: 297 | y = np.asarray(y) 298 | except ValueError: 299 | # Known to fail in numpy 1.3 for array of arrays 300 | return "unknown" 301 | 302 | # The old sequence of sequences format 303 | try: 304 | if ( 305 | not hasattr(y[0], "__array__") 306 | and isinstance(y[0], Sequence) 307 | and not isinstance(y[0], str) 308 | ): 309 | raise ValueError( 310 | "You appear to be using a legacy multi-label data" 311 | " representation. Sequence of sequences are no" 312 | " longer supported; use a binary array or sparse" 313 | " matrix instead - the MultiLabelBinarizer" 314 | " transformer can convert to this format." 315 | ) 316 | except IndexError: 317 | pass 318 | 319 | # Invalid inputs 320 | if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], str)): 321 | return "unknown" # [[[1, 2]]] or [obj_1] and not ["label_1"] 322 | 323 | if y.ndim == 2 and y.shape[1] == 0: 324 | return "unknown" # [[]] 325 | 326 | if y.ndim == 2 and y.shape[1] > 1: 327 | suffix = "-multioutput" # [[1, 2], [1, 2]] 328 | else: 329 | suffix = "" # [1, 2, 3] or [[1], [2], [3]] 330 | 331 | # check float and contains non-integer float values 332 | if y.dtype.kind == "f" and np.any(y != y.astype(int)): 333 | # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] 334 | _assert_all_finite(y) 335 | return "continuous" + suffix 336 | 337 | if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): 338 | return "multiclass" + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] 339 | else: 340 | return "binary" # [1, 2] or [["a"], ["b"]] 341 | 342 | 343 | def infer_output_dim(y_train): 344 | """ 345 | Infer output_dim from targets 346 | 347 | Parameters 348 | ---------- 349 | y_train : np.array 350 | Training targets 351 | 352 | Returns 353 | ------- 354 | output_dim : int 355 | Number of classes for output 356 | train_labels : list 357 | Sorted list of initial classes 358 | """ 359 | train_labels = unique_labels(y_train) 360 | output_dim = len(train_labels) 361 | 362 | return output_dim, train_labels 363 | 364 | 365 | def check_output_dim(labels, y): 366 | if y is not None: 367 | valid_labels = unique_labels(y) 368 | if not set(valid_labels).issubset(set(labels)): 369 | raise ValueError( 370 | f"""Valid set -- {set(valid_labels)} -- 371 | contains unkown targets from training -- 372 | {set(labels)}""" 373 | ) 374 | return 375 | 376 | 377 | def infer_multitask_output(y_train): 378 | """ 379 | Infer output_dim from targets 380 | This is for multiple tasks. 381 | 382 | Parameters 383 | ---------- 384 | y_train : np.ndarray 385 | Training targets 386 | 387 | Returns 388 | ------- 389 | tasks_dims : list 390 | Number of classes for output 391 | tasks_labels : list 392 | List of sorted list of initial classes 393 | """ 394 | 395 | if len(y_train.shape) < 2: 396 | raise ValueError( 397 | f"""y_train shoud be of shape (n_examples, n_tasks) """ 398 | + f"""but got {y_train.shape}""" 399 | ) 400 | nb_tasks = y_train.shape[1] 401 | tasks_dims = [] 402 | tasks_labels = [] 403 | for task_idx in range(nb_tasks): 404 | try: 405 | output_dim, train_labels = infer_output_dim( 406 | y_train[:, task_idx] 407 | ) 408 | tasks_dims.append(output_dim) 409 | tasks_labels.append(train_labels) 410 | except ValueError as err: 411 | raise ValueError(f"""Error for task {task_idx} : {err}""") 412 | return tasks_dims, tasks_labels 413 | -------------------------------------------------------------------------------- /src/models/tabular_nn.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional 2 | 3 | from src.models.optimizer import RAdam 4 | from src.utils.misc import LoggerFactory 5 | from src.models.loss import SmoothBCEwLogits 6 | from src.models.base import MoaBase 7 | from src.utils.environment import get_device 8 | import copy 9 | import pandas as pd 10 | import numpy as np 11 | from tqdm.auto import trange 12 | 13 | import torch 14 | import torch.nn as nn 15 | from torch.utils.data import Dataset, DataLoader 16 | from torch.optim.lr_scheduler import MultiStepLR 17 | from torch import optim 18 | 19 | DEVICE = get_device() 20 | logger = LoggerFactory().getLogger(__name__) 21 | 22 | 23 | class TabularDataset(Dataset): 24 | def __init__(self, X: pd.DataFrame, y: Optional[pd.DataFrame], predictors): 25 | self.predictors = predictors 26 | self.X = X[predictors].values 27 | 28 | if y is not None: 29 | self.y = y.values 30 | else: 31 | self.y = y 32 | 33 | def __len__(self): 34 | return self.X.shape[0] 35 | 36 | def __getitem__(self, idx): 37 | if self.y is None: 38 | return torch.tensor(self.X[idx], dtype=torch.float).to(DEVICE) 39 | else: 40 | return ( 41 | torch.tensor(self.X[idx], dtype=torch.float).to(DEVICE), 42 | torch.tensor(self.y[idx], dtype=torch.float).to(DEVICE), 43 | ) 44 | 45 | 46 | class TabularMLP_1_1(nn.Module): 47 | def __init__(self, features, targets): 48 | super(TabularMLP_1_1, self).__init__() 49 | 50 | self.sq = nn.Sequential( 51 | nn.BatchNorm1d(len(features)), 52 | nn.utils.weight_norm(nn.Linear(len(features), 1024)), 53 | # nn.Dropout(0.8), 54 | nn.LeakyReLU(), 55 | nn.BatchNorm1d(1024), 56 | nn.utils.weight_norm(nn.Linear(1024, 500)), 57 | nn.Dropout(0.8), 58 | nn.LeakyReLU(), 59 | nn.Linear(500, len(targets)), 60 | ) 61 | 62 | def forward(self, x): 63 | x = self.sq(x) 64 | return x 65 | 66 | 67 | class TabularMLP_1_2(nn.Module): 68 | def __init__(self, n_features, n_targets, hidden_size=512, dropratio=0.2): 69 | super(TabularMLP_1_2, self).__init__() 70 | n_features = len(n_features) 71 | n_targets = len(n_targets) 72 | self.batch_norm1 = nn.BatchNorm1d(n_features) 73 | self.dropout1 = nn.Dropout(dropratio) 74 | self.dense1 = nn.utils.weight_norm(nn.Linear(n_features, hidden_size)) 75 | 76 | self.batch_norm2 = nn.BatchNorm1d(hidden_size) 77 | self.dropout2 = nn.Dropout(dropratio) 78 | self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size)) 79 | 80 | self.batch_norm3 = nn.BatchNorm1d(hidden_size) 81 | self.dropout3 = nn.Dropout(dropratio) 82 | self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, n_targets)) 83 | 84 | self.relu = nn.ReLU() 85 | 86 | def forward(self, x): 87 | x = self.batch_norm1(x) 88 | x = self.dropout1(x) 89 | x = self.relu(self.dense1(x)) 90 | 91 | x = self.batch_norm2(x) 92 | x = self.dropout2(x) 93 | x = self.relu(self.dense2(x)) 94 | 95 | x = self.batch_norm3(x) 96 | x = self.dropout3(x) 97 | x = self.dense3(x) 98 | 99 | return x 100 | 101 | 102 | class TabularMLP_2(nn.Module): 103 | def __init__(self, features, targets): 104 | super(TabularMLP_2, self).__init__() 105 | 106 | self.sq = nn.Sequential( 107 | nn.BatchNorm1d(len(features)), 108 | nn.Linear(len(features), 2048), 109 | # nn.Dropout(0.8), 110 | nn.LeakyReLU(), 111 | nn.BatchNorm1d(2048), 112 | nn.Linear(2048, 500), 113 | nn.Dropout(0.8), 114 | nn.LeakyReLU(), 115 | nn.Linear(500, len(targets)), 116 | ) 117 | 118 | def forward(self, x): 119 | x = self.sq(x) 120 | return x 121 | 122 | 123 | class NNTrainer(MoaBase): 124 | def __init__(self, params: Optional[dict] = None, **kwargs): 125 | if params is None: 126 | self.params = {} 127 | else: 128 | self.params = params 129 | super().__init__(**kwargs) 130 | 131 | def _get_default_params(self): 132 | return { 133 | 'lr': 1e-4, 134 | 'batch_size': 256, 135 | 'epoch': 20, 136 | 'model_class': TabularMLP_1_1, 137 | } 138 | 139 | def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int): 140 | X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx] 141 | y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx] 142 | target_cols = y_valid.columns.tolist() 143 | 144 | _params = self._get_default_params() 145 | _params.update(self.params) 146 | 147 | # define model & schedulers 148 | num_epoch = _params['epoch'] 149 | batch_size = _params['batch_size'] 150 | net = _params['model_class'](predictors, target_cols) 151 | net.to(DEVICE) 152 | 153 | optimizer = optim.Adam(net.parameters(), lr=_params['lr'], weight_decay=1e-6) 154 | valid_criterion = nn.BCEWithLogitsLoss() 155 | criterion = SmoothBCEwLogits(smoothing=0.001) 156 | scheduler = MultiStepLR(optimizer, milestones=[10, 15], gamma=0.1) 157 | 158 | # 学習時はlength=1の破片などを回避するためdrop_last=1とする 159 | train_dataset = TabularDataset(X_train, y_train, predictors) 160 | train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) 161 | 162 | valid_dataset = TabularDataset(X_valid, y_valid, predictors) 163 | valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) 164 | 165 | bar = trange(num_epoch, desc=f"seed: {seed} train : {X_train.shape[0]} valid:{X_valid.shape[0]}====") 166 | train_loss = [] 167 | valid_loss = [] 168 | 169 | best_loss = np.inf 170 | best_preds = None 171 | best_loss_epoch = 1 172 | 173 | for epoch in bar: 174 | running_loss = [] 175 | valid_loss = [] 176 | 177 | # train 178 | net.train() 179 | for x, y in train_dataloader: 180 | x = x.to(DEVICE) 181 | y = y.to(DEVICE) 182 | optimizer.zero_grad() 183 | out = net(x) 184 | loss = criterion(out, y) 185 | loss.backward() 186 | running_loss.append(loss.item()) 187 | optimizer.step() 188 | scheduler.step() 189 | 190 | net.eval() 191 | 192 | preds_valid = [] 193 | _valid_loss = [] 194 | 195 | with torch.no_grad(): 196 | for x, y in valid_dataloader: 197 | x = x.to(DEVICE) 198 | y = y.to(DEVICE) 199 | out = net(x) 200 | loss = valid_criterion(out, y) 201 | preds_valid.append(out.sigmoid().detach().cpu().numpy()) 202 | _valid_loss.append(loss.item()) 203 | 204 | bar.set_postfix( 205 | running_loss=f"{np.mean(running_loss):.5f}", 206 | valid_loss=f"{np.mean(_valid_loss):.5f}", 207 | best_loss=f"{best_loss:.5f}", 208 | best_loss_epoch=f"{best_loss_epoch}", 209 | ) 210 | 211 | train_loss.append(np.mean(running_loss)) 212 | valid_loss.append(np.mean(_valid_loss)) 213 | 214 | if best_loss > np.mean(_valid_loss): 215 | best_loss = np.mean(_valid_loss) 216 | best_loss_epoch = epoch + 1 217 | best_preds = np.concatenate(preds_valid) 218 | best_state = copy.deepcopy(net.state_dict()) 219 | 220 | logger.info(f"best loss : {best_loss}") 221 | model = _params['model_class'](predictors, target_cols) 222 | model.load_state_dict(best_state) 223 | model.to(DEVICE) 224 | return best_preds, model 225 | 226 | def _predict(self, model: Any, X_valid: pd.DataFrame, predictors: List[str]): 227 | _params = self._get_default_params() 228 | _params.update(self.params) 229 | batch_size = _params['batch_size'] 230 | valid_dataset = TabularDataset(X_valid, None, predictors) 231 | valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) 232 | tmp_pred = [] 233 | 234 | model.eval() 235 | with torch.no_grad(): 236 | for x in valid_dataloader: 237 | x = x.to(DEVICE) 238 | out = model(x) 239 | tmp_pred.append(out.sigmoid().detach().cpu().numpy()) 240 | return np.concatenate(tmp_pred) 241 | 242 | 243 | class CNNDataset(Dataset): 244 | def __init__(self, X: np.ndarray, y: Optional[pd.DataFrame]): 245 | 246 | self.X = X 247 | 248 | if y is not None: 249 | self.y = y.values 250 | else: 251 | self.y = y 252 | 253 | def __len__(self): 254 | return self.X.shape[0] 255 | 256 | def __getitem__(self, idx): 257 | if self.y is None: 258 | return torch.tensor(self.X[idx], dtype=torch.float).to(DEVICE) 259 | else: 260 | return ( 261 | torch.tensor(self.X[idx], dtype=torch.float).to(DEVICE), 262 | torch.tensor(self.y[idx], dtype=torch.float).to(DEVICE), 263 | ) 264 | 265 | 266 | class CNNStacking(nn.Module): 267 | def __init__(self, n_features, n_labels): 268 | super(CNNStacking, self).__init__() 269 | 270 | self.sq = nn.Sequential( 271 | nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(2, 1), bias=False), 272 | nn.ReLU(), 273 | nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(2, 1), bias=False), 274 | nn.ReLU(), 275 | # nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 1), bias=False), 276 | # nn.ReLU(inplace=True), 277 | # nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 1), bias=False), 278 | # nn.ReLU(), 279 | nn.Flatten(), 280 | nn.Linear(in_features=16 * n_labels, out_features=4 * n_labels), 281 | nn.ReLU(), 282 | nn.Linear(in_features=4 * n_labels, out_features=n_labels), 283 | ) 284 | 285 | def forward(self, x): 286 | return self.sq(x) 287 | 288 | 289 | class CNNTrainer(MoaBase): 290 | def __init__(self, params: Optional[dict] = None, **kwargs): 291 | if params is None: 292 | self.params = {} 293 | else: 294 | self.params = params 295 | super().__init__(**kwargs) 296 | 297 | def _get_default_params(self): 298 | return { 299 | 'lr': 1e-4, 300 | 'batch_size': 256, 301 | 'epoch': 20, 302 | } 303 | 304 | def _train(self, X: pd.DataFrame, y: pd.DataFrame, predictors: List[str], train_idx: np.ndarray, valid_idx: np.ndarray, seed: int): 305 | X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx] 306 | y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx] 307 | target_cols = y_valid.columns.tolist() 308 | 309 | _params = self._get_default_params() 310 | _params.update(self.params) 311 | 312 | # define model & schedulers 313 | self.n_predictors = len(predictors) 314 | self.n_targets = len(target_cols) 315 | self.n_models = self.n_predictors // self.n_targets 316 | 317 | num_epoch = _params['epoch'] 318 | batch_size = _params['batch_size'] 319 | net = CNNStacking(n_features=self.n_predictors, n_labels=self.n_targets) 320 | net.to(DEVICE) 321 | 322 | # optimizer = optim.Adam(net.parameters(), lr=_params['lr'], weight_decay=1e-6) 323 | optimizer = RAdam(net.parameters(), lr=_params['lr']) 324 | valid_criterion = nn.BCEWithLogitsLoss() 325 | criterion = SmoothBCEwLogits(smoothing=0.001) 326 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0.0001) 327 | # scheduler = MultiStepLR(optimizer, milestones=[10, 15], gamma=0.1) 328 | 329 | # [N, Models, Labels, Channel] -> [N, Channel, Models, Labels] 330 | X_train = X_train[predictors].values.reshape(-1, self.n_models, self.n_targets, 1).transpose(0, 3, 1, 2) 331 | X_valid = X_valid[predictors].values.reshape(-1, self.n_models, self.n_targets, 1).transpose(0, 3, 1, 2) 332 | 333 | # 学習時はlength=1の破片などを回避するためdrop_last=1とする 334 | train_dataset = CNNDataset(X_train, y_train) 335 | train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) 336 | 337 | valid_dataset = CNNDataset(X_valid, y_valid) 338 | valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) 339 | 340 | bar = trange(num_epoch, desc=f"seed: {seed} train : {X_train.shape[0]} valid:{X_valid.shape[0]}====") 341 | train_loss = [] 342 | valid_loss = [] 343 | 344 | best_loss = np.inf 345 | best_preds = None 346 | best_loss_epoch = 1 347 | 348 | for epoch in bar: 349 | running_loss = [] 350 | valid_loss = [] 351 | 352 | # train 353 | net.train() 354 | for x, y in train_dataloader: 355 | x = x.to(DEVICE) 356 | y = y.to(DEVICE) 357 | 358 | optimizer.zero_grad() 359 | out = net(x) 360 | loss = criterion(out, y) 361 | loss.backward() 362 | running_loss.append(loss.item()) 363 | optimizer.step() 364 | scheduler.step() 365 | 366 | preds_valid = [] 367 | _valid_loss = [] 368 | 369 | net.eval() 370 | with torch.no_grad(): 371 | for x, y in valid_dataloader: 372 | x = x.to(DEVICE) 373 | y = y.to(DEVICE) 374 | 375 | out = net(x) 376 | loss = valid_criterion(out, y) 377 | preds_valid.append(out.sigmoid().detach().cpu().numpy()) 378 | _valid_loss.append(loss.item()) 379 | 380 | bar.set_postfix( 381 | running_loss=f"{np.mean(running_loss):.5f}", 382 | valid_loss=f"{np.mean(_valid_loss):.5f}", 383 | best_loss=f"{best_loss:.5f}", 384 | best_loss_epoch=f"{best_loss_epoch}", 385 | ) 386 | 387 | train_loss.append(np.mean(running_loss)) 388 | valid_loss.append(np.mean(_valid_loss)) 389 | 390 | if best_loss > np.mean(_valid_loss): 391 | best_loss = np.mean(_valid_loss) 392 | best_loss_epoch = epoch + 1 393 | best_preds = np.concatenate(preds_valid) 394 | best_state = copy.deepcopy(net.state_dict()) 395 | 396 | logger.info(f"best loss : {best_loss}") 397 | model = CNNStacking(n_features=self.n_predictors, n_labels=self.n_targets) 398 | model.load_state_dict(best_state) 399 | model.to(DEVICE) 400 | return best_preds, model 401 | 402 | def _predict(self, model: Any, X_valid: pd.DataFrame, predictors: List[str]): 403 | _params = self._get_default_params() 404 | _params.update(self.params) 405 | 406 | # [N, Models, Labels, Channel] -> [N, Channel, Models, Labels] 407 | X_valid = X_valid[predictors].values.reshape(-1, self.n_models, self.n_targets, 1).transpose(0, 3, 1, 2) 408 | 409 | batch_size = _params['batch_size'] 410 | valid_dataset = CNNDataset(X_valid, None) 411 | valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) 412 | tmp_pred = [] 413 | 414 | model.eval() 415 | with torch.no_grad(): 416 | for x in valid_dataloader: 417 | x = x.to(DEVICE) 418 | 419 | out = model(x) 420 | tmp_pred.append(out.sigmoid().detach().cpu().numpy()) 421 | return np.concatenate(tmp_pred) -------------------------------------------------------------------------------- /src/experiment/experiment.py: -------------------------------------------------------------------------------- 1 | # https://github.com/nyanp/nyaggle/blob/master/nyaggle/experiment/experiment.py 2 | 3 | import json 4 | import numbers 5 | import os 6 | import shutil 7 | import uuid 8 | import warnings 9 | from logging import getLogger, FileHandler, DEBUG, Logger 10 | from typing import Dict, Optional 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | from src.utils.environment import requires_mlflow 16 | 17 | MLFLOW_KEY_LENGTH_LIMIT = 250 18 | MLFLOW_VALUE_LENGTH_LIMIT = 250 19 | 20 | 21 | def _sanitize_mlflow_param(param, limit): 22 | if len(str(param)) > limit: 23 | warnings.warn('Length of param exceeds limit {}. It will be truncated. value: {}'.format(limit, param)) 24 | param = str(param)[:limit] 25 | return param 26 | 27 | 28 | def _check_directory(directory: str, if_exists: str) -> str: 29 | if os.path.exists(directory): 30 | if if_exists == 'error': 31 | raise ValueError('directory {} already exists.'.format(directory)) 32 | elif if_exists == 'replace': 33 | warnings.warn('directory {} already exists. It will be replaced by the new result'.format(directory)) 34 | 35 | existing_run_id = _try_to_get_existing_mlflow_run_id(directory) 36 | if existing_run_id is not None: 37 | requires_mlflow() 38 | import mlflow 39 | mlflow.delete_run(existing_run_id) 40 | 41 | shutil.rmtree(directory, ignore_errors=True) 42 | elif if_exists == 'rename': 43 | postfix_index = 1 44 | 45 | while os.path.exists(directory + '_' + str(postfix_index)): 46 | postfix_index += 1 47 | 48 | directory += '_' + str(postfix_index) 49 | warnings.warn('directory is renamed to {} because the original directory already exists.'.format(directory)) 50 | return directory 51 | 52 | 53 | def _sanitize(v): 54 | return v if isinstance(v, numbers.Number) else str(v) 55 | 56 | 57 | def _try_to_get_existing_mlflow_run_id(logging_directory: str) -> Optional[str]: 58 | mlflow_path = os.path.join(logging_directory, 'mlflow.json') 59 | if os.path.exists(mlflow_path): 60 | with open(mlflow_path, 'r') as f: 61 | mlflow_metadata = json.load(f) 62 | return mlflow_metadata['run_id'] 63 | return None 64 | 65 | 66 | class Experiment(object): 67 | """Minimal experiment logger for Kaggle 68 | This module provides minimal functionality for tracking experiments. 69 | The output files are laid out as follows: 70 | .. code-block:: none 71 | / 72 | log.txt <== Output of log 73 | metrics.json <== Output of log_metric(s), format: name,score 74 | params.json <== Output of log_param(s), format: key,value 75 | mlflow.json <== mlflow's run_id, experiment_id and artifact_uri (logged if with_mlflow=True) 76 | You can add numpy array and pandas dataframe under the directory through ``log_numpy`` and ``log_dataframe``. 77 | Args: 78 | logging_directory: 79 | Path to directory where output is stored. 80 | custom_logger: 81 | A custom logger to be used instead of default logger. 82 | with_mlflow: 83 | If True, `mlflow tracking `_ is used. 84 | One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow. 85 | Note that all output files are located both ``logging_directory`` and 86 | mlflow's directory (``mlruns`` by default). 87 | if_exists: 88 | How to behave if the logging directory already exists. 89 | - error: Raise a ValueError. 90 | - replace: Delete logging directory before logging. 91 | - append: Append to exisitng experiment. 92 | - rename: Rename current directory by adding "_1", "_2"... prefix 93 | Example: 94 | >>> import numpy as np 95 | >>> import pandas as pd 96 | >>> from nyaggle.experiment import Experiment 97 | >>> 98 | >>> with Experiment(logging_directory='./output/') as exp: 99 | >>> # log key-value pair as a parameter 100 | >>> exp.log_param('lr', 0.01) 101 | >>> exp.log_param('optimizer', 'adam') 102 | >>> 103 | >>> # log text 104 | >>> exp.log('blah blah blah') 105 | >>> 106 | >>> # log metric 107 | >>> exp.log_metric('CV', 0.85) 108 | >>> 109 | >>> # log dictionary with flattening keys 110 | >>> exp.log_dict('params', {'X': 3, 'Y': {'Z': 'foobar'}}) 111 | >>> 112 | >>> # log numpy ndarray, pandas dafaframe and any artifacts 113 | >>> exp.log_numpy('predicted', np.zeros(1)) 114 | >>> exp.log_dataframe('submission', pd.DataFrame(), file_format='csv') 115 | >>> exp.log_artifact('path-to-your-file') 116 | """ 117 | def __init__(self, logging_directory: str, custom_logger: Optional[Logger] = None, with_mlflow: bool = False, if_exists: str = 'error'): 118 | logging_directory = _check_directory(logging_directory, if_exists) 119 | os.makedirs(logging_directory, exist_ok=True) 120 | 121 | self.logging_directory = logging_directory 122 | self.with_mlflow = with_mlflow 123 | 124 | if custom_logger is not None: 125 | self.logger = custom_logger 126 | self.is_custom = True 127 | else: 128 | self.logger = getLogger(str(uuid.uuid4())) 129 | self.log_path = os.path.join(logging_directory, 'log.txt') 130 | self.logger.addHandler(FileHandler(self.log_path)) 131 | self.logger.setLevel(DEBUG) 132 | self.is_custom = False 133 | self.metrics = self._load_dict('metrics.json') 134 | self.params = self._load_dict('params.json') 135 | self.inherit_existing_run = False 136 | 137 | if self.with_mlflow: 138 | requires_mlflow() 139 | self.mlflow_run_id = _try_to_get_existing_mlflow_run_id(logging_directory) 140 | if self.mlflow_run_id is not None: 141 | self.mlflow_run_name = None 142 | else: 143 | self.mlflow_run_name = logging_directory 144 | 145 | def __enter__(self): 146 | self.start() 147 | return self 148 | 149 | def __exit__(self, ex_type, ex_value, trace): 150 | self.stop() 151 | 152 | @classmethod 153 | def continue_from(cls, logging_directory: str, with_mlflow: bool = False): 154 | return cls(logging_directory=logging_directory, if_exists='append', with_mlflow=with_mlflow) 155 | 156 | def start(self): 157 | """ 158 | Start a new experiment. 159 | """ 160 | if self.with_mlflow: 161 | import mlflow 162 | 163 | if mlflow.active_run() is not None: 164 | active_run = mlflow.active_run() 165 | self.inherit_existing_run = True 166 | else: 167 | active_run = mlflow.start_run(run_name=self.mlflow_run_name, run_id=self.mlflow_run_id) 168 | mlflow_metadata = {'artifact_uri': active_run.info.artifact_uri, 'experiment_id': active_run.info.experiment_id, 'run_id': active_run.info.run_id} 169 | self.mlflow_run_id = active_run.info.run_id 170 | with open(os.path.join(self.logging_directory, 'mlflow.json'), 'w') as f: 171 | json.dump(mlflow_metadata, f, indent=4) 172 | 173 | def _load_dict(self, filename: str) -> Dict: 174 | try: 175 | path = os.path.join(self.logging_directory, filename) 176 | with open(path, 'r') as f: 177 | return json.load(f) 178 | except IOError: 179 | self.logger.warning('failed to load file: {}'.format(filename)) 180 | return {} 181 | 182 | def _save_dict(self, obj: Dict, filename: str): 183 | try: 184 | path = os.path.join(self.logging_directory, filename) 185 | with open(path, 'w') as f: 186 | json.dump(obj, f, indent=2) 187 | except IOError: 188 | self.logger.warning('failed to save file: {}'.format(filename)) 189 | 190 | def stop(self): 191 | """ 192 | Stop current experiment. 193 | """ 194 | self._save_dict(self.metrics, 'metrics.json') 195 | self._save_dict(self.params, 'params.json') 196 | 197 | if not self.is_custom: 198 | for h in self.logger.handlers: 199 | h.close() 200 | 201 | if self.with_mlflow: 202 | import mlflow 203 | from mlflow.exceptions import MlflowException 204 | 205 | try: 206 | mlflow.log_artifact(self.log_path) 207 | mlflow.log_artifact(os.path.join(self.logging_directory, 'metrics.json')) 208 | mlflow.log_artifact(os.path.join(self.logging_directory, 'params.json')) 209 | except MlflowException as e: 210 | warnings.warn('Error in saving artifacts to mlflow. The result may not be saved.: {}'.format(e)) 211 | if not self.inherit_existing_run: 212 | mlflow.end_run() 213 | 214 | def get_logger(self) -> Logger: 215 | """ 216 | Get logger used in this experiment. 217 | Returns: 218 | logger object 219 | """ 220 | return self.logger 221 | 222 | def get_run(self): 223 | """ 224 | Get mlflow's currently active run, or None if ``with_mlflow = False``. 225 | Returns: 226 | active Run 227 | """ 228 | if not self.with_mlflow: 229 | return None 230 | 231 | import mlflow 232 | return mlflow.active_run() 233 | 234 | def log(self, text: str): 235 | """ 236 | Logs a message on the logger for the experiment. 237 | Args: 238 | text: 239 | The message to be written. 240 | """ 241 | self.logger.info(text) 242 | 243 | def log_param(self, key, value): 244 | """ 245 | Logs a key-value pair for the experiment. 246 | Args: 247 | key: parameter name 248 | value: parameter value 249 | """ 250 | key = _sanitize(key) 251 | value = _sanitize(value) 252 | self.params[key] = value 253 | 254 | if self.with_mlflow: 255 | import mlflow 256 | from mlflow.exceptions import MlflowException 257 | 258 | key_mlflow = _sanitize_mlflow_param(key, MLFLOW_KEY_LENGTH_LIMIT) 259 | value_mlflow = _sanitize_mlflow_param(value, MLFLOW_VALUE_LENGTH_LIMIT) 260 | 261 | try: 262 | mlflow.log_param(key_mlflow, value_mlflow) 263 | except MlflowException as e: 264 | warnings.warn('Error in logging parameter {} to mlflow. Skipped. {}'.format(key, e)) 265 | 266 | def log_params(self, params: Dict): 267 | """ 268 | Logs a batch of params for the experiments. 269 | Args: 270 | params: dictionary of parameters 271 | """ 272 | for k, v in params.items(): 273 | self.log_param(k, v) 274 | 275 | def log_dict(self, name: str, value: Dict, separator: str = '.'): 276 | """ 277 | Logs a dictionary as parameter with flatten format. 278 | Args: 279 | name: Parameter name 280 | value: Parameter value 281 | separator: Separating character used to concatanate keys 282 | Examples: 283 | >>> with Experiment('./') as e: 284 | >>> e.log_dict('a', {'b': 1, 'c': 'd'}) 285 | >>> print(e.params) 286 | { 'a.b': 1, 'a.c': 'd' } 287 | """ 288 | 289 | if value is None: 290 | self.log_param(name, value) 291 | return 292 | 293 | def _flatten(d: Dict, prefix: str, separator: str) -> Dict: 294 | items = [] 295 | for k, v in d.items(): 296 | child_key = prefix + separator + str(k) if prefix else str(k) 297 | if isinstance(v, Dict) and v: 298 | items.extend(_flatten(v, child_key, separator).items()) 299 | else: 300 | items.append((child_key, v)) 301 | return dict(items) 302 | 303 | value = _flatten(value, name, separator) 304 | self.log_params(value) 305 | 306 | def log_metric(self, name: str, score: float): 307 | """ 308 | Log a metric under the logging directory. 309 | Args: 310 | name: 311 | Metric name. 312 | score: 313 | Metric value. 314 | """ 315 | name = _sanitize(name) 316 | score = _sanitize(score) 317 | self.metrics[name] = score 318 | 319 | if self.with_mlflow: 320 | import mlflow 321 | from mlflow.exceptions import MlflowException 322 | 323 | try: 324 | mlflow.log_metric(name, score) 325 | except MlflowException as e: 326 | warnings.warn('Error in logging metric {} to mlflow. Skipped. {}'.format(name, e)) 327 | 328 | def log_metrics(self, metrics: Dict): 329 | """ 330 | Log a batch of metrics under the logging directory. 331 | Args: 332 | metrics: dictionary of metrics. 333 | """ 334 | for k, v in metrics.items(): 335 | self.log_metric(k, v) 336 | 337 | def log_numpy(self, name: str, array: np.ndarray): 338 | """ 339 | Log a numpy ndarray under the logging directory. 340 | Args: 341 | name: 342 | Name of the file. A .npy extension will be appended to the file name if it does not already have one. 343 | array: 344 | Array data to be saved. 345 | """ 346 | path = os.path.join(self.logging_directory, name) 347 | np.save(path, array) 348 | 349 | if self.with_mlflow: 350 | import mlflow 351 | mlflow.log_artifact(path + '.npy') 352 | 353 | def log_dataframe(self, name: str, df: pd.DataFrame, file_format: str = 'feather'): 354 | """ 355 | Log a pandas dataframe under the logging directory. 356 | Args: 357 | name: 358 | Name of the file. A ``.f`` or ``.csv`` extension will be appended to the file name 359 | if it does not already have one. 360 | df: 361 | A dataframe to be saved. 362 | file_format: 363 | A format of output file. ``csv`` and ``feather`` are supported. 364 | """ 365 | path = os.path.join(self.logging_directory, name) 366 | if file_format == 'feather': 367 | if not path.endswith('.f'): 368 | path += '.f' 369 | df.to_feather(path) 370 | elif file_format == 'csv': 371 | if not path.endswith('.csv'): 372 | path += '.csv' 373 | df.to_csv(path, index=False) 374 | else: 375 | raise RuntimeError('format not supported') 376 | 377 | if self.with_mlflow: 378 | import mlflow 379 | mlflow.log_artifact(path) 380 | 381 | def log_artifact(self, src_file_path: str): 382 | """ 383 | Make a copy of the file under the logging directory. 384 | Args: 385 | src_file_path: 386 | Path of the file. If path is not a child of the logging directory, the file will be copied. 387 | If ``with_mlflow`` is True, ``mlflow.log_artifact`` will be called (then another copy will be made). 388 | """ 389 | logging_path = os.path.abspath(self.logging_directory) 390 | src_file_path = os.path.abspath(src_file_path) 391 | 392 | if os.path.commonpath([logging_path]) != os.path.commonpath([logging_path, src_file_path]): 393 | src_file = os.path.basename(src_file_path) 394 | shutil.copy(src_file, self.logging_directory) 395 | 396 | if self.with_mlflow: 397 | import mlflow 398 | mlflow.log_artifact(src_file_path) 399 | 400 | 401 | def add_leaderboard_score(logging_directory: str, score: float): 402 | """ 403 | Record leaderboard score to the existing experiment directory. 404 | Args: 405 | logging_directory: 406 | The directory to be added 407 | score: 408 | Leaderboard score 409 | """ 410 | with Experiment.continue_from(logging_directory) as e: 411 | e.log_metric('LB', score) -------------------------------------------------------------------------------- /src/utils/splitter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.utils import check_random_state 4 | from sklearn.utils.validation import _num_samples, check_array 5 | from sklearn.utils.multiclass import type_of_target 6 | from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold 7 | 8 | from sklearn.model_selection._split import ( 9 | _BaseKFold, 10 | _RepeatedSplits, 11 | BaseShuffleSplit, 12 | _validate_shuffle_split, 13 | ) 14 | 15 | 16 | def splitter_for_moa(X, target_cols, n_splits=5, seed=42): 17 | folds = [] 18 | X = X.copy() 19 | # LOCATE DRUGS 20 | vc = X.drug_id.value_counts() 21 | 22 | vc1 = vc.loc[(vc == 6) | (vc == 12) | (vc == 18)].index.sort_values() 23 | vc2 = vc.loc[(vc != 6) & (vc != 12) & (vc != 18)].index.sort_values() 24 | 25 | # STRATIFY DRUGS 18X OR LESS 26 | dct1 = {} 27 | dct2 = {} 28 | skf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) 29 | tmp = X.groupby("drug_id")[target_cols].mean().loc[vc1] 30 | for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])): 31 | dd = {k: fold for k in tmp.index[idxV].values} 32 | dct1.update(dd) 33 | 34 | # STRATIFY DRUGS MORE THAN 18X 35 | skf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) 36 | tmp = X.loc[X.drug_id.isin(vc2)].reset_index(drop=True) 37 | for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])): 38 | dd = {k: fold for k in tmp.sig_id[idxV].values} 39 | dct2.update(dd) 40 | 41 | # ASSIGN FOLDS 42 | X["fold"] = X.drug_id.map(dct1) 43 | X.loc[X.fold.isna(), "fold"] = X.loc[X.fold.isna(), "sig_id"].map(dct2) 44 | X.fold = X.fold.astype("int8") 45 | folds.append(X.fold.values) 46 | 47 | _folds = np.stack(folds).flatten() 48 | folds = [] 49 | for idx in range(n_splits): 50 | train_idx = np.where(_folds != idx)[0] 51 | valid_idx = np.where(_folds == idx)[0] 52 | folds.append((train_idx, valid_idx)) 53 | return folds 54 | 55 | 56 | class SplitFactory: 57 | # split_type: necessary params 58 | split_pattern = { 59 | 'kfolds': [], 60 | 'stratified': ['target_col'], 61 | 'multilabel_stratified': ['target_col'], 62 | 'group': ['group_col'], 63 | } 64 | 65 | def __init__(self, n_splits: int = 5, split_type: str = "kfolds", random_state: int = 46, **kwargs): 66 | self.split_type = split_type 67 | self.n_splits = n_splits 68 | self.random_state = random_state 69 | self.params = kwargs 70 | 71 | if split_type not in self.split_pattern: 72 | raise ValueError(f"type: {split_type} is not in {list(self.split_pattern.keys())}") 73 | for arg in self.split_pattern[split_type]: 74 | if arg not in self.params: 75 | raise ValueError(f"split type {split_type} require {arg} param") 76 | 77 | # set splitter 78 | self._get_splitter() 79 | 80 | def _get_splitter(self): 81 | if self.split_type == 'kfolds': 82 | self._split = lambda X, y: KFold(n_splits=self.n_splits, random_state=self.random_state).split(X, y) 83 | elif self.split_type == 'stratified': 84 | target_col = self.params['target_col'] 85 | self._split = lambda X, y: StratifiedKFold(n_splits=self.n_splits, random_state=self.random_state).split(X, y, X[target_col]) 86 | elif self.split_type == 'group': 87 | group_col = self.params['group_col'] 88 | self._split = lambda X, y: GroupKFold(n_splits=self.n_splits).split(X, y, groups=X[group_col]) 89 | elif self.split_type == 'multilabel_stratified': 90 | target_col = self.params['target_col'] 91 | self._split = lambda X, y: MultilabelStratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state).split(X, y) 92 | else: 93 | raise AttributeError(f"split_type: {self.split_type} is not supperted") 94 | 95 | def split(self, X, y=None): 96 | return self._split(X, y) 97 | 98 | 99 | # https://github.com/trent-b/iterative-stratification 100 | def IterativeStratification(labels, r, random_state): 101 | """This function implements the Iterative Stratification algorithm described 102 | in the following paper: 103 | Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of 104 | Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M. 105 | (eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD 106 | 2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin, 107 | Heidelberg. 108 | """ 109 | 110 | n_samples = labels.shape[0] 111 | test_folds = np.zeros(n_samples, dtype=int) 112 | 113 | # Calculate the desired number of examples at each subset 114 | c_folds = r * n_samples 115 | 116 | # Calculate the desired number of examples of each label at each subset 117 | c_folds_labels = np.outer(r, labels.sum(axis=0)) 118 | 119 | labels_not_processed_mask = np.ones(n_samples, dtype=bool) 120 | 121 | while np.any(labels_not_processed_mask): 122 | # Find the label with the fewest (but at least one) remaining examples, 123 | # breaking ties randomly 124 | num_labels = labels[labels_not_processed_mask].sum(axis=0) 125 | 126 | # Handle case where only all-zero labels are left by distributing 127 | # across all folds as evenly as possible (not in original algorithm but 128 | # mentioned in the text). (By handling this case separately, some 129 | # code redundancy is introduced; however, this approach allows for 130 | # decreased execution time when there are a relatively large number 131 | # of all-zero labels.) 132 | if num_labels.sum() == 0: 133 | sample_idxs = np.where(labels_not_processed_mask)[0] 134 | 135 | for sample_idx in sample_idxs: 136 | fold_idx = np.where(c_folds == c_folds.max())[0] 137 | 138 | if fold_idx.shape[0] > 1: 139 | fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])] 140 | 141 | test_folds[sample_idx] = fold_idx 142 | c_folds[fold_idx] -= 1 143 | 144 | break 145 | 146 | label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0] 147 | if label_idx.shape[0] > 1: 148 | label_idx = label_idx[random_state.choice(label_idx.shape[0])] 149 | 150 | sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0] 151 | 152 | for sample_idx in sample_idxs: 153 | # Find the subset(s) with the largest number of desired examples 154 | # for this label, breaking ties by considering the largest number 155 | # of desired examples, breaking further ties randomly 156 | label_folds = c_folds_labels[:, label_idx] 157 | fold_idx = np.where(label_folds == label_folds.max())[0] 158 | 159 | if fold_idx.shape[0] > 1: 160 | temp_fold_idx = np.where(c_folds[fold_idx] == c_folds[fold_idx].max())[0] 161 | fold_idx = fold_idx[temp_fold_idx] 162 | 163 | if temp_fold_idx.shape[0] > 1: 164 | fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])] 165 | 166 | test_folds[sample_idx] = fold_idx 167 | labels_not_processed_mask[sample_idx] = False 168 | 169 | # Update desired number of examples 170 | c_folds_labels[fold_idx, labels[sample_idx]] -= 1 171 | c_folds[fold_idx] -= 1 172 | 173 | return test_folds 174 | 175 | 176 | class MultilabelStratifiedKFold(_BaseKFold): 177 | """Multilabel stratified K-Folds cross-validator 178 | Provides train/test indices to split multilabel data into train/test sets. 179 | This cross-validation object is a variation of KFold that returns 180 | stratified folds for multilabel data. The folds are made by preserving 181 | the percentage of samples for each label. 182 | Parameters 183 | ---------- 184 | n_splits : int, default=3 185 | Number of folds. Must be at least 2. 186 | shuffle : boolean, optional 187 | Whether to shuffle each stratification of the data before splitting 188 | into batches. 189 | random_state : int, RandomState instance or None, optional, default=None 190 | If int, random_state is the seed used by the random number generator; 191 | If RandomState instance, random_state is the random number generator; 192 | If None, the random number generator is the RandomState instance used 193 | by `np.random`. Unlike StratifiedKFold that only uses random_state 194 | when ``shuffle`` == True, this multilabel implementation 195 | always uses the random_state since the iterative stratification 196 | algorithm breaks ties randomly. 197 | Examples 198 | -------- 199 | >>> from iterstrat.ml_stratifiers import MultilabelStratifiedKFold 200 | >>> import numpy as np 201 | >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]]) 202 | >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]]) 203 | >>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0) 204 | >>> mskf.get_n_splits(X, y) 205 | 2 206 | >>> print(mskf) # doctest: +NORMALIZE_WHITESPACE 207 | MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False) 208 | >>> for train_index, test_index in mskf.split(X, y): 209 | ... print("TRAIN:", train_index, "TEST:", test_index) 210 | ... X_train, X_test = X[train_index], X[test_index] 211 | ... y_train, y_test = y[train_index], y[test_index] 212 | TRAIN: [0 3 4 6] TEST: [1 2 5 7] 213 | TRAIN: [1 2 5 7] TEST: [0 3 4 6] 214 | Notes 215 | ----- 216 | Train and test sizes may be slightly different in each fold. 217 | See also 218 | -------- 219 | RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold 220 | n times. 221 | """ 222 | def __init__(self, n_splits=3, shuffle=False, random_state=None): 223 | super(MultilabelStratifiedKFold, self).__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) 224 | 225 | def _make_test_folds(self, X, y): 226 | y = np.asarray(y, dtype=bool) 227 | type_of_target_y = type_of_target(y) 228 | 229 | if type_of_target_y != "multilabel-indicator": 230 | raise ValueError("Supported target type is: multilabel-indicator. Got {!r} instead.".format(type_of_target_y)) 231 | 232 | num_samples = y.shape[0] 233 | 234 | rng = check_random_state(self.random_state) 235 | indices = np.arange(num_samples) 236 | 237 | if self.shuffle: 238 | rng.shuffle(indices) 239 | y = y[indices] 240 | 241 | r = np.asarray([1 / self.n_splits] * self.n_splits) 242 | 243 | test_folds = IterativeStratification(labels=y, r=r, random_state=rng) 244 | 245 | return test_folds[np.argsort(indices)] 246 | 247 | def _iter_test_masks(self, X=None, y=None, groups=None): 248 | test_folds = self._make_test_folds(X, y) 249 | for i in range(self.n_splits): 250 | yield test_folds == i 251 | 252 | def split(self, X, y, groups=None): 253 | """Generate indices to split data into training and test set. 254 | Parameters 255 | ---------- 256 | X : array-like, shape (n_samples, n_features) 257 | Training data, where n_samples is the number of samples 258 | and n_features is the number of features. 259 | Note that providing ``y`` is sufficient to generate the splits and 260 | hence ``np.zeros(n_samples)`` may be used as a placeholder for 261 | ``X`` instead of actual training data. 262 | y : array-like, shape (n_samples, n_labels) 263 | The target variable for supervised learning problems. 264 | Multilabel stratification is done based on the y labels. 265 | groups : object 266 | Always ignored, exists for compatibility. 267 | Returns 268 | ------- 269 | train : ndarray 270 | The training set indices for that split. 271 | test : ndarray 272 | The testing set indices for that split. 273 | Notes 274 | ----- 275 | Randomized CV splitters may return different results for each call of 276 | split. You can make the results identical by setting ``random_state`` 277 | to an integer. 278 | """ 279 | y = check_array(y, ensure_2d=False, dtype=None) 280 | return super(MultilabelStratifiedKFold, self).split(X, y, groups) 281 | 282 | 283 | class RepeatedMultilabelStratifiedKFold(_RepeatedSplits): 284 | """Repeated Multilabel Stratified K-Fold cross validator. 285 | Repeats Mulilabel Stratified K-Fold n times with different randomization 286 | in each repetition. 287 | Parameters 288 | ---------- 289 | n_splits : int, default=5 290 | Number of folds. Must be at least 2. 291 | n_repeats : int, default=10 292 | Number of times cross-validator needs to be repeated. 293 | random_state : None, int or RandomState, default=None 294 | Random state to be used to generate random state for each 295 | repetition as well as randomly breaking ties within the iterative 296 | stratification algorithm. 297 | Examples 298 | -------- 299 | >>> from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold 300 | >>> import numpy as np 301 | >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]]) 302 | >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]]) 303 | >>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2, 304 | ... random_state=0) 305 | >>> for train_index, test_index in rmskf.split(X, y): 306 | ... print("TRAIN:", train_index, "TEST:", test_index) 307 | ... X_train, X_test = X[train_index], X[test_index] 308 | ... y_train, y_test = y[train_index], y[test_index] 309 | ... 310 | TRAIN: [0 3 4 6] TEST: [1 2 5 7] 311 | TRAIN: [1 2 5 7] TEST: [0 3 4 6] 312 | TRAIN: [0 1 4 5] TEST: [2 3 6 7] 313 | TRAIN: [2 3 6 7] TEST: [0 1 4 5] 314 | See also 315 | -------- 316 | RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold 317 | n times. 318 | """ 319 | def __init__(self, n_splits=5, n_repeats=10, random_state=None): 320 | super(RepeatedMultilabelStratifiedKFold, self).__init__( 321 | MultilabelStratifiedKFold, 322 | n_repeats=n_repeats, 323 | random_state=random_state, 324 | n_splits=n_splits, 325 | ) 326 | 327 | 328 | class MultilabelStratifiedShuffleSplit(BaseShuffleSplit): 329 | """Multilabel Stratified ShuffleSplit cross-validator 330 | Provides train/test indices to split data into train/test sets. 331 | This cross-validation object is a merge of MultilabelStratifiedKFold and 332 | ShuffleSplit, which returns stratified randomized folds for multilabel 333 | data. The folds are made by preserving the percentage of each label. 334 | Note: like the ShuffleSplit strategy, multilabel stratified random splits 335 | do not guarantee that all folds will be different, although this is 336 | still very likely for sizeable datasets. 337 | Parameters 338 | ---------- 339 | n_splits : int, default 10 340 | Number of re-shuffling & splitting iterations. 341 | test_size : float, int, None, optional 342 | If float, should be between 0.0 and 1.0 and represent the proportion 343 | of the dataset to include in the test split. If int, represents the 344 | absolute number of test samples. If None, the value is set to the 345 | complement of the train size. By default, the value is set to 0.1. 346 | The default will change in version 0.21. It will remain 0.1 only 347 | if ``train_size`` is unspecified, otherwise it will complement 348 | the specified ``train_size``. 349 | train_size : float, int, or None, default is None 350 | If float, should be between 0.0 and 1.0 and represent the 351 | proportion of the dataset to include in the train split. If 352 | int, represents the absolute number of train samples. If None, 353 | the value is automatically set to the complement of the test size. 354 | random_state : int, RandomState instance or None, optional (default=None) 355 | If int, random_state is the seed used by the random number generator; 356 | If RandomState instance, random_state is the random number generator; 357 | If None, the random number generator is the RandomState instance used 358 | by `np.random`. Unlike StratifiedShuffleSplit that only uses 359 | random_state when ``shuffle`` == True, this multilabel implementation 360 | always uses the random_state since the iterative stratification 361 | algorithm breaks ties randomly. 362 | Examples 363 | -------- 364 | >>> from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit 365 | >>> import numpy as np 366 | >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]]) 367 | >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]]) 368 | >>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5, 369 | ... random_state=0) 370 | >>> msss.get_n_splits(X, y) 371 | 3 372 | >>> print(mss) # doctest: +ELLIPSIS 373 | MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5, 374 | train_size=None) 375 | >>> for train_index, test_index in msss.split(X, y): 376 | ... print("TRAIN:", train_index, "TEST:", test_index) 377 | ... X_train, X_test = X[train_index], X[test_index] 378 | ... y_train, y_test = y[train_index], y[test_index] 379 | TRAIN: [1 2 5 7] TEST: [0 3 4 6] 380 | TRAIN: [2 3 6 7] TEST: [0 1 4 5] 381 | TRAIN: [1 2 5 6] TEST: [0 3 4 7] 382 | Notes 383 | ----- 384 | Train and test sizes may be slightly different from desired due to the 385 | preference of stratification over perfectly sized folds. 386 | """ 387 | def __init__(self, n_splits=10, test_size="default", train_size=None, random_state=None): 388 | super(MultilabelStratifiedShuffleSplit, self).__init__( 389 | n_splits=n_splits, 390 | test_size=test_size, 391 | train_size=train_size, 392 | random_state=random_state, 393 | ) 394 | 395 | def _iter_indices(self, X, y, groups=None): 396 | n_samples = _num_samples(X) 397 | y = check_array(y, ensure_2d=False, dtype=None) 398 | y = np.asarray(y, dtype=bool) 399 | type_of_target_y = type_of_target(y) 400 | 401 | if type_of_target_y != "multilabel-indicator": 402 | raise ValueError("Supported target type is: multilabel-indicator. Got {!r} instead.".format(type_of_target_y)) 403 | 404 | n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) 405 | 406 | n_samples = y.shape[0] 407 | rng = check_random_state(self.random_state) 408 | y_orig = y.copy() 409 | 410 | r = np.array([n_train, n_test]) / (n_train + n_test) 411 | 412 | for _ in range(self.n_splits): 413 | indices = np.arange(n_samples) 414 | rng.shuffle(indices) 415 | y = y_orig[indices] 416 | 417 | test_folds = IterativeStratification(labels=y, r=r, random_state=rng) 418 | 419 | test_idx = test_folds[np.argsort(indices)] == 1 420 | test = np.where(test_idx)[0] 421 | train = np.where(~test_idx)[0] 422 | 423 | yield train, test 424 | 425 | def split(self, X, y, groups=None): 426 | """Generate indices to split data into training and test set. 427 | Parameters 428 | ---------- 429 | X : array-like, shape (n_samples, n_features) 430 | Training data, where n_samples is the number of samples 431 | and n_features is the number of features. 432 | Note that providing ``y`` is sufficient to generate the splits and 433 | hence ``np.zeros(n_samples)`` may be used as a placeholder for 434 | ``X`` instead of actual training data. 435 | y : array-like, shape (n_samples, n_labels) 436 | The target variable for supervised learning problems. 437 | Multilabel stratification is done based on the y labels. 438 | groups : object 439 | Always ignored, exists for compatibility. 440 | Returns 441 | ------- 442 | train : ndarray 443 | The training set indices for that split. 444 | test : ndarray 445 | The testing set indices for that split. 446 | Notes 447 | ----- 448 | Randomized CV splitters may return different results for each call of 449 | split. You can make the results identical by setting ``random_state`` 450 | to an integer. 451 | """ 452 | y = check_array(y, ensure_2d=False, dtype=None) 453 | return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups) 454 | -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/tab_network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import Linear, BatchNorm1d, ReLU 3 | import numpy as np 4 | from . import sparsemax 5 | 6 | 7 | def initialize_non_glu(module, input_dim, output_dim): 8 | gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(4 * input_dim)) 9 | torch.nn.init.xavier_normal_(module.weight, gain=gain_value) 10 | # torch.nn.init.zeros_(module.bias) 11 | return 12 | 13 | 14 | def initialize_glu(module, input_dim, output_dim): 15 | gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(input_dim)) 16 | torch.nn.init.xavier_normal_(module.weight, gain=gain_value) 17 | # torch.nn.init.zeros_(module.bias) 18 | return 19 | 20 | 21 | class GBN(torch.nn.Module): 22 | """ 23 | Ghost Batch Normalization 24 | https://arxiv.org/abs/1705.08741 25 | """ 26 | def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01): 27 | super(GBN, self).__init__() 28 | 29 | self.input_dim = input_dim 30 | self.virtual_batch_size = virtual_batch_size 31 | self.bn = BatchNorm1d(self.input_dim, momentum=momentum) 32 | 33 | def forward(self, x): 34 | chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0) 35 | res = [self.bn(x_) for x_ in chunks] 36 | 37 | return torch.cat(res, dim=0) 38 | 39 | 40 | class TabNetNoEmbeddings(torch.nn.Module): 41 | def __init__(self, 42 | input_dim, 43 | output_dim, 44 | n_d=8, 45 | n_a=8, 46 | n_steps=3, 47 | gamma=1.3, 48 | n_independent=2, 49 | n_shared=2, 50 | epsilon=1e-15, 51 | virtual_batch_size=128, 52 | momentum=0.02, 53 | mask_type="sparsemax"): 54 | """ 55 | Defines main part of the TabNet network without the embedding layers. 56 | 57 | Parameters 58 | ---------- 59 | input_dim : int 60 | Number of features 61 | output_dim : int or list of int for multi task classification 62 | Dimension of network output 63 | examples : one for regression, 2 for binary classification etc... 64 | n_d : int 65 | Dimension of the prediction layer (usually between 4 and 64) 66 | n_a : int 67 | Dimension of the attention layer (usually between 4 and 64) 68 | n_steps : int 69 | Number of sucessive steps in the newtork (usually betwenn 3 and 10) 70 | gamma : float 71 | Float above 1, scaling factor for attention updates (usually betwenn 1.0 to 2.0) 72 | n_independent : int 73 | Number of independent GLU layer in each GLU block (default 2) 74 | n_shared : int 75 | Number of independent GLU layer in each GLU block (default 2) 76 | epsilon : float 77 | Avoid log(0), this should be kept very low 78 | virtual_batch_size : int 79 | Batch size for Ghost Batch Normalization 80 | momentum : float 81 | Float value between 0 and 1 which will be used for momentum in all batch norm 82 | mask_type : str 83 | Either "sparsemax" or "entmax" : this is the masking function to use 84 | """ 85 | super(TabNetNoEmbeddings, self).__init__() 86 | self.input_dim = input_dim 87 | self.output_dim = output_dim 88 | self.is_multi_task = isinstance(output_dim, list) 89 | self.n_d = n_d 90 | self.n_a = n_a 91 | self.n_steps = n_steps 92 | self.gamma = gamma 93 | self.epsilon = epsilon 94 | self.n_independent = n_independent 95 | self.n_shared = n_shared 96 | self.virtual_batch_size = virtual_batch_size 97 | self.mask_type = mask_type 98 | self.initial_bn = BatchNorm1d(self.input_dim, momentum=0.01) 99 | 100 | if self.n_shared > 0: 101 | shared_feat_transform = torch.nn.ModuleList() 102 | for i in range(self.n_shared): 103 | if i == 0: 104 | shared_feat_transform.append(Linear(self.input_dim, 2 * (n_d + n_a), bias=False)) 105 | else: 106 | shared_feat_transform.append(Linear(n_d + n_a, 2 * (n_d + n_a), bias=False)) 107 | 108 | else: 109 | shared_feat_transform = None 110 | 111 | self.initial_splitter = FeatTransformer(self.input_dim, 112 | n_d + n_a, 113 | shared_feat_transform, 114 | n_glu_independent=self.n_independent, 115 | virtual_batch_size=self.virtual_batch_size, 116 | momentum=momentum) 117 | 118 | self.feat_transformers = torch.nn.ModuleList() 119 | self.att_transformers = torch.nn.ModuleList() 120 | 121 | for step in range(n_steps): 122 | transformer = FeatTransformer(self.input_dim, 123 | n_d + n_a, 124 | shared_feat_transform, 125 | n_glu_independent=self.n_independent, 126 | virtual_batch_size=self.virtual_batch_size, 127 | momentum=momentum) 128 | attention = AttentiveTransformer(n_a, self.input_dim, virtual_batch_size=self.virtual_batch_size, momentum=momentum, mask_type=self.mask_type) 129 | self.feat_transformers.append(transformer) 130 | self.att_transformers.append(attention) 131 | 132 | if self.is_multi_task: 133 | self.multi_task_mappings = torch.nn.ModuleList() 134 | for task_dim in output_dim: 135 | task_mapping = Linear(n_d, task_dim, bias=False) 136 | initialize_non_glu(task_mapping, n_d, task_dim) 137 | self.multi_task_mappings.append(task_mapping) 138 | else: 139 | self.final_mapping = Linear(n_d, output_dim, bias=False) 140 | initialize_non_glu(self.final_mapping, n_d, output_dim) 141 | 142 | def forward(self, x): 143 | res = 0 144 | x = self.initial_bn(x) 145 | 146 | prior = torch.ones(x.shape).to(x.device) 147 | M_loss = 0 148 | att = self.initial_splitter(x)[:, self.n_d:] 149 | 150 | for step in range(self.n_steps): 151 | M = self.att_transformers[step](prior, att) 152 | M_loss += torch.mean(torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1)) 153 | # update prior 154 | prior = torch.mul(self.gamma - M, prior) 155 | # output 156 | masked_x = torch.mul(M, x) 157 | out = self.feat_transformers[step](masked_x) 158 | d = ReLU()(out[:, :self.n_d]) 159 | res = torch.add(res, d) 160 | # update attention 161 | att = out[:, self.n_d:] 162 | 163 | M_loss /= self.n_steps 164 | 165 | if self.is_multi_task: 166 | # Result will be in list format 167 | out = [] 168 | for task_mapping in self.multi_task_mappings: 169 | out.append(task_mapping(res)) 170 | else: 171 | out = self.final_mapping(res) 172 | return out, M_loss 173 | 174 | def forward_masks(self, x): 175 | x = self.initial_bn(x) 176 | 177 | prior = torch.ones(x.shape).to(x.device) 178 | M_explain = torch.zeros(x.shape).to(x.device) 179 | att = self.initial_splitter(x)[:, self.n_d:] 180 | masks = {} 181 | 182 | for step in range(self.n_steps): 183 | M = self.att_transformers[step](prior, att) 184 | masks[step] = M 185 | # update prior 186 | prior = torch.mul(self.gamma - M, prior) 187 | # output 188 | masked_x = torch.mul(M, x) 189 | out = self.feat_transformers[step](masked_x) 190 | d = ReLU()(out[:, :self.n_d]) 191 | # explain 192 | step_importance = torch.sum(d, dim=1) 193 | M_explain += torch.mul(M, step_importance.unsqueeze(dim=1)) 194 | # update attention 195 | att = out[:, self.n_d:] 196 | 197 | return M_explain, masks 198 | 199 | 200 | class TabNet(torch.nn.Module): 201 | def __init__(self, 202 | input_dim, 203 | output_dim, 204 | n_d=8, 205 | n_a=8, 206 | n_steps=3, 207 | gamma=1.3, 208 | cat_idxs=[], 209 | cat_dims=[], 210 | cat_emb_dim=1, 211 | n_independent=2, 212 | n_shared=2, 213 | epsilon=1e-15, 214 | virtual_batch_size=128, 215 | momentum=0.02, 216 | device_name='auto', 217 | mask_type="sparsemax"): 218 | """ 219 | Defines TabNet network 220 | 221 | Parameters 222 | ---------- 223 | input_dim : int 224 | Initial number of features 225 | output_dim : int 226 | Dimension of network output 227 | examples : one for regression, 2 for binary classification etc... 228 | n_d : int 229 | Dimension of the prediction layer (usually between 4 and 64) 230 | n_a : int 231 | Dimension of the attention layer (usually between 4 and 64) 232 | n_steps : int 233 | Number of sucessive steps in the newtork (usually betwenn 3 and 10) 234 | gamma : float 235 | Float above 1, scaling factor for attention updates (usually betwenn 1.0 to 2.0) 236 | cat_idxs : list of int 237 | Index of each categorical column in the dataset 238 | cat_dims : list of int 239 | Number of categories in each categorical column 240 | cat_emb_dim : int or list of int 241 | Size of the embedding of categorical features 242 | if int, all categorical features will have same embedding size 243 | if list of int, every corresponding feature will have specific size 244 | n_independent : int 245 | Number of independent GLU layer in each GLU block (default 2) 246 | n_shared : int 247 | Number of independent GLU layer in each GLU block (default 2) 248 | epsilon : float 249 | Avoid log(0), this should be kept very low 250 | virtual_batch_size : int 251 | Batch size for Ghost Batch Normalization 252 | momentum : float 253 | Float value between 0 and 1 which will be used for momentum in all batch norm 254 | device_name : {'auto', 'cuda', 'cpu'} 255 | mask_type : str 256 | Either "sparsemax" or "entmax" : this is the masking function to use 257 | """ 258 | super(TabNet, self).__init__() 259 | self.cat_idxs = cat_idxs or [] 260 | self.cat_dims = cat_dims or [] 261 | self.cat_emb_dim = cat_emb_dim 262 | 263 | self.input_dim = input_dim 264 | self.output_dim = output_dim 265 | self.n_d = n_d 266 | self.n_a = n_a 267 | self.n_steps = n_steps 268 | self.gamma = gamma 269 | self.epsilon = epsilon 270 | self.n_independent = n_independent 271 | self.n_shared = n_shared 272 | self.mask_type = mask_type 273 | 274 | if self.n_steps <= 0: 275 | raise ValueError("n_steps should be a positive integer.") 276 | if self.n_independent == 0 and self.n_shared == 0: 277 | raise ValueError("n_shared and n_independant can't be both zero.") 278 | 279 | self.virtual_batch_size = virtual_batch_size 280 | self.embedder = EmbeddingGenerator(input_dim, cat_dims, cat_idxs, cat_emb_dim) 281 | self.post_embed_dim = self.embedder.post_embed_dim 282 | self.tabnet = TabNetNoEmbeddings(self.post_embed_dim, output_dim, n_d, n_a, n_steps, gamma, n_independent, n_shared, epsilon, virtual_batch_size, 283 | momentum, mask_type) 284 | 285 | # Defining device 286 | if device_name == 'auto': 287 | if torch.cuda.is_available(): 288 | device_name = 'cuda' 289 | else: 290 | device_name = 'cpu' 291 | self.device = torch.device(device_name) 292 | self.to(self.device) 293 | 294 | def forward(self, x): 295 | x = self.embedder(x) 296 | return self.tabnet(x) 297 | 298 | def forward_masks(self, x): 299 | x = self.embedder(x) 300 | return self.tabnet.forward_masks(x) 301 | 302 | 303 | class AttentiveTransformer(torch.nn.Module): 304 | def __init__(self, input_dim, output_dim, virtual_batch_size=128, momentum=0.02, mask_type="sparsemax"): 305 | """ 306 | Initialize an attention transformer. 307 | 308 | Parameters 309 | ---------- 310 | input_dim : int 311 | Input size 312 | output_dim : int 313 | Outpu_size 314 | virtual_batch_size : int 315 | Batch size for Ghost Batch Normalization 316 | momentum : float 317 | Float value between 0 and 1 which will be used for momentum in batch norm 318 | mask_type : str 319 | Either "sparsemax" or "entmax" : this is the masking function to use 320 | """ 321 | super(AttentiveTransformer, self).__init__() 322 | self.fc = Linear(input_dim, output_dim, bias=False) 323 | initialize_non_glu(self.fc, input_dim, output_dim) 324 | self.bn = GBN(output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) 325 | 326 | if mask_type == "sparsemax": 327 | # Sparsemax 328 | self.selector = sparsemax.Sparsemax(dim=-1) 329 | elif mask_type == "entmax": 330 | # Entmax 331 | self.selector = sparsemax.Entmax15(dim=-1) 332 | else: 333 | raise NotImplementedError("Please choose either sparsemax" + "or entmax as masktype") 334 | 335 | def forward(self, priors, processed_feat): 336 | x = self.fc(processed_feat) 337 | x = self.bn(x) 338 | x = torch.mul(x, priors) 339 | x = self.selector(x) 340 | return x 341 | 342 | 343 | class FeatTransformer(torch.nn.Module): 344 | def __init__(self, input_dim, output_dim, shared_layers, n_glu_independent, virtual_batch_size=128, momentum=0.02): 345 | super(FeatTransformer, self).__init__() 346 | """ 347 | Initialize a feature transformer. 348 | 349 | Parameters 350 | ---------- 351 | input_dim : int 352 | Input size 353 | output_dim : int 354 | Outpu_size 355 | shared_layers : torch.nn.ModuleList 356 | The shared block that should be common to every step 357 | n_glu_independant : int 358 | Number of independent GLU layers 359 | virtual_batch_size : int 360 | Batch size for Ghost Batch Normalization within GLU block(s) 361 | momentum : float 362 | Float value between 0 and 1 which will be used for momentum in batch norm 363 | """ 364 | 365 | params = {'n_glu': n_glu_independent, 'virtual_batch_size': virtual_batch_size, 'momentum': momentum} 366 | 367 | if shared_layers is None: 368 | # no shared layers 369 | self.shared = torch.nn.Identity() 370 | is_first = True 371 | else: 372 | self.shared = GLU_Block(input_dim, 373 | output_dim, 374 | first=True, 375 | shared_layers=shared_layers, 376 | n_glu=len(shared_layers), 377 | virtual_batch_size=virtual_batch_size, 378 | momentum=momentum) 379 | is_first = False 380 | 381 | if n_glu_independent == 0: 382 | # no independent layers 383 | self.specifics = torch.nn.Identity() 384 | else: 385 | spec_input_dim = input_dim if is_first else output_dim 386 | self.specifics = GLU_Block(spec_input_dim, output_dim, first=is_first, **params) 387 | 388 | def forward(self, x): 389 | x = self.shared(x) 390 | x = self.specifics(x) 391 | return x 392 | 393 | 394 | class GLU_Block(torch.nn.Module): 395 | """ 396 | Independant GLU block, specific to each step 397 | """ 398 | def __init__(self, input_dim, output_dim, n_glu=2, first=False, shared_layers=None, virtual_batch_size=128, momentum=0.02): 399 | super(GLU_Block, self).__init__() 400 | self.first = first 401 | self.shared_layers = shared_layers 402 | self.n_glu = n_glu 403 | self.glu_layers = torch.nn.ModuleList() 404 | 405 | params = {'virtual_batch_size': virtual_batch_size, 'momentum': momentum} 406 | 407 | fc = shared_layers[0] if shared_layers else None 408 | self.glu_layers.append(GLU_Layer(input_dim, output_dim, fc=fc, **params)) 409 | for glu_id in range(1, self.n_glu): 410 | fc = shared_layers[glu_id] if shared_layers else None 411 | self.glu_layers.append(GLU_Layer(output_dim, output_dim, fc=fc, **params)) 412 | 413 | def forward(self, x): 414 | scale = torch.sqrt(torch.FloatTensor([0.5]).to(x.device)) 415 | if self.first: # the first layer of the block has no scale multiplication 416 | x = self.glu_layers[0](x) 417 | layers_left = range(1, self.n_glu) 418 | else: 419 | layers_left = range(self.n_glu) 420 | 421 | for glu_id in layers_left: 422 | x = torch.add(x, self.glu_layers[glu_id](x)) 423 | x = x * scale 424 | return x 425 | 426 | 427 | class GLU_Layer(torch.nn.Module): 428 | def __init__(self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02): 429 | super(GLU_Layer, self).__init__() 430 | 431 | self.output_dim = output_dim 432 | if fc: 433 | self.fc = fc 434 | else: 435 | self.fc = Linear(input_dim, 2 * output_dim, bias=False) 436 | initialize_glu(self.fc, input_dim, 2 * output_dim) 437 | 438 | self.bn = GBN(2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) 439 | 440 | def forward(self, x): 441 | x = self.fc(x) 442 | x = self.bn(x) 443 | out = torch.mul(x[:, :self.output_dim], torch.sigmoid(x[:, self.output_dim:])) 444 | return out 445 | 446 | 447 | class EmbeddingGenerator(torch.nn.Module): 448 | """ 449 | Classical embeddings generator 450 | """ 451 | def __init__(self, input_dim, cat_dims, cat_idxs, cat_emb_dim): 452 | """ This is an embedding module for an entier set of features 453 | 454 | Parameters 455 | ---------- 456 | input_dim : int 457 | Number of features coming as input (number of columns) 458 | cat_dims : list of int 459 | Number of modalities for each categorial features 460 | If the list is empty, no embeddings will be done 461 | cat_idxs : list of int 462 | Positional index for each categorical features in inputs 463 | cat_emb_dim : int or list of int 464 | Embedding dimension for each categorical features 465 | If int, the same embdeding dimension will be used for all categorical features 466 | """ 467 | super(EmbeddingGenerator, self).__init__() 468 | if cat_dims == [] or cat_idxs == []: 469 | self.skip_embedding = True 470 | self.post_embed_dim = input_dim 471 | return 472 | 473 | self.skip_embedding = False 474 | if isinstance(cat_emb_dim, int): 475 | self.cat_emb_dims = [cat_emb_dim] * len(cat_idxs) 476 | else: 477 | self.cat_emb_dims = cat_emb_dim 478 | 479 | # check that all embeddings are provided 480 | if len(self.cat_emb_dims) != len(cat_dims): 481 | msg = """ cat_emb_dim and cat_dims must be lists of same length, got {len(self.cat_emb_dims)} 482 | and {len(cat_dims)}""" 483 | raise ValueError(msg) 484 | self.post_embed_dim = int(input_dim + np.sum(self.cat_emb_dims) - len(self.cat_emb_dims)) 485 | 486 | self.embeddings = torch.nn.ModuleList() 487 | 488 | # Sort dims by cat_idx 489 | sorted_idxs = np.argsort(cat_idxs) 490 | cat_dims = [cat_dims[i] for i in sorted_idxs] 491 | self.cat_emb_dims = [self.cat_emb_dims[i] for i in sorted_idxs] 492 | 493 | for cat_dim, emb_dim in zip(cat_dims, self.cat_emb_dims): 494 | self.embeddings.append(torch.nn.Embedding(cat_dim, emb_dim)) 495 | 496 | # record continuous indices 497 | self.continuous_idx = torch.ones(input_dim, dtype=torch.bool) 498 | self.continuous_idx[cat_idxs] = 0 499 | 500 | def forward(self, x): 501 | """ 502 | Apply embdeddings to inputs 503 | Inputs should be (batch_size, input_dim) 504 | Outputs will be of size (batch_size, self.post_embed_dim) 505 | """ 506 | if self.skip_embedding: 507 | # no embeddings required 508 | return x 509 | 510 | cols = [] 511 | cat_feat_counter = 0 512 | for feat_init_idx, is_continuous in enumerate(self.continuous_idx): 513 | # Enumerate through continuous idx boolean mask to apply embeddings 514 | if is_continuous: 515 | cols.append(x[:, feat_init_idx].float().view(-1, 1)) 516 | else: 517 | cols.append(self.embeddings[cat_feat_counter](x[:, feat_init_idx].long())) 518 | cat_feat_counter += 1 519 | # concat 520 | post_embeddings = torch.cat(cols, dim=1) 521 | return post_embeddings 522 | -------------------------------------------------------------------------------- /src/models/pytorch_tabnet/abstract_model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Any, Dict 3 | import torch 4 | from torch.nn.utils import clip_grad_norm_ 5 | import numpy as np 6 | from scipy.sparse import csc_matrix 7 | from abc import abstractmethod 8 | from . import tab_network 9 | from .utils import ( 10 | PredictDataset, 11 | create_explain_matrix, 12 | validate_eval_set, 13 | create_dataloaders, 14 | define_device, 15 | ) 16 | from .callbacks import ( 17 | CallbackContainer, 18 | History, 19 | EarlyStopping, 20 | LRSchedulerCallback, 21 | ) 22 | from .metrics import MetricContainer, check_metrics 23 | from sklearn.base import BaseEstimator 24 | from sklearn.utils import check_array 25 | from torch.utils.data import DataLoader 26 | import io 27 | import json 28 | from pathlib import Path 29 | import shutil 30 | import zipfile 31 | 32 | 33 | @dataclass 34 | class TabModel(BaseEstimator): 35 | """ Class for TabNet model.""" 36 | 37 | n_d: int = 8 38 | n_a: int = 8 39 | n_steps: int = 3 40 | gamma: float = 1.3 41 | cat_idxs: List[int] = field(default_factory=list) 42 | cat_dims: List[int] = field(default_factory=list) 43 | cat_emb_dim: int = 1 44 | n_independent: int = 2 45 | n_shared: int = 2 46 | epsilon: float = 1e-15 47 | momentum: float = 0.02 48 | lambda_sparse: float = 1e-3 49 | seed: int = 0 50 | clip_value: int = 1 51 | verbose: int = 1 52 | optimizer_fn: Any = torch.optim.Adam 53 | optimizer_params: Dict = field(default_factory=lambda: dict(lr=2e-2)) 54 | scheduler_fn: Any = None 55 | scheduler_params: Dict = field(default_factory=dict) 56 | mask_type: str = "sparsemax" 57 | input_dim: int = None 58 | output_dim: int = None 59 | device_name: str = "auto" 60 | 61 | def __post_init__(self): 62 | self.batch_size = 1024 63 | self.virtual_batch_size = 1024 64 | torch.manual_seed(self.seed) 65 | # Defining device 66 | self.device = torch.device(define_device(self.device_name)) 67 | print(f"Device used : {self.device}") 68 | 69 | def fit(self, 70 | X_train, 71 | y_train, 72 | eval_set=None, 73 | eval_name=None, 74 | eval_metric=None, 75 | loss_fn=None, 76 | weights=0, 77 | max_epochs=100, 78 | patience=10, 79 | batch_size=1024, 80 | virtual_batch_size=128, 81 | num_workers=0, 82 | drop_last=False, 83 | callbacks=None, 84 | pin_memory=True): 85 | """Train a neural network stored in self.network 86 | Using train_dataloader for training data and 87 | valid_dataloader for validation. 88 | 89 | Parameters 90 | ---------- 91 | X_train : np.ndarray 92 | Train set 93 | y_train : np.array 94 | Train targets 95 | eval_set : list of tuple 96 | List of eval tuple set (X, y). 97 | The last one is used for early stopping 98 | eval_name : list of str 99 | List of eval set names. 100 | eval_metric : list of str 101 | List of evaluation metrics. 102 | The last metric is used for early stopping. 103 | loss_fn : callable or None 104 | a PyTorch loss function 105 | weights : bool or dictionnary 106 | 0 for no balancing 107 | 1 for automated balancing 108 | dict for custom weights per class 109 | max_epochs : int 110 | Maximum number of epochs during training 111 | patience : int 112 | Number of consecutive non improving epoch before early stopping 113 | batch_size : int 114 | Training batch size 115 | virtual_batch_size : int 116 | Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size) 117 | num_workers : int 118 | Number of workers used in torch.utils.data.DataLoader 119 | drop_last : bool 120 | Whether to drop last batch during training 121 | callbacks : list of callback function 122 | List of custom callbacks 123 | pin_memory: bool 124 | Whether to set pin_memory to True or False during training 125 | """ 126 | # update model name 127 | 128 | self.max_epochs = max_epochs 129 | self.patience = patience 130 | self.batch_size = batch_size 131 | self.virtual_batch_size = virtual_batch_size 132 | self.num_workers = num_workers 133 | self.drop_last = drop_last 134 | self.input_dim = X_train.shape[1] 135 | self._stop_training = False 136 | self.pin_memory = pin_memory and (self.device.type != 'cpu') 137 | 138 | eval_set = eval_set if eval_set else [] 139 | 140 | if loss_fn is None: 141 | self.loss_fn = self._default_loss 142 | else: 143 | self.loss_fn = loss_fn 144 | 145 | check_array(X_train) 146 | 147 | self.update_fit_params( 148 | X_train, 149 | y_train, 150 | eval_set, 151 | weights, 152 | ) 153 | 154 | # Validate and reformat eval set depending on training data 155 | eval_names, eval_set = validate_eval_set(eval_set, eval_name, X_train, y_train) 156 | 157 | train_dataloader, valid_dataloaders = self._construct_loaders(X_train, y_train, eval_set) 158 | 159 | self._set_network() 160 | self._set_metrics(eval_metric, eval_names) 161 | self._set_optimizer() 162 | self._set_callbacks(callbacks) 163 | 164 | # Call method on_train_begin for all callbacks 165 | self._callback_container.on_train_begin() 166 | 167 | # Training loop over epochs 168 | for epoch_idx in range(self.max_epochs): 169 | 170 | # Call method on_epoch_begin for all callbacks 171 | self._callback_container.on_epoch_begin(epoch_idx) 172 | 173 | self._train_epoch(train_dataloader) 174 | 175 | # Apply predict epoch to all eval sets 176 | for eval_name, valid_dataloader in zip(eval_names, valid_dataloaders): 177 | self._predict_epoch(eval_name, valid_dataloader) 178 | 179 | # Call method on_epoch_end for all callbacks 180 | self._callback_container.on_epoch_end(epoch_idx, logs=self.history.epoch_metrics) 181 | 182 | if self._stop_training: 183 | break 184 | 185 | # Call method on_train_end for all callbacks 186 | self._callback_container.on_train_end() 187 | self.network.eval() 188 | 189 | # compute feature importance once the best model is defined 190 | self._compute_feature_importances(train_dataloader) 191 | 192 | def predict(self, X): 193 | """ 194 | Make predictions on a batch (valid) 195 | 196 | Parameters 197 | ---------- 198 | X : a :tensor: `torch.Tensor` 199 | Input data 200 | 201 | Returns 202 | ------- 203 | predictions : np.array 204 | Predictions of the regression problem 205 | """ 206 | self.network.eval() 207 | dataloader = DataLoader( 208 | PredictDataset(X), 209 | batch_size=self.batch_size, 210 | shuffle=False, 211 | ) 212 | 213 | results = [] 214 | for batch_nb, data in enumerate(dataloader): 215 | data = data.to(self.device).float() 216 | output, M_loss = self.network(data) 217 | predictions = output.cpu().detach().numpy() 218 | results.append(predictions) 219 | res = np.vstack(results) 220 | return self.predict_func(res) 221 | 222 | def explain(self, X): 223 | """ 224 | Return local explanation 225 | 226 | Parameters 227 | ---------- 228 | X : tensor: `torch.Tensor` 229 | Input data 230 | 231 | Returns 232 | ------- 233 | M_explain : matrix 234 | Importance per sample, per columns. 235 | masks : matrix 236 | Sparse matrix showing attention masks used by network. 237 | """ 238 | self.network.eval() 239 | 240 | dataloader = DataLoader( 241 | PredictDataset(X), 242 | batch_size=self.batch_size, 243 | shuffle=False, 244 | ) 245 | 246 | res_explain = [] 247 | 248 | for batch_nb, data in enumerate(dataloader): 249 | data = data.to(self.device).float() 250 | 251 | M_explain, masks = self.network.forward_masks(data) 252 | for key, value in masks.items(): 253 | masks[key] = csc_matrix.dot(value.cpu().detach().numpy(), self.reducing_matrix) 254 | 255 | res_explain.append(csc_matrix.dot(M_explain.cpu().detach().numpy(), self.reducing_matrix)) 256 | 257 | if batch_nb == 0: 258 | res_masks = masks 259 | else: 260 | for key, value in masks.items(): 261 | res_masks[key] = np.vstack([res_masks[key], value]) 262 | 263 | res_explain = np.vstack(res_explain) 264 | 265 | return res_explain, res_masks 266 | 267 | def save_model(self, path): 268 | """Saving TabNet model in two distinct files. 269 | 270 | Parameters 271 | ---------- 272 | path : str 273 | Path of the model. 274 | 275 | Returns 276 | ------- 277 | str 278 | input filepath with ".zip" appended 279 | 280 | """ 281 | saved_params = {} 282 | for key, val in self.get_params().items(): 283 | if isinstance(val, type): 284 | # Don't save torch specific params 285 | continue 286 | else: 287 | saved_params[key] = val 288 | 289 | # Create folder 290 | Path(path).mkdir(parents=True, exist_ok=True) 291 | 292 | # Save models params 293 | with open(Path(path).joinpath("model_params.json"), "w", encoding="utf8") as f: 294 | json.dump(saved_params, f) 295 | 296 | # Save state_dict 297 | torch.save(self.network.state_dict(), Path(path).joinpath("network.pt")) 298 | shutil.make_archive(path, "zip", path) 299 | shutil.rmtree(path) 300 | print(f"Successfully saved model at {path}.zip") 301 | return f"{path}.zip" 302 | 303 | def load_model(self, filepath): 304 | """Load TabNet model. 305 | 306 | Parameters 307 | ---------- 308 | filepath : str 309 | Path of the model. 310 | """ 311 | try: 312 | with zipfile.ZipFile(filepath) as z: 313 | with z.open("model_params.json") as f: 314 | loaded_params = json.load(f) 315 | with z.open("network.pt") as f: 316 | try: 317 | saved_state_dict = torch.load(f, map_location=self.device) 318 | except io.UnsupportedOperation: 319 | # In Python <3.7, the returned file object is not seekable (which at least 320 | # some versions of PyTorch require) - so we'll try buffering it in to a 321 | # BytesIO instead: 322 | saved_state_dict = torch.load( 323 | io.BytesIO(f.read()), 324 | map_location=self.device, 325 | ) 326 | except KeyError: 327 | raise KeyError("Your zip file is missing at least one component") 328 | 329 | self.__init__(**loaded_params) 330 | 331 | self._set_network() 332 | self.network.load_state_dict(saved_state_dict) 333 | self.network.eval() 334 | return 335 | 336 | def _train_epoch(self, train_loader): 337 | """ 338 | Trains one epoch of the network in self.network 339 | 340 | Parameters 341 | ---------- 342 | train_loader : a :class: `torch.utils.data.Dataloader` 343 | DataLoader with train set 344 | """ 345 | self.network.train() 346 | 347 | for batch_idx, (X, y) in enumerate(train_loader): 348 | self._callback_container.on_batch_begin(batch_idx) 349 | 350 | batch_logs = self._train_batch(X, y) 351 | 352 | self._callback_container.on_batch_end(batch_idx, batch_logs) 353 | 354 | epoch_logs = {"lr": self._optimizer.param_groups[-1]["lr"]} 355 | self.history.epoch_metrics.update(epoch_logs) 356 | 357 | return 358 | 359 | def _train_batch(self, X, y): 360 | """ 361 | Trains one batch of data 362 | 363 | Parameters 364 | ---------- 365 | X : torch.Tensor 366 | Train matrix 367 | y : torch.Tensor 368 | Target matrix 369 | 370 | Returns 371 | ------- 372 | batch_outs : dict 373 | Dictionnary with "y": target and "score": prediction scores. 374 | batch_logs : dict 375 | Dictionnary with "batch_size" and "loss". 376 | """ 377 | batch_logs = {"batch_size": X.shape[0]} 378 | 379 | X = X.to(self.device).float() 380 | y = y.to(self.device).float() 381 | 382 | for param in self.network.parameters(): 383 | param.grad = None 384 | 385 | output, M_loss = self.network(X) 386 | 387 | loss = self.compute_loss(output, y) 388 | # Add the overall sparsity loss 389 | loss -= self.lambda_sparse * M_loss 390 | 391 | # Perform backward pass and optimization 392 | loss.backward() 393 | if self.clip_value: 394 | clip_grad_norm_(self.network.parameters(), self.clip_value) 395 | self._optimizer.step() 396 | 397 | batch_logs["loss"] = loss.cpu().detach().numpy().item() 398 | 399 | return batch_logs 400 | 401 | def _predict_epoch(self, name, loader): 402 | """ 403 | Predict an epoch and update metrics. 404 | 405 | Parameters 406 | ---------- 407 | name : str 408 | Name of the validation set 409 | loader : torch.utils.data.Dataloader 410 | DataLoader with validation set 411 | """ 412 | # Setting network on evaluation mode (no dropout etc...) 413 | self.network.eval() 414 | 415 | list_y_true = [] 416 | list_y_score = [] 417 | 418 | # Main loop 419 | for batch_idx, (X, y) in enumerate(loader): 420 | scores = self._predict_batch(X) 421 | list_y_true.append(y) 422 | list_y_score.append(scores) 423 | 424 | y_true, scores = self.stack_batches(list_y_true, list_y_score) 425 | 426 | metrics_logs = self._metric_container_dict[name](y_true, scores) 427 | self.network.train() 428 | self.history.epoch_metrics.update(metrics_logs) 429 | return 430 | 431 | def _predict_batch(self, X): 432 | """ 433 | Predict one batch of data. 434 | 435 | Parameters 436 | ---------- 437 | X : torch.Tensor 438 | Owned products 439 | 440 | Returns 441 | ------- 442 | np.array 443 | model scores 444 | """ 445 | X = X.to(self.device).float() 446 | 447 | # compute model output 448 | scores, _ = self.network(X) 449 | 450 | if isinstance(scores, list): 451 | scores = [x.cpu().detach().numpy() for x in scores] 452 | else: 453 | scores = scores.cpu().detach().numpy() 454 | 455 | return scores 456 | 457 | def _set_network(self): 458 | """Setup the network and explain matrix.""" 459 | self.network = tab_network.TabNet( 460 | self.input_dim, 461 | self.output_dim, 462 | n_d=self.n_d, 463 | n_a=self.n_a, 464 | n_steps=self.n_steps, 465 | gamma=self.gamma, 466 | cat_idxs=self.cat_idxs, 467 | cat_dims=self.cat_dims, 468 | cat_emb_dim=self.cat_emb_dim, 469 | n_independent=self.n_independent, 470 | n_shared=self.n_shared, 471 | epsilon=self.epsilon, 472 | virtual_batch_size=self.virtual_batch_size, 473 | momentum=self.momentum, 474 | device_name=self.device_name, 475 | mask_type=self.mask_type, 476 | ).to(self.device) 477 | 478 | self.reducing_matrix = create_explain_matrix( 479 | self.network.input_dim, 480 | self.network.cat_emb_dim, 481 | self.network.cat_idxs, 482 | self.network.post_embed_dim, 483 | ) 484 | 485 | def _set_metrics(self, metrics, eval_names): 486 | """Set attributes relative to the metrics. 487 | 488 | Parameters 489 | ---------- 490 | metrics : list of str 491 | List of eval metric names. 492 | eval_names : list of str 493 | List of eval set names. 494 | 495 | """ 496 | metrics = metrics or [self._default_metric] 497 | 498 | metrics = check_metrics(metrics) 499 | # Set metric container for each sets 500 | self._metric_container_dict = {} 501 | for name in eval_names: 502 | self._metric_container_dict.update({name: MetricContainer(metrics, prefix=f"{name}_")}) 503 | 504 | self._metrics = [] 505 | self._metrics_names = [] 506 | for _, metric_container in self._metric_container_dict.items(): 507 | self._metrics.extend(metric_container.metrics) 508 | self._metrics_names.extend(metric_container.names) 509 | 510 | # Early stopping metric is the last eval metric 511 | self.early_stopping_metric = (self._metrics_names[-1] if len(self._metrics_names) > 0 else None) 512 | 513 | def _set_callbacks(self, custom_callbacks): 514 | """Setup the callbacks functions. 515 | 516 | Parameters 517 | ---------- 518 | custom_callbacks : list of func 519 | List of callback functions. 520 | 521 | """ 522 | # Setup default callbacks history, early stopping and scheduler 523 | callbacks = [] 524 | self.history = History(self, verbose=self.verbose) 525 | callbacks.append(self.history) 526 | if (self.early_stopping_metric is not None) and (self.patience > 0): 527 | early_stopping = EarlyStopping( 528 | early_stopping_metric=self.early_stopping_metric, 529 | is_maximize=(self._metrics[-1]._maximize if len(self._metrics) > 0 else None), 530 | patience=self.patience, 531 | ) 532 | callbacks.append(early_stopping) 533 | else: 534 | print("No early stopping will be performed, last training weights will be used.") 535 | if self.scheduler_fn is not None: 536 | # Add LR Scheduler call_back 537 | is_batch_level = self.scheduler_params.pop("is_batch_level", False) 538 | scheduler = LRSchedulerCallback( 539 | scheduler_fn=self.scheduler_fn, 540 | scheduler_params=self.scheduler_params, 541 | optimizer=self._optimizer, 542 | early_stopping_metric=self.early_stopping_metric, 543 | is_batch_level=is_batch_level, 544 | ) 545 | callbacks.append(scheduler) 546 | 547 | if custom_callbacks: 548 | callbacks.extend(custom_callbacks) 549 | self._callback_container = CallbackContainer(callbacks) 550 | self._callback_container.set_trainer(self) 551 | 552 | def _set_optimizer(self): 553 | """Setup optimizer.""" 554 | self._optimizer = self.optimizer_fn(self.network.parameters(), **self.optimizer_params) 555 | 556 | def _construct_loaders(self, X_train, y_train, eval_set): 557 | """Generate dataloaders for train and eval set. 558 | 559 | Parameters 560 | ---------- 561 | X_train : np.array 562 | Train set. 563 | y_train : np.array 564 | Train targets. 565 | eval_set : list of tuple 566 | List of eval tuple set (X, y). 567 | 568 | Returns 569 | ------- 570 | train_dataloader : `torch.utils.data.Dataloader` 571 | Training dataloader. 572 | valid_dataloaders : list of `torch.utils.data.Dataloader` 573 | List of validation dataloaders. 574 | 575 | """ 576 | # all weights are not allowed for this type of model 577 | y_train_mapped = self.prepare_target(y_train) 578 | for i, (X, y) in enumerate(eval_set): 579 | y_mapped = self.prepare_target(y) 580 | eval_set[i] = (X, y_mapped) 581 | 582 | train_dataloader, valid_dataloaders = create_dataloaders( 583 | X_train, 584 | y_train_mapped, 585 | eval_set, 586 | self.updated_weights, 587 | self.batch_size, 588 | self.num_workers, 589 | self.drop_last, 590 | self.pin_memory, 591 | ) 592 | return train_dataloader, valid_dataloaders 593 | 594 | def _compute_feature_importances(self, loader): 595 | """Compute global feature importance. 596 | 597 | Parameters 598 | ---------- 599 | loader : `torch.utils.data.Dataloader` 600 | Pytorch dataloader. 601 | 602 | """ 603 | self.network.eval() 604 | feature_importances_ = np.zeros((self.network.post_embed_dim)) 605 | for data, targets in loader: 606 | data = data.to(self.device).float() 607 | M_explain, masks = self.network.forward_masks(data) 608 | feature_importances_ += M_explain.sum(dim=0).cpu().detach().numpy() 609 | 610 | feature_importances_ = csc_matrix.dot(feature_importances_, self.reducing_matrix) 611 | self.feature_importances_ = feature_importances_ / np.sum(feature_importances_) 612 | 613 | @abstractmethod 614 | def update_fit_params(self, X_train, y_train, eval_set, weights): 615 | """ 616 | Set attributes relative to fit function. 617 | 618 | Parameters 619 | ---------- 620 | X_train : np.ndarray 621 | Train set 622 | y_train : np.array 623 | Train targets 624 | eval_set : list of tuple 625 | List of eval tuple set (X, y). 626 | weights : bool or dictionnary 627 | 0 for no balancing 628 | 1 for automated balancing 629 | """ 630 | raise NotImplementedError("users must define update_fit_params to use this base class") 631 | 632 | @abstractmethod 633 | def compute_loss(self, y_score, y_true): 634 | """ 635 | Compute the loss. 636 | 637 | Parameters 638 | ---------- 639 | y_score : a :tensor: `torch.Tensor` 640 | Score matrix 641 | y_true : a :tensor: `torch.Tensor` 642 | Target matrix 643 | 644 | Returns 645 | ------- 646 | float 647 | Loss value 648 | """ 649 | raise NotImplementedError("users must define compute_loss to use this base class") 650 | 651 | @abstractmethod 652 | def prepare_target(self, y): 653 | """ 654 | Prepare target before training. 655 | 656 | Parameters 657 | ---------- 658 | y : a :tensor: `torch.Tensor` 659 | Target matrix. 660 | 661 | Returns 662 | ------- 663 | `torch.Tensor` 664 | Converted target matrix. 665 | """ 666 | raise NotImplementedError("users must define prepare_target to use this base class") 667 | --------------------------------------------------------------------------------