├── Catboost
    ├── evaluate_30_trials.py
    ├── lib
    │   ├── __init__.py
    │   ├── data.py
    │   ├── deep.py
    │   ├── env.py
    │   ├── metrics.py
    │   ├── synthetic_data.py
    │   └── util.py
    ├── run_catboost.py
    └── utils.py
├── FT_Transformer
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── bin
    │   ├── evaluate.py
    │   ├── evaluate_30_trials.py
    │   ├── ft_transformer.py
    │   ├── openmlcc18_tasks.txt
    │   └── run_ft.py
    ├── lib
    │   ├── __init__.py
    │   ├── data.py
    │   ├── deep.py
    │   ├── env.py
    │   ├── metrics.py
    │   ├── node
    │   │   ├── __init__.py
    │   │   ├── arch.py
    │   │   ├── nn_utils.py
    │   │   ├── odst.py
    │   │   └── utils.py
    │   ├── synthetic_data.py
    │   └── util.py
    ├── pyproject.toml
    ├── requirements.txt
    └── setup.cfg
├── README.md
├── ResNet
    ├── evaluate_30_trials.py
    ├── lib
    │   ├── __init__.py
    │   ├── data.py
    │   ├── deep.py
    │   ├── env.py
    │   ├── metrics.py
    │   ├── node
    │   │   ├── __init__.py
    │   │   ├── arch.py
    │   │   ├── nn_utils.py
    │   │   ├── odst.py
    │   │   └── utils.py
    │   ├── synthetic_data.py
    │   └── util.py
    ├── resnet_ft.py
    ├── resnext.py
    └── run_resnetFt.py
├── TabNet
    ├── abstract_model.py
    ├── augmentations.py
    ├── callbacks.py
    ├── evaluate_30_trials.py
    ├── lib
    │   ├── __init__.py
    │   ├── data.py
    │   ├── deep.py
    │   ├── env.py
    │   ├── metrics.py
    │   ├── synthetic_data.py
    │   └── util.py
    ├── metrics.py
    ├── multiclass_utils.py
    ├── multitask.py
    ├── pretraining.py
    ├── pretraining_utils.py
    ├── run_tabnet.py
    ├── sparsemax.py
    ├── tab_model.py
    ├── tab_network.py
    ├── tabnet_utils.py
    └── utils.py
├── TabPFN
    ├── run_tabpfn.py
    └── utils.py
├── XGBoost
    ├── evaluate_30_trials.py
    ├── lib
    │   ├── __init__.py
    │   ├── data.py
    │   ├── deep.py
    │   ├── env.py
    │   ├── metrics.py
    │   ├── synthetic_data.py
    │   └── util.py
    ├── run_xgboost.py
    └── utils.py
└── saint
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── augmentations.py
    ├── data_openml.py
    ├── evaluate_30_trials.py
    ├── models
        ├── __init__.py
        ├── model.py
        ├── pretrainmodel.py
        └── pretrainmodel_vision.py
    ├── pipeline.png
    ├── pretraining.py
    ├── run_saint.py
    ├── run_saint_test.py
    ├── run_saint_traditional.py
    ├── saint_environment.yml
    └── utils.py


/Catboost/evaluate_30_trials.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import math
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import optuna
  7 | import pandas as pd
  8 | import zero
  9 | import torch.nn as nn
 10 | import torch
 11 | import torch.nn.functional as F
 12 | from catboost import CatBoostClassifier
 13 | from sklearn.metrics import roc_auc_score, accuracy_score
 14 | from xgboost import XGBClassifier
 15 | 
 16 | import lib
 17 | import wandb
 18 | 
 19 | from sklearn.model_selection import StratifiedKFold
 20 | from utils import set_random_seed
 21 | 
 22 | # Create the parser
 23 | parser = argparse.ArgumentParser(description="Train a model with specified parameters.")
 24 | 
 25 | # Add the arguments
 26 | parser.add_argument('--experiment_name', type=str, default='test',
 27 |                     help='The name of the experiment. Default is "test".')
 28 | parser.add_argument('--dataset', type=int, default=23,
 29 |                     help='The dataset ID to use. Default is 45068 (adult).')
 30 | parser.add_argument('--seed', type=int, default=0,
 31 |                     help='The random seed for reproducibility. Default is 42.')
 32 | parser.add_argument('--normalization', type=str, default='quantile', choices=['quantile', 'standard'],
 33 |                     help='The normalization to use for the numerical features. Default is "quantile".')
 34 | parser.add_argument('--cat_nan_policy', type=str, default='new', choices=['new', 'most_frequent'],
 35 |                     help='The policy to use for handling nan values in categorical features. Default is "new".')
 36 | parser.add_argument('--cat_policy', type=str, default='indices', choices=['indices', 'ohe'],
 37 |                     help='The policy to use for handling categorical features. Default is "indices".')
 38 | parser.add_argument('--outer_fold', type=int, default=0, help='The outer fold to use. Default is 0')
 39 | parser.add_argument('--n_trials', type=int, default=100,
 40 |                     help='The number of trials to use for HPO. Default is 100')
 41 | parser.add_argument('--tune', action='store_true', help='Whether to tune the hyperparameters using Optuna')
 42 | args = parser.parse_args()
 43 | 
 44 | 
 45 | def load_best_config(project_name, dataset_name, outer_fold, num_trials=30):
 46 |     api = wandb.Api()
 47 |     target_run_name = f"{dataset_name}_outerFold_{outer_fold}"
 48 |     runs = api.runs(project_name)
 49 | 
 50 |     target_run = None
 51 |     for run in runs:
 52 |         if run.name == target_run_name:
 53 |             target_run = run
 54 |             break
 55 | 
 56 |     if not target_run:
 57 |         raise ValueError(f"No run found with name: {target_run_name}")
 58 | 
 59 |     # First scan for the best average_test_rocauc
 60 |     best_rocauc = 0  # Looking for the highest rocauc
 61 |     best_step = None
 62 |     history = target_run.scan_history(keys=['average_test_rocauc'])
 63 |     for i, row in enumerate(history):
 64 |         if i >= num_trials:
 65 |             break
 66 |         if 'average_test_rocauc' in row and row['average_test_rocauc'] > best_rocauc:
 67 |             best_rocauc = row['average_test_rocauc']
 68 |             best_step = i
 69 | 
 70 |     if best_step is None:
 71 |         raise ValueError("Best rocauc not found within the first 30 trials")
 72 | 
 73 |     # Second scan for the HPs at the best step
 74 |     hp_keys = ['max_depth', 'learning_rate', 'bagging_temperature', 'l2_leaf_reg', 'leaf_estimation_iterations']
 75 |     best_config = None
 76 |     history = target_run.scan_history(keys=hp_keys)
 77 |     for i, row in enumerate(history):
 78 |         if i == best_step:
 79 |             best_config = {key: row[key] for key in hp_keys if key in row}
 80 |             break
 81 | 
 82 |     if best_config:
 83 |         return best_config
 84 |     else:
 85 |         raise ValueError("HPs not found for the best rocauc step")
 86 | 
 87 | 
 88 | def run_single_outer_fold(outer_fold, D, outer_folds):
 89 |     outer_train_idx, outer_test_idx = outer_folds[outer_fold]
 90 | 
 91 |     best_configuration = load_best_config('t4tab/CatboostFT_optuna_CPU', D.info['dataset_name'], args.outer_fold)
 92 | 
 93 |     X_outer_preprocessed = D.build_X(
 94 |         normalization='quantile',
 95 |         num_nan_policy='mean',
 96 |         cat_nan_policy='new',
 97 |         cat_policy='indices',
 98 |         seed=args.seed,
 99 |         train_idx=outer_train_idx,
100 |         test_idx=outer_test_idx,
101 |     )
102 |     set_random_seed(args.seed)
103 |     Y, y_info = D.build_y(train_idx=outer_train_idx, test_idx=outer_test_idx)
104 | 
105 |     N, C = X_outer_preprocessed
106 |     n_num_features = 0 if N is None else N[outer_train_idx].shape[1]
107 |     n_cat_features = 0 if C is None else C[outer_train_idx].shape[1]
108 |     n_features = n_num_features + n_cat_features
109 |     if N is None:
110 |         assert C is not None
111 |         X_outer_preprocessed = pd.DataFrame(C, columns=range(n_features))
112 |     elif C is None:
113 |         assert N is not None
114 |         X_outer_preprocessed = pd.DataFrame(N, columns=range(n_features))
115 |     else:
116 |         X_outer_preprocessed = pd.concat(
117 |             [
118 |                 pd.DataFrame(N, columns=range(n_num_features)),
119 |                 pd.DataFrame(C, columns=range(n_num_features, n_features)),
120 |             ],
121 |             axis=1
122 |         )
123 |     cat_features = list(range(n_num_features, n_features))
124 |     unique_classes, class_counts = np.unique(Y[outer_train_idx], axis=0, return_counts=True)
125 |     nr_classes = len(unique_classes)
126 |     model = CatBoostClassifier(
127 |         task_type='CPU',
128 |         loss_function='MultiClass' if nr_classes > 2 else 'Logloss',
129 |         eval_metric='AUC',
130 |         random_seed=args.seed,
131 |         early_stopping_rounds=50,
132 |         od_pval=0.001,
133 |         iterations=2000,
134 |         max_depth=best_configuration['max_depth'],
135 |         learning_rate=best_configuration['learning_rate'],
136 |         bagging_temperature=best_configuration['bagging_temperature'],
137 |         l2_leaf_reg=best_configuration['l2_leaf_reg'],
138 |         leaf_estimation_iterations=best_configuration['leaf_estimation_iterations'],
139 |     )
140 | 
141 |     model.fit(X_outer_preprocessed.iloc[outer_train_idx], Y[outer_train_idx],
142 |               eval_set=(X_outer_preprocessed.iloc[outer_test_idx], Y[outer_test_idx]),
143 |               cat_features=cat_features,
144 |               verbose=False)
145 | 
146 |     train_predictions_labels = model.predict(X_outer_preprocessed.iloc[outer_train_idx])
147 |     test_predictions_labels = model.predict(X_outer_preprocessed.iloc[outer_test_idx])
148 |     if D.is_multiclass:
149 |         train_predictions_probabilities = model.predict_proba(X_outer_preprocessed.iloc[outer_train_idx])
150 |         test_predictions_probabilities = model.predict_proba(X_outer_preprocessed.iloc[outer_test_idx])
151 |     else:
152 |         train_predictions_probabilities = model.predict_proba(X_outer_preprocessed.iloc[outer_train_idx])[:, 1]
153 |         test_predictions_probabilities = model.predict_proba(X_outer_preprocessed.iloc[outer_test_idx])[:, 1]
154 | 
155 |     # calculate the balanced accuracy
156 |     train_rocauc = roc_auc_score(Y[outer_train_idx], train_predictions_probabilities,
157 |                                  multi_class='raise' if nr_classes == 2 else 'ovo')
158 |     train_accuracy = accuracy_score(Y[outer_train_idx], train_predictions_labels)
159 |     test_rocauc = roc_auc_score(Y[outer_test_idx], test_predictions_probabilities,
160 |                                 multi_class='raise' if nr_classes == 2 else 'ovo')
161 |     test_accuracy = accuracy_score(Y[outer_test_idx], test_predictions_labels)
162 |     print(f"Finished outer fold {outer_fold}")
163 | 
164 |     output_info = {
165 |         'train_rocauc': train_rocauc,
166 |         'train_accuracy': train_accuracy,
167 |         'test_accuracy': test_accuracy,
168 |         f'best_test_rocauc_outer_fold_{outer_fold}': test_rocauc,
169 |     }
170 |     wandb.log(output_info)
171 |     wandb.finish()
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     # %%
176 |     set_random_seed(args.seed)
177 |     D = lib.Dataset.from_openml(args.dataset)
178 |     run_name = f"{D.info['dataset_name']}_outerFold_{args.outer_fold}"
179 |     wandb.init(project=args.experiment_name,
180 |                name=run_name,
181 |                config=args)
182 |     outer_kfold = StratifiedKFold(n_splits=10, shuffle=True)
183 |     outer_folds = list(outer_kfold.split(D.X, D.y))
184 |     run_single_outer_fold(args.outer_fold, D, outer_folds)
185 | 


--------------------------------------------------------------------------------
/Catboost/lib/__init__.py:
--------------------------------------------------------------------------------
 1 | from icecream import install
 2 | 
 3 | install()
 4 | 
 5 | from . import env  # noqa
 6 | from .data import *  # noqa
 7 | from .deep import *  # noqa
 8 | from .metrics import *  # noqa
 9 | from .util import *  # noqa
10 | 


--------------------------------------------------------------------------------
/Catboost/lib/env.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/Catboost/lib/metrics.py:
--------------------------------------------------------------------------------
 1 | import typing as ty
 2 | 
 3 | import numpy as np
 4 | import scipy.special
 5 | import sklearn.metrics as skm
 6 | 
 7 | from . import util
 8 | 
 9 | 
10 | def calculate_metrics(
11 |     task_type: str,
12 |     y: np.ndarray,
13 |     prediction: np.ndarray,
14 |     classification_mode: str,
15 |     y_info: ty.Optional[ty.Dict[str, ty.Any]],
16 | ) -> ty.Dict[str, float]:
17 |     if task_type == util.REGRESSION:
18 |         del classification_mode
19 |         rmse = skm.mean_squared_error(y, prediction) ** 0.5  # type: ignore[code]
20 | 
21 |         return {'rmse': rmse, 'score': -rmse}
22 |     else:
23 |         assert task_type in (util.BINCLASS, util.MULTICLASS)
24 |         labels = None
25 |         if classification_mode == 'probs':
26 |             probs = prediction
27 |         elif classification_mode == 'logits':
28 |             probs = (
29 |                 scipy.special.expit(prediction)
30 |                 if task_type == util.BINCLASS
31 |                 else scipy.special.softmax(prediction, axis=1)
32 |             )
33 |         else:
34 |             assert classification_mode == 'labels'
35 |             probs = None
36 |             labels = prediction
37 |         if labels is None:
38 |             labels = (
39 |                 np.round(probs).astype('int64')
40 |                 if task_type == util.BINCLASS
41 |                 else probs.argmax(axis=1)  # type: ignore[code]
42 |             )
43 | 
44 |         result = skm.classification_report(y, labels, output_dict=True)  # type: ignore[code]
45 |         if task_type == util.BINCLASS:
46 |             result['roc_auc'] = skm.roc_auc_score(y, probs)  # type: ignore[code]
47 |         else:
48 |             result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo')  # type: ignore[code]
49 |         result['score'] = result['roc_auc']  # type: ignore[code]
50 |     return result  # type: ignore[code]
51 | 
52 | 
53 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str:
54 |     precision = 3
55 |     summary = {}
56 |     for k, v in metrics[1].items():
57 |         if k.isdigit():
58 |             continue
59 |         k = {
60 |             'score': 'SCORE',
61 |             'accuracy': 'acc',
62 |             'roc_auc': 'roc_auc',
63 |             'macro avg': 'm',
64 |             'weighted avg': 'w',
65 |         }.get(k, k)
66 |         if isinstance(v, float):
67 |             v = round(v, precision)
68 |             summary[k] = v
69 |         else:
70 |             v = {
71 |                 {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get(
72 |                     x, x
73 |                 ): round(v[x], precision)
74 |                 for x in v
75 |             }
76 |             for item in v.items():
77 |                 summary[k + item[0]] = item[1]
78 | 
79 |     s = [f'score = {summary.pop("SCORE"):.3f}']
80 |     for k, v in summary.items():
81 |         if k not in ['mp', 'mr', 'wp', 'wr']:  # just to save screen space
82 |             s.append(f'{k} = {v}')
83 |     return ' | '.join(s)
84 | 


--------------------------------------------------------------------------------
/Catboost/lib/synthetic_data.py:
--------------------------------------------------------------------------------
  1 | "Code used to generate data for experiments with synthetic data"
  2 | import math
  3 | import typing as ty
  4 | 
  5 | import numba
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | from numba.experimental import jitclass
 10 | from tqdm.auto import tqdm
 11 | 
 12 | 
 13 | class MLP(nn.Module):
 14 |     def __init__(
 15 |         self,
 16 |         *,
 17 |         d_in: int,
 18 |         d_layers: ty.List[int],
 19 |         d_out: int,
 20 |         bias: bool = True,
 21 |     ) -> None:
 22 |         super().__init__()
 23 |         self.layers = nn.ModuleList(
 24 |             [
 25 |                 nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias)
 26 |                 for i, x in enumerate(d_layers)
 27 |             ]
 28 |         )
 29 |         self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out)
 30 | 
 31 |         def init_weights(m):
 32 |             if isinstance(m, nn.Linear):
 33 |                 torch.nn.init.kaiming_normal_(m.weight, mode='fan_in')
 34 |                 if m.bias is not None:
 35 |                     fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight)
 36 |                     bound = 1 / math.sqrt(fan_in)
 37 |                     torch.nn.init.uniform_(m.bias, -bound, bound)
 38 | 
 39 |         self.apply(init_weights)
 40 | 
 41 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 42 |         for layer in self.layers:
 43 |             x = layer(x)
 44 |             x = torch.relu(x)
 45 |         x = self.head(x)
 46 |         x = x.squeeze(-1)
 47 |         return x
 48 | 
 49 | 
 50 | @jitclass(
 51 |     spec=[
 52 |         ('left_children', numba.int64[:]),
 53 |         ('right_children', numba.int64[:]),
 54 |         ('feature', numba.int64[:]),
 55 |         ('threshold', numba.float32[:]),
 56 |         ('value', numba.float32[:]),
 57 |         ('is_leaf', numba.int64[:]),
 58 |     ]
 59 | )
 60 | class Tree:
 61 |     "Randomly initialized decision tree"
 62 | 
 63 |     def __init__(self, n_features, n_nodes, max_depth):
 64 |         assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes"
 65 | 
 66 |         self.left_children = np.ones(n_nodes, dtype=np.int64) * -1
 67 |         self.right_children = np.ones(n_nodes, dtype=np.int64) * -1
 68 |         self.feature = np.random.randint(0, n_features, (n_nodes,))
 69 |         self.threshold = np.random.randn(n_nodes).astype(np.float32)
 70 |         self.value = np.random.randn(n_nodes).astype(np.float32)
 71 |         depth = np.zeros(n_nodes, dtype=np.int64)
 72 | 
 73 |         # Root is 0
 74 |         self.is_leaf = np.zeros(n_nodes, dtype=np.int64)
 75 |         self.is_leaf[0] = 1
 76 | 
 77 |         # Keep adding nodes while we can (new node must have 2 children)
 78 |         while True:
 79 |             idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())]
 80 |             if depth[idx] < max_depth:
 81 |                 unused = np.flatnonzero(
 82 |                     (self.left_children == -1)
 83 |                     & (self.right_children == -1)
 84 |                     & ~self.is_leaf
 85 |                 )
 86 |                 if len(unused) < 2:
 87 |                     break
 88 | 
 89 |                 lr_child = unused[np.random.permutation(unused.shape[0])[:2]]
 90 |                 self.is_leaf[lr_child] = 1
 91 |                 self.is_leaf[lr_child] = 1
 92 |                 depth[lr_child] = depth[idx] + 1
 93 |                 self.left_children[idx] = lr_child[0]
 94 |                 self.right_children[idx] = lr_child[1]
 95 |                 self.is_leaf[idx] = 0
 96 | 
 97 |     def apply(self, x):
 98 |         y = np.zeros(x.shape[0])
 99 | 
100 |         for i in range(x.shape[0]):
101 |             idx = 0
102 | 
103 |             while not self.is_leaf[idx]:
104 |                 if x[i, self.feature[idx]] < self.threshold[idx]:
105 |                     idx = self.left_children[idx]
106 |                 else:
107 |                     idx = self.right_children[idx]
108 | 
109 |             y[i] = self.value[idx]
110 | 
111 |         return y
112 | 
113 | 
114 | class TreeEnsemble:
115 |     "Combine multiple trees"
116 | 
117 |     def __init__(self, *, n_trees, n_features, n_nodes, max_depth):
118 |         self.trees = [
119 |             Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth)
120 |             for _ in range(n_trees)
121 |         ]
122 | 
123 |     def apply(self, x):
124 |         return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0)
125 | 


--------------------------------------------------------------------------------
/Catboost/lib/util.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import json
  4 | import os
  5 | import pickle
  6 | import random
  7 | import shutil
  8 | import sys
  9 | import time
 10 | import typing as ty
 11 | from copy import deepcopy
 12 | from pathlib import Path
 13 | 
 14 | import numpy as np
 15 | import pynvml
 16 | import pytomlpp as toml
 17 | import torch
 18 | 
 19 | from . import env
 20 | 
 21 | TRAIN = 'train'
 22 | VAL = 'val'
 23 | TEST = 'test'
 24 | PARTS = [TRAIN, VAL, TEST]
 25 | 
 26 | BINCLASS = 'binclass'
 27 | MULTICLASS = 'multiclass'
 28 | REGRESSION = 'regression'
 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION]
 30 | 
 31 | 
 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any:
 33 |     return json.loads(Path(path).read_text())
 34 | 
 35 | 
 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None:
 37 |     Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n')
 38 | 
 39 | 
 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any:
 41 |     return toml.loads(Path(path).read_text())
 42 | 
 43 | 
 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None:
 45 |     Path(path).write_text(toml.dumps(x) + '\n')
 46 | 
 47 | 
 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any:
 49 |     return pickle.loads(Path(path).read_bytes())
 50 | 
 51 | 
 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None:
 53 |     Path(path).write_bytes(pickle.dumps(x))
 54 | 
 55 | 
 56 | def load(path: ty.Union[Path, str]) -> ty.Any:
 57 |     return globals()[f'load_{Path(path).suffix[1:]}'](path)
 58 | 
 59 | 
 60 | def load_config(
 61 |     argv: ty.Optional[ty.List[str]] = None,
 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]:
 63 |     parser = argparse.ArgumentParser()
 64 |     parser.add_argument('config', metavar='FILE')
 65 |     parser.add_argument('-o', '--output', metavar='DIR')
 66 |     parser.add_argument('-f', '--force', action='store_true')
 67 |     parser.add_argument('--continue', action='store_true', dest='continue_')
 68 |     if argv is None:
 69 |         argv = sys.argv[1:]
 70 |     args = parser.parse_args(argv)
 71 | 
 72 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
 73 |     if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists():
 74 |         assert args.continue_
 75 | 
 76 |     config_path = Path(args.config).absolute()
 77 |     output_dir = (
 78 |         Path(args.output)
 79 |         if args.output
 80 |         else config_path.parent.joinpath(config_path.stem)
 81 |     ).absolute()
 82 |     sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir))))  # type: ignore[code]
 83 |     print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n')
 84 | 
 85 |     assert config_path.exists()
 86 |     config = load_toml(config_path)
 87 | 
 88 |     environment: ty.Dict[str, ty.Any] = {}
 89 |     if torch.cuda.is_available():  # type: ignore[code]
 90 |         cvd = os.environ.get('CUDA_VISIBLE_DEVICES')
 91 |         pynvml.nvmlInit()
 92 |         environment['devices'] = {
 93 |             'CUDA_VISIBLE_DEVICES': cvd,
 94 |             'torch.version.cuda': torch.version.cuda,
 95 |             'torch.backends.cudnn.version()': torch.backends.cudnn.version(),  # type: ignore[code]
 96 |             'torch.cuda.nccl.version()': torch.cuda.nccl.version(),  # type: ignore[code]
 97 |             'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'),
 98 |         }
 99 |         if cvd:
100 |             for i in map(int, cvd.split(',')):
101 |                 handle = pynvml.nvmlDeviceGetHandleByIndex(i)
102 |                 environment['devices'][i] = {
103 |                     'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'),
104 |                     'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total,
105 |                 }
106 | 
107 |     return config, output_dir
108 | 
109 | 
110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None:
111 |     dump_json(stats, output_dir / 'stats.json', indent=4)
112 |     json_output_path = os.environ.get('JSON_OUTPUT_FILE')
113 |     if final:
114 |         output_dir.joinpath('DONE').touch()
115 |         if json_output_path:
116 |             try:
117 |                 key = str(output_dir.relative_to(env.PROJECT_DIR))
118 |             except ValueError:
119 |                 pass
120 |             else:
121 |                 json_output_path = Path(json_output_path)
122 |                 try:
123 |                     json_data = json.loads(json_output_path.read_text())
124 |                 except (FileNotFoundError, json.decoder.JSONDecodeError):
125 |                     json_data = {}
126 |                 json_data[key] = stats
127 |                 json_output_path.write_text(json.dumps(json_data))
128 |             shutil.copyfile(
129 |                 json_output_path,
130 |                 os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'),
131 |             )
132 | 
133 | 
134 | _LAST_SNAPSHOT_TIME = None
135 | 
136 | 
137 | def backup_output(output_dir: Path) -> None:
138 |     backup_dir = os.environ.get('TMP_OUTPUT_PATH')
139 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
140 |     if backup_dir is None:
141 |         assert snapshot_dir is None
142 |         return
143 |     assert snapshot_dir is not None
144 | 
145 |     try:
146 |         relative_output_dir = output_dir.relative_to(env.PROJECT_DIR)
147 |     except ValueError:
148 |         return
149 | 
150 |     for dir_ in [backup_dir, snapshot_dir]:
151 |         new_output_dir = dir_ / relative_output_dir
152 |         prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev')
153 |         new_output_dir.parent.mkdir(exist_ok=True, parents=True)
154 |         if new_output_dir.exists():
155 |             new_output_dir.rename(prev_backup_output_dir)
156 |         shutil.copytree(output_dir, new_output_dir)
157 |         if prev_backup_output_dir.exists():
158 |             shutil.rmtree(prev_backup_output_dir)
159 | 
160 |     global _LAST_SNAPSHOT_TIME
161 |     if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60:
162 |         pass
163 |         _LAST_SNAPSHOT_TIME = time.time()
164 |         print('The snapshot was saved!')
165 | 
166 | 
167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any):
168 |     raise ValueError(f'Unknown {unknown_what}: {unknown_value}')
169 | 
170 | 
171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict:
172 |     x = deepcopy(default_kwargs)
173 |     x.update(kwargs)
174 |     return x
175 | 
176 | 
177 | def set_seeds(seed: int) -> None:
178 |     random.seed(seed)
179 |     np.random.seed(seed)
180 | 
181 | 
182 | def format_seconds(seconds: float) -> str:
183 |     return str(datetime.timedelta(seconds=round(seconds)))
184 | 
185 | 
186 | def get_categories(
187 |     X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List
188 | ) -> ty.Optional[ty.List[int]]:
189 |     if X_cat is None:
190 |         return None
191 |     else:
192 |         categories_count = []
193 |         for i in range(X_cat.shape[1]):
194 |             # Combine unique categories from both training and testing indices for each feature
195 |             unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist())
196 |             categories_count.append(len(unique_categories))
197 |         return categories_count
198 | 


--------------------------------------------------------------------------------
/Catboost/utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | 
 6 | def set_random_seed(seed):
 7 |     """
 8 |     Set the seed for random number generation in Python, NumPy, and PyTorch.
 9 | 
10 |     Args:
11 |     seed (int): The seed value to use for all random number generators.
12 |     """
13 |     random.seed(seed)
14 |     np.random.seed(seed)
15 |     torch.manual_seed(seed)
16 | 
17 |     if torch.cuda.is_available():
18 |         torch.cuda.manual_seed(seed)
19 |         torch.cuda.manual_seed_all(seed)
20 |         torch.backends.cudnn.deterministic = True
21 |         torch.backends.cudnn.benchmark = False


--------------------------------------------------------------------------------
/FT_Transformer/.gitignore:
--------------------------------------------------------------------------------
  1 | # >>> GITHUB DEFAULT PYTHON .GIGIGNORE
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | # lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # <<< GITHUB DEFAULT PYTHON .GIGIGNORE
142 | 
143 | # Data, checkpoints, etc.
144 | data
145 | **/catboost_cached_datasets/**
146 | *.bin
147 | *.csv
148 | *.cbm
149 | *.npy
150 | *.pickle
151 | *.pt
152 | *.pth
153 | *.rar
154 | *.tar*
155 | *.tmp
156 | *.zip
157 | events.out.tfevents.*
158 | 
159 | # Experiments
160 | output/**/*.*
161 | !output/**/stats.json
162 | !output/**/*.toml
163 | 
164 | # Other
165 | .DS_Store
166 | .vscode/
167 | .ruff_cache
168 | 


--------------------------------------------------------------------------------
/FT_Transformer/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Authors of "Revisiting Deep Learning Models for Tabular Data"
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/FT_Transformer/bin/openmlcc18_tasks.txt:
--------------------------------------------------------------------------------
1 | 3 6 11 12 14 15 16 18 22 23 28 29 31 32 37 44 46 50 54 151 182 188 38 307 300 458 469 1049 1050 1053 1063 1067 1068 1590 4134 1510 1489 1494 1497 1501 1480 1485 1486 1487 1468 1475 1462 1464 4534 6332 1461 4538 1478 23381 40499 40668 40966 40982 40994 40983 40975 40984 40979 41027 23517 40978 40670 40701


--------------------------------------------------------------------------------
/FT_Transformer/lib/__init__.py:
--------------------------------------------------------------------------------
 1 | from icecream import install
 2 | 
 3 | install()
 4 | 
 5 | from . import env  # noqa
 6 | from .data import *  # noqa
 7 | from .deep import *  # noqa
 8 | from .env import get_path  # noqa
 9 | from .metrics import *  # noqa
10 | from .util import *  # noqa
11 | 


--------------------------------------------------------------------------------
/FT_Transformer/lib/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | PROJECT_DIR = Path(os.environ['PROJECT_DIR']).absolute().resolve()
 5 | DATA_DIR = PROJECT_DIR / 'data'
 6 | OUTPUT_DIR = PROJECT_DIR / 'output'
 7 | 
 8 | 
 9 | def get_path(relative_path: str) -> Path:
10 |     return (
11 |         Path(relative_path)
12 |         if relative_path.startswith('/')
13 |         else PROJECT_DIR / relative_path
14 |     )
15 | 


--------------------------------------------------------------------------------
/FT_Transformer/lib/metrics.py:
--------------------------------------------------------------------------------
 1 | import typing as ty
 2 | 
 3 | import numpy as np
 4 | import scipy.special
 5 | import sklearn.metrics as skm
 6 | 
 7 | from . import util
 8 | 
 9 | 
10 | def calculate_metrics(
11 |     task_type: str,
12 |     y: np.ndarray,
13 |     prediction: np.ndarray,
14 |     classification_mode: str,
15 |     y_info: ty.Optional[ty.Dict[str, ty.Any]],
16 | ) -> ty.Dict[str, float]:
17 |     if task_type == util.REGRESSION:
18 |         del classification_mode
19 |         rmse = skm.mean_squared_error(y, prediction) ** 0.5  # type: ignore[code]
20 | 
21 |         return {'rmse': rmse, 'score': -rmse}
22 |     else:
23 |         assert task_type in (util.BINCLASS, util.MULTICLASS)
24 |         labels = None
25 |         if classification_mode == 'probs':
26 |             probs = prediction
27 |         elif classification_mode == 'logits':
28 |             probs = (
29 |                 scipy.special.expit(prediction)
30 |                 if task_type == util.BINCLASS
31 |                 else scipy.special.softmax(prediction, axis=1)
32 |             )
33 |         else:
34 |             assert classification_mode == 'labels'
35 |             probs = None
36 |             labels = prediction
37 |         if labels is None:
38 |             labels = (
39 |                 np.round(probs).astype('int64')
40 |                 if task_type == util.BINCLASS
41 |                 else probs.argmax(axis=1)  # type: ignore[code]
42 |             )
43 | 
44 |         result = skm.classification_report(y, labels, output_dict=True)  # type: ignore[code]
45 |         if task_type == util.BINCLASS:
46 |             result['roc_auc'] = skm.roc_auc_score(y, probs)  # type: ignore[code]
47 |         else:
48 |             result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo')  # type: ignore[code]
49 |         result['score'] = result['roc_auc']  # type: ignore[code]
50 |     return result  # type: ignore[code]
51 | 
52 | 
53 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str:
54 |     precision = 3
55 |     summary = {}
56 |     for k, v in metrics[1].items():
57 |         if k.isdigit():
58 |             continue
59 |         k = {
60 |             'score': 'SCORE',
61 |             'accuracy': 'acc',
62 |             'roc_auc': 'roc_auc',
63 |             'macro avg': 'm',
64 |             'weighted avg': 'w',
65 |         }.get(k, k)
66 |         if isinstance(v, float):
67 |             v = round(v, precision)
68 |             summary[k] = v
69 |         else:
70 |             v = {
71 |                 {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get(
72 |                     x, x
73 |                 ): round(v[x], precision)
74 |                 for x in v
75 |             }
76 |             for item in v.items():
77 |                 summary[k + item[0]] = item[1]
78 | 
79 |     s = [f'score = {summary.pop("SCORE"):.3f}']
80 |     for k, v in summary.items():
81 |         if k not in ['mp', 'mr', 'wp', 'wr']:  # just to save screen space
82 |             s.append(f'{k} = {v}')
83 |     return ' | '.join(s)
84 | 


--------------------------------------------------------------------------------
/FT_Transformer/lib/node/__init__.py:
--------------------------------------------------------------------------------
1 | # Source: https://github.com/Qwicen/node
2 | from .arch import *  # noqa
3 | from .nn_utils import *  # noqa
4 | from .odst import *  # noqa
5 | from .utils import *  # noqa
6 | 


--------------------------------------------------------------------------------
/FT_Transformer/lib/node/arch.py:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/Qwicen/node
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.utils.checkpoint import checkpoint as torch_checkpoint
 6 | 
 7 | from .odst import ODST
 8 | 
 9 | 
10 | class DenseBlock(nn.Sequential):
11 |     def __init__(self, input_dim, layer_dim, num_layers, tree_dim=1, max_features=None,
12 |                  input_dropout=0.0, flatten_output=True, Module=ODST, **kwargs):
13 |         layers = []
14 |         for i in range(num_layers):
15 |             oddt = Module(input_dim, layer_dim, tree_dim=tree_dim, flatten_output=True, **kwargs)
16 |             input_dim = min(input_dim + layer_dim * tree_dim, max_features or float('inf'))
17 |             layers.append(oddt)
18 | 
19 |         super().__init__(*layers)
20 |         self.num_layers, self.layer_dim, self.tree_dim = num_layers, layer_dim, tree_dim
21 |         self.max_features, self.flatten_output = max_features, flatten_output
22 |         self.input_dropout = input_dropout
23 | 
24 |     def forward(self, x):
25 |         initial_features = x.shape[-1]
26 |         for layer in self:
27 |             layer_inp = x
28 |             if self.max_features is not None:
29 |                 tail_features = min(self.max_features, layer_inp.shape[-1]) - initial_features
30 |                 if tail_features != 0:
31 |                     layer_inp = torch.cat([layer_inp[..., :initial_features], layer_inp[..., -tail_features:]], dim=-1)
32 |             if self.training and self.input_dropout:
33 |                 layer_inp = F.dropout(layer_inp, self.input_dropout)
34 |             h = layer(layer_inp)
35 |             x = torch.cat([x, h], dim=-1)
36 | 
37 |         outputs = x[..., initial_features:]
38 |         if not self.flatten_output:
39 |             outputs = outputs.view(*outputs.shape[:-1], self.num_layers * self.layer_dim, self.tree_dim)
40 |         return outputs
41 | 


--------------------------------------------------------------------------------
/FT_Transformer/lib/node/nn_utils.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/Qwicen/node
  2 | import contextlib
  3 | from collections import OrderedDict
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | from torch.autograd import Function
 10 | from torch.jit import script
 11 | 
 12 | 
 13 | def to_one_hot(y, depth=None):
 14 |     r"""
 15 |     Takes integer with n dims and converts it to 1-hot representation with n + 1 dims.
 16 |     The n+1'st dimension will have zeros everywhere but at y'th index, where it will be equal to 1.
 17 |     Args:
 18 |         y: input integer (IntTensor, LongTensor or Variable) of any shape
 19 |         depth (int):  the size of the one hot dimension
 20 |     """
 21 |     y_flat = y.to(torch.int64).view(-1, 1)
 22 |     depth = depth if depth is not None else int(torch.max(y_flat)) + 1
 23 |     y_one_hot = torch.zeros(y_flat.size()[0], depth, device=y.device).scatter_(1, y_flat, 1)
 24 |     y_one_hot = y_one_hot.view(*(tuple(y.shape) + (-1,)))
 25 |     return y_one_hot
 26 | 
 27 | 
 28 | def _make_ix_like(input, dim=0):
 29 |     d = input.size(dim)
 30 |     rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
 31 |     view = [1] * input.dim()
 32 |     view[0] = -1
 33 |     return rho.view(view).transpose(0, dim)
 34 | 
 35 | 
 36 | class SparsemaxFunction(Function):
 37 |     """
 38 |     An implementation of sparsemax (Martins & Astudillo, 2016). See
 39 |     :cite:`DBLP:journals/corr/MartinsA16` for detailed description.
 40 | 
 41 |     By Ben Peters and Vlad Niculae
 42 |     """
 43 | 
 44 |     @staticmethod
 45 |     def forward(ctx, input, dim=-1):
 46 |         """sparsemax: normalizing sparse transform (a la softmax)
 47 | 
 48 |         Parameters:
 49 |             input (Tensor): any shape
 50 |             dim: dimension along which to apply sparsemax
 51 | 
 52 |         Returns:
 53 |             output (Tensor): same shape as input
 54 |         """
 55 |         ctx.dim = dim
 56 |         max_val, _ = input.max(dim=dim, keepdim=True)
 57 |         input -= max_val  # same numerical stability trick as for softmax
 58 |         tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim)
 59 |         output = torch.clamp(input - tau, min=0)
 60 |         ctx.save_for_backward(supp_size, output)
 61 |         return output
 62 | 
 63 |     @staticmethod
 64 |     def backward(ctx, grad_output):
 65 |         supp_size, output = ctx.saved_tensors
 66 |         dim = ctx.dim
 67 |         grad_input = grad_output.clone()
 68 |         grad_input[output == 0] = 0
 69 | 
 70 |         v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
 71 |         v_hat = v_hat.unsqueeze(dim)
 72 |         grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
 73 |         return grad_input, None
 74 | 
 75 | 
 76 |     @staticmethod
 77 |     def _threshold_and_support(input, dim=-1):
 78 |         """Sparsemax building block: compute the threshold
 79 | 
 80 |         Args:
 81 |             input: any dimension
 82 |             dim: dimension along which to apply the sparsemax
 83 | 
 84 |         Returns:
 85 |             the threshold value
 86 |         """
 87 | 
 88 |         input_srt, _ = torch.sort(input, descending=True, dim=dim)
 89 |         input_cumsum = input_srt.cumsum(dim) - 1
 90 |         rhos = _make_ix_like(input, dim)
 91 |         support = rhos * input_srt > input_cumsum
 92 | 
 93 |         support_size = support.sum(dim=dim).unsqueeze(dim)
 94 |         tau = input_cumsum.gather(dim, support_size - 1)
 95 |         tau /= support_size.to(input.dtype)
 96 |         return tau, support_size
 97 | 
 98 | 
 99 | sparsemax = lambda input, dim=-1: SparsemaxFunction.apply(input, dim)
100 | sparsemoid = lambda input: (0.5 * input + 0.5).clamp_(0, 1)
101 | 
102 | 
103 | class Entmax15Function(Function):
104 |     """
105 |     An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See
106 |     :cite:`https://arxiv.org/abs/1905.05702 for detailed description.
107 |     Source: https://github.com/deep-spin/entmax
108 |     """
109 | 
110 |     @staticmethod
111 |     def forward(ctx, input, dim=-1):
112 |         ctx.dim = dim
113 | 
114 |         max_val, _ = input.max(dim=dim, keepdim=True)
115 |         input = input - max_val  # same numerical stability trick as for softmax
116 |         input = input / 2  # divide by 2 to solve actual Entmax
117 | 
118 |         tau_star, _ = Entmax15Function._threshold_and_support(input, dim)
119 |         output = torch.clamp(input - tau_star, min=0) ** 2
120 |         ctx.save_for_backward(output)
121 |         return output
122 | 
123 |     @staticmethod
124 |     def backward(ctx, grad_output):
125 |         Y, = ctx.saved_tensors
126 |         gppr = Y.sqrt()  # = 1 / g'' (Y)
127 |         dX = grad_output * gppr
128 |         q = dX.sum(ctx.dim) / gppr.sum(ctx.dim)
129 |         q = q.unsqueeze(ctx.dim)
130 |         dX -= q * gppr
131 |         return dX, None
132 | 
133 |     @staticmethod
134 |     def _threshold_and_support(input, dim=-1):
135 |         Xsrt, _ = torch.sort(input, descending=True, dim=dim)
136 | 
137 |         rho = _make_ix_like(input, dim)
138 |         mean = Xsrt.cumsum(dim) / rho
139 |         mean_sq = (Xsrt ** 2).cumsum(dim) / rho
140 |         ss = rho * (mean_sq - mean ** 2)
141 |         delta = (1 - ss) / rho
142 | 
143 |         # NOTE this is not exactly the same as in reference algo
144 |         # Fortunately it seems the clamped values never wrongly
145 |         # get selected by tau <= sorted_z. Prove this!
146 |         delta_nz = torch.clamp(delta, 0)
147 |         tau = mean - torch.sqrt(delta_nz)
148 | 
149 |         support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim)
150 |         tau_star = tau.gather(dim, support_size - 1)
151 |         return tau_star, support_size
152 | 
153 | 
154 | class Entmoid15(Function):
155 |     """ A highly optimized equivalent of labda x: Entmax15([x, 0]) """
156 | 
157 |     @staticmethod
158 |     def forward(ctx, input):
159 |         output = Entmoid15._forward(input)
160 |         ctx.save_for_backward(output)
161 |         return output
162 | 
163 |     @staticmethod
164 |     @script
165 |     def _forward(input):
166 |         input, is_pos = abs(input), input >= 0
167 |         tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2
168 |         tau.masked_fill_(tau <= input, 2.0)
169 |         y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2
170 |         return torch.where(is_pos, 1 - y_neg, y_neg)
171 | 
172 |     @staticmethod
173 |     def backward(ctx, grad_output):
174 |         return Entmoid15._backward(ctx.saved_tensors[0], grad_output)
175 | 
176 |     @staticmethod
177 |     @script
178 |     def _backward(output, grad_output):
179 |         gppr0, gppr1 = output.sqrt(), (1 - output).sqrt()
180 |         grad_input = grad_output * gppr0
181 |         q = grad_input / (gppr0 + gppr1)
182 |         grad_input -= q * gppr0
183 |         return grad_input
184 | 
185 | 
186 | entmax15 = lambda input, dim=-1: Entmax15Function.apply(input, dim)
187 | entmoid15 = Entmoid15.apply
188 | 
189 | 
190 | class Lambda(nn.Module):
191 |     def __init__(self, func):
192 |         super().__init__()
193 |         self.func = func
194 | 
195 |     def forward(self, *args, **kwargs):
196 |         return self.func(*args, **kwargs)
197 | 
198 | 
199 | class ModuleWithInit(nn.Module):
200 |     """ Base class for pytorch module with data-aware initializer on first batch """
201 |     def __init__(self):
202 |         super().__init__()
203 |         self._is_initialized_tensor = nn.Parameter(torch.tensor(0, dtype=torch.uint8), requires_grad=False)
204 |         self._is_initialized_bool = None
205 |         # Note: this module uses a separate flag self._is_initialized so as to achieve both
206 |         # * persistence: is_initialized is saved alongside model in state_dict
207 |         # * speed: model doesn't need to cache
208 |         # please DO NOT use these flags in child modules
209 | 
210 |     def initialize(self, *args, **kwargs):
211 |         """ initialize module tensors using first batch of data """
212 |         raise NotImplementedError("Please implement ")
213 | 
214 |     def __call__(self, *args, **kwargs):
215 |         if self._is_initialized_bool is None:
216 |             self._is_initialized_bool = bool(self._is_initialized_tensor.item())
217 |         if not self._is_initialized_bool:
218 |             self.initialize(*args, **kwargs)
219 |             self._is_initialized_tensor.data[...] = 1
220 |             self._is_initialized_bool = True
221 |         return super().__call__(*args, **kwargs)
222 | 


--------------------------------------------------------------------------------
/FT_Transformer/lib/node/utils.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/Qwicen/node
  2 | import contextlib
  3 | import gc
  4 | import glob
  5 | import hashlib
  6 | import os
  7 | import time
  8 | 
  9 | import numpy as np
 10 | import requests
 11 | import torch
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | def download(url, filename, delete_if_interrupted=True, chunk_size=4096):
 16 |     """ saves file from url to filename with a fancy progressbar """
 17 |     try:
 18 |         with open(filename, "wb") as f:
 19 |             print("Downloading {} > {}".format(url, filename))
 20 |             response = requests.get(url, stream=True)
 21 |             total_length = response.headers.get('content-length')
 22 | 
 23 |             if total_length is None:  # no content length header
 24 |                 f.write(response.content)
 25 |             else:
 26 |                 total_length = int(total_length)
 27 |                 with tqdm(total=total_length) as progressbar:
 28 |                     for data in response.iter_content(chunk_size=chunk_size):
 29 |                         if data:  # filter-out keep-alive chunks
 30 |                             f.write(data)
 31 |                             progressbar.update(len(data))
 32 |     except Exception as e:
 33 |         if delete_if_interrupted:
 34 |             print("Removing incomplete download {}.".format(filename))
 35 |             os.remove(filename)
 36 |         raise e
 37 |     return filename
 38 | 
 39 | 
 40 | def iterate_minibatches(*tensors, batch_size, shuffle=True, epochs=1,
 41 |                         allow_incomplete=True, callback=lambda x:x):
 42 |     indices = np.arange(len(tensors[0]))
 43 |     upper_bound = int((np.ceil if allow_incomplete else np.floor) (len(indices) / batch_size)) * batch_size
 44 |     epoch = 0
 45 |     while True:
 46 |         if shuffle:
 47 |             np.random.shuffle(indices)
 48 |         for batch_start in callback(range(0, upper_bound, batch_size)):
 49 |             batch_ix = indices[batch_start: batch_start + batch_size]
 50 |             batch = [tensor[batch_ix] for tensor in tensors]
 51 |             yield batch if len(tensors) > 1 else batch[0]
 52 |         epoch += 1
 53 |         if epoch >= epochs:
 54 |             break
 55 | 
 56 | 
 57 | def process_in_chunks(function, *args, batch_size, out=None, **kwargs):
 58 |     """
 59 |     Computes output by applying batch-parallel function to large data tensor in chunks
 60 |     :param function: a function(*[x[indices, ...] for x in args]) -> out[indices, ...]
 61 |     :param args: one or many tensors, each [num_instances, ...]
 62 |     :param batch_size: maximum chunk size processed in one go
 63 |     :param out: memory buffer for out, defaults to torch.zeros of appropriate size and type
 64 |     :returns: function(data), computed in a memory-efficient way
 65 |     """
 66 |     total_size = args[0].shape[0]
 67 |     first_output = function(*[x[0: batch_size] for x in args])
 68 |     output_shape = (total_size,) + tuple(first_output.shape[1:])
 69 |     if out is None:
 70 |         out = torch.zeros(*output_shape, dtype=first_output.dtype, device=first_output.device,
 71 |                           layout=first_output.layout, **kwargs)
 72 | 
 73 |     out[0: batch_size] = first_output
 74 |     for i in range(batch_size, total_size, batch_size):
 75 |         batch_ix = slice(i, min(i + batch_size, total_size))
 76 |         out[batch_ix] = function(*[x[batch_ix] for x in args])
 77 |     return out
 78 | 
 79 | 
 80 | def check_numpy(x):
 81 |     """ Makes sure x is a numpy array """
 82 |     if isinstance(x, torch.Tensor):
 83 |         x = x.detach().cpu().numpy()
 84 |     x = np.asarray(x)
 85 |     assert isinstance(x, np.ndarray)
 86 |     return x
 87 | 
 88 | 
 89 | @contextlib.contextmanager
 90 | def nop_ctx():
 91 |     yield None
 92 | 
 93 | 
 94 | def get_latest_file(pattern):
 95 |     list_of_files = glob.glob(pattern) # * means all if need specific format then *.csv
 96 |     assert len(list_of_files) > 0, "No files found: " + pattern
 97 |     return max(list_of_files, key=os.path.getctime)
 98 | 
 99 | 
100 | def md5sum(fname):
101 |     """ Computes mdp checksum of a file """
102 |     hash_md5 = hashlib.md5()
103 |     with open(fname, "rb") as f:
104 |         for chunk in iter(lambda: f.read(4096), b""):
105 |             hash_md5.update(chunk)
106 |     return hash_md5.hexdigest()
107 | 
108 | 
109 | def free_memory(sleep_time=0.1):
110 |     """ Black magic function to free torch memory and some jupyter whims """
111 |     gc.collect()
112 |     torch.cuda.synchronize()
113 |     gc.collect()
114 |     torch.cuda.empty_cache()
115 |     time.sleep(sleep_time)
116 | 
117 | def to_float_str(element):
118 |     try:
119 |         return str(float(element))
120 |     except ValueError:
121 |         return element
122 | 


--------------------------------------------------------------------------------
/FT_Transformer/lib/synthetic_data.py:
--------------------------------------------------------------------------------
  1 | "Code used to generate data for experiments with synthetic data"
  2 | import math
  3 | import typing as ty
  4 | 
  5 | import numba
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | from numba.experimental import jitclass
 10 | from tqdm.auto import tqdm
 11 | 
 12 | 
 13 | class MLP(nn.Module):
 14 |     def __init__(
 15 |         self,
 16 |         *,
 17 |         d_in: int,
 18 |         d_layers: ty.List[int],
 19 |         d_out: int,
 20 |         bias: bool = True,
 21 |     ) -> None:
 22 |         super().__init__()
 23 |         self.layers = nn.ModuleList(
 24 |             [
 25 |                 nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias)
 26 |                 for i, x in enumerate(d_layers)
 27 |             ]
 28 |         )
 29 |         self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out)
 30 | 
 31 |         def init_weights(m):
 32 |             if isinstance(m, nn.Linear):
 33 |                 torch.nn.init.kaiming_normal_(m.weight, mode='fan_in')
 34 |                 if m.bias is not None:
 35 |                     fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight)
 36 |                     bound = 1 / math.sqrt(fan_in)
 37 |                     torch.nn.init.uniform_(m.bias, -bound, bound)
 38 | 
 39 |         self.apply(init_weights)
 40 | 
 41 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 42 |         for layer in self.layers:
 43 |             x = layer(x)
 44 |             x = torch.relu(x)
 45 |         x = self.head(x)
 46 |         x = x.squeeze(-1)
 47 |         return x
 48 | 
 49 | 
 50 | @jitclass(
 51 |     spec=[
 52 |         ('left_children', numba.int64[:]),
 53 |         ('right_children', numba.int64[:]),
 54 |         ('feature', numba.int64[:]),
 55 |         ('threshold', numba.float32[:]),
 56 |         ('value', numba.float32[:]),
 57 |         ('is_leaf', numba.int64[:]),
 58 |     ]
 59 | )
 60 | class Tree:
 61 |     "Randomly initialized decision tree"
 62 | 
 63 |     def __init__(self, n_features, n_nodes, max_depth):
 64 |         assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes"
 65 | 
 66 |         self.left_children = np.ones(n_nodes, dtype=np.int64) * -1
 67 |         self.right_children = np.ones(n_nodes, dtype=np.int64) * -1
 68 |         self.feature = np.random.randint(0, n_features, (n_nodes,))
 69 |         self.threshold = np.random.randn(n_nodes).astype(np.float32)
 70 |         self.value = np.random.randn(n_nodes).astype(np.float32)
 71 |         depth = np.zeros(n_nodes, dtype=np.int64)
 72 | 
 73 |         # Root is 0
 74 |         self.is_leaf = np.zeros(n_nodes, dtype=np.int64)
 75 |         self.is_leaf[0] = 1
 76 | 
 77 |         # Keep adding nodes while we can (new node must have 2 children)
 78 |         while True:
 79 |             idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())]
 80 |             if depth[idx] < max_depth:
 81 |                 unused = np.flatnonzero(
 82 |                     (self.left_children == -1)
 83 |                     & (self.right_children == -1)
 84 |                     & ~self.is_leaf
 85 |                 )
 86 |                 if len(unused) < 2:
 87 |                     break
 88 | 
 89 |                 lr_child = unused[np.random.permutation(unused.shape[0])[:2]]
 90 |                 self.is_leaf[lr_child] = 1
 91 |                 self.is_leaf[lr_child] = 1
 92 |                 depth[lr_child] = depth[idx] + 1
 93 |                 self.left_children[idx] = lr_child[0]
 94 |                 self.right_children[idx] = lr_child[1]
 95 |                 self.is_leaf[idx] = 0
 96 | 
 97 |     def apply(self, x):
 98 |         y = np.zeros(x.shape[0])
 99 | 
100 |         for i in range(x.shape[0]):
101 |             idx = 0
102 | 
103 |             while not self.is_leaf[idx]:
104 |                 if x[i, self.feature[idx]] < self.threshold[idx]:
105 |                     idx = self.left_children[idx]
106 |                 else:
107 |                     idx = self.right_children[idx]
108 | 
109 |             y[i] = self.value[idx]
110 | 
111 |         return y
112 | 
113 | 
114 | class TreeEnsemble:
115 |     "Combine multiple trees"
116 | 
117 |     def __init__(self, *, n_trees, n_features, n_nodes, max_depth):
118 |         self.trees = [
119 |             Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth)
120 |             for _ in range(n_trees)
121 |         ]
122 | 
123 |     def apply(self, x):
124 |         return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0)
125 | 


--------------------------------------------------------------------------------
/FT_Transformer/lib/util.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import json
  4 | import os
  5 | import pickle
  6 | import random
  7 | import shutil
  8 | import sys
  9 | import time
 10 | import typing as ty
 11 | from copy import deepcopy
 12 | from pathlib import Path
 13 | 
 14 | import numpy as np
 15 | import pynvml
 16 | import pytomlpp as toml
 17 | import torch
 18 | 
 19 | from . import env
 20 | 
 21 | TRAIN = 'train'
 22 | VAL = 'val'
 23 | TEST = 'test'
 24 | PARTS = [TRAIN, VAL, TEST]
 25 | 
 26 | BINCLASS = 'binclass'
 27 | MULTICLASS = 'multiclass'
 28 | REGRESSION = 'regression'
 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION]
 30 | 
 31 | 
 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any:
 33 |     return json.loads(Path(path).read_text())
 34 | 
 35 | 
 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None:
 37 |     Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n')
 38 | 
 39 | 
 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any:
 41 |     return toml.loads(Path(path).read_text())
 42 | 
 43 | 
 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None:
 45 |     Path(path).write_text(toml.dumps(x) + '\n')
 46 | 
 47 | 
 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any:
 49 |     return pickle.loads(Path(path).read_bytes())
 50 | 
 51 | 
 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None:
 53 |     Path(path).write_bytes(pickle.dumps(x))
 54 | 
 55 | 
 56 | def load(path: ty.Union[Path, str]) -> ty.Any:
 57 |     return globals()[f'load_{Path(path).suffix[1:]}'](path)
 58 | 
 59 | 
 60 | def load_config(
 61 |     argv: ty.Optional[ty.List[str]] = None,
 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]:
 63 |     parser = argparse.ArgumentParser()
 64 |     parser.add_argument('config', metavar='FILE')
 65 |     parser.add_argument('-o', '--output', metavar='DIR')
 66 |     parser.add_argument('-f', '--force', action='store_true')
 67 |     parser.add_argument('--continue', action='store_true', dest='continue_')
 68 |     if argv is None:
 69 |         argv = sys.argv[1:]
 70 |     args = parser.parse_args(argv)
 71 | 
 72 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
 73 |     if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists():
 74 |         assert args.continue_
 75 | 
 76 |     config_path = Path(args.config).absolute()
 77 |     output_dir = (
 78 |         Path(args.output)
 79 |         if args.output
 80 |         else config_path.parent.joinpath(config_path.stem)
 81 |     ).absolute()
 82 |     sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir))))  # type: ignore[code]
 83 |     print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n')
 84 | 
 85 |     assert config_path.exists()
 86 |     config = load_toml(config_path)
 87 | 
 88 |     environment: ty.Dict[str, ty.Any] = {}
 89 |     if torch.cuda.is_available():  # type: ignore[code]
 90 |         cvd = os.environ.get('CUDA_VISIBLE_DEVICES')
 91 |         pynvml.nvmlInit()
 92 |         environment['devices'] = {
 93 |             'CUDA_VISIBLE_DEVICES': cvd,
 94 |             'torch.version.cuda': torch.version.cuda,
 95 |             'torch.backends.cudnn.version()': torch.backends.cudnn.version(),  # type: ignore[code]
 96 |             'torch.cuda.nccl.version()': torch.cuda.nccl.version(),  # type: ignore[code]
 97 |             'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'),
 98 |         }
 99 |         if cvd:
100 |             for i in map(int, cvd.split(',')):
101 |                 handle = pynvml.nvmlDeviceGetHandleByIndex(i)
102 |                 environment['devices'][i] = {
103 |                     'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'),
104 |                     'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total,
105 |                 }
106 | 
107 |     return config, output_dir
108 | 
109 | 
110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None:
111 |     dump_json(stats, output_dir / 'stats.json', indent=4)
112 |     json_output_path = os.environ.get('JSON_OUTPUT_FILE')
113 |     if final:
114 |         output_dir.joinpath('DONE').touch()
115 |         if json_output_path:
116 |             try:
117 |                 key = str(output_dir.relative_to(env.PROJECT_DIR))
118 |             except ValueError:
119 |                 pass
120 |             else:
121 |                 json_output_path = Path(json_output_path)
122 |                 try:
123 |                     json_data = json.loads(json_output_path.read_text())
124 |                 except (FileNotFoundError, json.decoder.JSONDecodeError):
125 |                     json_data = {}
126 |                 json_data[key] = stats
127 |                 json_output_path.write_text(json.dumps(json_data))
128 |             shutil.copyfile(
129 |                 json_output_path,
130 |                 os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'),
131 |             )
132 | 
133 | 
134 | _LAST_SNAPSHOT_TIME = None
135 | 
136 | 
137 | def backup_output(output_dir: Path) -> None:
138 |     backup_dir = os.environ.get('TMP_OUTPUT_PATH')
139 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
140 |     if backup_dir is None:
141 |         assert snapshot_dir is None
142 |         return
143 |     assert snapshot_dir is not None
144 | 
145 |     try:
146 |         relative_output_dir = output_dir.relative_to(env.PROJECT_DIR)
147 |     except ValueError:
148 |         return
149 | 
150 |     for dir_ in [backup_dir, snapshot_dir]:
151 |         new_output_dir = dir_ / relative_output_dir
152 |         prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev')
153 |         new_output_dir.parent.mkdir(exist_ok=True, parents=True)
154 |         if new_output_dir.exists():
155 |             new_output_dir.rename(prev_backup_output_dir)
156 |         shutil.copytree(output_dir, new_output_dir)
157 |         if prev_backup_output_dir.exists():
158 |             shutil.rmtree(prev_backup_output_dir)
159 | 
160 |     global _LAST_SNAPSHOT_TIME
161 |     if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60:
162 |         pass
163 |         _LAST_SNAPSHOT_TIME = time.time()
164 |         print('The snapshot was saved!')
165 | 
166 | 
167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any):
168 |     raise ValueError(f'Unknown {unknown_what}: {unknown_value}')
169 | 
170 | 
171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict:
172 |     x = deepcopy(default_kwargs)
173 |     x.update(kwargs)
174 |     return x
175 | 
176 | 
177 | def set_seeds(seed: int) -> None:
178 |     random.seed(seed)
179 |     np.random.seed(seed)
180 | 
181 | 
182 | def format_seconds(seconds: float) -> str:
183 |     return str(datetime.timedelta(seconds=round(seconds)))
184 | 
185 | 
186 | def get_categories(
187 |     X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List
188 | ) -> ty.Optional[ty.List[int]]:
189 |     if X_cat is None:
190 |         return None
191 |     else:
192 |         categories_count = []
193 |         for i in range(X_cat.shape[1]):
194 |             # Combine unique categories from both training and testing indices for each feature
195 |             unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist())
196 |             categories_count.append(len(unique_categories))
197 |         return categories_count
198 | 


--------------------------------------------------------------------------------
/FT_Transformer/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | skip_string_normalization = true
3 | 
4 | [tool.isort]
5 | profile = "black"
6 | multi_line_output = 3
7 | known_first_party = ["lib"]
8 | 


--------------------------------------------------------------------------------
/FT_Transformer/requirements.txt:
--------------------------------------------------------------------------------
 1 | catboost==0.24.4
 2 | category-encoders==2.2.2
 3 | lightgbm==3.2.1
 4 | libzero==0.0.3.dev7
 5 | numba==0.53.1
 6 | optuna==2.6.0
 7 | pandas==1.2.3
 8 | pynvml==8.0.4
 9 | pytomlpp==0.3.5
10 | scikit-learn==0.24.1
11 | scipy==1.6.1
12 | tensorboard==2.4.1
13 | tqdm==4.59.0
14 | xgboost==1.3.3
15 | 
16 | # Tools
17 | black
18 | flake8
19 | icecream
20 | isort
21 | 
22 | # Jupyter
23 | ipywidgets
24 | jupyterlab
25 | jupyterlab-nvdashboard
26 | voila
27 | 
28 | # Visualization
29 | bokeh
30 | colorcet
31 | holoviews
32 | matplotlib
33 | panel
34 | seaborn
35 | 


--------------------------------------------------------------------------------
/FT_Transformer/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | # E501 is about line length; it can be violated by Black, so ignore it
4 | ignore = E203, E501, W503
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Revisiting-MLPs
2 | 
3 | To be updated...
4 | 


--------------------------------------------------------------------------------
/ResNet/lib/__init__.py:
--------------------------------------------------------------------------------
 1 | from icecream import install
 2 | 
 3 | install()
 4 | 
 5 | from . import env  # noqa
 6 | from .data import *  # noqa
 7 | from .deep import *  # noqa
 8 | from .env import get_path  # noqa
 9 | from .metrics import *  # noqa
10 | from .util import *  # noqa
11 | 


--------------------------------------------------------------------------------
/ResNet/lib/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | PROJECT_DIR = Path(os.environ['PROJECT_DIR']).absolute().resolve()
 5 | DATA_DIR = PROJECT_DIR / 'data'
 6 | OUTPUT_DIR = PROJECT_DIR / 'output'
 7 | 
 8 | 
 9 | def get_path(relative_path: str) -> Path:
10 |     return (
11 |         Path(relative_path)
12 |         if relative_path.startswith('/')
13 |         else PROJECT_DIR / relative_path
14 |     )
15 | 


--------------------------------------------------------------------------------
/ResNet/lib/metrics.py:
--------------------------------------------------------------------------------
 1 | import typing as ty
 2 | 
 3 | import numpy as np
 4 | import scipy.special
 5 | import sklearn.metrics as skm
 6 | 
 7 | from . import util
 8 | 
 9 | 
10 | def calculate_metrics(
11 |     task_type: str,
12 |     y: np.ndarray,
13 |     prediction: np.ndarray,
14 |     classification_mode: str,
15 |     y_info: ty.Optional[ty.Dict[str, ty.Any]],
16 | ) -> ty.Dict[str, float]:
17 |     if task_type == util.REGRESSION:
18 |         del classification_mode
19 |         rmse = skm.mean_squared_error(y, prediction) ** 0.5  # type: ignore[code]
20 | 
21 |         return {'rmse': rmse, 'score': -rmse}
22 |     else:
23 |         assert task_type in (util.BINCLASS, util.MULTICLASS)
24 |         labels = None
25 |         if classification_mode == 'probs':
26 |             probs = prediction
27 |         elif classification_mode == 'logits':
28 |             probs = (
29 |                 scipy.special.expit(prediction)
30 |                 if task_type == util.BINCLASS
31 |                 else scipy.special.softmax(prediction, axis=1)
32 |             )
33 |         else:
34 |             assert classification_mode == 'labels'
35 |             probs = None
36 |             labels = prediction
37 |         if labels is None:
38 |             labels = (
39 |                 np.round(probs).astype('int64')
40 |                 if task_type == util.BINCLASS
41 |                 else probs.argmax(axis=1)  # type: ignore[code]
42 |             )
43 | 
44 |         result = skm.classification_report(y, labels, output_dict=True)  # type: ignore[code]
45 |         if task_type == util.BINCLASS:
46 |             result['roc_auc'] = skm.roc_auc_score(y, probs)  # type: ignore[code]
47 |         else:
48 |             result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo')  # type: ignore[code]
49 |         result['score'] = result['roc_auc']  # type: ignore[code]
50 |     return result  # type: ignore[code]
51 | 
52 | 
53 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str:
54 |     precision = 3
55 |     summary = {}
56 |     for k, v in metrics[1].items():
57 |         if k.isdigit():
58 |             continue
59 |         k = {
60 |             'score': 'SCORE',
61 |             'accuracy': 'acc',
62 |             'roc_auc': 'roc_auc',
63 |             'macro avg': 'm',
64 |             'weighted avg': 'w',
65 |         }.get(k, k)
66 |         if isinstance(v, float):
67 |             v = round(v, precision)
68 |             summary[k] = v
69 |         else:
70 |             v = {
71 |                 {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get(
72 |                     x, x
73 |                 ): round(v[x], precision)
74 |                 for x in v
75 |             }
76 |             for item in v.items():
77 |                 summary[k + item[0]] = item[1]
78 | 
79 |     s = [f'score = {summary.pop("SCORE"):.3f}']
80 |     for k, v in summary.items():
81 |         if k not in ['mp', 'mr', 'wp', 'wr']:  # just to save screen space
82 |             s.append(f'{k} = {v}')
83 |     return ' | '.join(s)
84 | 


--------------------------------------------------------------------------------
/ResNet/lib/node/__init__.py:
--------------------------------------------------------------------------------
1 | # Source: https://github.com/Qwicen/node
2 | from .arch import *  # noqa
3 | from .nn_utils import *  # noqa
4 | from .odst import *  # noqa
5 | from .utils import *  # noqa
6 | 


--------------------------------------------------------------------------------
/ResNet/lib/node/arch.py:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/Qwicen/node
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.utils.checkpoint import checkpoint as torch_checkpoint
 6 | 
 7 | from .odst import ODST
 8 | 
 9 | 
10 | class DenseBlock(nn.Sequential):
11 |     def __init__(self, input_dim, layer_dim, num_layers, tree_dim=1, max_features=None,
12 |                  input_dropout=0.0, flatten_output=True, Module=ODST, **kwargs):
13 |         layers = []
14 |         for i in range(num_layers):
15 |             oddt = Module(input_dim, layer_dim, tree_dim=tree_dim, flatten_output=True, **kwargs)
16 |             input_dim = min(input_dim + layer_dim * tree_dim, max_features or float('inf'))
17 |             layers.append(oddt)
18 | 
19 |         super().__init__(*layers)
20 |         self.num_layers, self.layer_dim, self.tree_dim = num_layers, layer_dim, tree_dim
21 |         self.max_features, self.flatten_output = max_features, flatten_output
22 |         self.input_dropout = input_dropout
23 | 
24 |     def forward(self, x):
25 |         initial_features = x.shape[-1]
26 |         for layer in self:
27 |             layer_inp = x
28 |             if self.max_features is not None:
29 |                 tail_features = min(self.max_features, layer_inp.shape[-1]) - initial_features
30 |                 if tail_features != 0:
31 |                     layer_inp = torch.cat([layer_inp[..., :initial_features], layer_inp[..., -tail_features:]], dim=-1)
32 |             if self.training and self.input_dropout:
33 |                 layer_inp = F.dropout(layer_inp, self.input_dropout)
34 |             h = layer(layer_inp)
35 |             x = torch.cat([x, h], dim=-1)
36 | 
37 |         outputs = x[..., initial_features:]
38 |         if not self.flatten_output:
39 |             outputs = outputs.view(*outputs.shape[:-1], self.num_layers * self.layer_dim, self.tree_dim)
40 |         return outputs
41 | 


--------------------------------------------------------------------------------
/ResNet/lib/node/nn_utils.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/Qwicen/node
  2 | import contextlib
  3 | from collections import OrderedDict
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | from torch.autograd import Function
 10 | from torch.jit import script
 11 | 
 12 | 
 13 | def to_one_hot(y, depth=None):
 14 |     r"""
 15 |     Takes integer with n dims and converts it to 1-hot representation with n + 1 dims.
 16 |     The n+1'st dimension will have zeros everywhere but at y'th index, where it will be equal to 1.
 17 |     Args:
 18 |         y: input integer (IntTensor, LongTensor or Variable) of any shape
 19 |         depth (int):  the size of the one hot dimension
 20 |     """
 21 |     y_flat = y.to(torch.int64).view(-1, 1)
 22 |     depth = depth if depth is not None else int(torch.max(y_flat)) + 1
 23 |     y_one_hot = torch.zeros(y_flat.size()[0], depth, device=y.device).scatter_(1, y_flat, 1)
 24 |     y_one_hot = y_one_hot.view(*(tuple(y.shape) + (-1,)))
 25 |     return y_one_hot
 26 | 
 27 | 
 28 | def _make_ix_like(input, dim=0):
 29 |     d = input.size(dim)
 30 |     rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
 31 |     view = [1] * input.dim()
 32 |     view[0] = -1
 33 |     return rho.view(view).transpose(0, dim)
 34 | 
 35 | 
 36 | class SparsemaxFunction(Function):
 37 |     """
 38 |     An implementation of sparsemax (Martins & Astudillo, 2016). See
 39 |     :cite:`DBLP:journals/corr/MartinsA16` for detailed description.
 40 | 
 41 |     By Ben Peters and Vlad Niculae
 42 |     """
 43 | 
 44 |     @staticmethod
 45 |     def forward(ctx, input, dim=-1):
 46 |         """sparsemax: normalizing sparse transform (a la softmax)
 47 | 
 48 |         Parameters:
 49 |             input (Tensor): any shape
 50 |             dim: dimension along which to apply sparsemax
 51 | 
 52 |         Returns:
 53 |             output (Tensor): same shape as input
 54 |         """
 55 |         ctx.dim = dim
 56 |         max_val, _ = input.max(dim=dim, keepdim=True)
 57 |         input -= max_val  # same numerical stability trick as for softmax
 58 |         tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim)
 59 |         output = torch.clamp(input - tau, min=0)
 60 |         ctx.save_for_backward(supp_size, output)
 61 |         return output
 62 | 
 63 |     @staticmethod
 64 |     def backward(ctx, grad_output):
 65 |         supp_size, output = ctx.saved_tensors
 66 |         dim = ctx.dim
 67 |         grad_input = grad_output.clone()
 68 |         grad_input[output == 0] = 0
 69 | 
 70 |         v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
 71 |         v_hat = v_hat.unsqueeze(dim)
 72 |         grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
 73 |         return grad_input, None
 74 | 
 75 | 
 76 |     @staticmethod
 77 |     def _threshold_and_support(input, dim=-1):
 78 |         """Sparsemax building block: compute the threshold
 79 | 
 80 |         Args:
 81 |             input: any dimension
 82 |             dim: dimension along which to apply the sparsemax
 83 | 
 84 |         Returns:
 85 |             the threshold value
 86 |         """
 87 | 
 88 |         input_srt, _ = torch.sort(input, descending=True, dim=dim)
 89 |         input_cumsum = input_srt.cumsum(dim) - 1
 90 |         rhos = _make_ix_like(input, dim)
 91 |         support = rhos * input_srt > input_cumsum
 92 | 
 93 |         support_size = support.sum(dim=dim).unsqueeze(dim)
 94 |         tau = input_cumsum.gather(dim, support_size - 1)
 95 |         tau /= support_size.to(input.dtype)
 96 |         return tau, support_size
 97 | 
 98 | 
 99 | sparsemax = lambda input, dim=-1: SparsemaxFunction.apply(input, dim)
100 | sparsemoid = lambda input: (0.5 * input + 0.5).clamp_(0, 1)
101 | 
102 | 
103 | class Entmax15Function(Function):
104 |     """
105 |     An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See
106 |     :cite:`https://arxiv.org/abs/1905.05702 for detailed description.
107 |     Source: https://github.com/deep-spin/entmax
108 |     """
109 | 
110 |     @staticmethod
111 |     def forward(ctx, input, dim=-1):
112 |         ctx.dim = dim
113 | 
114 |         max_val, _ = input.max(dim=dim, keepdim=True)
115 |         input = input - max_val  # same numerical stability trick as for softmax
116 |         input = input / 2  # divide by 2 to solve actual Entmax
117 | 
118 |         tau_star, _ = Entmax15Function._threshold_and_support(input, dim)
119 |         output = torch.clamp(input - tau_star, min=0) ** 2
120 |         ctx.save_for_backward(output)
121 |         return output
122 | 
123 |     @staticmethod
124 |     def backward(ctx, grad_output):
125 |         Y, = ctx.saved_tensors
126 |         gppr = Y.sqrt()  # = 1 / g'' (Y)
127 |         dX = grad_output * gppr
128 |         q = dX.sum(ctx.dim) / gppr.sum(ctx.dim)
129 |         q = q.unsqueeze(ctx.dim)
130 |         dX -= q * gppr
131 |         return dX, None
132 | 
133 |     @staticmethod
134 |     def _threshold_and_support(input, dim=-1):
135 |         Xsrt, _ = torch.sort(input, descending=True, dim=dim)
136 | 
137 |         rho = _make_ix_like(input, dim)
138 |         mean = Xsrt.cumsum(dim) / rho
139 |         mean_sq = (Xsrt ** 2).cumsum(dim) / rho
140 |         ss = rho * (mean_sq - mean ** 2)
141 |         delta = (1 - ss) / rho
142 | 
143 |         # NOTE this is not exactly the same as in reference algo
144 |         # Fortunately it seems the clamped values never wrongly
145 |         # get selected by tau <= sorted_z. Prove this!
146 |         delta_nz = torch.clamp(delta, 0)
147 |         tau = mean - torch.sqrt(delta_nz)
148 | 
149 |         support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim)
150 |         tau_star = tau.gather(dim, support_size - 1)
151 |         return tau_star, support_size
152 | 
153 | 
154 | class Entmoid15(Function):
155 |     """ A highly optimized equivalent of labda x: Entmax15([x, 0]) """
156 | 
157 |     @staticmethod
158 |     def forward(ctx, input):
159 |         output = Entmoid15._forward(input)
160 |         ctx.save_for_backward(output)
161 |         return output
162 | 
163 |     @staticmethod
164 |     @script
165 |     def _forward(input):
166 |         input, is_pos = abs(input), input >= 0
167 |         tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2
168 |         tau.masked_fill_(tau <= input, 2.0)
169 |         y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2
170 |         return torch.where(is_pos, 1 - y_neg, y_neg)
171 | 
172 |     @staticmethod
173 |     def backward(ctx, grad_output):
174 |         return Entmoid15._backward(ctx.saved_tensors[0], grad_output)
175 | 
176 |     @staticmethod
177 |     @script
178 |     def _backward(output, grad_output):
179 |         gppr0, gppr1 = output.sqrt(), (1 - output).sqrt()
180 |         grad_input = grad_output * gppr0
181 |         q = grad_input / (gppr0 + gppr1)
182 |         grad_input -= q * gppr0
183 |         return grad_input
184 | 
185 | 
186 | entmax15 = lambda input, dim=-1: Entmax15Function.apply(input, dim)
187 | entmoid15 = Entmoid15.apply
188 | 
189 | 
190 | class Lambda(nn.Module):
191 |     def __init__(self, func):
192 |         super().__init__()
193 |         self.func = func
194 | 
195 |     def forward(self, *args, **kwargs):
196 |         return self.func(*args, **kwargs)
197 | 
198 | 
199 | class ModuleWithInit(nn.Module):
200 |     """ Base class for pytorch module with data-aware initializer on first batch """
201 |     def __init__(self):
202 |         super().__init__()
203 |         self._is_initialized_tensor = nn.Parameter(torch.tensor(0, dtype=torch.uint8), requires_grad=False)
204 |         self._is_initialized_bool = None
205 |         # Note: this module uses a separate flag self._is_initialized so as to achieve both
206 |         # * persistence: is_initialized is saved alongside model in state_dict
207 |         # * speed: model doesn't need to cache
208 |         # please DO NOT use these flags in child modules
209 | 
210 |     def initialize(self, *args, **kwargs):
211 |         """ initialize module tensors using first batch of data """
212 |         raise NotImplementedError("Please implement ")
213 | 
214 |     def __call__(self, *args, **kwargs):
215 |         if self._is_initialized_bool is None:
216 |             self._is_initialized_bool = bool(self._is_initialized_tensor.item())
217 |         if not self._is_initialized_bool:
218 |             self.initialize(*args, **kwargs)
219 |             self._is_initialized_tensor.data[...] = 1
220 |             self._is_initialized_bool = True
221 |         return super().__call__(*args, **kwargs)
222 | 


--------------------------------------------------------------------------------
/ResNet/lib/node/odst.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/Qwicen/node
  2 | from warnings import warn
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | from .nn_utils import ModuleWithInit, sparsemax, sparsemoid
 10 | from .utils import check_numpy
 11 | 
 12 | 
 13 | class ODST(ModuleWithInit):
 14 |     def __init__(self, in_features, num_trees, depth=6, tree_dim=1, flatten_output=True,
 15 |                  choice_function=sparsemax, bin_function=sparsemoid,
 16 |                  initialize_response_=nn.init.normal_, initialize_selection_logits_=nn.init.uniform_,
 17 |                  threshold_init_beta=1.0, threshold_init_cutoff=1.0,
 18 |                  ):
 19 |         """
 20 |         Oblivious Differentiable Sparsemax Trees. http://tinyurl.com/odst-readmore
 21 |         One can drop (sic!) this module anywhere instead of nn.Linear
 22 |         :param in_features: number of features in the input tensor
 23 |         :param num_trees: number of trees in this layer
 24 |         :param tree_dim: number of response channels in the response of individual tree
 25 |         :param depth: number of splits in every tree
 26 |         :param flatten_output: if False, returns [..., num_trees, tree_dim],
 27 |             by default returns [..., num_trees * tree_dim]
 28 |         :param choice_function: f(tensor, dim) -> R_simplex computes feature weights s.t. f(tensor, dim).sum(dim) == 1
 29 |         :param bin_function: f(tensor) -> R[0, 1], computes tree leaf weights
 30 | 
 31 |         :param initialize_response_: in-place initializer for tree output tensor
 32 |         :param initialize_selection_logits_: in-place initializer for logits that select features for the tree
 33 |         both thresholds and scales are initialized with data-aware init (or .load_state_dict)
 34 |         :param threshold_init_beta: initializes threshold to a q-th quantile of data points
 35 |             where q ~ Beta(:threshold_init_beta:, :threshold_init_beta:)
 36 |             If this param is set to 1, initial thresholds will have the same distribution as data points
 37 |             If greater than 1 (e.g. 10), thresholds will be closer to median data value
 38 |             If less than 1 (e.g. 0.1), thresholds will approach min/max data values.
 39 | 
 40 |         :param threshold_init_cutoff: threshold log-temperatures initializer, \in (0, inf)
 41 |             By default(1.0), log-remperatures are initialized in such a way that all bin selectors
 42 |             end up in the linear region of sparse-sigmoid. The temperatures are then scaled by this parameter.
 43 |             Setting this value > 1.0 will result in some margin between data points and sparse-sigmoid cutoff value
 44 |             Setting this value < 1.0 will cause (1 - value) part of data points to end up in flat sparse-sigmoid region
 45 |             For instance, threshold_init_cutoff = 0.9 will set 10% points equal to 0.0 or 1.0
 46 |             Setting this value > 1.0 will result in a margin between data points and sparse-sigmoid cutoff value
 47 |             All points will be between (0.5 - 0.5 / threshold_init_cutoff) and (0.5 + 0.5 / threshold_init_cutoff)
 48 |         """
 49 |         super().__init__()
 50 |         self.depth, self.num_trees, self.tree_dim, self.flatten_output = depth, num_trees, tree_dim, flatten_output
 51 |         self.choice_function, self.bin_function = choice_function, bin_function
 52 |         self.threshold_init_beta, self.threshold_init_cutoff = threshold_init_beta, threshold_init_cutoff
 53 | 
 54 |         self.response = nn.Parameter(torch.zeros([num_trees, tree_dim, 2 ** depth]), requires_grad=True)
 55 |         initialize_response_(self.response)
 56 | 
 57 |         self.feature_selection_logits = nn.Parameter(
 58 |             torch.zeros([in_features, num_trees, depth]), requires_grad=True
 59 |         )
 60 |         initialize_selection_logits_(self.feature_selection_logits)
 61 | 
 62 |         self.feature_thresholds = nn.Parameter(
 63 |             torch.full([num_trees, depth], float('nan'), dtype=torch.float32), requires_grad=True
 64 |         )  # nan values will be initialized on first batch (data-aware init)
 65 | 
 66 |         self.log_temperatures = nn.Parameter(
 67 |             torch.full([num_trees, depth], float('nan'), dtype=torch.float32), requires_grad=True
 68 |         )
 69 | 
 70 |         # binary codes for mapping between 1-hot vectors and bin indices
 71 |         with torch.no_grad():
 72 |             indices = torch.arange(2 ** self.depth)
 73 |             offsets = 2 ** torch.arange(self.depth)
 74 |             bin_codes = (indices.view(1, -1) // offsets.view(-1, 1) % 2).to(torch.float32)
 75 |             bin_codes_1hot = torch.stack([bin_codes, 1.0 - bin_codes], dim=-1)
 76 |             self.bin_codes_1hot = nn.Parameter(bin_codes_1hot, requires_grad=False)
 77 |             # ^-- [depth, 2 ** depth, 2]
 78 | 
 79 |     def forward(self, input):
 80 |         assert len(input.shape) >= 2
 81 |         if len(input.shape) > 2:
 82 |             return self.forward(input.view(-1, input.shape[-1])).view(*input.shape[:-1], -1)
 83 |         # new input shape: [batch_size, in_features]
 84 | 
 85 |         feature_logits = self.feature_selection_logits
 86 |         feature_selectors = self.choice_function(feature_logits, dim=0)
 87 |         # ^--[in_features, num_trees, depth]
 88 | 
 89 |         feature_values = torch.einsum('bi,ind->bnd', input, feature_selectors)
 90 |         # ^--[batch_size, num_trees, depth]
 91 | 
 92 |         threshold_logits = (feature_values - self.feature_thresholds) * torch.exp(-self.log_temperatures)
 93 | 
 94 |         threshold_logits = torch.stack([-threshold_logits, threshold_logits], dim=-1)
 95 |         # ^--[batch_size, num_trees, depth, 2]
 96 | 
 97 |         bins = self.bin_function(threshold_logits)
 98 |         # ^--[batch_size, num_trees, depth, 2], approximately binary
 99 | 
100 |         bin_matches = torch.einsum('btds,dcs->btdc', bins, self.bin_codes_1hot)
101 |         # ^--[batch_size, num_trees, depth, 2 ** depth]
102 | 
103 |         response_weights = torch.prod(bin_matches, dim=-2)
104 |         # ^-- [batch_size, num_trees, 2 ** depth]
105 | 
106 |         response = torch.einsum('bnd,ncd->bnc', response_weights, self.response)
107 |         # ^-- [batch_size, num_trees, tree_dim]
108 | 
109 |         return response.flatten(1, 2) if self.flatten_output else response
110 | 
111 |     def initialize(self, input, eps=1e-6):
112 |         # data-aware initializer
113 |         assert len(input.shape) == 2
114 |         if input.shape[0] < 1000:
115 |             warn("Data-aware initialization is performed on less than 1000 data points. This may cause instability."
116 |                  "To avoid potential problems, run this model on a data batch with at least 1000 data samples."
117 |                  "You can do so manually before training. Use with torch.no_grad() for memory efficiency.")
118 |         with torch.no_grad():
119 |             feature_selectors = self.choice_function(self.feature_selection_logits, dim=0)
120 |             # ^--[in_features, num_trees, depth]
121 | 
122 |             feature_values = torch.einsum('bi,ind->bnd', input, feature_selectors)
123 |             # ^--[batch_size, num_trees, depth]
124 | 
125 |             # initialize thresholds: sample random percentiles of data
126 |             percentiles_q = 100 * np.random.beta(self.threshold_init_beta, self.threshold_init_beta,
127 |                                                  size=[self.num_trees, self.depth])
128 |             self.feature_thresholds.data[...] = torch.as_tensor(
129 |                 list(map(np.percentile, check_numpy(feature_values.flatten(1, 2).t()), percentiles_q.flatten())),
130 |                 dtype=feature_values.dtype, device=feature_values.device
131 |             ).view(self.num_trees, self.depth)
132 | 
133 |             # init temperatures: make sure enough data points are in the linear region of sparse-sigmoid
134 |             temperatures = np.percentile(check_numpy(abs(feature_values - self.feature_thresholds)),
135 |                                          q=100 * min(1.0, self.threshold_init_cutoff), axis=0)
136 | 
137 |             # if threshold_init_cutoff > 1, scale everything down by it
138 |             temperatures /= max(1.0, self.threshold_init_cutoff)
139 |             self.log_temperatures.data[...] = torch.log(torch.as_tensor(temperatures) + eps)
140 | 
141 |     def __repr__(self):
142 |         return "{}(in_features={}, num_trees={}, depth={}, tree_dim={}, flatten_output={})".format(
143 |             self.__class__.__name__, self.feature_selection_logits.shape[0],
144 |             self.num_trees, self.depth, self.tree_dim, self.flatten_output
145 |         )
146 | 
147 | 


--------------------------------------------------------------------------------
/ResNet/lib/node/utils.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/Qwicen/node
  2 | import contextlib
  3 | import gc
  4 | import glob
  5 | import hashlib
  6 | import os
  7 | import time
  8 | 
  9 | import numpy as np
 10 | import requests
 11 | import torch
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | def download(url, filename, delete_if_interrupted=True, chunk_size=4096):
 16 |     """ saves file from url to filename with a fancy progressbar """
 17 |     try:
 18 |         with open(filename, "wb") as f:
 19 |             print("Downloading {} > {}".format(url, filename))
 20 |             response = requests.get(url, stream=True)
 21 |             total_length = response.headers.get('content-length')
 22 | 
 23 |             if total_length is None:  # no content length header
 24 |                 f.write(response.content)
 25 |             else:
 26 |                 total_length = int(total_length)
 27 |                 with tqdm(total=total_length) as progressbar:
 28 |                     for data in response.iter_content(chunk_size=chunk_size):
 29 |                         if data:  # filter-out keep-alive chunks
 30 |                             f.write(data)
 31 |                             progressbar.update(len(data))
 32 |     except Exception as e:
 33 |         if delete_if_interrupted:
 34 |             print("Removing incomplete download {}.".format(filename))
 35 |             os.remove(filename)
 36 |         raise e
 37 |     return filename
 38 | 
 39 | 
 40 | def iterate_minibatches(*tensors, batch_size, shuffle=True, epochs=1,
 41 |                         allow_incomplete=True, callback=lambda x:x):
 42 |     indices = np.arange(len(tensors[0]))
 43 |     upper_bound = int((np.ceil if allow_incomplete else np.floor) (len(indices) / batch_size)) * batch_size
 44 |     epoch = 0
 45 |     while True:
 46 |         if shuffle:
 47 |             np.random.shuffle(indices)
 48 |         for batch_start in callback(range(0, upper_bound, batch_size)):
 49 |             batch_ix = indices[batch_start: batch_start + batch_size]
 50 |             batch = [tensor[batch_ix] for tensor in tensors]
 51 |             yield batch if len(tensors) > 1 else batch[0]
 52 |         epoch += 1
 53 |         if epoch >= epochs:
 54 |             break
 55 | 
 56 | 
 57 | def process_in_chunks(function, *args, batch_size, out=None, **kwargs):
 58 |     """
 59 |     Computes output by applying batch-parallel function to large data tensor in chunks
 60 |     :param function: a function(*[x[indices, ...] for x in args]) -> out[indices, ...]
 61 |     :param args: one or many tensors, each [num_instances, ...]
 62 |     :param batch_size: maximum chunk size processed in one go
 63 |     :param out: memory buffer for out, defaults to torch.zeros of appropriate size and type
 64 |     :returns: function(data), computed in a memory-efficient way
 65 |     """
 66 |     total_size = args[0].shape[0]
 67 |     first_output = function(*[x[0: batch_size] for x in args])
 68 |     output_shape = (total_size,) + tuple(first_output.shape[1:])
 69 |     if out is None:
 70 |         out = torch.zeros(*output_shape, dtype=first_output.dtype, device=first_output.device,
 71 |                           layout=first_output.layout, **kwargs)
 72 | 
 73 |     out[0: batch_size] = first_output
 74 |     for i in range(batch_size, total_size, batch_size):
 75 |         batch_ix = slice(i, min(i + batch_size, total_size))
 76 |         out[batch_ix] = function(*[x[batch_ix] for x in args])
 77 |     return out
 78 | 
 79 | 
 80 | def check_numpy(x):
 81 |     """ Makes sure x is a numpy array """
 82 |     if isinstance(x, torch.Tensor):
 83 |         x = x.detach().cpu().numpy()
 84 |     x = np.asarray(x)
 85 |     assert isinstance(x, np.ndarray)
 86 |     return x
 87 | 
 88 | 
 89 | @contextlib.contextmanager
 90 | def nop_ctx():
 91 |     yield None
 92 | 
 93 | 
 94 | def get_latest_file(pattern):
 95 |     list_of_files = glob.glob(pattern) # * means all if need specific format then *.csv
 96 |     assert len(list_of_files) > 0, "No files found: " + pattern
 97 |     return max(list_of_files, key=os.path.getctime)
 98 | 
 99 | 
100 | def md5sum(fname):
101 |     """ Computes mdp checksum of a file """
102 |     hash_md5 = hashlib.md5()
103 |     with open(fname, "rb") as f:
104 |         for chunk in iter(lambda: f.read(4096), b""):
105 |             hash_md5.update(chunk)
106 |     return hash_md5.hexdigest()
107 | 
108 | 
109 | def free_memory(sleep_time=0.1):
110 |     """ Black magic function to free torch memory and some jupyter whims """
111 |     gc.collect()
112 |     torch.cuda.synchronize()
113 |     gc.collect()
114 |     torch.cuda.empty_cache()
115 |     time.sleep(sleep_time)
116 | 
117 | def to_float_str(element):
118 |     try:
119 |         return str(float(element))
120 |     except ValueError:
121 |         return element
122 | 


--------------------------------------------------------------------------------
/ResNet/lib/synthetic_data.py:
--------------------------------------------------------------------------------
  1 | "Code used to generate data for experiments with synthetic data"
  2 | import math
  3 | import typing as ty
  4 | 
  5 | import numba
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | from numba.experimental import jitclass
 10 | from tqdm.auto import tqdm
 11 | 
 12 | 
 13 | class MLP(nn.Module):
 14 |     def __init__(
 15 |         self,
 16 |         *,
 17 |         d_in: int,
 18 |         d_layers: ty.List[int],
 19 |         d_out: int,
 20 |         bias: bool = True,
 21 |     ) -> None:
 22 |         super().__init__()
 23 |         self.layers = nn.ModuleList(
 24 |             [
 25 |                 nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias)
 26 |                 for i, x in enumerate(d_layers)
 27 |             ]
 28 |         )
 29 |         self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out)
 30 | 
 31 |         def init_weights(m):
 32 |             if isinstance(m, nn.Linear):
 33 |                 torch.nn.init.kaiming_normal_(m.weight, mode='fan_in')
 34 |                 if m.bias is not None:
 35 |                     fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight)
 36 |                     bound = 1 / math.sqrt(fan_in)
 37 |                     torch.nn.init.uniform_(m.bias, -bound, bound)
 38 | 
 39 |         self.apply(init_weights)
 40 | 
 41 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 42 |         for layer in self.layers:
 43 |             x = layer(x)
 44 |             x = torch.relu(x)
 45 |         x = self.head(x)
 46 |         x = x.squeeze(-1)
 47 |         return x
 48 | 
 49 | 
 50 | @jitclass(
 51 |     spec=[
 52 |         ('left_children', numba.int64[:]),
 53 |         ('right_children', numba.int64[:]),
 54 |         ('feature', numba.int64[:]),
 55 |         ('threshold', numba.float32[:]),
 56 |         ('value', numba.float32[:]),
 57 |         ('is_leaf', numba.int64[:]),
 58 |     ]
 59 | )
 60 | class Tree:
 61 |     "Randomly initialized decision tree"
 62 | 
 63 |     def __init__(self, n_features, n_nodes, max_depth):
 64 |         assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes"
 65 | 
 66 |         self.left_children = np.ones(n_nodes, dtype=np.int64) * -1
 67 |         self.right_children = np.ones(n_nodes, dtype=np.int64) * -1
 68 |         self.feature = np.random.randint(0, n_features, (n_nodes,))
 69 |         self.threshold = np.random.randn(n_nodes).astype(np.float32)
 70 |         self.value = np.random.randn(n_nodes).astype(np.float32)
 71 |         depth = np.zeros(n_nodes, dtype=np.int64)
 72 | 
 73 |         # Root is 0
 74 |         self.is_leaf = np.zeros(n_nodes, dtype=np.int64)
 75 |         self.is_leaf[0] = 1
 76 | 
 77 |         # Keep adding nodes while we can (new node must have 2 children)
 78 |         while True:
 79 |             idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())]
 80 |             if depth[idx] < max_depth:
 81 |                 unused = np.flatnonzero(
 82 |                     (self.left_children == -1)
 83 |                     & (self.right_children == -1)
 84 |                     & ~self.is_leaf
 85 |                 )
 86 |                 if len(unused) < 2:
 87 |                     break
 88 | 
 89 |                 lr_child = unused[np.random.permutation(unused.shape[0])[:2]]
 90 |                 self.is_leaf[lr_child] = 1
 91 |                 self.is_leaf[lr_child] = 1
 92 |                 depth[lr_child] = depth[idx] + 1
 93 |                 self.left_children[idx] = lr_child[0]
 94 |                 self.right_children[idx] = lr_child[1]
 95 |                 self.is_leaf[idx] = 0
 96 | 
 97 |     def apply(self, x):
 98 |         y = np.zeros(x.shape[0])
 99 | 
100 |         for i in range(x.shape[0]):
101 |             idx = 0
102 | 
103 |             while not self.is_leaf[idx]:
104 |                 if x[i, self.feature[idx]] < self.threshold[idx]:
105 |                     idx = self.left_children[idx]
106 |                 else:
107 |                     idx = self.right_children[idx]
108 | 
109 |             y[i] = self.value[idx]
110 | 
111 |         return y
112 | 
113 | 
114 | class TreeEnsemble:
115 |     "Combine multiple trees"
116 | 
117 |     def __init__(self, *, n_trees, n_features, n_nodes, max_depth):
118 |         self.trees = [
119 |             Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth)
120 |             for _ in range(n_trees)
121 |         ]
122 | 
123 |     def apply(self, x):
124 |         return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0)
125 | 


--------------------------------------------------------------------------------
/ResNet/lib/util.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import json
  4 | import os
  5 | import pickle
  6 | import random
  7 | import shutil
  8 | import sys
  9 | import time
 10 | import typing as ty
 11 | from copy import deepcopy
 12 | from pathlib import Path
 13 | 
 14 | import numpy as np
 15 | import pynvml
 16 | import pytomlpp as toml
 17 | import torch
 18 | 
 19 | from . import env
 20 | 
 21 | TRAIN = 'train'
 22 | VAL = 'val'
 23 | TEST = 'test'
 24 | PARTS = [TRAIN, VAL, TEST]
 25 | 
 26 | BINCLASS = 'binclass'
 27 | MULTICLASS = 'multiclass'
 28 | REGRESSION = 'regression'
 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION]
 30 | 
 31 | 
 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any:
 33 |     return json.loads(Path(path).read_text())
 34 | 
 35 | 
 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None:
 37 |     Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n')
 38 | 
 39 | 
 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any:
 41 |     return toml.loads(Path(path).read_text())
 42 | 
 43 | 
 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None:
 45 |     Path(path).write_text(toml.dumps(x) + '\n')
 46 | 
 47 | 
 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any:
 49 |     return pickle.loads(Path(path).read_bytes())
 50 | 
 51 | 
 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None:
 53 |     Path(path).write_bytes(pickle.dumps(x))
 54 | 
 55 | 
 56 | def load(path: ty.Union[Path, str]) -> ty.Any:
 57 |     return globals()[f'load_{Path(path).suffix[1:]}'](path)
 58 | 
 59 | 
 60 | def load_config(
 61 |     argv: ty.Optional[ty.List[str]] = None,
 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]:
 63 |     parser = argparse.ArgumentParser()
 64 |     parser.add_argument('config', metavar='FILE')
 65 |     parser.add_argument('-o', '--output', metavar='DIR')
 66 |     parser.add_argument('-f', '--force', action='store_true')
 67 |     parser.add_argument('--continue', action='store_true', dest='continue_')
 68 |     if argv is None:
 69 |         argv = sys.argv[1:]
 70 |     args = parser.parse_args(argv)
 71 | 
 72 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
 73 |     if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists():
 74 |         assert args.continue_
 75 | 
 76 |     config_path = Path(args.config).absolute()
 77 |     output_dir = (
 78 |         Path(args.output)
 79 |         if args.output
 80 |         else config_path.parent.joinpath(config_path.stem)
 81 |     ).absolute()
 82 |     sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir))))  # type: ignore[code]
 83 |     print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n')
 84 | 
 85 |     assert config_path.exists()
 86 |     config = load_toml(config_path)
 87 | 
 88 |     environment: ty.Dict[str, ty.Any] = {}
 89 |     if torch.cuda.is_available():  # type: ignore[code]
 90 |         cvd = os.environ.get('CUDA_VISIBLE_DEVICES')
 91 |         pynvml.nvmlInit()
 92 |         environment['devices'] = {
 93 |             'CUDA_VISIBLE_DEVICES': cvd,
 94 |             'torch.version.cuda': torch.version.cuda,
 95 |             'torch.backends.cudnn.version()': torch.backends.cudnn.version(),  # type: ignore[code]
 96 |             'torch.cuda.nccl.version()': torch.cuda.nccl.version(),  # type: ignore[code]
 97 |             'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'),
 98 |         }
 99 |         if cvd:
100 |             for i in map(int, cvd.split(',')):
101 |                 handle = pynvml.nvmlDeviceGetHandleByIndex(i)
102 |                 environment['devices'][i] = {
103 |                     'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'),
104 |                     'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total,
105 |                 }
106 | 
107 |     return config, output_dir
108 | 
109 | 
110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None:
111 |     dump_json(stats, output_dir / 'stats.json', indent=4)
112 |     json_output_path = os.environ.get('JSON_OUTPUT_FILE')
113 |     if final:
114 |         output_dir.joinpath('DONE').touch()
115 |         if json_output_path:
116 |             try:
117 |                 key = str(output_dir.relative_to(env.PROJECT_DIR))
118 |             except ValueError:
119 |                 pass
120 |             else:
121 |                 json_output_path = Path(json_output_path)
122 |                 try:
123 |                     json_data = json.loads(json_output_path.read_text())
124 |                 except (FileNotFoundError, json.decoder.JSONDecodeError):
125 |                     json_data = {}
126 |                 json_data[key] = stats
127 |                 json_output_path.write_text(json.dumps(json_data))
128 |             shutil.copyfile(
129 |                 json_output_path,
130 |                 os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'),
131 |             )
132 | 
133 | 
134 | _LAST_SNAPSHOT_TIME = None
135 | 
136 | 
137 | def backup_output(output_dir: Path) -> None:
138 |     backup_dir = os.environ.get('TMP_OUTPUT_PATH')
139 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
140 |     if backup_dir is None:
141 |         assert snapshot_dir is None
142 |         return
143 |     assert snapshot_dir is not None
144 | 
145 |     try:
146 |         relative_output_dir = output_dir.relative_to(env.PROJECT_DIR)
147 |     except ValueError:
148 |         return
149 | 
150 |     for dir_ in [backup_dir, snapshot_dir]:
151 |         new_output_dir = dir_ / relative_output_dir
152 |         prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev')
153 |         new_output_dir.parent.mkdir(exist_ok=True, parents=True)
154 |         if new_output_dir.exists():
155 |             new_output_dir.rename(prev_backup_output_dir)
156 |         shutil.copytree(output_dir, new_output_dir)
157 |         if prev_backup_output_dir.exists():
158 |             shutil.rmtree(prev_backup_output_dir)
159 | 
160 |     global _LAST_SNAPSHOT_TIME
161 |     if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60:
162 |         pass
163 |         _LAST_SNAPSHOT_TIME = time.time()
164 |         print('The snapshot was saved!')
165 | 
166 | 
167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any):
168 |     raise ValueError(f'Unknown {unknown_what}: {unknown_value}')
169 | 
170 | 
171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict:
172 |     x = deepcopy(default_kwargs)
173 |     x.update(kwargs)
174 |     return x
175 | 
176 | 
177 | def set_seeds(seed: int) -> None:
178 |     random.seed(seed)
179 |     np.random.seed(seed)
180 | 
181 | 
182 | def format_seconds(seconds: float) -> str:
183 |     return str(datetime.timedelta(seconds=round(seconds)))
184 | 
185 | 
186 | def get_categories(
187 |     X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List
188 | ) -> ty.Optional[ty.List[int]]:
189 |     if X_cat is None:
190 |         return None
191 |     else:
192 |         categories_count = []
193 |         for i in range(X_cat.shape[1]):
194 |             # Combine unique categories from both training and testing indices for each feature
195 |             unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist())
196 |             categories_count.append(len(unique_categories))
197 |         return categories_count
198 | 


--------------------------------------------------------------------------------
/ResNet/resnet_ft.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | import math
  3 | import typing as ty
  4 | from pathlib import Path
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import zero
 11 | from torch import Tensor
 12 | 
 13 | import lib
 14 | 
 15 | 
 16 | # %%
 17 | class ResNet(nn.Module):
 18 |     def __init__(
 19 |         self,
 20 |         *,
 21 |         d_numerical: int,
 22 |         categories: ty.Optional[ty.List[int]],
 23 |         d_embedding: int,
 24 |         d: int,
 25 |         d_hidden_factor: float,
 26 |         n_layers: int,
 27 |         activation: str,
 28 |         normalization: str,
 29 |         hidden_dropout: float,
 30 |         residual_dropout: float,
 31 |         d_out: int,
 32 |     ) -> None:
 33 |         super().__init__()
 34 | 
 35 |         def make_normalization():
 36 |             return {'batchnorm': nn.BatchNorm1d, 'layernorm': nn.LayerNorm}[
 37 |                 normalization
 38 |             ](d)
 39 | 
 40 |         self.main_activation = lib.get_activation_fn(activation)
 41 |         self.last_activation = lib.get_nonglu_activation_fn(activation)
 42 |         self.residual_dropout = residual_dropout
 43 |         self.hidden_dropout = hidden_dropout
 44 | 
 45 |         d_in = d_numerical
 46 |         d_hidden = int(d * d_hidden_factor)
 47 | 
 48 |         if categories is not None:
 49 |             d_in += len(categories) * d_embedding
 50 |             category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
 51 |             self.register_buffer('category_offsets', category_offsets)
 52 |             self.categories = torch.tensor(np.subtract(categories, 1).tolist())
 53 |             self.category_embeddings = nn.Embedding(sum(categories), d_embedding)
 54 |             self.unknown_value = np.iinfo('int64').max - 3
 55 |             nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))
 56 |             print(f'{self.category_embeddings.weight.shape=}')
 57 | 
 58 |         self.first_layer = nn.Linear(d_in, d)
 59 |         self.layers = nn.ModuleList(
 60 |             [
 61 |                 nn.ModuleDict(
 62 |                     {
 63 |                         'norm': make_normalization(),
 64 |                         'linear0': nn.Linear(
 65 |                             d, d_hidden * (2 if activation.endswith('glu') else 1)
 66 |                         ),
 67 |                         'linear1': nn.Linear(d_hidden, d),
 68 |                     }
 69 |                 )
 70 |                 for _ in range(n_layers)
 71 |             ]
 72 |         )
 73 |         self.last_normalization = make_normalization()
 74 |         self.head = nn.Linear(d, d_out)
 75 | 
 76 |     def forward(self, x_num: Tensor, x_cat: ty.Optional[Tensor]) -> Tensor:
 77 |         x = []
 78 |         if x_num is not None:
 79 |             x.append(x_num)
 80 |         if x_cat is not None:
 81 |             x_cat = torch.where(x_cat == self.unknown_value, self.categories.to(x_cat.device), x_cat)
 82 |             x.append(
 83 |                 self.category_embeddings(x_cat + self.category_offsets[None]).view(
 84 |                     x_cat.size(0), -1
 85 |                 )
 86 |             )
 87 |         x = torch.cat(x, dim=-1)
 88 | 
 89 |         x = self.first_layer(x)
 90 |         for layer in self.layers:
 91 |             layer = ty.cast(ty.Dict[str, nn.Module], layer)
 92 |             z = x
 93 |             if x.shape[0] > 1:
 94 |                 z = layer['norm'](z)
 95 |             z = layer['linear0'](z)
 96 |             z = self.main_activation(z)
 97 |             if self.hidden_dropout:
 98 |                 z = F.dropout(z, self.hidden_dropout, self.training)
 99 |             z = layer['linear1'](z)
100 |             if self.residual_dropout:
101 |                 z = F.dropout(z, self.residual_dropout, self.training)
102 |             x = x + z
103 |         if x.shape[0] > 1:
104 |             x = self.last_normalization(x)
105 |         x = self.last_activation(x)
106 |         x = self.head(x)
107 |         x = x.squeeze(-1)
108 |         return x
109 | 


--------------------------------------------------------------------------------
/ResNet/resnext.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | import math
  3 | import typing as ty
  4 | from pathlib import Path
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import zero
 11 | from torch import Tensor
 12 | 
 13 | import lib
 14 | 
 15 | 
 16 | # %%
 17 | class ResNext(nn.Module):
 18 |     def __init__(
 19 |         self,
 20 |         *,
 21 |         d_numerical: int,
 22 |         categories: ty.Optional[ty.List[int]],
 23 |         d_embedding: int,
 24 |         d: int,
 25 |         d_hidden_factor: float,
 26 |         n_layers: int,
 27 |         activation: str,
 28 |         normalization: str,
 29 |         hidden_dropout: float,
 30 |         residual_dropout: float,
 31 |         d_out: int,
 32 |         cardinality: int,
 33 |     ) -> None:
 34 |         super().__init__()
 35 | 
 36 |         def make_normalization():
 37 |             return {'batchnorm': nn.BatchNorm1d, 'layernorm': nn.LayerNorm}[
 38 |                 normalization
 39 |             ](d)
 40 | 
 41 |         self.main_activation = lib.get_activation_fn(activation)
 42 |         self.last_activation = lib.get_nonglu_activation_fn(activation)
 43 |         self.residual_dropout = residual_dropout
 44 |         self.hidden_dropout = hidden_dropout
 45 |         self.cardinality = cardinality
 46 | 
 47 |         d_in = d_numerical
 48 |         d_hidden = int(d * d_hidden_factor)
 49 |         d_hidden_per_path = int(d_hidden / self.cardinality)
 50 | 
 51 |         if categories is not None:
 52 |             d_in += len(categories) * d_embedding
 53 |             category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
 54 |             self.register_buffer('category_offsets', category_offsets)
 55 |             self.category_embeddings = nn.Embedding(sum(categories), d_embedding)
 56 |             self.categories = torch.tensor(np.subtract(categories, 1).tolist())
 57 |             self.unknown_value = np.iinfo('int64').max - 3
 58 |             nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))
 59 |             print(f'{self.category_embeddings.weight.shape=}')
 60 | 
 61 |         self.first_layer = nn.Linear(d_in, d)
 62 |         self.layers = nn.ModuleList(
 63 |             [
 64 |                 nn.ModuleDict(
 65 |                     {
 66 |                         'norm': make_normalization(),
 67 |                         'linear0': nn.ModuleList([nn.Linear(d, d_hidden_per_path) for _ in range(cardinality)]),
 68 |                         'linear1': nn.ModuleList([nn.Linear(d_hidden_per_path, d) for _ in range(cardinality)]),
 69 |                     }
 70 |                 )
 71 |                 for _ in range(n_layers)
 72 |             ]
 73 |         )
 74 |         self.last_normalization = make_normalization()
 75 |         self.head = nn.Linear(d, d_out)
 76 | 
 77 |     def forward(self, x_num: Tensor, x_cat: ty.Optional[Tensor]) -> Tensor:
 78 |         x = []
 79 |         if x_num is not None:
 80 |             x.append(x_num)
 81 |         if x_cat is not None:
 82 |             x_cat = torch.where(x_cat == self.unknown_value, self.categories.to(x_cat.device), x_cat)
 83 |             x.append(
 84 |                 self.category_embeddings(x_cat + self.category_offsets[None]).view(
 85 |                     x_cat.size(0), -1
 86 |                 )
 87 |             )
 88 |         x = torch.cat(x, dim=-1)
 89 | 
 90 |         x = self.first_layer(x)
 91 |         for layer in self.layers:
 92 |             layer = ty.cast(ty.Dict[str, nn.Module], layer)
 93 |             z = x
 94 |             z = layer['norm'](z) if z.shape[0] > 1 else z
 95 |             path_outputs = []
 96 |             for i in range(self.cardinality):
 97 |                 path_output = layer['linear0'][i](z)
 98 |                 path_output = self.main_activation(path_output)
 99 |                 if self.hidden_dropout:
100 |                     path_output = F.dropout(path_output, p=self.hidden_dropout, training=self.training)
101 |                 path_output = layer['linear1'][i](path_output)
102 |                 if self.residual_dropout:
103 |                     path_output = F.dropout(path_output, self.residual_dropout, self.training)
104 |                 path_outputs.append(path_output)
105 |             z = sum(path_outputs)
106 |             x = x + z
107 |         x = self.last_normalization(x) if x.shape[0] > 1 else x
108 |         x = self.last_activation(x)
109 |         x = self.head(x)
110 |         x = x.squeeze(-1)
111 |         return x
112 | 


--------------------------------------------------------------------------------
/TabNet/augmentations.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from pytorch_tabnet.utils import define_device
 3 | import numpy as np
 4 | 
 5 | 
 6 | class RegressionSMOTE():
 7 |     """
 8 |     Apply SMOTE
 9 | 
10 |     This will average a percentage p of the elements in the batch with other elements.
11 |     The target will be averaged as well (this might work with binary classification
12 |     and certain loss), following a beta distribution.
13 |     """
14 |     def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0):
15 |         ""
16 |         self.seed = seed
17 |         self._set_seed()
18 |         self.device = define_device(device_name)
19 |         self.alpha = alpha
20 |         self.beta = beta
21 |         self.p = p
22 |         if (p < 0.) or (p > 1.0):
23 |             raise ValueError("Value of p should be between 0. and 1.")
24 | 
25 |     def _set_seed(self):
26 |         torch.manual_seed(self.seed)
27 |         np.random.seed(self.seed)
28 |         return
29 | 
30 |     def __call__(self, X, y):
31 |         batch_size = X.shape[0]
32 |         random_values = torch.rand(batch_size, device=self.device)
33 |         idx_to_change = random_values < self.p
34 | 
35 |         # ensure that first element to switch has probability > 0.5
36 |         np_betas = np.random.beta(self.alpha, self.beta, batch_size) / 2 + 0.5
37 |         random_betas = torch.from_numpy(np_betas).to(self.device).float()
38 |         index_permute = torch.randperm(batch_size, device=self.device)
39 | 
40 |         X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change]
41 |         X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view(X[idx_to_change].size()) # noqa
42 | 
43 |         y[idx_to_change] = random_betas[idx_to_change, None] * y[idx_to_change]
44 |         y[idx_to_change] += (1 - random_betas[idx_to_change, None]) * y[index_permute][idx_to_change].view(y[idx_to_change].size()) # noqa
45 | 
46 |         return X, y
47 | 
48 | 
49 | class ClassificationSMOTE():
50 |     """
51 |     Apply SMOTE for classification tasks.
52 | 
53 |     This will average a percentage p of the elements in the batch with other elements.
54 |     The target will stay unchanged and keep the value of the most important row in the mix.
55 |     """
56 |     def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0):
57 |         ""
58 |         self.seed = seed
59 |         self._set_seed()
60 |         self.device = define_device(device_name)
61 |         self.alpha = alpha
62 |         self.beta = beta
63 |         self.p = p
64 |         if (p < 0.) or (p > 1.0):
65 |             raise ValueError("Value of p should be between 0. and 1.")
66 | 
67 |     def _set_seed(self):
68 |         torch.manual_seed(self.seed)
69 |         np.random.seed(self.seed)
70 |         return
71 | 
72 |     def __call__(self, X, y):
73 |         batch_size = X.shape[0]
74 |         random_values = torch.rand(batch_size, device=self.device)
75 |         idx_to_change = random_values < self.p
76 | 
77 |         # ensure that first element to switch has probability > 0.5
78 |         np_betas = np.random.beta(self.alpha, self.beta, batch_size) / 2 + 0.5
79 |         random_betas = torch.from_numpy(np_betas).to(self.device).float()
80 |         index_permute = torch.randperm(batch_size, device=self.device)
81 | 
82 |         X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change]
83 |         X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view(X[idx_to_change].size())  # noqa
84 | 
85 |         return X, y
86 | 


--------------------------------------------------------------------------------
/TabNet/lib/__init__.py:
--------------------------------------------------------------------------------
 1 | from icecream import install
 2 | 
 3 | install()
 4 | 
 5 | from . import env  # noqa
 6 | from .data import *  # noqa
 7 | from .deep import *  # noqa
 8 | from .metrics import *  # noqa
 9 | from .util import *  # noqa
10 | 


--------------------------------------------------------------------------------
/TabNet/lib/env.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/TabNet/lib/metrics.py:
--------------------------------------------------------------------------------
 1 | import typing as ty
 2 | 
 3 | import numpy as np
 4 | import scipy.special
 5 | import sklearn.metrics as skm
 6 | 
 7 | from . import util
 8 | 
 9 | 
10 | def calculate_metrics(
11 |     task_type: str,
12 |     y: np.ndarray,
13 |     prediction: np.ndarray,
14 |     classification_mode: str,
15 |     y_info: ty.Optional[ty.Dict[str, ty.Any]],
16 | ) -> ty.Dict[str, float]:
17 |     if task_type == util.REGRESSION:
18 |         del classification_mode
19 |         rmse = skm.mean_squared_error(y, prediction) ** 0.5  # type: ignore[code]
20 |         if y_info:
21 |             if y_info['policy'] == 'mean_std':
22 |                 rmse *= y_info['std']
23 |             else:
24 |                 assert False
25 |         return {'rmse': rmse, 'score': -rmse}
26 |     else:
27 |         assert task_type in (util.BINCLASS, util.MULTICLASS)
28 |         labels = None
29 |         if classification_mode == 'probs':
30 |             probs = prediction
31 |         elif classification_mode == 'logits':
32 |             probs = (
33 |                 scipy.special.expit(prediction)
34 |                 if task_type == util.BINCLASS
35 |                 else scipy.special.softmax(prediction, axis=1)
36 |             )
37 |         else:
38 |             assert classification_mode == 'labels'
39 |             probs = None
40 |             labels = prediction
41 |         if labels is None:
42 |             labels = (
43 |                 np.round(probs).astype('int64')
44 |                 if task_type == util.BINCLASS
45 |                 else probs.argmax(axis=1)  # type: ignore[code]
46 |             )
47 | 
48 |         result = skm.classification_report(y, labels, output_dict=True)  # type: ignore[code]
49 |         if task_type == util.BINCLASS:
50 |             result['roc_auc'] = skm.roc_auc_score(y, probs)  # type: ignore[code]
51 |         else:
52 |             result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo')  # type: ignore[code]
53 |         result['score'] = result['roc_auc']  # type: ignore[code]
54 |     return result  # type: ignore[code]
55 | 
56 | 
57 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str:
58 |     precision = 3
59 |     summary = {}
60 |     for k, v in metrics[1].items():
61 |         if k.isdigit():
62 |             continue
63 |         k = {
64 |             'score': 'SCORE',
65 |             'accuracy': 'acc',
66 |             'roc_auc': 'roc_auc',
67 |             'macro avg': 'm',
68 |             'weighted avg': 'w',
69 |         }.get(k, k)
70 |         if isinstance(v, float):
71 |             v = round(v, precision)
72 |             summary[k] = v
73 |         else:
74 |             v = {
75 |                 {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get(
76 |                     x, x
77 |                 ): round(v[x], precision)
78 |                 for x in v
79 |             }
80 |             for item in v.items():
81 |                 summary[k + item[0]] = item[1]
82 | 
83 |     s = [f'score = {summary.pop("SCORE"):.3f}']
84 |     for k, v in summary.items():
85 |         if k not in ['mp', 'mr', 'wp', 'wr']:  # just to save screen space
86 |             s.append(f'{k} = {v}')
87 |     return ' | '.join(s)
88 | 


--------------------------------------------------------------------------------
/TabNet/lib/synthetic_data.py:
--------------------------------------------------------------------------------
  1 | "Code used to generate data for experiments with synthetic data"
  2 | import math
  3 | import typing as ty
  4 | 
  5 | import numba
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | from numba.experimental import jitclass
 10 | from tqdm.auto import tqdm
 11 | 
 12 | 
 13 | class MLP(nn.Module):
 14 |     def __init__(
 15 |         self,
 16 |         *,
 17 |         d_in: int,
 18 |         d_layers: ty.List[int],
 19 |         d_out: int,
 20 |         bias: bool = True,
 21 |     ) -> None:
 22 |         super().__init__()
 23 |         self.layers = nn.ModuleList(
 24 |             [
 25 |                 nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias)
 26 |                 for i, x in enumerate(d_layers)
 27 |             ]
 28 |         )
 29 |         self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out)
 30 | 
 31 |         def init_weights(m):
 32 |             if isinstance(m, nn.Linear):
 33 |                 torch.nn.init.kaiming_normal_(m.weight, mode='fan_in')
 34 |                 if m.bias is not None:
 35 |                     fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight)
 36 |                     bound = 1 / math.sqrt(fan_in)
 37 |                     torch.nn.init.uniform_(m.bias, -bound, bound)
 38 | 
 39 |         self.apply(init_weights)
 40 | 
 41 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 42 |         for layer in self.layers:
 43 |             x = layer(x)
 44 |             x = torch.relu(x)
 45 |         x = self.head(x)
 46 |         x = x.squeeze(-1)
 47 |         return x
 48 | 
 49 | 
 50 | @jitclass(
 51 |     spec=[
 52 |         ('left_children', numba.int64[:]),
 53 |         ('right_children', numba.int64[:]),
 54 |         ('feature', numba.int64[:]),
 55 |         ('threshold', numba.float32[:]),
 56 |         ('value', numba.float32[:]),
 57 |         ('is_leaf', numba.int64[:]),
 58 |     ]
 59 | )
 60 | class Tree:
 61 |     "Randomly initialized decision tree"
 62 | 
 63 |     def __init__(self, n_features, n_nodes, max_depth):
 64 |         assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes"
 65 | 
 66 |         self.left_children = np.ones(n_nodes, dtype=np.int64) * -1
 67 |         self.right_children = np.ones(n_nodes, dtype=np.int64) * -1
 68 |         self.feature = np.random.randint(0, n_features, (n_nodes,))
 69 |         self.threshold = np.random.randn(n_nodes).astype(np.float32)
 70 |         self.value = np.random.randn(n_nodes).astype(np.float32)
 71 |         depth = np.zeros(n_nodes, dtype=np.int64)
 72 | 
 73 |         # Root is 0
 74 |         self.is_leaf = np.zeros(n_nodes, dtype=np.int64)
 75 |         self.is_leaf[0] = 1
 76 | 
 77 |         # Keep adding nodes while we can (new node must have 2 children)
 78 |         while True:
 79 |             idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())]
 80 |             if depth[idx] < max_depth:
 81 |                 unused = np.flatnonzero(
 82 |                     (self.left_children == -1)
 83 |                     & (self.right_children == -1)
 84 |                     & ~self.is_leaf
 85 |                 )
 86 |                 if len(unused) < 2:
 87 |                     break
 88 | 
 89 |                 lr_child = unused[np.random.permutation(unused.shape[0])[:2]]
 90 |                 self.is_leaf[lr_child] = 1
 91 |                 self.is_leaf[lr_child] = 1
 92 |                 depth[lr_child] = depth[idx] + 1
 93 |                 self.left_children[idx] = lr_child[0]
 94 |                 self.right_children[idx] = lr_child[1]
 95 |                 self.is_leaf[idx] = 0
 96 | 
 97 |     def apply(self, x):
 98 |         y = np.zeros(x.shape[0])
 99 | 
100 |         for i in range(x.shape[0]):
101 |             idx = 0
102 | 
103 |             while not self.is_leaf[idx]:
104 |                 if x[i, self.feature[idx]] < self.threshold[idx]:
105 |                     idx = self.left_children[idx]
106 |                 else:
107 |                     idx = self.right_children[idx]
108 | 
109 |             y[i] = self.value[idx]
110 | 
111 |         return y
112 | 
113 | 
114 | class TreeEnsemble:
115 |     "Combine multiple trees"
116 | 
117 |     def __init__(self, *, n_trees, n_features, n_nodes, max_depth):
118 |         self.trees = [
119 |             Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth)
120 |             for _ in range(n_trees)
121 |         ]
122 | 
123 |     def apply(self, x):
124 |         return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0)
125 | 


--------------------------------------------------------------------------------
/TabNet/lib/util.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import json
  4 | import os
  5 | import pickle
  6 | import random
  7 | import shutil
  8 | import sys
  9 | import time
 10 | import typing as ty
 11 | from copy import deepcopy
 12 | from pathlib import Path
 13 | 
 14 | import numpy as np
 15 | import pynvml
 16 | import pytomlpp as toml
 17 | import torch
 18 | 
 19 | from . import env
 20 | 
 21 | TRAIN = 'train'
 22 | VAL = 'val'
 23 | TEST = 'test'
 24 | PARTS = [TRAIN, VAL, TEST]
 25 | 
 26 | BINCLASS = 'binclass'
 27 | MULTICLASS = 'multiclass'
 28 | REGRESSION = 'regression'
 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION]
 30 | 
 31 | 
 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any:
 33 |     return json.loads(Path(path).read_text())
 34 | 
 35 | 
 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None:
 37 |     Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n')
 38 | 
 39 | 
 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any:
 41 |     return toml.loads(Path(path).read_text())
 42 | 
 43 | 
 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None:
 45 |     Path(path).write_text(toml.dumps(x) + '\n')
 46 | 
 47 | 
 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any:
 49 |     return pickle.loads(Path(path).read_bytes())
 50 | 
 51 | 
 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None:
 53 |     Path(path).write_bytes(pickle.dumps(x))
 54 | 
 55 | 
 56 | def load(path: ty.Union[Path, str]) -> ty.Any:
 57 |     return globals()[f'load_{Path(path).suffix[1:]}'](path)
 58 | 
 59 | 
 60 | def load_config(
 61 |     argv: ty.Optional[ty.List[str]] = None,
 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]:
 63 |     parser = argparse.ArgumentParser()
 64 |     parser.add_argument('config', metavar='FILE')
 65 |     parser.add_argument('-o', '--output', metavar='DIR')
 66 |     parser.add_argument('-f', '--force', action='store_true')
 67 |     parser.add_argument('--continue', action='store_true', dest='continue_')
 68 |     if argv is None:
 69 |         argv = sys.argv[1:]
 70 |     args = parser.parse_args(argv)
 71 | 
 72 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
 73 |     if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists():
 74 |         assert args.continue_
 75 | 
 76 |     config_path = Path(args.config).absolute()
 77 |     output_dir = (
 78 |         Path(args.output)
 79 |         if args.output
 80 |         else config_path.parent.joinpath(config_path.stem)
 81 |     ).absolute()
 82 |     sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir))))  # type: ignore[code]
 83 |     print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n')
 84 | 
 85 |     assert config_path.exists()
 86 |     config = load_toml(config_path)
 87 | 
 88 |     environment: ty.Dict[str, ty.Any] = {}
 89 |     if torch.cuda.is_available():  # type: ignore[code]
 90 |         cvd = os.environ.get('CUDA_VISIBLE_DEVICES')
 91 |         pynvml.nvmlInit()
 92 |         environment['devices'] = {
 93 |             'CUDA_VISIBLE_DEVICES': cvd,
 94 |             'torch.version.cuda': torch.version.cuda,
 95 |             'torch.backends.cudnn.version()': torch.backends.cudnn.version(),  # type: ignore[code]
 96 |             'torch.cuda.nccl.version()': torch.cuda.nccl.version(),  # type: ignore[code]
 97 |             'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'),
 98 |         }
 99 |         if cvd:
100 |             for i in map(int, cvd.split(',')):
101 |                 handle = pynvml.nvmlDeviceGetHandleByIndex(i)
102 |                 environment['devices'][i] = {
103 |                     'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'),
104 |                     'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total,
105 |                 }
106 | 
107 |     return config, output_dir
108 | 
109 | 
110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None:
111 |     dump_json(stats, output_dir / 'stats.json', indent=4)
112 |     json_output_path = os.environ.get('JSON_OUTPUT_FILE')
113 |     if final:
114 |         output_dir.joinpath('DONE').touch()
115 |         if json_output_path:
116 |             try:
117 |                 key = str(output_dir.relative_to(env.PROJECT_DIR))
118 |             except ValueError:
119 |                 pass
120 |             else:
121 |                 json_output_path = Path(json_output_path)
122 |                 try:
123 |                     json_data = json.loads(json_output_path.read_text())
124 |                 except (FileNotFoundError, json.decoder.JSONDecodeError):
125 |                     json_data = {}
126 |                 json_data[key] = stats
127 |                 json_output_path.write_text(json.dumps(json_data))
128 |             shutil.copyfile(
129 |                 json_output_path,
130 |                 os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'),
131 |             )
132 | 
133 | 
134 | _LAST_SNAPSHOT_TIME = None
135 | 
136 | 
137 | def backup_output(output_dir: Path) -> None:
138 |     backup_dir = os.environ.get('TMP_OUTPUT_PATH')
139 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
140 |     if backup_dir is None:
141 |         assert snapshot_dir is None
142 |         return
143 |     assert snapshot_dir is not None
144 | 
145 |     try:
146 |         relative_output_dir = output_dir.relative_to(env.PROJECT_DIR)
147 |     except ValueError:
148 |         return
149 | 
150 |     for dir_ in [backup_dir, snapshot_dir]:
151 |         new_output_dir = dir_ / relative_output_dir
152 |         prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev')
153 |         new_output_dir.parent.mkdir(exist_ok=True, parents=True)
154 |         if new_output_dir.exists():
155 |             new_output_dir.rename(prev_backup_output_dir)
156 |         shutil.copytree(output_dir, new_output_dir)
157 |         if prev_backup_output_dir.exists():
158 |             shutil.rmtree(prev_backup_output_dir)
159 | 
160 |     global _LAST_SNAPSHOT_TIME
161 |     if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60:
162 |         pass
163 |         _LAST_SNAPSHOT_TIME = time.time()
164 |         print('The snapshot was saved!')
165 | 
166 | 
167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any):
168 |     raise ValueError(f'Unknown {unknown_what}: {unknown_value}')
169 | 
170 | 
171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict:
172 |     x = deepcopy(default_kwargs)
173 |     x.update(kwargs)
174 |     return x
175 | 
176 | 
177 | def set_seeds(seed: int) -> None:
178 |     random.seed(seed)
179 |     np.random.seed(seed)
180 | 
181 | 
182 | def format_seconds(seconds: float) -> str:
183 |     return str(datetime.timedelta(seconds=round(seconds)))
184 | 
185 | 
186 | def get_categories(
187 |     X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List
188 | ) -> ty.Optional[ty.List[int]]:
189 |     if X_cat is None:
190 |         return None
191 |     else:
192 |         categories_count = []
193 |         for i in range(X_cat.shape[1]):
194 |             # Combine unique categories from both training and testing indices for each feature
195 |             unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist())
196 |             categories_count.append(len(unique_categories))
197 |         return categories_count
198 | 


--------------------------------------------------------------------------------
/TabNet/multitask.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from scipy.special import softmax
  4 | from pytorch_tabnet.utils import SparsePredictDataset, PredictDataset, filter_weights
  5 | from pytorch_tabnet.abstract_model import TabModel
  6 | from pytorch_tabnet.multiclass_utils import infer_multitask_output, check_output_dim
  7 | from torch.utils.data import DataLoader
  8 | import scipy
  9 | 
 10 | 
 11 | class TabNetMultiTaskClassifier(TabModel):
 12 |     def __post_init__(self):
 13 |         super(TabNetMultiTaskClassifier, self).__post_init__()
 14 |         self._task = 'classification'
 15 |         self._default_loss = torch.nn.functional.cross_entropy
 16 |         self._default_metric = 'logloss'
 17 | 
 18 |     def prepare_target(self, y):
 19 |         y_mapped = y.copy()
 20 |         for task_idx in range(y.shape[1]):
 21 |             task_mapper = self.target_mapper[task_idx]
 22 |             y_mapped[:, task_idx] = np.vectorize(task_mapper.get)(y[:, task_idx])
 23 |         return y_mapped
 24 | 
 25 |     def compute_loss(self, y_pred, y_true):
 26 |         """
 27 |         Computes the loss according to network output and targets
 28 | 
 29 |         Parameters
 30 |         ----------
 31 |         y_pred : list of tensors
 32 |             Output of network
 33 |         y_true : LongTensor
 34 |             Targets label encoded
 35 | 
 36 |         Returns
 37 |         -------
 38 |         loss : torch.Tensor
 39 |             output of loss function(s)
 40 | 
 41 |         """
 42 |         loss = 0
 43 |         y_true = y_true.long()
 44 |         if isinstance(self.loss_fn, list):
 45 |             # if you specify a different loss for each task
 46 |             for task_loss, task_output, task_id in zip(
 47 |                 self.loss_fn, y_pred, range(len(self.loss_fn))
 48 |             ):
 49 |                 loss += task_loss(task_output, y_true[:, task_id])
 50 |         else:
 51 |             # same loss function is applied to all tasks
 52 |             for task_id, task_output in enumerate(y_pred):
 53 |                 loss += self.loss_fn(task_output, y_true[:, task_id])
 54 | 
 55 |         loss /= len(y_pred)
 56 |         return loss
 57 | 
 58 |     def stack_batches(self, list_y_true, list_y_score):
 59 |         y_true = np.vstack(list_y_true)
 60 |         y_score = []
 61 |         for i in range(len(self.output_dim)):
 62 |             score = np.vstack([x[i] for x in list_y_score])
 63 |             score = softmax(score, axis=1)
 64 |             y_score.append(score)
 65 |         return y_true, y_score
 66 | 
 67 |     def update_fit_params(self, X_train, y_train, eval_set, weights):
 68 |         output_dim, train_labels = infer_multitask_output(y_train)
 69 |         for _, y in eval_set:
 70 |             for task_idx in range(y.shape[1]):
 71 |                 check_output_dim(train_labels[task_idx], y[:, task_idx])
 72 |         self.output_dim = output_dim
 73 |         self.classes_ = train_labels
 74 |         self.target_mapper = [
 75 |             {class_label: index for index, class_label in enumerate(classes)}
 76 |             for classes in self.classes_
 77 |         ]
 78 |         self.preds_mapper = [
 79 |             {str(index): str(class_label) for index, class_label in enumerate(classes)}
 80 |             for classes in self.classes_
 81 |         ]
 82 |         self.updated_weights = weights
 83 |         filter_weights(self.updated_weights)
 84 | 
 85 |     def predict(self, X):
 86 |         """
 87 |         Make predictions on a batch (valid)
 88 | 
 89 |         Parameters
 90 |         ----------
 91 |         X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix`
 92 |             Input data
 93 | 
 94 |         Returns
 95 |         -------
 96 |         results : np.array
 97 |             Predictions of the most probable class
 98 |         """
 99 |         self.network.eval()
100 | 
101 |         if scipy.sparse.issparse(X):
102 |             dataloader = DataLoader(
103 |                 SparsePredictDataset(X),
104 |                 batch_size=self.batch_size,
105 |                 shuffle=False,
106 |             )
107 |         else:
108 |             dataloader = DataLoader(
109 |                 PredictDataset(X),
110 |                 batch_size=self.batch_size,
111 |                 shuffle=False,
112 |             )
113 | 
114 |         results = {}
115 |         for data in dataloader:
116 |             data = data.to(self.device).float()
117 |             output, _ = self.network(data)
118 |             predictions = [
119 |                 torch.argmax(torch.nn.Softmax(dim=1)(task_output), dim=1)
120 |                 .cpu()
121 |                 .detach()
122 |                 .numpy()
123 |                 .reshape(-1)
124 |                 for task_output in output
125 |             ]
126 | 
127 |             for task_idx in range(len(self.output_dim)):
128 |                 results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]]
129 |         # stack all task individually
130 |         results = [np.hstack(task_res) for task_res in results.values()]
131 |         # map all task individually
132 |         results = [
133 |             np.vectorize(self.preds_mapper[task_idx].get)(task_res.astype(str))
134 |             for task_idx, task_res in enumerate(results)
135 |         ]
136 |         return results
137 | 
138 |     def predict_proba(self, X):
139 |         """
140 |         Make predictions for classification on a batch (valid)
141 | 
142 |         Parameters
143 |         ----------
144 |         X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix`
145 |             Input data
146 | 
147 |         Returns
148 |         -------
149 |         res : list of np.ndarray
150 | 
151 |         """
152 |         self.network.eval()
153 | 
154 |         if scipy.sparse.issparse(X):
155 |             dataloader = DataLoader(
156 |                 SparsePredictDataset(X),
157 |                 batch_size=self.batch_size,
158 |                 shuffle=False,
159 |             )
160 |         else:
161 |             dataloader = DataLoader(
162 |                 PredictDataset(X),
163 |                 batch_size=self.batch_size,
164 |                 shuffle=False,
165 |             )
166 | 
167 |         results = {}
168 |         for data in dataloader:
169 |             data = data.to(self.device).float()
170 |             output, _ = self.network(data)
171 |             predictions = [
172 |                 torch.nn.Softmax(dim=1)(task_output).cpu().detach().numpy()
173 |                 for task_output in output
174 |             ]
175 |             for task_idx in range(len(self.output_dim)):
176 |                 results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]]
177 |         res = [np.vstack(task_res) for task_res in results.values()]
178 |         return res
179 | 


--------------------------------------------------------------------------------
/TabNet/pretraining_utils.py:
--------------------------------------------------------------------------------
  1 | from torch.utils.data import DataLoader
  2 | from pytorch_tabnet.utils import (
  3 |     create_sampler,
  4 |     SparsePredictDataset,
  5 |     PredictDataset,
  6 |     check_input
  7 | )
  8 | import scipy
  9 | 
 10 | 
 11 | def create_dataloaders(
 12 |     X_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory
 13 | ):
 14 |     """
 15 |     Create dataloaders with or without subsampling depending on weights and balanced.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     X_train : np.ndarray or scipy.sparse.csr_matrix
 20 |         Training data
 21 |     eval_set : list of np.array (for Xs and ys) or scipy.sparse.csr_matrix (for Xs)
 22 |         List of eval sets
 23 |     weights : either 0, 1, dict or iterable
 24 |         if 0 (default) : no weights will be applied
 25 |         if 1 : classification only, will balanced class with inverse frequency
 26 |         if dict : keys are corresponding class values are sample weights
 27 |         if iterable : list or np array must be of length equal to nb elements
 28 |                       in the training set
 29 |     batch_size : int
 30 |         how many samples per batch to load
 31 |     num_workers : int
 32 |         how many subprocesses to use for data loading. 0 means that the data
 33 |         will be loaded in the main process
 34 |     drop_last : bool
 35 |         set to True to drop the last incomplete batch, if the dataset size is not
 36 |         divisible by the batch size. If False and the size of dataset is not
 37 |         divisible by the batch size, then the last batch will be smaller
 38 |     pin_memory : bool
 39 |         Whether to pin GPU memory during training
 40 | 
 41 |     Returns
 42 |     -------
 43 |     train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader
 44 |         Training and validation dataloaders
 45 |     """
 46 |     need_shuffle, sampler = create_sampler(weights, X_train)
 47 | 
 48 |     if scipy.sparse.issparse(X_train):
 49 |         train_dataloader = DataLoader(
 50 |             SparsePredictDataset(X_train),
 51 |             batch_size=batch_size,
 52 |             sampler=sampler,
 53 |             shuffle=need_shuffle,
 54 |             num_workers=num_workers,
 55 |             drop_last=drop_last,
 56 |             pin_memory=pin_memory,
 57 |         )
 58 |     else:
 59 |         train_dataloader = DataLoader(
 60 |             PredictDataset(X_train),
 61 |             batch_size=batch_size,
 62 |             sampler=sampler,
 63 |             shuffle=need_shuffle,
 64 |             num_workers=num_workers,
 65 |             drop_last=drop_last,
 66 |             pin_memory=pin_memory,
 67 |         )
 68 | 
 69 |     valid_dataloaders = []
 70 |     for X in eval_set:
 71 |         if scipy.sparse.issparse(X):
 72 |             valid_dataloaders.append(
 73 |                 DataLoader(
 74 |                     SparsePredictDataset(X),
 75 |                     batch_size=batch_size,
 76 |                     sampler=sampler,
 77 |                     shuffle=need_shuffle,
 78 |                     num_workers=num_workers,
 79 |                     drop_last=drop_last,
 80 |                     pin_memory=pin_memory,
 81 |                 )
 82 |             )
 83 |         else:
 84 |             valid_dataloaders.append(
 85 |                 DataLoader(
 86 |                     PredictDataset(X),
 87 |                     batch_size=batch_size,
 88 |                     sampler=sampler,
 89 |                     shuffle=need_shuffle,
 90 |                     num_workers=num_workers,
 91 |                     drop_last=drop_last,
 92 |                     pin_memory=pin_memory,
 93 |                 )
 94 |             )
 95 | 
 96 |     return train_dataloader, valid_dataloaders
 97 | 
 98 | 
 99 | def validate_eval_set(eval_set, eval_name, X_train):
100 |     """Check if the shapes of eval_set are compatible with X_train.
101 | 
102 |     Parameters
103 |     ----------
104 |     eval_set : List of numpy array
105 |         The list evaluation set.
106 |         The last one is used for early stopping
107 |     X_train : np.ndarray
108 |         Train owned products
109 | 
110 |     Returns
111 |     -------
112 |     eval_names : list of str
113 |         Validated list of eval_names.
114 | 
115 |     """
116 |     eval_names = eval_name or [f"val_{i}" for i in range(len(eval_set))]
117 |     assert len(eval_set) == len(
118 |         eval_names
119 |     ), "eval_set and eval_name have not the same length"
120 | 
121 |     for set_nb, X in enumerate(eval_set):
122 |         check_input(X)
123 |         msg = (
124 |             f"Number of columns is different between eval set {set_nb}"
125 |             + f"({X.shape[1]}) and X_train ({X_train.shape[1]})"
126 |         )
127 |         assert X.shape[1] == X_train.shape[1], msg
128 |     return eval_names
129 | 


--------------------------------------------------------------------------------
/TabNet/tab_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from scipy.special import softmax
  4 | from tabnet_utils import SparsePredictDataset, PredictDataset, filter_weights
  5 | from abstract_model import TabModel
  6 | from multiclass_utils import infer_output_dim, check_output_dim
  7 | from torch.utils.data import DataLoader
  8 | import scipy
  9 | 
 10 | 
 11 | class TabNetClassifier(TabModel):
 12 |     def __post_init__(self):
 13 |         super(TabNetClassifier, self).__post_init__()
 14 |         self._task = 'classification'
 15 |         self._default_loss = torch.nn.functional.cross_entropy
 16 |         self._default_metric = 'accuracy'
 17 | 
 18 |     def weight_updater(self, weights):
 19 |         """
 20 |         Updates weights dictionary according to target_mapper.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         weights : bool or dict
 25 |             Given weights for balancing training.
 26 | 
 27 |         Returns
 28 |         -------
 29 |         bool or dict
 30 |             Same bool if weights are bool, updated dict otherwise.
 31 | 
 32 |         """
 33 |         if isinstance(weights, int):
 34 |             return weights
 35 |         elif isinstance(weights, dict):
 36 |             return {self.target_mapper[key]: value for key, value in weights.items()}
 37 |         else:
 38 |             return weights
 39 | 
 40 |     def prepare_target(self, y):
 41 |         return np.vectorize(self.target_mapper.get)(y)
 42 | 
 43 |     def compute_loss(self, y_pred, y_true):
 44 |         return self.loss_fn(y_pred, y_true.long())
 45 | 
 46 |     def update_fit_params(
 47 |         self,
 48 |         X_train,
 49 |         y_train,
 50 |         eval_set,
 51 |         weights,
 52 |     ):
 53 |         output_dim, train_labels = infer_output_dim(y_train)
 54 |         for X, y in eval_set:
 55 |             check_output_dim(train_labels, y)
 56 |         self.output_dim = output_dim
 57 |         self._default_metric = ('auc' if self.output_dim == 2 else 'accuracy')
 58 |         self.classes_ = train_labels
 59 |         self.target_mapper = {
 60 |             class_label: index for index, class_label in enumerate(self.classes_)
 61 |         }
 62 |         self.preds_mapper = {
 63 |             str(index): class_label for index, class_label in enumerate(self.classes_)
 64 |         }
 65 |         self.updated_weights = self.weight_updater(weights)
 66 | 
 67 |     def stack_batches(self, list_y_true, list_y_score):
 68 |         y_true = np.hstack(list_y_true)
 69 |         y_score = np.vstack(list_y_score)
 70 |         y_score = softmax(y_score, axis=1)
 71 |         return y_true, y_score
 72 | 
 73 |     def predict_func(self, outputs):
 74 |         outputs = np.argmax(outputs, axis=1)
 75 |         return np.vectorize(self.preds_mapper.get)(outputs.astype(str))
 76 | 
 77 |     def predict_proba(self, X):
 78 |         """
 79 |         Make predictions for classification on a batch (valid)
 80 | 
 81 |         Parameters
 82 |         ----------
 83 |         X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix`
 84 |             Input data
 85 | 
 86 |         Returns
 87 |         -------
 88 |         res : np.ndarray
 89 | 
 90 |         """
 91 |         self.network.eval()
 92 | 
 93 |         if scipy.sparse.issparse(X):
 94 |             dataloader = DataLoader(
 95 |                 SparsePredictDataset(X),
 96 |                 batch_size=self.batch_size,
 97 |                 shuffle=False,
 98 |             )
 99 |         else:
100 |             dataloader = DataLoader(
101 |                 PredictDataset(X),
102 |                 batch_size=self.batch_size,
103 |                 shuffle=False,
104 |             )
105 | 
106 |         results = []
107 |         for batch_nb, data in enumerate(dataloader):
108 |             data = data.to(self.device).float()
109 | 
110 |             output, M_loss = self.network(data)
111 |             predictions = torch.nn.Softmax(dim=1)(output).cpu().detach().numpy()
112 |             results.append(predictions)
113 |         res = np.vstack(results)
114 |         return res
115 | 
116 | 
117 | class TabNetRegressor(TabModel):
118 |     def __post_init__(self):
119 |         super(TabNetRegressor, self).__post_init__()
120 |         self._task = 'regression'
121 |         self._default_loss = torch.nn.functional.mse_loss
122 |         self._default_metric = 'mse'
123 | 
124 |     def prepare_target(self, y):
125 |         return y
126 | 
127 |     def compute_loss(self, y_pred, y_true):
128 |         return self.loss_fn(y_pred, y_true)
129 | 
130 |     def update_fit_params(
131 |         self,
132 |         X_train,
133 |         y_train,
134 |         eval_set,
135 |         weights
136 |     ):
137 |         if len(y_train.shape) != 2:
138 |             msg = "Targets should be 2D : (n_samples, n_regression) " + \
139 |                   f"but y_train.shape={y_train.shape} given.\n" + \
140 |                   "Use reshape(-1, 1) for single regression."
141 |             raise ValueError(msg)
142 |         self.output_dim = y_train.shape[1]
143 |         self.preds_mapper = None
144 | 
145 |         self.updated_weights = weights
146 |         filter_weights(self.updated_weights)
147 | 
148 |     def predict_func(self, outputs):
149 |         return outputs
150 | 
151 |     def stack_batches(self, list_y_true, list_y_score):
152 |         y_true = np.vstack(list_y_true)
153 |         y_score = np.vstack(list_y_score)
154 |         return y_true, y_score
155 | 


--------------------------------------------------------------------------------
/TabNet/utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | 
 6 | def set_random_seed(seed):
 7 |     """
 8 |     Set the seed for random number generation in Python, NumPy, and PyTorch.
 9 | 
10 |     Args:
11 |     seed (int): The seed value to use for all random number generators.
12 |     """
13 |     random.seed(seed)
14 |     np.random.seed(seed)
15 |     torch.manual_seed(seed)
16 | 
17 |     if torch.cuda.is_available():
18 |         torch.cuda.manual_seed(seed)
19 |         torch.cuda.manual_seed_all(seed)
20 |         torch.backends.cudnn.deterministic = True
21 |         torch.backends.cudnn.benchmark = False


--------------------------------------------------------------------------------
/TabPFN/run_tabpfn.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openml
  6 | from sklearn.metrics import accuracy_score, roc_auc_score
  7 | from sklearn.model_selection import StratifiedKFold
  8 | from sklearn.preprocessing import LabelEncoder
  9 | 
 10 | from tabpfn import TabPFNClassifier
 11 | from utils import set_random_seed
 12 | import torch
 13 | import numpy as np
 14 | import pandas as pd
 15 | 
 16 | 
 17 | def main(args: argparse.Namespace):
 18 | 
 19 |     seed = args.seed
 20 |     set_random_seed(seed)
 21 |     outer_fold = args.outer_fold
 22 |     dataset_id = args.dataset_id
 23 |     dataset = openml.datasets.get_dataset(dataset_id, download_data=False)
 24 | 
 25 |     X, y, categorical_indicator, attribute_names = dataset.get_data(
 26 |         dataset_format='dataframe',
 27 |         target=dataset.default_target_attribute,
 28 |     )
 29 | 
 30 |     categorical_column_names = X.columns[categorical_indicator]
 31 |     X = pd.get_dummies(X, columns=categorical_column_names)
 32 | 
 33 |     label_encoder = LabelEncoder()
 34 |     label_encoder.fit(y)
 35 | 
 36 |     skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
 37 |     splits = list(skf.split(X, y))
 38 |     train_idx, test_idx = splits[outer_fold]
 39 |     nr_classes = len(np.unique(y))
 40 |     X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
 41 |     y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
 42 | 
 43 |     y_train = label_encoder.transform(y_train)
 44 |     y_test = label_encoder.transform(y_test)
 45 | 
 46 |     device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 47 |     classifier = TabPFNClassifier(device=device, seed=seed, N_ensemble_configurations=32)
 48 | 
 49 |     classifier.fit(X_train, y_train)
 50 |     p_eval = classifier.predict_proba(X_test)
 51 |     y_eval = classifier.predict(X_test)
 52 |     if nr_classes == 2:
 53 |         p_eval = p_eval[:, 1]
 54 | 
 55 |     auroc_test_value = roc_auc_score(y_test, p_eval, multi_class='ovo')
 56 | 
 57 |     acc_test_value = accuracy_score(y_test, y_eval)
 58 | 
 59 |     result_path = os.path.join(
 60 |         args.output_dir,
 61 |         'tabpfn',
 62 |         f'{dataset_id}',
 63 |         f'{outer_fold}',
 64 |     )
 65 | 
 66 |     os.makedirs(result_path, exist_ok=True)
 67 |     result_dict = {
 68 |         'test_auroc': auroc_test_value,
 69 |         'test_acc': acc_test_value,
 70 |     }
 71 | 
 72 |     with open(os.path.join(result_path, 'result.json'), 'w') as f:
 73 |         json.dump(result_dict, f)
 74 | 
 75 | 
 76 | if __name__ == "__main__":
 77 | 
 78 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 79 | 
 80 |     parser.add_argument(
 81 |         '--seed',
 82 |         type=int,
 83 |         default=0,
 84 |         help='Random seed',
 85 |     )
 86 |     parser.add_argument(
 87 |         '--outer_fold',
 88 |         type=int,
 89 |         default=2,
 90 |         help='Outer fold iteration.',
 91 |     )
 92 |     parser.add_argument(
 93 |         '--dataset_id',
 94 |         type=int,
 95 |         default=31,
 96 |         help='Dataset id',
 97 |     )
 98 |     parser.add_argument(
 99 |         '--output_dir',
100 |         type=str,
101 |         default='.',
102 |         help='Directory to save the results',
103 |     )
104 | 
105 |     args = parser.parse_args()
106 | 
107 |     main(args)


--------------------------------------------------------------------------------
/TabPFN/utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | 
 6 | def set_random_seed(seed):
 7 |     """
 8 |     Set the seed for random number generation in Python, NumPy, and PyTorch.
 9 | 
10 |     Args:
11 |     seed (int): The seed value to use for all random number generators.
12 |     """
13 |     random.seed(seed)
14 |     np.random.seed(seed)
15 |     torch.manual_seed(seed)
16 | 
17 |     if torch.cuda.is_available():
18 |         torch.cuda.manual_seed(seed)
19 |         torch.cuda.manual_seed_all(seed)
20 |         torch.backends.cudnn.deterministic = True
21 |         torch.backends.cudnn.benchmark = False


--------------------------------------------------------------------------------
/XGBoost/evaluate_30_trials.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import math
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import optuna
  7 | import scipy
  8 | import zero
  9 | import torch.nn as nn
 10 | import torch
 11 | import torch.nn.functional as F
 12 | from sklearn.metrics import roc_auc_score, accuracy_score
 13 | from xgboost import XGBClassifier
 14 | 
 15 | import lib
 16 | import wandb
 17 | 
 18 | from sklearn.model_selection import StratifiedKFold
 19 | from utils import set_random_seed
 20 | 
 21 | # Create the parser
 22 | parser = argparse.ArgumentParser(description="Train a model with specified parameters.")
 23 | 
 24 | # Add the arguments
 25 | parser.add_argument('--experiment_name', type=str, default='test',
 26 |                     help='The name of the experiment. Default is "test".')
 27 | parser.add_argument('--dataset', type=int, default=54,
 28 |                     help='The dataset ID to use. Default is 45068 (adult).')
 29 | parser.add_argument('--seed', type=int, default=0,
 30 |                     help='The random seed for reproducibility. Default is 42.')
 31 | parser.add_argument('--normalization', type=str, default='quantile', choices=['quantile', 'standard'],
 32 |                     help='The normalization to use for the numerical features. Default is "quantile".')
 33 | parser.add_argument('--cat_nan_policy', type=str, default='new', choices=['new', 'most_frequent'],
 34 |                     help='The policy to use for handling nan values in categorical features. Default is "new".')
 35 | parser.add_argument('--cat_policy', type=str, default='indices', choices=['indices', 'ohe'],
 36 |                     help='The policy to use for handling categorical features. Default is "indices".')
 37 | parser.add_argument('--outer_fold', type=int, default=0, help='The outer fold to use. Default is 0')
 38 | parser.add_argument('--n_trials', type=int, default=100,
 39 |                     help='The number of trials to use for HPO. Default is 100')
 40 | parser.add_argument('--tune', action='store_true', help='Whether to tune the hyperparameters using Optuna')
 41 | 
 42 | args = parser.parse_args()
 43 | 
 44 | 
 45 | def load_best_config(project_name, dataset_name, outer_fold, num_trials=30):
 46 |     api = wandb.Api()
 47 |     target_run_name = f"{dataset_name}_outerFold_{outer_fold}"
 48 |     runs = api.runs(project_name)
 49 | 
 50 |     target_run = None
 51 |     for run in runs:
 52 |         if run.name == target_run_name:
 53 |             target_run = run
 54 |             break
 55 | 
 56 |     if not target_run:
 57 |         raise ValueError(f"No run found with name: {target_run_name}")
 58 | 
 59 |     # First scan for the best average_test_rocauc
 60 |     best_rocauc = 0  # Looking for the highest rocauc
 61 |     best_step = None
 62 |     history = target_run.scan_history(keys=['average_test_rocauc'])
 63 |     for i, row in enumerate(history):
 64 |         if i >= num_trials:
 65 |             break
 66 |         if 'average_test_rocauc' in row and row['average_test_rocauc'] > best_rocauc:
 67 |             best_rocauc = row['average_test_rocauc']
 68 |             best_step = i
 69 | 
 70 |     if best_step is None:
 71 |         raise ValueError("Best rocauc not found within the first 30 trials")
 72 | 
 73 |     # Second scan for the HPs at the best step
 74 |     hp_keys = ['max_depth', 'min_child_weight', 'subsample', 'learning_rate', 'colsample_bylevel', 'colsample_bytree',
 75 |                'gamma', 'reg_lambda', 'reg_alpha']
 76 |     best_config = None
 77 |     history = target_run.scan_history(keys=hp_keys)
 78 |     for i, row in enumerate(history):
 79 |         if i == best_step:
 80 |             best_config = {key: row[key] for key in hp_keys if key in row}
 81 |             break
 82 | 
 83 |     if best_config:
 84 |         return best_config
 85 |     else:
 86 |         raise ValueError("HPs not found for the best rocauc step")
 87 | 
 88 | 
 89 | def run_single_outer_fold(outer_fold, D, outer_folds):
 90 |     outer_train_idx, outer_test_idx = outer_folds[outer_fold]
 91 | 
 92 |     best_params = load_best_config('t4tab/XGBoost_optuna', D.info['dataset_name'], args.outer_fold)
 93 | 
 94 |     hyperparameters = {
 95 |         'max_depth': best_params['max_depth'],
 96 |         'min_child_weight': best_params['min_child_weight'],
 97 |         'subsample': best_params['subsample'],
 98 |         'learning_rate': best_params['learning_rate'],
 99 |         'colsample_bylevel': best_params['colsample_bylevel'],
100 |         'colsample_bytree': best_params['colsample_bytree'],
101 |         'gamma': best_params['gamma'],
102 |         'reg_lambda': best_params['reg_lambda'],
103 |         'reg_alpha': best_params['reg_alpha']
104 |     }
105 |     X_outer_preprocessed = D.build_X(
106 |         normalization='quantile',
107 |         num_nan_policy='mean',
108 |         cat_nan_policy='new',
109 |         cat_policy='ohe',
110 |         seed=args.seed,
111 |         train_idx=outer_train_idx,
112 |         test_idx=outer_test_idx,
113 |     )
114 |     set_random_seed(args.seed)
115 |     Y, y_info = D.build_y(train_idx=outer_train_idx, test_idx=outer_test_idx)
116 | 
117 |     booster = "gbtree"
118 |     early_stopping_rounds = 50
119 |     n_estimators = 2000
120 |     eval_metric = 'auc'
121 |     model = XGBClassifier(booster=booster,
122 |                           n_estimators=n_estimators,
123 |                           tree_method='gpu_hist',
124 |                           disable_default_eval_metric=True,
125 |                           use_label_encoder=False)
126 |     if args.tune:
127 |         model.set_params(**hyperparameters)
128 |     unique_classes, class_counts = np.unique(Y[outer_train_idx], axis=0, return_counts=True)
129 |     nr_classes = len(unique_classes)
130 | 
131 |     model.fit(X_outer_preprocessed[outer_train_idx], Y[outer_train_idx],
132 |               eval_set=[(X_outer_preprocessed[outer_test_idx], Y[outer_test_idx])],
133 |               eval_metric=custom_auc_eval if D.is_multiclass else eval_metric,
134 |               early_stopping_rounds=early_stopping_rounds,
135 |               verbose=False)
136 | 
137 |     train_predictions_labels = model.predict(X_outer_preprocessed[outer_train_idx])
138 |     test_predictions_labels = model.predict(X_outer_preprocessed[outer_test_idx])
139 |     if D.is_multiclass:
140 |         train_predictions_probabilities = model.predict_proba(X_outer_preprocessed[outer_train_idx])
141 |         test_predictions_probabilities = model.predict_proba(X_outer_preprocessed[outer_test_idx])
142 |     else:
143 |         train_predictions_probabilities = model.predict_proba(X_outer_preprocessed[outer_train_idx])[:, 1]
144 |         test_predictions_probabilities = model.predict_proba(X_outer_preprocessed[outer_test_idx])[:, 1]
145 | 
146 |     # calculate the balanced accuracy
147 |     train_rocauc = roc_auc_score(Y[outer_train_idx], train_predictions_probabilities,
148 |                                  multi_class='raise' if nr_classes == 2 else 'ovo')
149 |     train_accuracy = accuracy_score(Y[outer_train_idx], train_predictions_labels)
150 |     test_rocauc = roc_auc_score(Y[outer_test_idx], test_predictions_probabilities,
151 |                                 multi_class='raise' if nr_classes == 2 else 'ovo')
152 |     test_accuracy = accuracy_score(Y[outer_test_idx], test_predictions_labels)
153 |     print(f"Finished outer fold {outer_fold}")
154 | 
155 |     output_info = {
156 |         'train_rocauc': train_rocauc,
157 |         'train_accuracy': train_accuracy,
158 |         'test_accuracy': test_accuracy,
159 |         f'best_test_rocauc_outer_fold_{outer_fold}': test_rocauc,
160 |     }
161 |     wandb.log(output_info)
162 |     wandb.finish()
163 | 
164 | 
165 | def custom_auc_eval(y_pred, dtrain):
166 |     y_true = dtrain.get_label()
167 | 
168 |     y_pred = scipy.special.softmax(y_pred, axis=1)
169 |     y_pred_sums = np.sum(y_pred, axis=1)
170 |     if not np.allclose(y_pred_sums, 1.0):
171 |         print("Probabilities do not sum to 1.0 for some instances.")
172 |         y_pred = y_pred / y_pred_sums[:, np.newaxis]
173 |     auc = roc_auc_score(y_true, y_pred, multi_class='ovo')
174 | 
175 |     return 'auc', auc
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     # %%
180 |     set_random_seed(args.seed)
181 |     D = lib.Dataset.from_openml(args.dataset)
182 |     run_name = f"{D.info['dataset_name']}_outerFold_{args.outer_fold}"
183 |     wandb.init(project=args.experiment_name,
184 |                name=run_name,
185 |                config=args)
186 |     outer_kfold = StratifiedKFold(n_splits=10, shuffle=True)
187 |     outer_folds = list(outer_kfold.split(D.X, D.y))
188 |     run_single_outer_fold(args.outer_fold, D, outer_folds)


--------------------------------------------------------------------------------
/XGBoost/lib/__init__.py:
--------------------------------------------------------------------------------
 1 | from icecream import install
 2 | 
 3 | install()
 4 | 
 5 | from . import env  # noqa
 6 | from .data import *  # noqa
 7 | from .deep import *  # noqa
 8 | from .metrics import *  # noqa
 9 | from .util import *  # noqa
10 | 


--------------------------------------------------------------------------------
/XGBoost/lib/env.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/XGBoost/lib/metrics.py:
--------------------------------------------------------------------------------
 1 | import typing as ty
 2 | 
 3 | import numpy as np
 4 | import scipy.special
 5 | import sklearn.metrics as skm
 6 | 
 7 | from . import util
 8 | 
 9 | 
10 | def calculate_metrics(
11 |     task_type: str,
12 |     y: np.ndarray,
13 |     prediction: np.ndarray,
14 |     classification_mode: str,
15 |     y_info: ty.Optional[ty.Dict[str, ty.Any]],
16 | ) -> ty.Dict[str, float]:
17 |     if task_type == util.REGRESSION:
18 |         del classification_mode
19 |         rmse = skm.mean_squared_error(y, prediction) ** 0.5  # type: ignore[code]
20 | 
21 |         return {'rmse': rmse, 'score': -rmse}
22 |     else:
23 |         assert task_type in (util.BINCLASS, util.MULTICLASS)
24 |         labels = None
25 |         if classification_mode == 'probs':
26 |             probs = prediction
27 |         elif classification_mode == 'logits':
28 |             probs = (
29 |                 scipy.special.expit(prediction)
30 |                 if task_type == util.BINCLASS
31 |                 else scipy.special.softmax(prediction, axis=1)
32 |             )
33 |         else:
34 |             assert classification_mode == 'labels'
35 |             probs = None
36 |             labels = prediction
37 |         if labels is None:
38 |             labels = (
39 |                 np.round(probs).astype('int64')
40 |                 if task_type == util.BINCLASS
41 |                 else probs.argmax(axis=1)  # type: ignore[code]
42 |             )
43 | 
44 |         result = skm.classification_report(y, labels, output_dict=True)  # type: ignore[code]
45 |         if task_type == util.BINCLASS:
46 |             result['roc_auc'] = skm.roc_auc_score(y, probs)  # type: ignore[code]
47 |         else:
48 |             result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo')  # type: ignore[code]
49 |         result['score'] = result['roc_auc']  # type: ignore[code]
50 |     return result  # type: ignore[code]
51 | 
52 | 
53 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str:
54 |     precision = 3
55 |     summary = {}
56 |     for k, v in metrics[1].items():
57 |         if k.isdigit():
58 |             continue
59 |         k = {
60 |             'score': 'SCORE',
61 |             'accuracy': 'acc',
62 |             'roc_auc': 'roc_auc',
63 |             'macro avg': 'm',
64 |             'weighted avg': 'w',
65 |         }.get(k, k)
66 |         if isinstance(v, float):
67 |             v = round(v, precision)
68 |             summary[k] = v
69 |         else:
70 |             v = {
71 |                 {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get(
72 |                     x, x
73 |                 ): round(v[x], precision)
74 |                 for x in v
75 |             }
76 |             for item in v.items():
77 |                 summary[k + item[0]] = item[1]
78 | 
79 |     s = [f'score = {summary.pop("SCORE"):.3f}']
80 |     for k, v in summary.items():
81 |         if k not in ['mp', 'mr', 'wp', 'wr']:  # just to save screen space
82 |             s.append(f'{k} = {v}')
83 |     return ' | '.join(s)
84 | 


--------------------------------------------------------------------------------
/XGBoost/lib/synthetic_data.py:
--------------------------------------------------------------------------------
  1 | "Code used to generate data for experiments with synthetic data"
  2 | import math
  3 | import typing as ty
  4 | 
  5 | import numba
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | from numba.experimental import jitclass
 10 | from tqdm.auto import tqdm
 11 | 
 12 | 
 13 | class MLP(nn.Module):
 14 |     def __init__(
 15 |         self,
 16 |         *,
 17 |         d_in: int,
 18 |         d_layers: ty.List[int],
 19 |         d_out: int,
 20 |         bias: bool = True,
 21 |     ) -> None:
 22 |         super().__init__()
 23 |         self.layers = nn.ModuleList(
 24 |             [
 25 |                 nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias)
 26 |                 for i, x in enumerate(d_layers)
 27 |             ]
 28 |         )
 29 |         self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out)
 30 | 
 31 |         def init_weights(m):
 32 |             if isinstance(m, nn.Linear):
 33 |                 torch.nn.init.kaiming_normal_(m.weight, mode='fan_in')
 34 |                 if m.bias is not None:
 35 |                     fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight)
 36 |                     bound = 1 / math.sqrt(fan_in)
 37 |                     torch.nn.init.uniform_(m.bias, -bound, bound)
 38 | 
 39 |         self.apply(init_weights)
 40 | 
 41 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 42 |         for layer in self.layers:
 43 |             x = layer(x)
 44 |             x = torch.relu(x)
 45 |         x = self.head(x)
 46 |         x = x.squeeze(-1)
 47 |         return x
 48 | 
 49 | 
 50 | @jitclass(
 51 |     spec=[
 52 |         ('left_children', numba.int64[:]),
 53 |         ('right_children', numba.int64[:]),
 54 |         ('feature', numba.int64[:]),
 55 |         ('threshold', numba.float32[:]),
 56 |         ('value', numba.float32[:]),
 57 |         ('is_leaf', numba.int64[:]),
 58 |     ]
 59 | )
 60 | class Tree:
 61 |     "Randomly initialized decision tree"
 62 | 
 63 |     def __init__(self, n_features, n_nodes, max_depth):
 64 |         assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes"
 65 | 
 66 |         self.left_children = np.ones(n_nodes, dtype=np.int64) * -1
 67 |         self.right_children = np.ones(n_nodes, dtype=np.int64) * -1
 68 |         self.feature = np.random.randint(0, n_features, (n_nodes,))
 69 |         self.threshold = np.random.randn(n_nodes).astype(np.float32)
 70 |         self.value = np.random.randn(n_nodes).astype(np.float32)
 71 |         depth = np.zeros(n_nodes, dtype=np.int64)
 72 | 
 73 |         # Root is 0
 74 |         self.is_leaf = np.zeros(n_nodes, dtype=np.int64)
 75 |         self.is_leaf[0] = 1
 76 | 
 77 |         # Keep adding nodes while we can (new node must have 2 children)
 78 |         while True:
 79 |             idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())]
 80 |             if depth[idx] < max_depth:
 81 |                 unused = np.flatnonzero(
 82 |                     (self.left_children == -1)
 83 |                     & (self.right_children == -1)
 84 |                     & ~self.is_leaf
 85 |                 )
 86 |                 if len(unused) < 2:
 87 |                     break
 88 | 
 89 |                 lr_child = unused[np.random.permutation(unused.shape[0])[:2]]
 90 |                 self.is_leaf[lr_child] = 1
 91 |                 self.is_leaf[lr_child] = 1
 92 |                 depth[lr_child] = depth[idx] + 1
 93 |                 self.left_children[idx] = lr_child[0]
 94 |                 self.right_children[idx] = lr_child[1]
 95 |                 self.is_leaf[idx] = 0
 96 | 
 97 |     def apply(self, x):
 98 |         y = np.zeros(x.shape[0])
 99 | 
100 |         for i in range(x.shape[0]):
101 |             idx = 0
102 | 
103 |             while not self.is_leaf[idx]:
104 |                 if x[i, self.feature[idx]] < self.threshold[idx]:
105 |                     idx = self.left_children[idx]
106 |                 else:
107 |                     idx = self.right_children[idx]
108 | 
109 |             y[i] = self.value[idx]
110 | 
111 |         return y
112 | 
113 | 
114 | class TreeEnsemble:
115 |     "Combine multiple trees"
116 | 
117 |     def __init__(self, *, n_trees, n_features, n_nodes, max_depth):
118 |         self.trees = [
119 |             Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth)
120 |             for _ in range(n_trees)
121 |         ]
122 | 
123 |     def apply(self, x):
124 |         return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0)
125 | 


--------------------------------------------------------------------------------
/XGBoost/lib/util.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import json
  4 | import os
  5 | import pickle
  6 | import random
  7 | import shutil
  8 | import sys
  9 | import time
 10 | import typing as ty
 11 | from copy import deepcopy
 12 | from pathlib import Path
 13 | 
 14 | import numpy as np
 15 | import pynvml
 16 | import pytomlpp as toml
 17 | import torch
 18 | 
 19 | from . import env
 20 | 
 21 | TRAIN = 'train'
 22 | VAL = 'val'
 23 | TEST = 'test'
 24 | PARTS = [TRAIN, VAL, TEST]
 25 | 
 26 | BINCLASS = 'binclass'
 27 | MULTICLASS = 'multiclass'
 28 | REGRESSION = 'regression'
 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION]
 30 | 
 31 | 
 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any:
 33 |     return json.loads(Path(path).read_text())
 34 | 
 35 | 
 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None:
 37 |     Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n')
 38 | 
 39 | 
 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any:
 41 |     return toml.loads(Path(path).read_text())
 42 | 
 43 | 
 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None:
 45 |     Path(path).write_text(toml.dumps(x) + '\n')
 46 | 
 47 | 
 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any:
 49 |     return pickle.loads(Path(path).read_bytes())
 50 | 
 51 | 
 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None:
 53 |     Path(path).write_bytes(pickle.dumps(x))
 54 | 
 55 | 
 56 | def load(path: ty.Union[Path, str]) -> ty.Any:
 57 |     return globals()[f'load_{Path(path).suffix[1:]}'](path)
 58 | 
 59 | 
 60 | def load_config(
 61 |     argv: ty.Optional[ty.List[str]] = None,
 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]:
 63 |     parser = argparse.ArgumentParser()
 64 |     parser.add_argument('config', metavar='FILE')
 65 |     parser.add_argument('-o', '--output', metavar='DIR')
 66 |     parser.add_argument('-f', '--force', action='store_true')
 67 |     parser.add_argument('--continue', action='store_true', dest='continue_')
 68 |     if argv is None:
 69 |         argv = sys.argv[1:]
 70 |     args = parser.parse_args(argv)
 71 | 
 72 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
 73 |     if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists():
 74 |         assert args.continue_
 75 | 
 76 |     config_path = Path(args.config).absolute()
 77 |     output_dir = (
 78 |         Path(args.output)
 79 |         if args.output
 80 |         else config_path.parent.joinpath(config_path.stem)
 81 |     ).absolute()
 82 |     sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir))))  # type: ignore[code]
 83 |     print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n')
 84 | 
 85 |     assert config_path.exists()
 86 |     config = load_toml(config_path)
 87 | 
 88 |     environment: ty.Dict[str, ty.Any] = {}
 89 |     if torch.cuda.is_available():  # type: ignore[code]
 90 |         cvd = os.environ.get('CUDA_VISIBLE_DEVICES')
 91 |         pynvml.nvmlInit()
 92 |         environment['devices'] = {
 93 |             'CUDA_VISIBLE_DEVICES': cvd,
 94 |             'torch.version.cuda': torch.version.cuda,
 95 |             'torch.backends.cudnn.version()': torch.backends.cudnn.version(),  # type: ignore[code]
 96 |             'torch.cuda.nccl.version()': torch.cuda.nccl.version(),  # type: ignore[code]
 97 |             'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'),
 98 |         }
 99 |         if cvd:
100 |             for i in map(int, cvd.split(',')):
101 |                 handle = pynvml.nvmlDeviceGetHandleByIndex(i)
102 |                 environment['devices'][i] = {
103 |                     'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'),
104 |                     'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total,
105 |                 }
106 | 
107 |     return config, output_dir
108 | 
109 | 
110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None:
111 |     dump_json(stats, output_dir / 'stats.json', indent=4)
112 |     json_output_path = os.environ.get('JSON_OUTPUT_FILE')
113 |     if final:
114 |         output_dir.joinpath('DONE').touch()
115 |         if json_output_path:
116 |             try:
117 |                 key = str(output_dir.relative_to(env.PROJECT_DIR))
118 |             except ValueError:
119 |                 pass
120 |             else:
121 |                 json_output_path = Path(json_output_path)
122 |                 try:
123 |                     json_data = json.loads(json_output_path.read_text())
124 |                 except (FileNotFoundError, json.decoder.JSONDecodeError):
125 |                     json_data = {}
126 |                 json_data[key] = stats
127 |                 json_output_path.write_text(json.dumps(json_data))
128 |             shutil.copyfile(
129 |                 json_output_path,
130 |                 os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'),
131 |             )
132 | 
133 | 
134 | _LAST_SNAPSHOT_TIME = None
135 | 
136 | 
137 | def backup_output(output_dir: Path) -> None:
138 |     backup_dir = os.environ.get('TMP_OUTPUT_PATH')
139 |     snapshot_dir = os.environ.get('SNAPSHOT_PATH')
140 |     if backup_dir is None:
141 |         assert snapshot_dir is None
142 |         return
143 |     assert snapshot_dir is not None
144 | 
145 |     try:
146 |         relative_output_dir = output_dir.relative_to(env.PROJECT_DIR)
147 |     except ValueError:
148 |         return
149 | 
150 |     for dir_ in [backup_dir, snapshot_dir]:
151 |         new_output_dir = dir_ / relative_output_dir
152 |         prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev')
153 |         new_output_dir.parent.mkdir(exist_ok=True, parents=True)
154 |         if new_output_dir.exists():
155 |             new_output_dir.rename(prev_backup_output_dir)
156 |         shutil.copytree(output_dir, new_output_dir)
157 |         if prev_backup_output_dir.exists():
158 |             shutil.rmtree(prev_backup_output_dir)
159 | 
160 |     global _LAST_SNAPSHOT_TIME
161 |     if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60:
162 |         pass
163 |         _LAST_SNAPSHOT_TIME = time.time()
164 |         print('The snapshot was saved!')
165 | 
166 | 
167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any):
168 |     raise ValueError(f'Unknown {unknown_what}: {unknown_value}')
169 | 
170 | 
171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict:
172 |     x = deepcopy(default_kwargs)
173 |     x.update(kwargs)
174 |     return x
175 | 
176 | 
177 | def set_seeds(seed: int) -> None:
178 |     random.seed(seed)
179 |     np.random.seed(seed)
180 | 
181 | 
182 | def format_seconds(seconds: float) -> str:
183 |     return str(datetime.timedelta(seconds=round(seconds)))
184 | 
185 | 
186 | def get_categories(
187 |     X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List
188 | ) -> ty.Optional[ty.List[int]]:
189 |     if X_cat is None:
190 |         return None
191 |     else:
192 |         categories_count = []
193 |         for i in range(X_cat.shape[1]):
194 |             # Combine unique categories from both training and testing indices for each feature
195 |             unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist())
196 |             categories_count.append(len(unique_categories))
197 |         return categories_count
198 | 


--------------------------------------------------------------------------------
/XGBoost/utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | 
 6 | def set_random_seed(seed):
 7 |     """
 8 |     Set the seed for random number generation in Python, NumPy, and PyTorch.
 9 | 
10 |     Args:
11 |     seed (int): The seed value to use for all random number generators.
12 |     """
13 |     random.seed(seed)
14 |     np.random.seed(seed)
15 |     torch.manual_seed(seed)
16 | 
17 |     if torch.cuda.is_available():
18 |         torch.cuda.manual_seed(seed)
19 |         torch.cuda.manual_seed_all(seed)
20 |         torch.backends.cudnn.deterministic = True
21 |         torch.backends.cudnn.benchmark = False


--------------------------------------------------------------------------------
/saint/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/saint/README.md:
--------------------------------------------------------------------------------
 1 | This repository is the official PyTorch implementation of SAINT. Find the paper on [arxiv](https://arxiv.org/abs/2106.01342) 
 2 | 
 3 | # SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training
 4 | 
 5 | 
 6 | ![Overview](pipeline.png)
 7 | 
 8 | 
 9 | 
10 | ## Requirements
11 | 
12 | We recommend using `anaconda` or `miniconda` for python. Our code has been tested with `python=3.8` on linux.
13 | 
14 | Create a conda environment from the yml file and activate it.
15 | ```
16 | conda env create -f saint_environment.yml
17 | conda activate saint_env
18 | ```
19 | 
20 | Make sure the following requirements are met
21 | 
22 | * torch>=1.8.1
23 | * torchvision>=0.9.1
24 | 
25 | ### Optional
26 | We used wandb to update our logs. But it is optional.
27 | ```
28 | conda install -c conda-forge wandb 
29 | ```
30 | 
31 | 
32 | ## Training & Evaluation
33 | 
34 | In each of our experiments, we use a single Nvidia GeForce RTX 2080Ti GPU.
35 | 
36 | 
37 | To train the model(s) in the paper, run this command:
38 | 
39 | ```
40 | python train.py --dset_id <openml_dataset_id> --task <task_name> --attentiontype <attention_type> 
41 | ```
42 | 
43 | Pretraining is useful when there are few training data samples. Sample code looks like this. (Use train_robust.py file for pretraining and robustness experiments)
44 | ```
45 | python train_robust.py --dset_id <openml_dataset_id> --task <task_name> --attentiontype <attention_type>  --pretrain --pt_tasks <pretraining_task_touse> --pt_aug <augmentations_on_data_touse> --ssl_samples <Number_of_labeled_samples>
46 | ```
47 | 
48 | 
49 | 
50 | ### Arguments
51 | * `--dset_id` : Dataset id from OpenML. Works with all the datasets mentioned in the paper. Works with all OpenML datasets.
52 | * `--task` : The task we want to perform. Pick from 'regression','multiclass', or 'binary'.
53 | * `--attentiontype` : Variant of SAINT. 'col' refers to SAINT-s variant, 'row' is SAINT-i, and 'colrow' refers to SAINT.
54 | * `--embedding_size` : Size of the feature embeddings
55 | * `--transformer_depth` : Depth of the model. Number of stages.
56 | * `--attention_heads` : Number of attention heads in each Attention layer.
57 | * `--cont_embeddings` : Style of embedding continuous data.
58 | * `--pretrain` : To enable pretraining
59 | * `--pt_tasks` : Losses we want to use for pretraining. Multiple arguments can be passed.
60 | * `--pt_aug` : Types of data augmentations used in pretraining. Multiple arguments are allowed. We support only mixup and CutMix right now.
61 | * `--ssl_samples` : Number of labeled samples used in semi-supervised experiments. 
62 | * `--pt_projhead_style` : Projection head style used in contrastive pipeline.
63 | * `--nce_temp` : Temperature used in contrastive loss function.
64 | * `--active_log` : To update the logs onto wandb. This is optional
65 | 
66 | #### <span style="color:Tomato">Most of the hyperparameters are hardcoded in train.py file. For datasets with really high number of features, we suggest using smaller batchsize, lower embedding dimension and fewer number of heads.</span>
67 | 
68 | ### Evaluation
69 | 
70 | We choose the best model by evaluating the model on validation dataset. The AuROC(for binary classification datasets), Accuracy (for multiclass classification datasets), and RMSE (for regression datasets) of the best model on test datasets is printed after training is completed. If wandb is enabled, they are logged to 'test_auroc_bestep', 'test_accuracy_bestep', 'test_rmse_bestep'  variables.
71 | 
72 | 
73 | 
74 | ## What's new in this version?
75 | * Regression and multiclass classification models are added.
76 | * Data can be accessed directly from openml just by calling the id of the dataset.
77 | 
78 | 
79 | ## Acknowledgements
80 | 
81 | We would like to thank the following public repo from which we borrowed various utilites.
82 | - https://github.com/lucidrains/tab-transformer-pytorch
83 | 
84 | ## License
85 | This repository is released under the Apache 2.0 license as found in the [LICENSE](LICENSE) file.
86 | 
87 | ## Cite us
88 | 
89 | ```
90 | @article{somepalli2021saint,
91 |   title={SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training},
92 |   author={Somepalli, Gowthami and Goldblum, Micah and Schwarzschild, Avi and Bruss, C Bayan and Goldstein, Tom},
93 |   journal={arXiv preprint arXiv:2106.01342},
94 |   year={2021}
95 | }
96 | 
97 | ```
98 | 


--------------------------------------------------------------------------------
/saint/augmentations.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | 
  5 | def embed_data_mask(x_categ, x_cont, cat_mask, con_mask, model, vision_dset=False):
  6 |     device = x_cont.device if x_cont is not None else x_categ.device
  7 | 
  8 |     # Embed categorical data if available
  9 |     if x_categ is not None and model.embeds is not None:
 10 |         x_categ = torch.where(x_categ == model.unknown_value, model.categories.to(x_categ.device), x_categ)
 11 |         x_categ = x_categ + model.categories_offset.type_as(x_categ)
 12 |         x_categ_enc = model.embeds(x_categ)
 13 |         cat_mask_temp = cat_mask + model.cat_mask_offset.type_as(cat_mask)
 14 |         cat_mask_temp = model.mask_embeds_cat(cat_mask_temp)
 15 |         cat_mask_unsqueezed = cat_mask.unsqueeze(-1)
 16 | 
 17 |         assert x_categ_enc.shape[0] == cat_mask_temp.shape[0] == cat_mask_unsqueezed.shape[0], \
 18 |             f"Mismatch in batch size. x_categ_enc: {x_categ_enc.shape[0]}, cat_mask_temp: {cat_mask_temp.shape[0]}, " \
 19 |             f"cat_mask_unsqueezed: {cat_mask_unsqueezed.shape[0]} "
 20 | 
 21 |         assert x_categ_enc.shape[1] == cat_mask_temp.shape[1] == cat_mask_unsqueezed.shape[1], \
 22 |             f"Mismatch in sequence length. x_categ_enc: {x_categ_enc.shape[1]}, cat_mask_temp: {cat_mask_temp.shape[1]}," \
 23 |             f" cat_mask_unsqueezed: {cat_mask_unsqueezed.shape[1]} "
 24 | 
 25 |         assert x_categ_enc.shape[2] == cat_mask_temp.shape[2], \
 26 |             f"Mismatch in embedding size. x_categ_enc: {x_categ_enc.shape[2]}, cat_mask_temp: {cat_mask_temp.shape[2]}"
 27 | 
 28 |         assert cat_mask_unsqueezed.shape[2] == 1, \
 29 |             f"cat_mask_unsqueezed should have a singleton dimension. Found: {cat_mask_unsqueezed.shape[2]}"
 30 | 
 31 |         x_categ_enc = torch.where(cat_mask_unsqueezed == 0, cat_mask_temp, x_categ_enc)
 32 | 
 33 |     else:
 34 |         x_categ_enc = None
 35 | 
 36 |     # Embed continuous data if available
 37 |     if x_cont is not None:
 38 |         n1, n2 = x_cont.shape
 39 |         if model.cont_embeddings == 'MLP':
 40 |             x_cont_enc = torch.empty(n1, n2, model.dim, device=device)
 41 |             for i in range(model.num_continuous):
 42 |                 x_cont_enc[:, i, :] = model.simple_MLP[i](x_cont[:, i])
 43 |         else:
 44 |             raise Exception('This case should not work!')
 45 | 
 46 |         con_mask_temp = con_mask + model.con_mask_offset.type_as(con_mask)
 47 |         con_mask_temp = model.mask_embeds_cont(con_mask_temp)
 48 |         x_cont_enc[con_mask == 0] = con_mask_temp[con_mask == 0]
 49 |     else:
 50 |         x_cont_enc = None
 51 | 
 52 |     # Handle vision dataset specific logic
 53 |     if vision_dset and x_categ is not None:
 54 |         pos = np.tile(np.arange(x_categ.shape[-1]), (x_categ.shape[0], 1))
 55 |         pos = torch.from_numpy(pos).to(device)
 56 |         pos_enc = model.pos_encodings(pos)
 57 |         x_categ_enc += pos_enc
 58 | 
 59 |     return x_categ, x_categ_enc, x_cont_enc
 60 | 
 61 | 
 62 | def mixup_data(x1, x2, lam=1.0, y=None, use_cuda=True):
 63 |     '''Returns mixed inputs, pairs of targets'''
 64 | 
 65 |     batch_size = x1.size()[0]
 66 |     if use_cuda:
 67 |         index = torch.randperm(batch_size).cuda()
 68 |     else:
 69 |         index = torch.randperm(batch_size)
 70 | 
 71 |     mixed_x1 = lam * x1 + (1 - lam) * x1[index, :]
 72 |     mixed_x2 = lam * x2 + (1 - lam) * x2[index, :]
 73 |     if y is not None:
 74 |         y_a, y_b = y, y[index]
 75 |         return mixed_x1, mixed_x2, y_a, y_b
 76 | 
 77 |     return mixed_x1, mixed_x2
 78 | 
 79 | 
 80 | def add_noise(x_categ, x_cont, noise_params={'noise_type': ['cutmix'], 'lambda': 0.1}):
 81 |     lam = noise_params['lambda']
 82 |     device = x_categ.device
 83 |     batch_size = x_categ.size()[0]
 84 | 
 85 |     if 'cutmix' in noise_params['noise_type']:
 86 |         index = torch.randperm(batch_size)
 87 |         cat_corr = torch.from_numpy(np.random.choice(2, (x_categ.shape), p=[lam, 1 - lam])).to(device)
 88 |         con_corr = torch.from_numpy(np.random.choice(2, (x_cont.shape), p=[lam, 1 - lam])).to(device)
 89 |         x1, x2 = x_categ[index, :], x_cont[index, :]
 90 |         x_categ_corr, x_cont_corr = x_categ.clone().detach(), x_cont.clone().detach()
 91 |         x_categ_corr[cat_corr == 0] = x1[cat_corr == 0]
 92 |         x_cont_corr[con_corr == 0] = x2[con_corr == 0]
 93 |         return x_categ_corr, x_cont_corr
 94 |     elif noise_params['noise_type'] == 'missing':
 95 |         x_categ_mask = np.random.choice(2, (x_categ.shape), p=[lam, 1 - lam])
 96 |         x_cont_mask = np.random.choice(2, (x_cont.shape), p=[lam, 1 - lam])
 97 |         x_categ_mask = torch.from_numpy(x_categ_mask).to(device)
 98 |         x_cont_mask = torch.from_numpy(x_cont_mask).to(device)
 99 |         return torch.mul(x_categ, x_categ_mask), torch.mul(x_cont, x_cont_mask)
100 | 
101 |     else:
102 |         print("yet to write this")
103 | 


--------------------------------------------------------------------------------
/saint/data_openml.py:
--------------------------------------------------------------------------------
  1 | import openml
  2 | import numpy as np
  3 | from sklearn.preprocessing import LabelEncoder
  4 | import pandas as pd
  5 | from torch.utils.data import Dataset
  6 | 
  7 | 
  8 | def simple_lapsed_time(text, lapsed):
  9 |     hours, rem = divmod(lapsed, 3600)
 10 |     minutes, seconds = divmod(rem, 60)
 11 |     print(text+": {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
 12 | 
 13 | 
 14 | def task_dset_ids(task):
 15 |     dataset_ids = {
 16 |         'binary': [1487,44,1590,42178,1111,31,42733,1494,1017,4134],
 17 |         'multiclass': [188, 1596, 4541, 40664, 40685, 40687, 40975, 41166, 41169, 42734],
 18 |         'regression':[541, 42726, 42727, 422, 42571, 42705, 42728, 42563, 42724, 42729]
 19 |     }
 20 | 
 21 |     return dataset_ids[task]
 22 | 
 23 | def concat_data(X,y):
 24 |     # import ipdb; ipdb.set_trace()
 25 |     return pd.concat([pd.DataFrame(X['data']), pd.DataFrame(y['data'][:,0].tolist(),columns=['target'])], axis=1)
 26 | 
 27 | 
 28 | def data_split(X,y,nan_mask,indices):
 29 |     x_d = {
 30 |         'data': X.values[indices],
 31 |         'mask': nan_mask.values[indices]
 32 |     }
 33 |     
 34 |     if x_d['data'].shape != x_d['mask'].shape:
 35 |         raise'Shape of data not same as that of nan mask!'
 36 |         
 37 |     y_d = {
 38 |         'data': y[indices].reshape(-1, 1)
 39 |     } 
 40 |     return x_d, y_d
 41 | 
 42 | 
 43 | def data_prep_openml(ds_id, seed, task, datasplit=[.65, .15, .2]):
 44 |     
 45 |     np.random.seed(seed) 
 46 |     dataset = openml.datasets.get_dataset(ds_id)
 47 |     
 48 |     X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)
 49 |     if ds_id == 42178:
 50 |         categorical_indicator = [True, False, True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,False, False]
 51 |         tmp = [x if (x != ' ') else '0' for x in X['TotalCharges'].tolist()]
 52 |         X['TotalCharges'] = [float(i) for i in tmp ]
 53 |         y = y[X.TotalCharges != 0]
 54 |         X = X[X.TotalCharges != 0]
 55 |         X.reset_index(drop=True, inplace=True)
 56 |         print(y.shape, X.shape)
 57 |     if ds_id in [42728,42705,42729,42571]:
 58 |         # import ipdb; ipdb.set_trace()
 59 |         X, y = X[:50000], y[:50000]
 60 |         X.reset_index(drop=True, inplace=True)
 61 |     categorical_columns = X.columns[list(np.where(np.array(categorical_indicator)==True)[0])].tolist()
 62 |     cont_columns = list(set(X.columns.tolist()) - set(categorical_columns))
 63 | 
 64 |     cat_idxs = list(np.where(np.array(categorical_indicator)==True)[0])
 65 |     con_idxs = list(set(range(len(X.columns))) - set(cat_idxs))
 66 | 
 67 |     for col in categorical_columns:
 68 |         X[col] = X[col].astype("object")
 69 | 
 70 |     X["Set"] = np.random.choice(["train", "valid", "test"], p = datasplit, size=(X.shape[0],))
 71 | 
 72 |     train_indices = X[X.Set=="train"].index
 73 |     valid_indices = X[X.Set=="valid"].index
 74 |     test_indices = X[X.Set=="test"].index
 75 | 
 76 |     X = X.drop(columns=['Set'])
 77 |     temp = X.fillna("MissingValue")
 78 |     nan_mask = temp.ne("MissingValue").astype(int)
 79 |     
 80 |     cat_dims = []
 81 |     for col in categorical_columns:
 82 |     #     X[col] = X[col].cat.add_categories("MissingValue")
 83 |         X[col] = X[col].fillna("MissingValue")
 84 |         l_enc = LabelEncoder() 
 85 |         X[col] = l_enc.fit_transform(X[col].values)
 86 |         cat_dims.append(len(l_enc.classes_))
 87 |     for col in cont_columns:
 88 |     #     X[col].fillna("MissingValue",inplace=True)
 89 |         X.fillna(X.loc[train_indices, col].mean(), inplace=True)
 90 |     y = y.values
 91 |     if task != 'regression':
 92 |         l_enc = LabelEncoder() 
 93 |         y = l_enc.fit_transform(y)
 94 |     X_train, y_train = data_split(X,y,nan_mask,train_indices)
 95 |     X_valid, y_valid = data_split(X,y,nan_mask,valid_indices)
 96 |     X_test, y_test = data_split(X,y,nan_mask,test_indices)
 97 | 
 98 |     train_mean, train_std = np.array(X_train['data'][:,con_idxs],dtype=np.float32).mean(0), np.array(X_train['data'][:,con_idxs],dtype=np.float32).std(0)
 99 |     train_std = np.where(train_std < 1e-6, 1e-6, train_std)
100 |     # import ipdb; ipdb.set_trace()
101 |     return cat_dims, cat_idxs, con_idxs, X_train, y_train, X_valid, y_valid, X_test, y_test, train_mean, train_std
102 | 
103 | 
104 | 
105 | 
106 | class DataSetCatCon(Dataset):
107 |     def __init__(self, X, Y, cat_cols,task='clf',continuous_mean_std=None):
108 |         
109 |         cat_cols = list(cat_cols)
110 |         X_mask =  X['mask'].copy()
111 |         X = X['data'].copy()
112 |         con_cols = list(set(np.arange(X.shape[1])) - set(cat_cols))
113 |         self.X1 = X[:,cat_cols].copy().astype(np.int64) #categorical columns
114 |         self.X2 = X[:,con_cols].copy().astype(np.float32) #numerical columns
115 |         self.X1_mask = X_mask[:,cat_cols].copy().astype(np.int64) #categorical columns
116 |         self.X2_mask = X_mask[:,con_cols].copy().astype(np.int64) #numerical columns
117 |         if task == 'clf':
118 |             self.y = Y['data']#.astype(np.float32)
119 |         else:
120 |             self.y = Y['data'].astype(np.float32)
121 |         self.cls = np.zeros_like(self.y,dtype=int)
122 |         self.cls_mask = np.ones_like(self.y,dtype=int)
123 |         if continuous_mean_std is not None:
124 |             mean, std = continuous_mean_std
125 |             self.X2 = (self.X2 - mean) / std
126 | 
127 |     def __len__(self):
128 |         return len(self.y)
129 |     
130 |     def __getitem__(self, idx):
131 |         # X1 has categorical data, X2 has continuous
132 |         return np.concatenate((self.cls[idx], self.X1[idx])), self.X2[idx],self.y[idx], np.concatenate((self.cls_mask[idx], self.X1_mask[idx])), self.X2_mask[idx]
133 | 
134 | 


--------------------------------------------------------------------------------
/saint/models/__init__.py:
--------------------------------------------------------------------------------
1 | from models.pretrainmodel import SAINT
2 | from models.pretrainmodel_vision import SAINT_vision
3 | 


--------------------------------------------------------------------------------
/saint/models/pretrainmodel.py:
--------------------------------------------------------------------------------
  1 | from .model import *
  2 | 
  3 | 
  4 | class sep_MLP(nn.Module):
  5 |     def __init__(self, dim, len_feats, categories):
  6 |         super(sep_MLP, self).__init__()
  7 |         self.len_feats = len_feats
  8 |         self.layers = nn.ModuleList([])
  9 |         for i in range(len_feats):
 10 |             self.layers.append(simple_MLP([dim, 5 * dim, categories[i]]))
 11 | 
 12 |     def forward(self, x):
 13 |         y_pred = list([])
 14 |         for i in range(self.len_feats):
 15 |             x_i = x[:, i, :]
 16 |             pred = self.layers[i](x_i)
 17 |             y_pred.append(pred)
 18 |         return y_pred
 19 | 
 20 | 
 21 | class SAINT(nn.Module):
 22 |     def __init__(
 23 |             self,
 24 |             *,
 25 |             categories,
 26 |             num_continuous,
 27 |             dim,
 28 |             depth,
 29 |             heads,
 30 |             dim_head=16,
 31 |             dim_out=1,
 32 |             mlp_hidden_mults=(4, 2),
 33 |             mlp_act=None,
 34 |             num_special_tokens=0,
 35 |             attn_dropout=0.,
 36 |             ff_dropout=0.,
 37 |             cont_embeddings='MLP',
 38 |             scalingfactor=10,
 39 |             attentiontype='col',
 40 |             final_mlp_style='common',
 41 |             y_dim=2
 42 |     ):
 43 |         super().__init__()
 44 |         if categories is not None:
 45 |             assert all(map(lambda n: n > 0, categories)), 'number of each category must be positive'
 46 | 
 47 |             # categories related calculations
 48 |             self.categories = torch.tensor(np.subtract(categories, 1).tolist())
 49 |             self.unknown_value = -1
 50 |             self.num_categories = len(categories)
 51 |             self.num_unique_categories = sum(categories)
 52 | 
 53 |             # create category embeddings table
 54 | 
 55 |             self.num_special_tokens = num_special_tokens
 56 |             self.total_tokens = self.num_unique_categories + num_special_tokens
 57 | 
 58 |             # for automatically offsetting unique category ids to the correct position in the categories embedding table
 59 | 
 60 |             categories_offset = F.pad(torch.tensor(list(categories)), (1, 0), value=num_special_tokens)
 61 |             categories_offset = categories_offset.cumsum(dim=-1)[:-1]
 62 | 
 63 |             self.register_buffer('categories_offset', categories_offset)
 64 |         else:
 65 |             self.num_categories = 0
 66 |             self.num_unique_categories = 0
 67 |             self.total_tokens = 0
 68 | 
 69 |         self.norm = nn.LayerNorm(num_continuous)
 70 |         self.num_continuous = num_continuous
 71 |         self.dim = dim
 72 |         self.cont_embeddings = cont_embeddings
 73 |         self.attentiontype = attentiontype
 74 |         self.final_mlp_style = final_mlp_style
 75 | 
 76 |         if self.cont_embeddings == 'MLP':
 77 |             self.simple_MLP = nn.ModuleList([simple_MLP([1, 100, self.dim]) for _ in range(self.num_continuous)])
 78 |             input_size = (dim * self.num_categories) + (dim * num_continuous)
 79 |             nfeats = self.num_categories + num_continuous
 80 |         elif self.cont_embeddings == 'pos_singleMLP':
 81 |             self.simple_MLP = nn.ModuleList([simple_MLP([1, 100, self.dim]) for _ in range(1)])
 82 |             input_size = (dim * self.num_categories) + (dim * num_continuous)
 83 |             nfeats = self.num_categories + num_continuous
 84 |         else:
 85 |             print('Continous features are not passed through attention')
 86 |             input_size = (dim * self.num_categories) + num_continuous
 87 |             nfeats = self.num_categories
 88 | 
 89 |             # transformer
 90 |         if attentiontype == 'col':
 91 |             self.transformer = Transformer(
 92 |                 num_tokens=self.total_tokens,
 93 |                 dim=dim,
 94 |                 depth=depth,
 95 |                 heads=heads,
 96 |                 dim_head=dim_head,
 97 |                 attn_dropout=attn_dropout,
 98 |                 ff_dropout=ff_dropout
 99 |             )
100 |         elif attentiontype in ['row', 'colrow']:
101 |             self.transformer = RowColTransformer(
102 |                 num_tokens=self.total_tokens,
103 |                 dim=dim,
104 |                 nfeats=nfeats,
105 |                 depth=depth,
106 |                 heads=heads,
107 |                 dim_head=dim_head,
108 |                 attn_dropout=attn_dropout,
109 |                 ff_dropout=ff_dropout,
110 |                 style=attentiontype
111 |             )
112 | 
113 |         l = input_size // 8
114 |         hidden_dimensions = list(map(lambda t: l * t, mlp_hidden_mults))
115 |         all_dimensions = [input_size, *hidden_dimensions, dim_out]
116 | 
117 |         self.mlp = MLP(all_dimensions, act=mlp_act)
118 |         self.embeds = nn.Embedding(self.total_tokens, self.dim)  # .to(device)
119 | 
120 |         cat_mask_offset = F.pad(torch.Tensor(self.num_categories).fill_(2).type(torch.int8), (1, 0), value=0)
121 |         cat_mask_offset = cat_mask_offset.cumsum(dim=-1)[:-1]
122 | 
123 |         con_mask_offset = F.pad(torch.Tensor(self.num_continuous).fill_(2).type(torch.int8), (1, 0), value=0)
124 |         con_mask_offset = con_mask_offset.cumsum(dim=-1)[:-1]
125 | 
126 |         self.register_buffer('cat_mask_offset', cat_mask_offset)
127 |         self.register_buffer('con_mask_offset', con_mask_offset)
128 | 
129 |         self.mask_embeds_cat = nn.Embedding(self.num_categories * 2, self.dim)
130 |         self.mask_embeds_cont = nn.Embedding(self.num_continuous * 2, self.dim)
131 |         self.single_mask = nn.Embedding(2, self.dim)
132 |         self.pos_encodings = nn.Embedding(self.num_categories + self.num_continuous, self.dim)
133 | 
134 |         if self.final_mlp_style == 'common':
135 |             self.mlp1 = simple_MLP([dim, (self.total_tokens) * 2, self.total_tokens])
136 |             self.mlp2 = simple_MLP([dim, (self.num_continuous), 1])
137 | 
138 |         else:
139 |             self.mlp1 = sep_MLP(dim, self.num_categories, categories)
140 |             self.mlp2 = sep_MLP(dim, self.num_continuous, np.ones(self.num_continuous).astype(int))
141 | 
142 |         self.mlpfory = simple_MLP([dim, 1000, y_dim])
143 |         self.pt_mlp = simple_MLP([dim * (self.num_continuous + self.num_categories),
144 |                                   6 * dim * (self.num_continuous + self.num_categories) // 5,
145 |                                   dim * (self.num_continuous + self.num_categories) // 2])
146 |         self.pt_mlp2 = simple_MLP([dim * (self.num_continuous + self.num_categories),
147 |                                    6 * dim * (self.num_continuous + self.num_categories) // 5,
148 |                                    dim * (self.num_continuous + self.num_categories) // 2])
149 | 
150 |     def forward(self, x_categ, x_cont):
151 |         if x_categ is None:
152 |             # Handle the case when only continuous data is provided
153 |             if self.cont_embeddings == 'MLP':
154 |                 x_cont = torch.stack([self.simple_MLP[i](x_cont[:, i].view(-1, 1)) for i in range(self.num_continuous)],
155 |                                      dim=1)
156 |             # Process continuous data
157 |             x_cont = self.norm(x_cont)
158 |             x = x_cont
159 |         elif x_cont is None:
160 |             # Handle the case when only categorical data is provided
161 |             x_categ = torch.where(x_categ == self.unknown_value, self.categories.to(x_categ.device), x_categ)
162 |             x = self.embeds(x_categ + self.categories_offset)
163 |         else:
164 |             # Handle the case when both categorical and continuous data is provided
165 |             x_categ = torch.where(x_categ == self.unknown_value, self.categories.to(x_categ.device), x_categ)
166 |             x_categ = self.embeds(x_categ + self.categories_offset)
167 |             if self.cont_embeddings == 'MLP':
168 |                 x_cont = torch.stack([self.simple_MLP[i](x_cont[:, i].view(-1, 1)) for i in range(self.num_continuous)],
169 |                                      dim=1)
170 |             x_cont = self.norm(x_cont)
171 |             x = torch.cat((x_categ, x_cont), dim=1)
172 | 
173 |         # Proceed with the rest of the forward pass
174 |         x = self.transformer(x)
175 |         cat_outs = self.mlp1(x[:, :self.num_categories, :]) if x_categ is not None else None
176 |         con_outs = self.mlp2(x[:, self.num_categories:, :]) if x_cont is not None else None
177 |         return cat_outs, con_outs
178 | 


--------------------------------------------------------------------------------
/saint/models/pretrainmodel_vision.py:
--------------------------------------------------------------------------------
  1 | from .model import *
  2 | 
  3 | 
  4 | class sep_MLP(nn.Module):
  5 |     def __init__(self,dim,len_feats,categories):
  6 |         super(sep_MLP, self).__init__()
  7 |         self.len_feats = len_feats
  8 |         self.layers = nn.ModuleList([])
  9 |         for i in range(len_feats):
 10 |             self.layers.append(simple_MLP([dim,5*dim, categories[i]]))
 11 | 
 12 |         
 13 |     def forward(self, x):
 14 |         y_pred = list([])
 15 |         for i in range(self.len_feats):
 16 |             x_i = x[:,i,:]
 17 |             pred = self.layers[i](x_i)
 18 |             y_pred.append(pred)
 19 |         return y_pred
 20 | 
 21 | class SAINT_vision(nn.Module):
 22 |     def __init__(
 23 |         self,
 24 |         *,
 25 |         categories,
 26 |         num_continuous,
 27 |         dim,
 28 |         depth,
 29 |         heads,
 30 |         dim_head = 16,
 31 |         dim_out = 1,
 32 |         mlp_hidden_mults = (4, 2),
 33 |         mlp_act = None,
 34 |         num_special_tokens = 0,
 35 |         continuous_mean_std = None,
 36 |         attn_dropout = 0.,
 37 |         ff_dropout = 0.,
 38 |         cont_embeddings = 'MLP',
 39 |         scalingfactor = 10,
 40 |         attentiontype = 'col',
 41 |         final_mlp_style = 'common',
 42 |         y_dim = 2
 43 |         ):
 44 |         super().__init__()
 45 |         assert all(map(lambda n: n > 0, categories)), 'number of each category must be positive'
 46 | 
 47 |         # categories related calculations
 48 | 
 49 |         self.num_categories = len(categories)
 50 |         self.num_unique_categories = sum(categories)
 51 | 
 52 |         # create category embeddings table
 53 | 
 54 |         self.num_special_tokens = num_special_tokens
 55 |         self.total_tokens = categories[-1] + 256
 56 | 
 57 |         # for automatically offsetting unique category ids to the correct position in the categories embedding table
 58 | 
 59 |         categories_offset = torch.tensor(np.append(np.repeat(0, self.num_categories-1),[256]))
 60 |         self.register_buffer('categories_offset', categories_offset)
 61 | 
 62 | 
 63 |         self.norm = nn.LayerNorm(num_continuous)
 64 |         self.num_continuous = num_continuous
 65 |         self.dim = dim
 66 |         self.cont_embeddings = cont_embeddings
 67 |         self.attentiontype = attentiontype
 68 |         self.final_mlp_style = final_mlp_style
 69 | 
 70 |         if self.cont_embeddings == 'MLP':
 71 |             self.simple_MLP = nn.ModuleList([simple_MLP([1,100,self.dim]) for _ in range(self.num_continuous)])
 72 |             input_size = (dim * self.num_categories)  + (dim * num_continuous)
 73 |             nfeats = self.num_categories + num_continuous
 74 |         else:
 75 |             print('Continous features are not passed through attention')
 76 |             input_size = (dim * self.num_categories) + num_continuous
 77 |             nfeats = self.num_categories 
 78 | 
 79 |         # transformer
 80 |         if attentiontype == 'col':
 81 |             self.transformer = Transformer(
 82 |                 num_tokens = self.total_tokens,
 83 |                 dim = dim,
 84 |                 depth = depth,
 85 |                 heads = heads,
 86 |                 dim_head = dim_head,
 87 |                 attn_dropout = attn_dropout,
 88 |                 ff_dropout = ff_dropout
 89 |             )
 90 |         elif attentiontype in ['row','colrow'] :
 91 |             self.transformer = RowColTransformer(
 92 |                 num_tokens = self.total_tokens,
 93 |                 dim = dim,
 94 |                 nfeats= nfeats,
 95 |                 depth = depth,
 96 |                 heads = heads,
 97 |                 dim_head = dim_head,
 98 |                 attn_dropout = attn_dropout,
 99 |                 ff_dropout = ff_dropout,
100 |                 style = attentiontype
101 |             )
102 | 
103 |         l = input_size // 8
104 |         hidden_dimensions = list(map(lambda t: l * t, mlp_hidden_mults))
105 |         all_dimensions = [input_size, *hidden_dimensions, dim_out]
106 |         
107 |         self.mlp = MLP(all_dimensions, act = mlp_act)
108 |         self.embeds = nn.Embedding(self.total_tokens, self.dim) 
109 | 
110 |         cat_mask_offset = torch.tensor(np.append(np.repeat(0, self.num_categories-1),[2]))
111 |         con_mask_offset = torch.empty(0)
112 | 
113 |         self.register_buffer('cat_mask_offset', cat_mask_offset)
114 |         self.register_buffer('con_mask_offset', con_mask_offset)
115 | 
116 |         self.mask_embeds_cat = nn.Embedding(4, self.dim)
117 |         self.mask_embeds_cont = nn.Embedding(4, self.dim)
118 |         self.pos_encodings = nn.Embedding(self.num_categories, self.dim)
119 |         if self.final_mlp_style == 'common':
120 |             self.mlp1 = simple_MLP([dim,(self.total_tokens)*2, self.total_tokens])
121 |             self.mlp2 = simple_MLP([dim ,(self.num_continuous), 1])
122 | 
123 |         else:
124 |             self.mlp1 = sep_MLP(dim,self.num_categories,categories)
125 |             self.mlp2 = sep_MLP(dim,self.num_continuous,np.ones(self.num_continuous).astype(int))
126 | 
127 | 
128 |         self.mlpfory = simple_MLP([dim ,100, y_dim])
129 | 
130 |         
131 |     def forward(self, x_categ, x_cont):
132 |         x = self.transformer(x_categ, x_cont)
133 |         y_reps = x[:,self.num_categories-1,:]
134 |         y_outs = self.mlpfory(y_reps)
135 |         return y_outs
136 | 


--------------------------------------------------------------------------------
/saint/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/machinelearningnuremberg/Revisiting-MLPs/b17e3bf1a663a5605b5e727929cfc779de4211df/saint/pipeline.png


--------------------------------------------------------------------------------
/saint/pretraining.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | from baselines.data_openml import data_prep_openml,task_dset_ids,DataSetCatCon
 5 | from torch.utils.data import DataLoader
 6 | import torch.optim as optim
 7 | from augmentations import embed_data_mask
 8 | from augmentations import add_noise
 9 | 
10 | import os
11 | import numpy as np
12 | 
13 | def SAINT_pretrain(model,cat_idxs,X_train,y_train,continuous_mean_std,opt,device):
14 |     train_ds = DataSetCatCon(X_train, y_train, cat_idxs,opt.dtask, continuous_mean_std)
15 |     trainloader = DataLoader(train_ds, batch_size=opt.batchsize, shuffle=True,num_workers=4)
16 |     vision_dset = opt.vision_dset
17 |     optimizer = optim.AdamW(model.parameters(),lr=0.0001)
18 |     pt_aug_dict = {
19 |         'noise_type' : opt.pt_aug,
20 |         'lambda' : opt.pt_aug_lam
21 |     }
22 |     criterion1 = nn.CrossEntropyLoss()
23 |     criterion2 = nn.MSELoss()
24 |     print("Pretraining begins!")
25 |     for epoch in range(opt.pretrain_epochs):
26 |         model.train()
27 |         running_loss = 0.0
28 |         for i, data in enumerate(trainloader, 0):
29 |             optimizer.zero_grad()
30 |             x_categ, x_cont, _ ,cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
31 |             
32 |             # embed_data_mask function is used to embed both categorical and continuous data.
33 |             if 'cutmix' in opt.pt_aug:
34 |                 from augmentations import add_noise
35 |                 x_categ_corr, x_cont_corr = add_noise(x_categ,x_cont, noise_params = pt_aug_dict)
36 |                 _ , x_categ_enc_2, x_cont_enc_2 = embed_data_mask(x_categ_corr, x_cont_corr, cat_mask, con_mask,model,vision_dset)
37 |             else:
38 |                 _ , x_categ_enc_2, x_cont_enc_2 = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)
39 |             _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)
40 |             
41 |             if 'mixup' in opt.pt_aug:
42 |                 from augmentations import mixup_data
43 |                 x_categ_enc_2, x_cont_enc_2 = mixup_data(x_categ_enc_2, x_cont_enc_2 , lam=opt.mixup_lam)
44 |             loss = 0
45 |             if 'contrastive' in opt.pt_tasks:
46 |                 aug_features_1  = model.transformer(x_categ_enc, x_cont_enc)
47 |                 aug_features_2 = model.transformer(x_categ_enc_2, x_cont_enc_2)
48 |                 aug_features_1 = (aug_features_1 / aug_features_1.norm(dim=-1, keepdim=True)).flatten(1,2)
49 |                 aug_features_2 = (aug_features_2 / aug_features_2.norm(dim=-1, keepdim=True)).flatten(1,2)
50 |                 if opt.pt_projhead_style == 'diff':
51 |                     aug_features_1 = model.pt_mlp(aug_features_1)
52 |                     aug_features_2 = model.pt_mlp2(aug_features_2)
53 |                 elif opt.pt_projhead_style == 'same':
54 |                     aug_features_1 = model.pt_mlp(aug_features_1)
55 |                     aug_features_2 = model.pt_mlp(aug_features_2)
56 |                 else:
57 |                     print('Not using projection head')
58 |                 logits_per_aug1 = aug_features_1 @ aug_features_2.t()/opt.nce_temp
59 |                 logits_per_aug2 =  aug_features_2 @ aug_features_1.t()/opt.nce_temp
60 |                 targets = torch.arange(logits_per_aug1.size(0)).to(logits_per_aug1.device)
61 |                 loss_1 = criterion1(logits_per_aug1, targets)
62 |                 loss_2 = criterion1(logits_per_aug2, targets)
63 |                 loss   = opt.lam0*(loss_1 + loss_2)/2
64 |             elif 'contrastive_sim' in opt.pt_tasks:
65 |                 aug_features_1  = model.transformer(x_categ_enc, x_cont_enc)
66 |                 aug_features_2 = model.transformer(x_categ_enc_2, x_cont_enc_2)
67 |                 aug_features_1 = (aug_features_1 / aug_features_1.norm(dim=-1, keepdim=True)).flatten(1,2)
68 |                 aug_features_2 = (aug_features_2 / aug_features_2.norm(dim=-1, keepdim=True)).flatten(1,2)
69 |                 aug_features_1 = model.pt_mlp(aug_features_1)
70 |                 aug_features_2 = model.pt_mlp2(aug_features_2)
71 |                 c1 = aug_features_1 @ aug_features_2.t()
72 |                 loss+= opt.lam1*torch.diagonal(-1*c1).add_(1).pow_(2).sum()
73 |             if 'denoising' in opt.pt_tasks:
74 |                 cat_outs, con_outs = model(x_categ_enc_2, x_cont_enc_2)
75 |                 # if con_outs.shape(-1) != 0:
76 |                 # import ipdb; ipdb.set_trace()
77 |                 if len(con_outs) > 0:
78 |                     con_outs =  torch.cat(con_outs,dim=1)
79 |                     l2 = criterion2(con_outs, x_cont)
80 |                 else:
81 |                     l2 = 0
82 |                 l1 = 0
83 |                 # import ipdb; ipdb.set_trace()
84 |                 n_cat = x_categ.shape[-1]
85 |                 for j in range(1,n_cat):
86 |                     l1+= criterion1(cat_outs[j],x_categ[:,j])
87 |                 loss += opt.lam2*l1 + opt.lam3*l2    
88 |             loss.backward()
89 |             optimizer.step()
90 |             running_loss += loss.item()
91 |         
92 |         print(f'Epoch: {epoch}, Running Loss: {running_loss}')
93 | 
94 |     print('END OF PRETRAINING!')
95 |     return model
96 |         # if opt.active_log:
97 |         #     wandb.log({'pt_epoch': epoch ,'pretrain_epoch_loss': running_loss
98 |         #     })
99 | 


--------------------------------------------------------------------------------
/saint/saint_environment.yml:
--------------------------------------------------------------------------------
  1 | name: saint_env
  2 | channels:
  3 |   - anaconda
  4 |   - pytorch
  5 |   - rwest
  6 |   - vgauthier
  7 |   - conda-forge
  8 |   - defaults
  9 |   - ostrokach
 10 | dependencies:
 11 |   - _libgcc_mutex=0.1=conda_forge
 12 |   - _openmp_mutex=4.5=1_gnu
 13 |   - _py-xgboost-mutex=2.0=cpu_0
 14 |   - anyio=3.2.1=py38h578d9bd_0
 15 |   - argh=0.26.2=pyh9f0ad1d_1002
 16 |   - argon2-cffi=20.1.0=py38h497a2fe_2
 17 |   - async_generator=1.10=py_0
 18 |   - attrs=21.2.0=pyhd8ed1ab_0
 19 |   - babel=2.9.1=pyh44b312d_0
 20 |   - backcall=0.2.0=pyh9f0ad1d_0
 21 |   - backports=1.0=py_2
 22 |   - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
 23 |   - blas=1.0=mkl
 24 |   - bleach=3.3.1=pyhd8ed1ab_0
 25 |   - brotlipy=0.7.0=py38h497a2fe_1001
 26 |   - bzip2=1.0.8=h7f98852_4
 27 |   - ca-certificates=2021.5.30=ha878542_0
 28 |   - certifi=2021.5.30=py38h578d9bd_0
 29 |   - cffi=1.14.5=py38ha65f79e_0
 30 |   - chardet=4.0.0=py38h578d9bd_1
 31 |   - click=8.0.1=py38h578d9bd_0
 32 |   - configparser=5.0.2=pyhd8ed1ab_0
 33 |   - cryptography=3.4.7=py38ha5dfef3_0
 34 |   - cudatoolkit=11.1.1=h6406543_8
 35 |   - cycler=0.10.0=py_2
 36 |   - dbus=1.13.18=hb2f20db_0
 37 |   - debugpy=1.3.0=py38h709712a_0
 38 |   - decorator=5.0.9=pyhd8ed1ab_0
 39 |   - defusedxml=0.7.1=pyhd8ed1ab_0
 40 |   - docker-pycreds=0.4.0=py_0
 41 |   - einops=0.3.0=py_0
 42 |   - entrypoints=0.3=pyhd8ed1ab_1003
 43 |   - expat=2.4.1=h9c3ff4c_0
 44 |   - ffmpeg=4.3=hf484d3e_0
 45 |   - fontconfig=2.13.1=hba837de_1005
 46 |   - freetype=2.10.4=h0708190_1
 47 |   - gettext=0.19.8.1=h0b5b191_1005
 48 |   - gitdb=4.0.7=pyhd8ed1ab_0
 49 |   - gitpython=3.1.17=pyhd8ed1ab_0
 50 |   - glib=2.68.3=h9c3ff4c_0
 51 |   - glib-tools=2.68.3=h9c3ff4c_0
 52 |   - gmp=6.2.1=h58526e2_0
 53 |   - gnutls=3.6.13=h85f3911_1
 54 |   - gql=0.1.0=py_0
 55 |   - graphql-core=3.1.5=pyhd8ed1ab_0
 56 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 57 |   - gstreamer=1.14.0=h28cd5cc_2
 58 |   - icu=58.2=hf484d3e_1000
 59 |   - idna=2.10=pyh9f0ad1d_0
 60 |   - importlib-metadata=4.6.1=py38h578d9bd_0
 61 |   - intel-openmp=2021.2.0=h06a4308_610
 62 |   - ipdb=0.13.9=pyhd8ed1ab_0
 63 |   - ipykernel=6.0.2=py38hd0cf306_0
 64 |   - ipython=7.25.0=py38hd0cf306_1
 65 |   - ipython_genutils=0.2.0=py_1
 66 |   - jedi=0.18.0=py38h578d9bd_2
 67 |   - jinja2=3.0.1=pyhd8ed1ab_0
 68 |   - joblib=0.17.0=py_0
 69 |   - jpeg=9b=h024ee3a_2
 70 |   - json5=0.9.5=pyh9f0ad1d_0
 71 |   - jsonschema=3.2.0=pyhd8ed1ab_3
 72 |   - jupyter_client=6.1.12=pyhd8ed1ab_0
 73 |   - jupyter_core=4.7.1=py38h578d9bd_0
 74 |   - jupyter_server=1.9.0=pyhd8ed1ab_0
 75 |   - jupyterlab=3.0.16=pyhd8ed1ab_0
 76 |   - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0
 77 |   - jupyterlab_server=2.6.1=pyhd8ed1ab_0
 78 |   - kiwisolver=1.3.1=py38h1fd1430_1
 79 |   - lame=3.100=h7f98852_1001
 80 |   - lcms2=2.12=h3be6417_0
 81 |   - ld_impl_linux-64=2.35.1=hea4e1c9_2
 82 |   - liac-arff=2.5.0=pyhd8ed1ab_1
 83 |   - libffi=3.3=h58526e2_2
 84 |   - libgcc-ng=9.3.0=h2828fa1_19
 85 |   - libgfortran-ng=7.3.0=hdf63c60_0
 86 |   - libglib=2.68.3=h3e27bee_0
 87 |   - libgomp=9.3.0=h2828fa1_19
 88 |   - libiconv=1.16=h516909a_0
 89 |   - libidn2=2.3.1=h7f98852_0
 90 |   - libpng=1.6.37=h21135ba_2
 91 |   - libprotobuf=3.17.2=h780b84a_0
 92 |   - libsodium=1.0.18=h36c2ea0_1
 93 |   - libstdcxx-ng=9.3.0=h6de172a_19
 94 |   - libtiff=4.2.0=h85742a9_0
 95 |   - libunistring=0.9.10=h14c3975_0
 96 |   - libuuid=2.32.1=h7f98852_1000
 97 |   - libuv=1.41.0=h7f98852_0
 98 |   - libwebp-base=1.2.0=h7f98852_2
 99 |   - libxcb=1.13=h7f98852_1003
100 |   - libxgboost=1.4.0=h9c3ff4c_0
101 |   - libxml2=2.9.12=h03d6c58_0
102 |   - lz4-c=1.9.3=h9c3ff4c_0
103 |   - markupsafe=2.0.1=py38h497a2fe_0
104 |   - matplotlib=3.4.2=py38h578d9bd_0
105 |   - matplotlib-base=3.4.2=py38hcc49a3a_0
106 |   - matplotlib-inline=0.1.2=pyhd8ed1ab_2
107 |   - mistune=0.8.4=py38h497a2fe_1004
108 |   - mkl=2021.2.0=h06a4308_296
109 |   - mkl-service=2.4.0=py38h497a2fe_0
110 |   - mkl_fft=1.3.0=py38h42c9631_2
111 |   - mkl_random=1.2.2=py38h1abd341_0
112 |   - nbclassic=0.3.1=pyhd8ed1ab_1
113 |   - nbclient=0.5.3=pyhd8ed1ab_0
114 |   - nbconvert=6.1.0=py38h578d9bd_0
115 |   - nbformat=5.1.3=pyhd8ed1ab_0
116 |   - ncurses=6.2=h58526e2_4
117 |   - nest-asyncio=1.5.1=pyhd8ed1ab_0
118 |   - nettle=3.6=he412f7d_0
119 |   - ninja=1.10.2=h4bd325d_0
120 |   - notebook=6.4.0=pyha770c72_0
121 |   - numpy=1.20.2=py38h2d18471_0
122 |   - numpy-base=1.20.2=py38hfae3a4d_0
123 |   - nvidia-ml=7.352.0=py_0
124 |   - olefile=0.46=pyh9f0ad1d_1
125 |   - openh264=2.1.1=h780b84a_0
126 |   - openml=0.11.0=pyhd8ed1ab_0
127 |   - openssl=1.1.1k=h7f98852_0
128 |   - packaging=21.0=pyhd8ed1ab_0
129 |   - pandas=1.2.4=py38h1abd341_0
130 |   - pandoc=2.14.0.3=h7f98852_0
131 |   - pandocfilters=1.4.2=py_1
132 |   - parso=0.8.2=pyhd8ed1ab_0
133 |   - pathtools=0.1.2=py_1
134 |   - pcre=8.45=h9c3ff4c_0
135 |   - pexpect=4.8.0=pyh9f0ad1d_2
136 |   - pickleshare=0.7.5=py_1003
137 |   - pillow=8.2.0=py38he98fc37_0
138 |   - pip=21.1.2=pyhd8ed1ab_0
139 |   - prometheus_client=0.11.0=pyhd8ed1ab_0
140 |   - promise=2.3=py38h578d9bd_3
141 |   - prompt-toolkit=3.0.19=pyha770c72_0
142 |   - protobuf=3.17.2=py38h709712a_0
143 |   - psutil=5.8.0=py38h497a2fe_1
144 |   - pthread-stubs=0.4=h36c2ea0_1001
145 |   - ptyprocess=0.7.0=pyhd3deb0d_0
146 |   - py-xgboost=1.4.0=py38h578d9bd_0
147 |   - pycparser=2.20=pyh9f0ad1d_2
148 |   - pygments=2.9.0=pyhd8ed1ab_0
149 |   - pyopenssl=20.0.1=pyhd8ed1ab_0
150 |   - pyparsing=2.4.7=pyh9f0ad1d_0
151 |   - pyqt=5.9.2=py38h05f1152_4
152 |   - pyrsistent=0.17.3=py38h497a2fe_2
153 |   - pysocks=1.7.1=py38h578d9bd_3
154 |   - python=3.8.10=h49503c6_1_cpython
155 |   - python-dateutil=2.8.1=py_0
156 |   - python-wget=3.2=py_0
157 |   - python_abi=3.8=1_cp38
158 |   - pytorch=1.8.1=py3.8_cuda11.1_cudnn8.0.5_0
159 |   - pytz=2021.1=pyhd8ed1ab_0
160 |   - pyyaml=5.4.1=py38h497a2fe_0
161 |   - pyzmq=22.1.0=py38h2035c66_0
162 |   - qt=5.9.7=h5867ecd_1
163 |   - readline=8.1=h46c0cb4_0
164 |   - requests=2.25.1=pyhd3deb0d_0
165 |   - requests-unixsocket=0.2.0=py_0
166 |   - scikit-learn=0.23.2=py38h0573a6f_0
167 |   - scipy=1.6.2=py38had2a1c9_1
168 |   - seaborn=0.11.0=py_0
169 |   - send2trash=1.7.1=pyhd8ed1ab_0
170 |   - sentry-sdk=1.1.0=pyhd8ed1ab_0
171 |   - setuptools=49.6.0=py38h578d9bd_3
172 |   - shortuuid=1.0.1=py38h578d9bd_4
173 |   - sip=4.19.13=py38he6710b0_0
174 |   - six=1.16.0=pyh6c4a22f_0
175 |   - smmap=3.0.5=pyh44b312d_0
176 |   - sniffio=1.2.0=py38h578d9bd_1
177 |   - sqlite=3.35.5=h74cdb3f_0
178 |   - subprocess32=3.5.4=py_1
179 |   - terminado=0.10.1=py38h578d9bd_0
180 |   - testpath=0.5.0=pyhd8ed1ab_0
181 |   - threadpoolctl=2.1.0=pyh5ca1d4c_0
182 |   - tk=8.6.10=h21135ba_1
183 |   - torchvision=0.9.1=py38_cu111
184 |   - tornado=6.1=py38h497a2fe_1
185 |   - traitlets=5.0.5=py_0
186 |   - typing_extensions=3.7.4.3=py_0
187 |   - urllib3=1.26.5=pyhd8ed1ab_0
188 |   - wandb=0.10.31=pyhd8ed1ab_0
189 |   - watchdog=0.10.4=py38h578d9bd_0
190 |   - wcwidth=0.2.5=pyh9f0ad1d_2
191 |   - webencodings=0.5.1=py_1
192 |   - websocket-client=0.57.0=py38h578d9bd_4
193 |   - wget=1.20.1=h22169c7_0
194 |   - wheel=0.36.2=pyhd3deb0d_0
195 |   - xgboost=1.4.0=py38h578d9bd_0
196 |   - xmltodict=0.12.0=py_0
197 |   - xorg-libxau=1.0.9=h7f98852_0
198 |   - xorg-libxdmcp=1.1.3=h7f98852_0
199 |   - xz=5.2.5=h516909a_1
200 |   - yaml=0.2.5=h516909a_0
201 |   - zeromq=4.3.4=h9c3ff4c_0
202 |   - zipp=3.5.0=pyhd8ed1ab_0
203 |   - zlib=1.2.11=h516909a_1010
204 |   - zstd=1.4.9=ha95c52a_0


--------------------------------------------------------------------------------
/saint/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from sklearn.metrics import roc_auc_score, mean_squared_error
  3 | import numpy as np
  4 | from augmentations import embed_data_mask
  5 | import torch.nn as nn
  6 | 
  7 | def make_default_mask(x):
  8 |     mask = np.ones_like(x)
  9 |     mask[:,-1] = 0
 10 |     return mask
 11 | 
 12 | def tag_gen(tag,y):
 13 |     return np.repeat(tag,len(y['data']))
 14 | 
 15 | 
 16 | def count_parameters(model):
 17 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)  
 18 | 
 19 | def get_scheduler(args, optimizer):
 20 |     if args.scheduler == 'cosine':
 21 |         scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs)
 22 |     elif args.scheduler == 'linear':
 23 |         scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
 24 |                                       milestones=[args.epochs // 2.667, args.epochs // 1.6, args.epochs // 1.142], gamma=0.1)
 25 |     return scheduler
 26 | 
 27 | def imputations_acc_justy(model,dloader,device):
 28 |     model.eval()
 29 |     m = nn.Softmax(dim=1)
 30 |     y_test = torch.empty(0).to(device)
 31 |     y_pred = torch.empty(0).to(device)
 32 |     prob = torch.empty(0).to(device)
 33 |     with torch.no_grad():
 34 |         for i, data in enumerate(dloader, 0):
 35 |             x_categ, x_cont, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device)
 36 |             _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model)
 37 |             reps = model.transformer(x_categ_enc, x_cont_enc)
 38 |             y_reps = reps[:,model.num_categories-1,:]
 39 |             y_outs = model.mlpfory(y_reps)
 40 |             # import ipdb; ipdb.set_trace()   
 41 |             y_test = torch.cat([y_test,x_categ[:,-1].float()],dim=0)
 42 |             y_pred = torch.cat([y_pred,torch.argmax(m(y_outs), dim=1).float()],dim=0)
 43 |             prob = torch.cat([prob,m(y_outs)[:,-1].float()],dim=0)
 44 |      
 45 |     correct_results_sum = (y_pred == y_test).sum().float()
 46 |     acc = correct_results_sum/y_test.shape[0]*100
 47 |     auc = roc_auc_score(y_score=prob.cpu(), y_true=y_test.cpu())
 48 |     return acc, auc
 49 | 
 50 | 
 51 | def multiclass_acc_justy(model,dloader,device):
 52 |     model.eval()
 53 |     vision_dset = True
 54 |     m = nn.Softmax(dim=1)
 55 |     y_test = torch.empty(0).to(device)
 56 |     y_pred = torch.empty(0).to(device)
 57 |     prob = torch.empty(0).to(device)
 58 |     with torch.no_grad():
 59 |         for i, data in enumerate(dloader, 0):
 60 |             x_categ, x_cont, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device)
 61 |             _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)
 62 |             reps = model.transformer(x_categ_enc, x_cont_enc)
 63 |             y_reps = reps[:,model.num_categories-1,:]
 64 |             y_outs = model.mlpfory(y_reps)
 65 |             # import ipdb; ipdb.set_trace()   
 66 |             y_test = torch.cat([y_test,x_categ[:,-1].float()],dim=0)
 67 |             y_pred = torch.cat([y_pred,torch.argmax(m(y_outs), dim=1).float()],dim=0)
 68 |      
 69 |     correct_results_sum = (y_pred == y_test).sum().float()
 70 |     acc = correct_results_sum/y_test.shape[0]*100
 71 |     return acc, 0
 72 | 
 73 | 
 74 | def classification_scores(model, dloader, device, task,vision_dset):
 75 |     model.eval()
 76 |     m = nn.Softmax(dim=1)
 77 |     y_test = torch.empty(0).to(device)
 78 |     y_pred = torch.empty(0).to(device)
 79 |     prob = torch.empty(0).to(device)
 80 |     with torch.no_grad():
 81 |         for i, data in enumerate(dloader, 0):
 82 |             x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
 83 |             _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)           
 84 |             reps = model.transformer(x_categ_enc, x_cont_enc)
 85 |             y_reps = reps[:,0,:]
 86 |             y_outs = model.mlpfory(y_reps)
 87 |             # import ipdb; ipdb.set_trace()   
 88 |             y_test = torch.cat([y_test,y_gts.squeeze().float()],dim=0)
 89 |             y_pred = torch.cat([y_pred,torch.argmax(y_outs, dim=1).float()],dim=0)
 90 |             if task == 'binary':
 91 |                 prob = torch.cat([prob,m(y_outs)[:,-1].float()],dim=0)
 92 |      
 93 |     correct_results_sum = (y_pred == y_test).sum().float()
 94 |     acc = correct_results_sum/y_test.shape[0]*100
 95 |     auc = 0
 96 |     if task == 'binary':
 97 |         auc = roc_auc_score(y_score=prob.cpu(), y_true=y_test.cpu())
 98 |     return acc.cpu().numpy(), auc
 99 | 
100 | def mean_sq_error(model, dloader, device, vision_dset):
101 |     model.eval()
102 |     y_test = torch.empty(0).to(device)
103 |     y_pred = torch.empty(0).to(device)
104 |     with torch.no_grad():
105 |         for i, data in enumerate(dloader, 0):
106 |             x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
107 |             _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)           
108 |             reps = model.transformer(x_categ_enc, x_cont_enc)
109 |             y_reps = reps[:,0,:]
110 |             y_outs = model.mlpfory(y_reps)
111 |             y_test = torch.cat([y_test,y_gts.squeeze().float()],dim=0)
112 |             y_pred = torch.cat([y_pred,y_outs],dim=0)
113 |         # import ipdb; ipdb.set_trace() 
114 |         rmse = mean_squared_error(y_test.cpu(), y_pred.cpu(), squared=False)
115 |         return rmse
116 | 
117 | 


--------------------------------------------------------------------------------