├── Catboost ├── evaluate_30_trials.py ├── lib │ ├── __init__.py │ ├── data.py │ ├── deep.py │ ├── env.py │ ├── metrics.py │ ├── synthetic_data.py │ └── util.py ├── run_catboost.py └── utils.py ├── FT_Transformer ├── .gitignore ├── LICENSE ├── README.md ├── bin │ ├── evaluate.py │ ├── evaluate_30_trials.py │ ├── ft_transformer.py │ ├── openmlcc18_tasks.txt │ └── run_ft.py ├── lib │ ├── __init__.py │ ├── data.py │ ├── deep.py │ ├── env.py │ ├── metrics.py │ ├── node │ │ ├── __init__.py │ │ ├── arch.py │ │ ├── nn_utils.py │ │ ├── odst.py │ │ └── utils.py │ ├── synthetic_data.py │ └── util.py ├── pyproject.toml ├── requirements.txt └── setup.cfg ├── README.md ├── ResNet ├── evaluate_30_trials.py ├── lib │ ├── __init__.py │ ├── data.py │ ├── deep.py │ ├── env.py │ ├── metrics.py │ ├── node │ │ ├── __init__.py │ │ ├── arch.py │ │ ├── nn_utils.py │ │ ├── odst.py │ │ └── utils.py │ ├── synthetic_data.py │ └── util.py ├── resnet_ft.py ├── resnext.py └── run_resnetFt.py ├── TabNet ├── abstract_model.py ├── augmentations.py ├── callbacks.py ├── evaluate_30_trials.py ├── lib │ ├── __init__.py │ ├── data.py │ ├── deep.py │ ├── env.py │ ├── metrics.py │ ├── synthetic_data.py │ └── util.py ├── metrics.py ├── multiclass_utils.py ├── multitask.py ├── pretraining.py ├── pretraining_utils.py ├── run_tabnet.py ├── sparsemax.py ├── tab_model.py ├── tab_network.py ├── tabnet_utils.py └── utils.py ├── TabPFN ├── run_tabpfn.py └── utils.py ├── XGBoost ├── evaluate_30_trials.py ├── lib │ ├── __init__.py │ ├── data.py │ ├── deep.py │ ├── env.py │ ├── metrics.py │ ├── synthetic_data.py │ └── util.py ├── run_xgboost.py └── utils.py └── saint ├── .gitignore ├── LICENSE ├── README.md ├── augmentations.py ├── data_openml.py ├── evaluate_30_trials.py ├── models ├── __init__.py ├── model.py ├── pretrainmodel.py └── pretrainmodel_vision.py ├── pipeline.png ├── pretraining.py ├── run_saint.py ├── run_saint_test.py ├── run_saint_traditional.py ├── saint_environment.yml └── utils.py /Catboost/evaluate_30_trials.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import time 4 | 5 | import numpy as np 6 | import optuna 7 | import pandas as pd 8 | import zero 9 | import torch.nn as nn 10 | import torch 11 | import torch.nn.functional as F 12 | from catboost import CatBoostClassifier 13 | from sklearn.metrics import roc_auc_score, accuracy_score 14 | from xgboost import XGBClassifier 15 | 16 | import lib 17 | import wandb 18 | 19 | from sklearn.model_selection import StratifiedKFold 20 | from utils import set_random_seed 21 | 22 | # Create the parser 23 | parser = argparse.ArgumentParser(description="Train a model with specified parameters.") 24 | 25 | # Add the arguments 26 | parser.add_argument('--experiment_name', type=str, default='test', 27 | help='The name of the experiment. Default is "test".') 28 | parser.add_argument('--dataset', type=int, default=23, 29 | help='The dataset ID to use. Default is 45068 (adult).') 30 | parser.add_argument('--seed', type=int, default=0, 31 | help='The random seed for reproducibility. Default is 42.') 32 | parser.add_argument('--normalization', type=str, default='quantile', choices=['quantile', 'standard'], 33 | help='The normalization to use for the numerical features. Default is "quantile".') 34 | parser.add_argument('--cat_nan_policy', type=str, default='new', choices=['new', 'most_frequent'], 35 | help='The policy to use for handling nan values in categorical features. Default is "new".') 36 | parser.add_argument('--cat_policy', type=str, default='indices', choices=['indices', 'ohe'], 37 | help='The policy to use for handling categorical features. Default is "indices".') 38 | parser.add_argument('--outer_fold', type=int, default=0, help='The outer fold to use. Default is 0') 39 | parser.add_argument('--n_trials', type=int, default=100, 40 | help='The number of trials to use for HPO. Default is 100') 41 | parser.add_argument('--tune', action='store_true', help='Whether to tune the hyperparameters using Optuna') 42 | args = parser.parse_args() 43 | 44 | 45 | def load_best_config(project_name, dataset_name, outer_fold, num_trials=30): 46 | api = wandb.Api() 47 | target_run_name = f"{dataset_name}_outerFold_{outer_fold}" 48 | runs = api.runs(project_name) 49 | 50 | target_run = None 51 | for run in runs: 52 | if run.name == target_run_name: 53 | target_run = run 54 | break 55 | 56 | if not target_run: 57 | raise ValueError(f"No run found with name: {target_run_name}") 58 | 59 | # First scan for the best average_test_rocauc 60 | best_rocauc = 0 # Looking for the highest rocauc 61 | best_step = None 62 | history = target_run.scan_history(keys=['average_test_rocauc']) 63 | for i, row in enumerate(history): 64 | if i >= num_trials: 65 | break 66 | if 'average_test_rocauc' in row and row['average_test_rocauc'] > best_rocauc: 67 | best_rocauc = row['average_test_rocauc'] 68 | best_step = i 69 | 70 | if best_step is None: 71 | raise ValueError("Best rocauc not found within the first 30 trials") 72 | 73 | # Second scan for the HPs at the best step 74 | hp_keys = ['max_depth', 'learning_rate', 'bagging_temperature', 'l2_leaf_reg', 'leaf_estimation_iterations'] 75 | best_config = None 76 | history = target_run.scan_history(keys=hp_keys) 77 | for i, row in enumerate(history): 78 | if i == best_step: 79 | best_config = {key: row[key] for key in hp_keys if key in row} 80 | break 81 | 82 | if best_config: 83 | return best_config 84 | else: 85 | raise ValueError("HPs not found for the best rocauc step") 86 | 87 | 88 | def run_single_outer_fold(outer_fold, D, outer_folds): 89 | outer_train_idx, outer_test_idx = outer_folds[outer_fold] 90 | 91 | best_configuration = load_best_config('t4tab/CatboostFT_optuna_CPU', D.info['dataset_name'], args.outer_fold) 92 | 93 | X_outer_preprocessed = D.build_X( 94 | normalization='quantile', 95 | num_nan_policy='mean', 96 | cat_nan_policy='new', 97 | cat_policy='indices', 98 | seed=args.seed, 99 | train_idx=outer_train_idx, 100 | test_idx=outer_test_idx, 101 | ) 102 | set_random_seed(args.seed) 103 | Y, y_info = D.build_y(train_idx=outer_train_idx, test_idx=outer_test_idx) 104 | 105 | N, C = X_outer_preprocessed 106 | n_num_features = 0 if N is None else N[outer_train_idx].shape[1] 107 | n_cat_features = 0 if C is None else C[outer_train_idx].shape[1] 108 | n_features = n_num_features + n_cat_features 109 | if N is None: 110 | assert C is not None 111 | X_outer_preprocessed = pd.DataFrame(C, columns=range(n_features)) 112 | elif C is None: 113 | assert N is not None 114 | X_outer_preprocessed = pd.DataFrame(N, columns=range(n_features)) 115 | else: 116 | X_outer_preprocessed = pd.concat( 117 | [ 118 | pd.DataFrame(N, columns=range(n_num_features)), 119 | pd.DataFrame(C, columns=range(n_num_features, n_features)), 120 | ], 121 | axis=1 122 | ) 123 | cat_features = list(range(n_num_features, n_features)) 124 | unique_classes, class_counts = np.unique(Y[outer_train_idx], axis=0, return_counts=True) 125 | nr_classes = len(unique_classes) 126 | model = CatBoostClassifier( 127 | task_type='CPU', 128 | loss_function='MultiClass' if nr_classes > 2 else 'Logloss', 129 | eval_metric='AUC', 130 | random_seed=args.seed, 131 | early_stopping_rounds=50, 132 | od_pval=0.001, 133 | iterations=2000, 134 | max_depth=best_configuration['max_depth'], 135 | learning_rate=best_configuration['learning_rate'], 136 | bagging_temperature=best_configuration['bagging_temperature'], 137 | l2_leaf_reg=best_configuration['l2_leaf_reg'], 138 | leaf_estimation_iterations=best_configuration['leaf_estimation_iterations'], 139 | ) 140 | 141 | model.fit(X_outer_preprocessed.iloc[outer_train_idx], Y[outer_train_idx], 142 | eval_set=(X_outer_preprocessed.iloc[outer_test_idx], Y[outer_test_idx]), 143 | cat_features=cat_features, 144 | verbose=False) 145 | 146 | train_predictions_labels = model.predict(X_outer_preprocessed.iloc[outer_train_idx]) 147 | test_predictions_labels = model.predict(X_outer_preprocessed.iloc[outer_test_idx]) 148 | if D.is_multiclass: 149 | train_predictions_probabilities = model.predict_proba(X_outer_preprocessed.iloc[outer_train_idx]) 150 | test_predictions_probabilities = model.predict_proba(X_outer_preprocessed.iloc[outer_test_idx]) 151 | else: 152 | train_predictions_probabilities = model.predict_proba(X_outer_preprocessed.iloc[outer_train_idx])[:, 1] 153 | test_predictions_probabilities = model.predict_proba(X_outer_preprocessed.iloc[outer_test_idx])[:, 1] 154 | 155 | # calculate the balanced accuracy 156 | train_rocauc = roc_auc_score(Y[outer_train_idx], train_predictions_probabilities, 157 | multi_class='raise' if nr_classes == 2 else 'ovo') 158 | train_accuracy = accuracy_score(Y[outer_train_idx], train_predictions_labels) 159 | test_rocauc = roc_auc_score(Y[outer_test_idx], test_predictions_probabilities, 160 | multi_class='raise' if nr_classes == 2 else 'ovo') 161 | test_accuracy = accuracy_score(Y[outer_test_idx], test_predictions_labels) 162 | print(f"Finished outer fold {outer_fold}") 163 | 164 | output_info = { 165 | 'train_rocauc': train_rocauc, 166 | 'train_accuracy': train_accuracy, 167 | 'test_accuracy': test_accuracy, 168 | f'best_test_rocauc_outer_fold_{outer_fold}': test_rocauc, 169 | } 170 | wandb.log(output_info) 171 | wandb.finish() 172 | 173 | 174 | if __name__ == "__main__": 175 | # %% 176 | set_random_seed(args.seed) 177 | D = lib.Dataset.from_openml(args.dataset) 178 | run_name = f"{D.info['dataset_name']}_outerFold_{args.outer_fold}" 179 | wandb.init(project=args.experiment_name, 180 | name=run_name, 181 | config=args) 182 | outer_kfold = StratifiedKFold(n_splits=10, shuffle=True) 183 | outer_folds = list(outer_kfold.split(D.X, D.y)) 184 | run_single_outer_fold(args.outer_fold, D, outer_folds) 185 | -------------------------------------------------------------------------------- /Catboost/lib/__init__.py: -------------------------------------------------------------------------------- 1 | from icecream import install 2 | 3 | install() 4 | 5 | from . import env # noqa 6 | from .data import * # noqa 7 | from .deep import * # noqa 8 | from .metrics import * # noqa 9 | from .util import * # noqa 10 | -------------------------------------------------------------------------------- /Catboost/lib/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | 5 | -------------------------------------------------------------------------------- /Catboost/lib/metrics.py: -------------------------------------------------------------------------------- 1 | import typing as ty 2 | 3 | import numpy as np 4 | import scipy.special 5 | import sklearn.metrics as skm 6 | 7 | from . import util 8 | 9 | 10 | def calculate_metrics( 11 | task_type: str, 12 | y: np.ndarray, 13 | prediction: np.ndarray, 14 | classification_mode: str, 15 | y_info: ty.Optional[ty.Dict[str, ty.Any]], 16 | ) -> ty.Dict[str, float]: 17 | if task_type == util.REGRESSION: 18 | del classification_mode 19 | rmse = skm.mean_squared_error(y, prediction) ** 0.5 # type: ignore[code] 20 | 21 | return {'rmse': rmse, 'score': -rmse} 22 | else: 23 | assert task_type in (util.BINCLASS, util.MULTICLASS) 24 | labels = None 25 | if classification_mode == 'probs': 26 | probs = prediction 27 | elif classification_mode == 'logits': 28 | probs = ( 29 | scipy.special.expit(prediction) 30 | if task_type == util.BINCLASS 31 | else scipy.special.softmax(prediction, axis=1) 32 | ) 33 | else: 34 | assert classification_mode == 'labels' 35 | probs = None 36 | labels = prediction 37 | if labels is None: 38 | labels = ( 39 | np.round(probs).astype('int64') 40 | if task_type == util.BINCLASS 41 | else probs.argmax(axis=1) # type: ignore[code] 42 | ) 43 | 44 | result = skm.classification_report(y, labels, output_dict=True) # type: ignore[code] 45 | if task_type == util.BINCLASS: 46 | result['roc_auc'] = skm.roc_auc_score(y, probs) # type: ignore[code] 47 | else: 48 | result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo') # type: ignore[code] 49 | result['score'] = result['roc_auc'] # type: ignore[code] 50 | return result # type: ignore[code] 51 | 52 | 53 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str: 54 | precision = 3 55 | summary = {} 56 | for k, v in metrics[1].items(): 57 | if k.isdigit(): 58 | continue 59 | k = { 60 | 'score': 'SCORE', 61 | 'accuracy': 'acc', 62 | 'roc_auc': 'roc_auc', 63 | 'macro avg': 'm', 64 | 'weighted avg': 'w', 65 | }.get(k, k) 66 | if isinstance(v, float): 67 | v = round(v, precision) 68 | summary[k] = v 69 | else: 70 | v = { 71 | {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get( 72 | x, x 73 | ): round(v[x], precision) 74 | for x in v 75 | } 76 | for item in v.items(): 77 | summary[k + item[0]] = item[1] 78 | 79 | s = [f'score = {summary.pop("SCORE"):.3f}'] 80 | for k, v in summary.items(): 81 | if k not in ['mp', 'mr', 'wp', 'wr']: # just to save screen space 82 | s.append(f'{k} = {v}') 83 | return ' | '.join(s) 84 | -------------------------------------------------------------------------------- /Catboost/lib/synthetic_data.py: -------------------------------------------------------------------------------- 1 | "Code used to generate data for experiments with synthetic data" 2 | import math 3 | import typing as ty 4 | 5 | import numba 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | from numba.experimental import jitclass 10 | from tqdm.auto import tqdm 11 | 12 | 13 | class MLP(nn.Module): 14 | def __init__( 15 | self, 16 | *, 17 | d_in: int, 18 | d_layers: ty.List[int], 19 | d_out: int, 20 | bias: bool = True, 21 | ) -> None: 22 | super().__init__() 23 | self.layers = nn.ModuleList( 24 | [ 25 | nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias) 26 | for i, x in enumerate(d_layers) 27 | ] 28 | ) 29 | self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out) 30 | 31 | def init_weights(m): 32 | if isinstance(m, nn.Linear): 33 | torch.nn.init.kaiming_normal_(m.weight, mode='fan_in') 34 | if m.bias is not None: 35 | fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight) 36 | bound = 1 / math.sqrt(fan_in) 37 | torch.nn.init.uniform_(m.bias, -bound, bound) 38 | 39 | self.apply(init_weights) 40 | 41 | def forward(self, x: torch.Tensor) -> torch.Tensor: 42 | for layer in self.layers: 43 | x = layer(x) 44 | x = torch.relu(x) 45 | x = self.head(x) 46 | x = x.squeeze(-1) 47 | return x 48 | 49 | 50 | @jitclass( 51 | spec=[ 52 | ('left_children', numba.int64[:]), 53 | ('right_children', numba.int64[:]), 54 | ('feature', numba.int64[:]), 55 | ('threshold', numba.float32[:]), 56 | ('value', numba.float32[:]), 57 | ('is_leaf', numba.int64[:]), 58 | ] 59 | ) 60 | class Tree: 61 | "Randomly initialized decision tree" 62 | 63 | def __init__(self, n_features, n_nodes, max_depth): 64 | assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes" 65 | 66 | self.left_children = np.ones(n_nodes, dtype=np.int64) * -1 67 | self.right_children = np.ones(n_nodes, dtype=np.int64) * -1 68 | self.feature = np.random.randint(0, n_features, (n_nodes,)) 69 | self.threshold = np.random.randn(n_nodes).astype(np.float32) 70 | self.value = np.random.randn(n_nodes).astype(np.float32) 71 | depth = np.zeros(n_nodes, dtype=np.int64) 72 | 73 | # Root is 0 74 | self.is_leaf = np.zeros(n_nodes, dtype=np.int64) 75 | self.is_leaf[0] = 1 76 | 77 | # Keep adding nodes while we can (new node must have 2 children) 78 | while True: 79 | idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())] 80 | if depth[idx] < max_depth: 81 | unused = np.flatnonzero( 82 | (self.left_children == -1) 83 | & (self.right_children == -1) 84 | & ~self.is_leaf 85 | ) 86 | if len(unused) < 2: 87 | break 88 | 89 | lr_child = unused[np.random.permutation(unused.shape[0])[:2]] 90 | self.is_leaf[lr_child] = 1 91 | self.is_leaf[lr_child] = 1 92 | depth[lr_child] = depth[idx] + 1 93 | self.left_children[idx] = lr_child[0] 94 | self.right_children[idx] = lr_child[1] 95 | self.is_leaf[idx] = 0 96 | 97 | def apply(self, x): 98 | y = np.zeros(x.shape[0]) 99 | 100 | for i in range(x.shape[0]): 101 | idx = 0 102 | 103 | while not self.is_leaf[idx]: 104 | if x[i, self.feature[idx]] < self.threshold[idx]: 105 | idx = self.left_children[idx] 106 | else: 107 | idx = self.right_children[idx] 108 | 109 | y[i] = self.value[idx] 110 | 111 | return y 112 | 113 | 114 | class TreeEnsemble: 115 | "Combine multiple trees" 116 | 117 | def __init__(self, *, n_trees, n_features, n_nodes, max_depth): 118 | self.trees = [ 119 | Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth) 120 | for _ in range(n_trees) 121 | ] 122 | 123 | def apply(self, x): 124 | return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0) 125 | -------------------------------------------------------------------------------- /Catboost/lib/util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import os 5 | import pickle 6 | import random 7 | import shutil 8 | import sys 9 | import time 10 | import typing as ty 11 | from copy import deepcopy 12 | from pathlib import Path 13 | 14 | import numpy as np 15 | import pynvml 16 | import pytomlpp as toml 17 | import torch 18 | 19 | from . import env 20 | 21 | TRAIN = 'train' 22 | VAL = 'val' 23 | TEST = 'test' 24 | PARTS = [TRAIN, VAL, TEST] 25 | 26 | BINCLASS = 'binclass' 27 | MULTICLASS = 'multiclass' 28 | REGRESSION = 'regression' 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION] 30 | 31 | 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any: 33 | return json.loads(Path(path).read_text()) 34 | 35 | 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None: 37 | Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n') 38 | 39 | 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any: 41 | return toml.loads(Path(path).read_text()) 42 | 43 | 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None: 45 | Path(path).write_text(toml.dumps(x) + '\n') 46 | 47 | 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any: 49 | return pickle.loads(Path(path).read_bytes()) 50 | 51 | 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None: 53 | Path(path).write_bytes(pickle.dumps(x)) 54 | 55 | 56 | def load(path: ty.Union[Path, str]) -> ty.Any: 57 | return globals()[f'load_{Path(path).suffix[1:]}'](path) 58 | 59 | 60 | def load_config( 61 | argv: ty.Optional[ty.List[str]] = None, 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]: 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('config', metavar='FILE') 65 | parser.add_argument('-o', '--output', metavar='DIR') 66 | parser.add_argument('-f', '--force', action='store_true') 67 | parser.add_argument('--continue', action='store_true', dest='continue_') 68 | if argv is None: 69 | argv = sys.argv[1:] 70 | args = parser.parse_args(argv) 71 | 72 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 73 | if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists(): 74 | assert args.continue_ 75 | 76 | config_path = Path(args.config).absolute() 77 | output_dir = ( 78 | Path(args.output) 79 | if args.output 80 | else config_path.parent.joinpath(config_path.stem) 81 | ).absolute() 82 | sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir)))) # type: ignore[code] 83 | print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n') 84 | 85 | assert config_path.exists() 86 | config = load_toml(config_path) 87 | 88 | environment: ty.Dict[str, ty.Any] = {} 89 | if torch.cuda.is_available(): # type: ignore[code] 90 | cvd = os.environ.get('CUDA_VISIBLE_DEVICES') 91 | pynvml.nvmlInit() 92 | environment['devices'] = { 93 | 'CUDA_VISIBLE_DEVICES': cvd, 94 | 'torch.version.cuda': torch.version.cuda, 95 | 'torch.backends.cudnn.version()': torch.backends.cudnn.version(), # type: ignore[code] 96 | 'torch.cuda.nccl.version()': torch.cuda.nccl.version(), # type: ignore[code] 97 | 'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'), 98 | } 99 | if cvd: 100 | for i in map(int, cvd.split(',')): 101 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 102 | environment['devices'][i] = { 103 | 'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'), 104 | 'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total, 105 | } 106 | 107 | return config, output_dir 108 | 109 | 110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None: 111 | dump_json(stats, output_dir / 'stats.json', indent=4) 112 | json_output_path = os.environ.get('JSON_OUTPUT_FILE') 113 | if final: 114 | output_dir.joinpath('DONE').touch() 115 | if json_output_path: 116 | try: 117 | key = str(output_dir.relative_to(env.PROJECT_DIR)) 118 | except ValueError: 119 | pass 120 | else: 121 | json_output_path = Path(json_output_path) 122 | try: 123 | json_data = json.loads(json_output_path.read_text()) 124 | except (FileNotFoundError, json.decoder.JSONDecodeError): 125 | json_data = {} 126 | json_data[key] = stats 127 | json_output_path.write_text(json.dumps(json_data)) 128 | shutil.copyfile( 129 | json_output_path, 130 | os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'), 131 | ) 132 | 133 | 134 | _LAST_SNAPSHOT_TIME = None 135 | 136 | 137 | def backup_output(output_dir: Path) -> None: 138 | backup_dir = os.environ.get('TMP_OUTPUT_PATH') 139 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 140 | if backup_dir is None: 141 | assert snapshot_dir is None 142 | return 143 | assert snapshot_dir is not None 144 | 145 | try: 146 | relative_output_dir = output_dir.relative_to(env.PROJECT_DIR) 147 | except ValueError: 148 | return 149 | 150 | for dir_ in [backup_dir, snapshot_dir]: 151 | new_output_dir = dir_ / relative_output_dir 152 | prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev') 153 | new_output_dir.parent.mkdir(exist_ok=True, parents=True) 154 | if new_output_dir.exists(): 155 | new_output_dir.rename(prev_backup_output_dir) 156 | shutil.copytree(output_dir, new_output_dir) 157 | if prev_backup_output_dir.exists(): 158 | shutil.rmtree(prev_backup_output_dir) 159 | 160 | global _LAST_SNAPSHOT_TIME 161 | if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60: 162 | pass 163 | _LAST_SNAPSHOT_TIME = time.time() 164 | print('The snapshot was saved!') 165 | 166 | 167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any): 168 | raise ValueError(f'Unknown {unknown_what}: {unknown_value}') 169 | 170 | 171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict: 172 | x = deepcopy(default_kwargs) 173 | x.update(kwargs) 174 | return x 175 | 176 | 177 | def set_seeds(seed: int) -> None: 178 | random.seed(seed) 179 | np.random.seed(seed) 180 | 181 | 182 | def format_seconds(seconds: float) -> str: 183 | return str(datetime.timedelta(seconds=round(seconds))) 184 | 185 | 186 | def get_categories( 187 | X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List 188 | ) -> ty.Optional[ty.List[int]]: 189 | if X_cat is None: 190 | return None 191 | else: 192 | categories_count = [] 193 | for i in range(X_cat.shape[1]): 194 | # Combine unique categories from both training and testing indices for each feature 195 | unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist()) 196 | categories_count.append(len(unique_categories)) 197 | return categories_count 198 | -------------------------------------------------------------------------------- /Catboost/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | 5 | 6 | def set_random_seed(seed): 7 | """ 8 | Set the seed for random number generation in Python, NumPy, and PyTorch. 9 | 10 | Args: 11 | seed (int): The seed value to use for all random number generators. 12 | """ 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | torch.manual_seed(seed) 16 | 17 | if torch.cuda.is_available(): 18 | torch.cuda.manual_seed(seed) 19 | torch.cuda.manual_seed_all(seed) 20 | torch.backends.cudnn.deterministic = True 21 | torch.backends.cudnn.benchmark = False -------------------------------------------------------------------------------- /FT_Transformer/.gitignore: -------------------------------------------------------------------------------- 1 | # >>> GITHUB DEFAULT PYTHON .GIGIGNORE 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | # lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # <<< GITHUB DEFAULT PYTHON .GIGIGNORE 142 | 143 | # Data, checkpoints, etc. 144 | data 145 | **/catboost_cached_datasets/** 146 | *.bin 147 | *.csv 148 | *.cbm 149 | *.npy 150 | *.pickle 151 | *.pt 152 | *.pth 153 | *.rar 154 | *.tar* 155 | *.tmp 156 | *.zip 157 | events.out.tfevents.* 158 | 159 | # Experiments 160 | output/**/*.* 161 | !output/**/stats.json 162 | !output/**/*.toml 163 | 164 | # Other 165 | .DS_Store 166 | .vscode/ 167 | .ruff_cache 168 | -------------------------------------------------------------------------------- /FT_Transformer/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Authors of "Revisiting Deep Learning Models for Tabular Data" 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /FT_Transformer/bin/openmlcc18_tasks.txt: -------------------------------------------------------------------------------- 1 | 3 6 11 12 14 15 16 18 22 23 28 29 31 32 37 44 46 50 54 151 182 188 38 307 300 458 469 1049 1050 1053 1063 1067 1068 1590 4134 1510 1489 1494 1497 1501 1480 1485 1486 1487 1468 1475 1462 1464 4534 6332 1461 4538 1478 23381 40499 40668 40966 40982 40994 40983 40975 40984 40979 41027 23517 40978 40670 40701 -------------------------------------------------------------------------------- /FT_Transformer/lib/__init__.py: -------------------------------------------------------------------------------- 1 | from icecream import install 2 | 3 | install() 4 | 5 | from . import env # noqa 6 | from .data import * # noqa 7 | from .deep import * # noqa 8 | from .env import get_path # noqa 9 | from .metrics import * # noqa 10 | from .util import * # noqa 11 | -------------------------------------------------------------------------------- /FT_Transformer/lib/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | PROJECT_DIR = Path(os.environ['PROJECT_DIR']).absolute().resolve() 5 | DATA_DIR = PROJECT_DIR / 'data' 6 | OUTPUT_DIR = PROJECT_DIR / 'output' 7 | 8 | 9 | def get_path(relative_path: str) -> Path: 10 | return ( 11 | Path(relative_path) 12 | if relative_path.startswith('/') 13 | else PROJECT_DIR / relative_path 14 | ) 15 | -------------------------------------------------------------------------------- /FT_Transformer/lib/metrics.py: -------------------------------------------------------------------------------- 1 | import typing as ty 2 | 3 | import numpy as np 4 | import scipy.special 5 | import sklearn.metrics as skm 6 | 7 | from . import util 8 | 9 | 10 | def calculate_metrics( 11 | task_type: str, 12 | y: np.ndarray, 13 | prediction: np.ndarray, 14 | classification_mode: str, 15 | y_info: ty.Optional[ty.Dict[str, ty.Any]], 16 | ) -> ty.Dict[str, float]: 17 | if task_type == util.REGRESSION: 18 | del classification_mode 19 | rmse = skm.mean_squared_error(y, prediction) ** 0.5 # type: ignore[code] 20 | 21 | return {'rmse': rmse, 'score': -rmse} 22 | else: 23 | assert task_type in (util.BINCLASS, util.MULTICLASS) 24 | labels = None 25 | if classification_mode == 'probs': 26 | probs = prediction 27 | elif classification_mode == 'logits': 28 | probs = ( 29 | scipy.special.expit(prediction) 30 | if task_type == util.BINCLASS 31 | else scipy.special.softmax(prediction, axis=1) 32 | ) 33 | else: 34 | assert classification_mode == 'labels' 35 | probs = None 36 | labels = prediction 37 | if labels is None: 38 | labels = ( 39 | np.round(probs).astype('int64') 40 | if task_type == util.BINCLASS 41 | else probs.argmax(axis=1) # type: ignore[code] 42 | ) 43 | 44 | result = skm.classification_report(y, labels, output_dict=True) # type: ignore[code] 45 | if task_type == util.BINCLASS: 46 | result['roc_auc'] = skm.roc_auc_score(y, probs) # type: ignore[code] 47 | else: 48 | result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo') # type: ignore[code] 49 | result['score'] = result['roc_auc'] # type: ignore[code] 50 | return result # type: ignore[code] 51 | 52 | 53 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str: 54 | precision = 3 55 | summary = {} 56 | for k, v in metrics[1].items(): 57 | if k.isdigit(): 58 | continue 59 | k = { 60 | 'score': 'SCORE', 61 | 'accuracy': 'acc', 62 | 'roc_auc': 'roc_auc', 63 | 'macro avg': 'm', 64 | 'weighted avg': 'w', 65 | }.get(k, k) 66 | if isinstance(v, float): 67 | v = round(v, precision) 68 | summary[k] = v 69 | else: 70 | v = { 71 | {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get( 72 | x, x 73 | ): round(v[x], precision) 74 | for x in v 75 | } 76 | for item in v.items(): 77 | summary[k + item[0]] = item[1] 78 | 79 | s = [f'score = {summary.pop("SCORE"):.3f}'] 80 | for k, v in summary.items(): 81 | if k not in ['mp', 'mr', 'wp', 'wr']: # just to save screen space 82 | s.append(f'{k} = {v}') 83 | return ' | '.join(s) 84 | -------------------------------------------------------------------------------- /FT_Transformer/lib/node/__init__.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Qwicen/node 2 | from .arch import * # noqa 3 | from .nn_utils import * # noqa 4 | from .odst import * # noqa 5 | from .utils import * # noqa 6 | -------------------------------------------------------------------------------- /FT_Transformer/lib/node/arch.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Qwicen/node 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.utils.checkpoint import checkpoint as torch_checkpoint 6 | 7 | from .odst import ODST 8 | 9 | 10 | class DenseBlock(nn.Sequential): 11 | def __init__(self, input_dim, layer_dim, num_layers, tree_dim=1, max_features=None, 12 | input_dropout=0.0, flatten_output=True, Module=ODST, **kwargs): 13 | layers = [] 14 | for i in range(num_layers): 15 | oddt = Module(input_dim, layer_dim, tree_dim=tree_dim, flatten_output=True, **kwargs) 16 | input_dim = min(input_dim + layer_dim * tree_dim, max_features or float('inf')) 17 | layers.append(oddt) 18 | 19 | super().__init__(*layers) 20 | self.num_layers, self.layer_dim, self.tree_dim = num_layers, layer_dim, tree_dim 21 | self.max_features, self.flatten_output = max_features, flatten_output 22 | self.input_dropout = input_dropout 23 | 24 | def forward(self, x): 25 | initial_features = x.shape[-1] 26 | for layer in self: 27 | layer_inp = x 28 | if self.max_features is not None: 29 | tail_features = min(self.max_features, layer_inp.shape[-1]) - initial_features 30 | if tail_features != 0: 31 | layer_inp = torch.cat([layer_inp[..., :initial_features], layer_inp[..., -tail_features:]], dim=-1) 32 | if self.training and self.input_dropout: 33 | layer_inp = F.dropout(layer_inp, self.input_dropout) 34 | h = layer(layer_inp) 35 | x = torch.cat([x, h], dim=-1) 36 | 37 | outputs = x[..., initial_features:] 38 | if not self.flatten_output: 39 | outputs = outputs.view(*outputs.shape[:-1], self.num_layers * self.layer_dim, self.tree_dim) 40 | return outputs 41 | -------------------------------------------------------------------------------- /FT_Transformer/lib/node/nn_utils.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Qwicen/node 2 | import contextlib 3 | from collections import OrderedDict 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from torch.autograd import Function 10 | from torch.jit import script 11 | 12 | 13 | def to_one_hot(y, depth=None): 14 | r""" 15 | Takes integer with n dims and converts it to 1-hot representation with n + 1 dims. 16 | The n+1'st dimension will have zeros everywhere but at y'th index, where it will be equal to 1. 17 | Args: 18 | y: input integer (IntTensor, LongTensor or Variable) of any shape 19 | depth (int): the size of the one hot dimension 20 | """ 21 | y_flat = y.to(torch.int64).view(-1, 1) 22 | depth = depth if depth is not None else int(torch.max(y_flat)) + 1 23 | y_one_hot = torch.zeros(y_flat.size()[0], depth, device=y.device).scatter_(1, y_flat, 1) 24 | y_one_hot = y_one_hot.view(*(tuple(y.shape) + (-1,))) 25 | return y_one_hot 26 | 27 | 28 | def _make_ix_like(input, dim=0): 29 | d = input.size(dim) 30 | rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) 31 | view = [1] * input.dim() 32 | view[0] = -1 33 | return rho.view(view).transpose(0, dim) 34 | 35 | 36 | class SparsemaxFunction(Function): 37 | """ 38 | An implementation of sparsemax (Martins & Astudillo, 2016). See 39 | :cite:`DBLP:journals/corr/MartinsA16` for detailed description. 40 | 41 | By Ben Peters and Vlad Niculae 42 | """ 43 | 44 | @staticmethod 45 | def forward(ctx, input, dim=-1): 46 | """sparsemax: normalizing sparse transform (a la softmax) 47 | 48 | Parameters: 49 | input (Tensor): any shape 50 | dim: dimension along which to apply sparsemax 51 | 52 | Returns: 53 | output (Tensor): same shape as input 54 | """ 55 | ctx.dim = dim 56 | max_val, _ = input.max(dim=dim, keepdim=True) 57 | input -= max_val # same numerical stability trick as for softmax 58 | tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim) 59 | output = torch.clamp(input - tau, min=0) 60 | ctx.save_for_backward(supp_size, output) 61 | return output 62 | 63 | @staticmethod 64 | def backward(ctx, grad_output): 65 | supp_size, output = ctx.saved_tensors 66 | dim = ctx.dim 67 | grad_input = grad_output.clone() 68 | grad_input[output == 0] = 0 69 | 70 | v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() 71 | v_hat = v_hat.unsqueeze(dim) 72 | grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) 73 | return grad_input, None 74 | 75 | 76 | @staticmethod 77 | def _threshold_and_support(input, dim=-1): 78 | """Sparsemax building block: compute the threshold 79 | 80 | Args: 81 | input: any dimension 82 | dim: dimension along which to apply the sparsemax 83 | 84 | Returns: 85 | the threshold value 86 | """ 87 | 88 | input_srt, _ = torch.sort(input, descending=True, dim=dim) 89 | input_cumsum = input_srt.cumsum(dim) - 1 90 | rhos = _make_ix_like(input, dim) 91 | support = rhos * input_srt > input_cumsum 92 | 93 | support_size = support.sum(dim=dim).unsqueeze(dim) 94 | tau = input_cumsum.gather(dim, support_size - 1) 95 | tau /= support_size.to(input.dtype) 96 | return tau, support_size 97 | 98 | 99 | sparsemax = lambda input, dim=-1: SparsemaxFunction.apply(input, dim) 100 | sparsemoid = lambda input: (0.5 * input + 0.5).clamp_(0, 1) 101 | 102 | 103 | class Entmax15Function(Function): 104 | """ 105 | An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See 106 | :cite:`https://arxiv.org/abs/1905.05702 for detailed description. 107 | Source: https://github.com/deep-spin/entmax 108 | """ 109 | 110 | @staticmethod 111 | def forward(ctx, input, dim=-1): 112 | ctx.dim = dim 113 | 114 | max_val, _ = input.max(dim=dim, keepdim=True) 115 | input = input - max_val # same numerical stability trick as for softmax 116 | input = input / 2 # divide by 2 to solve actual Entmax 117 | 118 | tau_star, _ = Entmax15Function._threshold_and_support(input, dim) 119 | output = torch.clamp(input - tau_star, min=0) ** 2 120 | ctx.save_for_backward(output) 121 | return output 122 | 123 | @staticmethod 124 | def backward(ctx, grad_output): 125 | Y, = ctx.saved_tensors 126 | gppr = Y.sqrt() # = 1 / g'' (Y) 127 | dX = grad_output * gppr 128 | q = dX.sum(ctx.dim) / gppr.sum(ctx.dim) 129 | q = q.unsqueeze(ctx.dim) 130 | dX -= q * gppr 131 | return dX, None 132 | 133 | @staticmethod 134 | def _threshold_and_support(input, dim=-1): 135 | Xsrt, _ = torch.sort(input, descending=True, dim=dim) 136 | 137 | rho = _make_ix_like(input, dim) 138 | mean = Xsrt.cumsum(dim) / rho 139 | mean_sq = (Xsrt ** 2).cumsum(dim) / rho 140 | ss = rho * (mean_sq - mean ** 2) 141 | delta = (1 - ss) / rho 142 | 143 | # NOTE this is not exactly the same as in reference algo 144 | # Fortunately it seems the clamped values never wrongly 145 | # get selected by tau <= sorted_z. Prove this! 146 | delta_nz = torch.clamp(delta, 0) 147 | tau = mean - torch.sqrt(delta_nz) 148 | 149 | support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim) 150 | tau_star = tau.gather(dim, support_size - 1) 151 | return tau_star, support_size 152 | 153 | 154 | class Entmoid15(Function): 155 | """ A highly optimized equivalent of labda x: Entmax15([x, 0]) """ 156 | 157 | @staticmethod 158 | def forward(ctx, input): 159 | output = Entmoid15._forward(input) 160 | ctx.save_for_backward(output) 161 | return output 162 | 163 | @staticmethod 164 | @script 165 | def _forward(input): 166 | input, is_pos = abs(input), input >= 0 167 | tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2 168 | tau.masked_fill_(tau <= input, 2.0) 169 | y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2 170 | return torch.where(is_pos, 1 - y_neg, y_neg) 171 | 172 | @staticmethod 173 | def backward(ctx, grad_output): 174 | return Entmoid15._backward(ctx.saved_tensors[0], grad_output) 175 | 176 | @staticmethod 177 | @script 178 | def _backward(output, grad_output): 179 | gppr0, gppr1 = output.sqrt(), (1 - output).sqrt() 180 | grad_input = grad_output * gppr0 181 | q = grad_input / (gppr0 + gppr1) 182 | grad_input -= q * gppr0 183 | return grad_input 184 | 185 | 186 | entmax15 = lambda input, dim=-1: Entmax15Function.apply(input, dim) 187 | entmoid15 = Entmoid15.apply 188 | 189 | 190 | class Lambda(nn.Module): 191 | def __init__(self, func): 192 | super().__init__() 193 | self.func = func 194 | 195 | def forward(self, *args, **kwargs): 196 | return self.func(*args, **kwargs) 197 | 198 | 199 | class ModuleWithInit(nn.Module): 200 | """ Base class for pytorch module with data-aware initializer on first batch """ 201 | def __init__(self): 202 | super().__init__() 203 | self._is_initialized_tensor = nn.Parameter(torch.tensor(0, dtype=torch.uint8), requires_grad=False) 204 | self._is_initialized_bool = None 205 | # Note: this module uses a separate flag self._is_initialized so as to achieve both 206 | # * persistence: is_initialized is saved alongside model in state_dict 207 | # * speed: model doesn't need to cache 208 | # please DO NOT use these flags in child modules 209 | 210 | def initialize(self, *args, **kwargs): 211 | """ initialize module tensors using first batch of data """ 212 | raise NotImplementedError("Please implement ") 213 | 214 | def __call__(self, *args, **kwargs): 215 | if self._is_initialized_bool is None: 216 | self._is_initialized_bool = bool(self._is_initialized_tensor.item()) 217 | if not self._is_initialized_bool: 218 | self.initialize(*args, **kwargs) 219 | self._is_initialized_tensor.data[...] = 1 220 | self._is_initialized_bool = True 221 | return super().__call__(*args, **kwargs) 222 | -------------------------------------------------------------------------------- /FT_Transformer/lib/node/utils.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Qwicen/node 2 | import contextlib 3 | import gc 4 | import glob 5 | import hashlib 6 | import os 7 | import time 8 | 9 | import numpy as np 10 | import requests 11 | import torch 12 | from tqdm import tqdm 13 | 14 | 15 | def download(url, filename, delete_if_interrupted=True, chunk_size=4096): 16 | """ saves file from url to filename with a fancy progressbar """ 17 | try: 18 | with open(filename, "wb") as f: 19 | print("Downloading {} > {}".format(url, filename)) 20 | response = requests.get(url, stream=True) 21 | total_length = response.headers.get('content-length') 22 | 23 | if total_length is None: # no content length header 24 | f.write(response.content) 25 | else: 26 | total_length = int(total_length) 27 | with tqdm(total=total_length) as progressbar: 28 | for data in response.iter_content(chunk_size=chunk_size): 29 | if data: # filter-out keep-alive chunks 30 | f.write(data) 31 | progressbar.update(len(data)) 32 | except Exception as e: 33 | if delete_if_interrupted: 34 | print("Removing incomplete download {}.".format(filename)) 35 | os.remove(filename) 36 | raise e 37 | return filename 38 | 39 | 40 | def iterate_minibatches(*tensors, batch_size, shuffle=True, epochs=1, 41 | allow_incomplete=True, callback=lambda x:x): 42 | indices = np.arange(len(tensors[0])) 43 | upper_bound = int((np.ceil if allow_incomplete else np.floor) (len(indices) / batch_size)) * batch_size 44 | epoch = 0 45 | while True: 46 | if shuffle: 47 | np.random.shuffle(indices) 48 | for batch_start in callback(range(0, upper_bound, batch_size)): 49 | batch_ix = indices[batch_start: batch_start + batch_size] 50 | batch = [tensor[batch_ix] for tensor in tensors] 51 | yield batch if len(tensors) > 1 else batch[0] 52 | epoch += 1 53 | if epoch >= epochs: 54 | break 55 | 56 | 57 | def process_in_chunks(function, *args, batch_size, out=None, **kwargs): 58 | """ 59 | Computes output by applying batch-parallel function to large data tensor in chunks 60 | :param function: a function(*[x[indices, ...] for x in args]) -> out[indices, ...] 61 | :param args: one or many tensors, each [num_instances, ...] 62 | :param batch_size: maximum chunk size processed in one go 63 | :param out: memory buffer for out, defaults to torch.zeros of appropriate size and type 64 | :returns: function(data), computed in a memory-efficient way 65 | """ 66 | total_size = args[0].shape[0] 67 | first_output = function(*[x[0: batch_size] for x in args]) 68 | output_shape = (total_size,) + tuple(first_output.shape[1:]) 69 | if out is None: 70 | out = torch.zeros(*output_shape, dtype=first_output.dtype, device=first_output.device, 71 | layout=first_output.layout, **kwargs) 72 | 73 | out[0: batch_size] = first_output 74 | for i in range(batch_size, total_size, batch_size): 75 | batch_ix = slice(i, min(i + batch_size, total_size)) 76 | out[batch_ix] = function(*[x[batch_ix] for x in args]) 77 | return out 78 | 79 | 80 | def check_numpy(x): 81 | """ Makes sure x is a numpy array """ 82 | if isinstance(x, torch.Tensor): 83 | x = x.detach().cpu().numpy() 84 | x = np.asarray(x) 85 | assert isinstance(x, np.ndarray) 86 | return x 87 | 88 | 89 | @contextlib.contextmanager 90 | def nop_ctx(): 91 | yield None 92 | 93 | 94 | def get_latest_file(pattern): 95 | list_of_files = glob.glob(pattern) # * means all if need specific format then *.csv 96 | assert len(list_of_files) > 0, "No files found: " + pattern 97 | return max(list_of_files, key=os.path.getctime) 98 | 99 | 100 | def md5sum(fname): 101 | """ Computes mdp checksum of a file """ 102 | hash_md5 = hashlib.md5() 103 | with open(fname, "rb") as f: 104 | for chunk in iter(lambda: f.read(4096), b""): 105 | hash_md5.update(chunk) 106 | return hash_md5.hexdigest() 107 | 108 | 109 | def free_memory(sleep_time=0.1): 110 | """ Black magic function to free torch memory and some jupyter whims """ 111 | gc.collect() 112 | torch.cuda.synchronize() 113 | gc.collect() 114 | torch.cuda.empty_cache() 115 | time.sleep(sleep_time) 116 | 117 | def to_float_str(element): 118 | try: 119 | return str(float(element)) 120 | except ValueError: 121 | return element 122 | -------------------------------------------------------------------------------- /FT_Transformer/lib/synthetic_data.py: -------------------------------------------------------------------------------- 1 | "Code used to generate data for experiments with synthetic data" 2 | import math 3 | import typing as ty 4 | 5 | import numba 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | from numba.experimental import jitclass 10 | from tqdm.auto import tqdm 11 | 12 | 13 | class MLP(nn.Module): 14 | def __init__( 15 | self, 16 | *, 17 | d_in: int, 18 | d_layers: ty.List[int], 19 | d_out: int, 20 | bias: bool = True, 21 | ) -> None: 22 | super().__init__() 23 | self.layers = nn.ModuleList( 24 | [ 25 | nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias) 26 | for i, x in enumerate(d_layers) 27 | ] 28 | ) 29 | self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out) 30 | 31 | def init_weights(m): 32 | if isinstance(m, nn.Linear): 33 | torch.nn.init.kaiming_normal_(m.weight, mode='fan_in') 34 | if m.bias is not None: 35 | fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight) 36 | bound = 1 / math.sqrt(fan_in) 37 | torch.nn.init.uniform_(m.bias, -bound, bound) 38 | 39 | self.apply(init_weights) 40 | 41 | def forward(self, x: torch.Tensor) -> torch.Tensor: 42 | for layer in self.layers: 43 | x = layer(x) 44 | x = torch.relu(x) 45 | x = self.head(x) 46 | x = x.squeeze(-1) 47 | return x 48 | 49 | 50 | @jitclass( 51 | spec=[ 52 | ('left_children', numba.int64[:]), 53 | ('right_children', numba.int64[:]), 54 | ('feature', numba.int64[:]), 55 | ('threshold', numba.float32[:]), 56 | ('value', numba.float32[:]), 57 | ('is_leaf', numba.int64[:]), 58 | ] 59 | ) 60 | class Tree: 61 | "Randomly initialized decision tree" 62 | 63 | def __init__(self, n_features, n_nodes, max_depth): 64 | assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes" 65 | 66 | self.left_children = np.ones(n_nodes, dtype=np.int64) * -1 67 | self.right_children = np.ones(n_nodes, dtype=np.int64) * -1 68 | self.feature = np.random.randint(0, n_features, (n_nodes,)) 69 | self.threshold = np.random.randn(n_nodes).astype(np.float32) 70 | self.value = np.random.randn(n_nodes).astype(np.float32) 71 | depth = np.zeros(n_nodes, dtype=np.int64) 72 | 73 | # Root is 0 74 | self.is_leaf = np.zeros(n_nodes, dtype=np.int64) 75 | self.is_leaf[0] = 1 76 | 77 | # Keep adding nodes while we can (new node must have 2 children) 78 | while True: 79 | idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())] 80 | if depth[idx] < max_depth: 81 | unused = np.flatnonzero( 82 | (self.left_children == -1) 83 | & (self.right_children == -1) 84 | & ~self.is_leaf 85 | ) 86 | if len(unused) < 2: 87 | break 88 | 89 | lr_child = unused[np.random.permutation(unused.shape[0])[:2]] 90 | self.is_leaf[lr_child] = 1 91 | self.is_leaf[lr_child] = 1 92 | depth[lr_child] = depth[idx] + 1 93 | self.left_children[idx] = lr_child[0] 94 | self.right_children[idx] = lr_child[1] 95 | self.is_leaf[idx] = 0 96 | 97 | def apply(self, x): 98 | y = np.zeros(x.shape[0]) 99 | 100 | for i in range(x.shape[0]): 101 | idx = 0 102 | 103 | while not self.is_leaf[idx]: 104 | if x[i, self.feature[idx]] < self.threshold[idx]: 105 | idx = self.left_children[idx] 106 | else: 107 | idx = self.right_children[idx] 108 | 109 | y[i] = self.value[idx] 110 | 111 | return y 112 | 113 | 114 | class TreeEnsemble: 115 | "Combine multiple trees" 116 | 117 | def __init__(self, *, n_trees, n_features, n_nodes, max_depth): 118 | self.trees = [ 119 | Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth) 120 | for _ in range(n_trees) 121 | ] 122 | 123 | def apply(self, x): 124 | return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0) 125 | -------------------------------------------------------------------------------- /FT_Transformer/lib/util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import os 5 | import pickle 6 | import random 7 | import shutil 8 | import sys 9 | import time 10 | import typing as ty 11 | from copy import deepcopy 12 | from pathlib import Path 13 | 14 | import numpy as np 15 | import pynvml 16 | import pytomlpp as toml 17 | import torch 18 | 19 | from . import env 20 | 21 | TRAIN = 'train' 22 | VAL = 'val' 23 | TEST = 'test' 24 | PARTS = [TRAIN, VAL, TEST] 25 | 26 | BINCLASS = 'binclass' 27 | MULTICLASS = 'multiclass' 28 | REGRESSION = 'regression' 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION] 30 | 31 | 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any: 33 | return json.loads(Path(path).read_text()) 34 | 35 | 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None: 37 | Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n') 38 | 39 | 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any: 41 | return toml.loads(Path(path).read_text()) 42 | 43 | 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None: 45 | Path(path).write_text(toml.dumps(x) + '\n') 46 | 47 | 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any: 49 | return pickle.loads(Path(path).read_bytes()) 50 | 51 | 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None: 53 | Path(path).write_bytes(pickle.dumps(x)) 54 | 55 | 56 | def load(path: ty.Union[Path, str]) -> ty.Any: 57 | return globals()[f'load_{Path(path).suffix[1:]}'](path) 58 | 59 | 60 | def load_config( 61 | argv: ty.Optional[ty.List[str]] = None, 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]: 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('config', metavar='FILE') 65 | parser.add_argument('-o', '--output', metavar='DIR') 66 | parser.add_argument('-f', '--force', action='store_true') 67 | parser.add_argument('--continue', action='store_true', dest='continue_') 68 | if argv is None: 69 | argv = sys.argv[1:] 70 | args = parser.parse_args(argv) 71 | 72 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 73 | if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists(): 74 | assert args.continue_ 75 | 76 | config_path = Path(args.config).absolute() 77 | output_dir = ( 78 | Path(args.output) 79 | if args.output 80 | else config_path.parent.joinpath(config_path.stem) 81 | ).absolute() 82 | sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir)))) # type: ignore[code] 83 | print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n') 84 | 85 | assert config_path.exists() 86 | config = load_toml(config_path) 87 | 88 | environment: ty.Dict[str, ty.Any] = {} 89 | if torch.cuda.is_available(): # type: ignore[code] 90 | cvd = os.environ.get('CUDA_VISIBLE_DEVICES') 91 | pynvml.nvmlInit() 92 | environment['devices'] = { 93 | 'CUDA_VISIBLE_DEVICES': cvd, 94 | 'torch.version.cuda': torch.version.cuda, 95 | 'torch.backends.cudnn.version()': torch.backends.cudnn.version(), # type: ignore[code] 96 | 'torch.cuda.nccl.version()': torch.cuda.nccl.version(), # type: ignore[code] 97 | 'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'), 98 | } 99 | if cvd: 100 | for i in map(int, cvd.split(',')): 101 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 102 | environment['devices'][i] = { 103 | 'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'), 104 | 'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total, 105 | } 106 | 107 | return config, output_dir 108 | 109 | 110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None: 111 | dump_json(stats, output_dir / 'stats.json', indent=4) 112 | json_output_path = os.environ.get('JSON_OUTPUT_FILE') 113 | if final: 114 | output_dir.joinpath('DONE').touch() 115 | if json_output_path: 116 | try: 117 | key = str(output_dir.relative_to(env.PROJECT_DIR)) 118 | except ValueError: 119 | pass 120 | else: 121 | json_output_path = Path(json_output_path) 122 | try: 123 | json_data = json.loads(json_output_path.read_text()) 124 | except (FileNotFoundError, json.decoder.JSONDecodeError): 125 | json_data = {} 126 | json_data[key] = stats 127 | json_output_path.write_text(json.dumps(json_data)) 128 | shutil.copyfile( 129 | json_output_path, 130 | os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'), 131 | ) 132 | 133 | 134 | _LAST_SNAPSHOT_TIME = None 135 | 136 | 137 | def backup_output(output_dir: Path) -> None: 138 | backup_dir = os.environ.get('TMP_OUTPUT_PATH') 139 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 140 | if backup_dir is None: 141 | assert snapshot_dir is None 142 | return 143 | assert snapshot_dir is not None 144 | 145 | try: 146 | relative_output_dir = output_dir.relative_to(env.PROJECT_DIR) 147 | except ValueError: 148 | return 149 | 150 | for dir_ in [backup_dir, snapshot_dir]: 151 | new_output_dir = dir_ / relative_output_dir 152 | prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev') 153 | new_output_dir.parent.mkdir(exist_ok=True, parents=True) 154 | if new_output_dir.exists(): 155 | new_output_dir.rename(prev_backup_output_dir) 156 | shutil.copytree(output_dir, new_output_dir) 157 | if prev_backup_output_dir.exists(): 158 | shutil.rmtree(prev_backup_output_dir) 159 | 160 | global _LAST_SNAPSHOT_TIME 161 | if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60: 162 | pass 163 | _LAST_SNAPSHOT_TIME = time.time() 164 | print('The snapshot was saved!') 165 | 166 | 167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any): 168 | raise ValueError(f'Unknown {unknown_what}: {unknown_value}') 169 | 170 | 171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict: 172 | x = deepcopy(default_kwargs) 173 | x.update(kwargs) 174 | return x 175 | 176 | 177 | def set_seeds(seed: int) -> None: 178 | random.seed(seed) 179 | np.random.seed(seed) 180 | 181 | 182 | def format_seconds(seconds: float) -> str: 183 | return str(datetime.timedelta(seconds=round(seconds))) 184 | 185 | 186 | def get_categories( 187 | X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List 188 | ) -> ty.Optional[ty.List[int]]: 189 | if X_cat is None: 190 | return None 191 | else: 192 | categories_count = [] 193 | for i in range(X_cat.shape[1]): 194 | # Combine unique categories from both training and testing indices for each feature 195 | unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist()) 196 | categories_count.append(len(unique_categories)) 197 | return categories_count 198 | -------------------------------------------------------------------------------- /FT_Transformer/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | skip_string_normalization = true 3 | 4 | [tool.isort] 5 | profile = "black" 6 | multi_line_output = 3 7 | known_first_party = ["lib"] 8 | -------------------------------------------------------------------------------- /FT_Transformer/requirements.txt: -------------------------------------------------------------------------------- 1 | catboost==0.24.4 2 | category-encoders==2.2.2 3 | lightgbm==3.2.1 4 | libzero==0.0.3.dev7 5 | numba==0.53.1 6 | optuna==2.6.0 7 | pandas==1.2.3 8 | pynvml==8.0.4 9 | pytomlpp==0.3.5 10 | scikit-learn==0.24.1 11 | scipy==1.6.1 12 | tensorboard==2.4.1 13 | tqdm==4.59.0 14 | xgboost==1.3.3 15 | 16 | # Tools 17 | black 18 | flake8 19 | icecream 20 | isort 21 | 22 | # Jupyter 23 | ipywidgets 24 | jupyterlab 25 | jupyterlab-nvdashboard 26 | voila 27 | 28 | # Visualization 29 | bokeh 30 | colorcet 31 | holoviews 32 | matplotlib 33 | panel 34 | seaborn 35 | -------------------------------------------------------------------------------- /FT_Transformer/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | # E501 is about line length; it can be violated by Black, so ignore it 4 | ignore = E203, E501, W503 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Revisiting-MLPs 2 | 3 | To be updated... 4 | -------------------------------------------------------------------------------- /ResNet/lib/__init__.py: -------------------------------------------------------------------------------- 1 | from icecream import install 2 | 3 | install() 4 | 5 | from . import env # noqa 6 | from .data import * # noqa 7 | from .deep import * # noqa 8 | from .env import get_path # noqa 9 | from .metrics import * # noqa 10 | from .util import * # noqa 11 | -------------------------------------------------------------------------------- /ResNet/lib/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | PROJECT_DIR = Path(os.environ['PROJECT_DIR']).absolute().resolve() 5 | DATA_DIR = PROJECT_DIR / 'data' 6 | OUTPUT_DIR = PROJECT_DIR / 'output' 7 | 8 | 9 | def get_path(relative_path: str) -> Path: 10 | return ( 11 | Path(relative_path) 12 | if relative_path.startswith('/') 13 | else PROJECT_DIR / relative_path 14 | ) 15 | -------------------------------------------------------------------------------- /ResNet/lib/metrics.py: -------------------------------------------------------------------------------- 1 | import typing as ty 2 | 3 | import numpy as np 4 | import scipy.special 5 | import sklearn.metrics as skm 6 | 7 | from . import util 8 | 9 | 10 | def calculate_metrics( 11 | task_type: str, 12 | y: np.ndarray, 13 | prediction: np.ndarray, 14 | classification_mode: str, 15 | y_info: ty.Optional[ty.Dict[str, ty.Any]], 16 | ) -> ty.Dict[str, float]: 17 | if task_type == util.REGRESSION: 18 | del classification_mode 19 | rmse = skm.mean_squared_error(y, prediction) ** 0.5 # type: ignore[code] 20 | 21 | return {'rmse': rmse, 'score': -rmse} 22 | else: 23 | assert task_type in (util.BINCLASS, util.MULTICLASS) 24 | labels = None 25 | if classification_mode == 'probs': 26 | probs = prediction 27 | elif classification_mode == 'logits': 28 | probs = ( 29 | scipy.special.expit(prediction) 30 | if task_type == util.BINCLASS 31 | else scipy.special.softmax(prediction, axis=1) 32 | ) 33 | else: 34 | assert classification_mode == 'labels' 35 | probs = None 36 | labels = prediction 37 | if labels is None: 38 | labels = ( 39 | np.round(probs).astype('int64') 40 | if task_type == util.BINCLASS 41 | else probs.argmax(axis=1) # type: ignore[code] 42 | ) 43 | 44 | result = skm.classification_report(y, labels, output_dict=True) # type: ignore[code] 45 | if task_type == util.BINCLASS: 46 | result['roc_auc'] = skm.roc_auc_score(y, probs) # type: ignore[code] 47 | else: 48 | result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo') # type: ignore[code] 49 | result['score'] = result['roc_auc'] # type: ignore[code] 50 | return result # type: ignore[code] 51 | 52 | 53 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str: 54 | precision = 3 55 | summary = {} 56 | for k, v in metrics[1].items(): 57 | if k.isdigit(): 58 | continue 59 | k = { 60 | 'score': 'SCORE', 61 | 'accuracy': 'acc', 62 | 'roc_auc': 'roc_auc', 63 | 'macro avg': 'm', 64 | 'weighted avg': 'w', 65 | }.get(k, k) 66 | if isinstance(v, float): 67 | v = round(v, precision) 68 | summary[k] = v 69 | else: 70 | v = { 71 | {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get( 72 | x, x 73 | ): round(v[x], precision) 74 | for x in v 75 | } 76 | for item in v.items(): 77 | summary[k + item[0]] = item[1] 78 | 79 | s = [f'score = {summary.pop("SCORE"):.3f}'] 80 | for k, v in summary.items(): 81 | if k not in ['mp', 'mr', 'wp', 'wr']: # just to save screen space 82 | s.append(f'{k} = {v}') 83 | return ' | '.join(s) 84 | -------------------------------------------------------------------------------- /ResNet/lib/node/__init__.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Qwicen/node 2 | from .arch import * # noqa 3 | from .nn_utils import * # noqa 4 | from .odst import * # noqa 5 | from .utils import * # noqa 6 | -------------------------------------------------------------------------------- /ResNet/lib/node/arch.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Qwicen/node 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.utils.checkpoint import checkpoint as torch_checkpoint 6 | 7 | from .odst import ODST 8 | 9 | 10 | class DenseBlock(nn.Sequential): 11 | def __init__(self, input_dim, layer_dim, num_layers, tree_dim=1, max_features=None, 12 | input_dropout=0.0, flatten_output=True, Module=ODST, **kwargs): 13 | layers = [] 14 | for i in range(num_layers): 15 | oddt = Module(input_dim, layer_dim, tree_dim=tree_dim, flatten_output=True, **kwargs) 16 | input_dim = min(input_dim + layer_dim * tree_dim, max_features or float('inf')) 17 | layers.append(oddt) 18 | 19 | super().__init__(*layers) 20 | self.num_layers, self.layer_dim, self.tree_dim = num_layers, layer_dim, tree_dim 21 | self.max_features, self.flatten_output = max_features, flatten_output 22 | self.input_dropout = input_dropout 23 | 24 | def forward(self, x): 25 | initial_features = x.shape[-1] 26 | for layer in self: 27 | layer_inp = x 28 | if self.max_features is not None: 29 | tail_features = min(self.max_features, layer_inp.shape[-1]) - initial_features 30 | if tail_features != 0: 31 | layer_inp = torch.cat([layer_inp[..., :initial_features], layer_inp[..., -tail_features:]], dim=-1) 32 | if self.training and self.input_dropout: 33 | layer_inp = F.dropout(layer_inp, self.input_dropout) 34 | h = layer(layer_inp) 35 | x = torch.cat([x, h], dim=-1) 36 | 37 | outputs = x[..., initial_features:] 38 | if not self.flatten_output: 39 | outputs = outputs.view(*outputs.shape[:-1], self.num_layers * self.layer_dim, self.tree_dim) 40 | return outputs 41 | -------------------------------------------------------------------------------- /ResNet/lib/node/nn_utils.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Qwicen/node 2 | import contextlib 3 | from collections import OrderedDict 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from torch.autograd import Function 10 | from torch.jit import script 11 | 12 | 13 | def to_one_hot(y, depth=None): 14 | r""" 15 | Takes integer with n dims and converts it to 1-hot representation with n + 1 dims. 16 | The n+1'st dimension will have zeros everywhere but at y'th index, where it will be equal to 1. 17 | Args: 18 | y: input integer (IntTensor, LongTensor or Variable) of any shape 19 | depth (int): the size of the one hot dimension 20 | """ 21 | y_flat = y.to(torch.int64).view(-1, 1) 22 | depth = depth if depth is not None else int(torch.max(y_flat)) + 1 23 | y_one_hot = torch.zeros(y_flat.size()[0], depth, device=y.device).scatter_(1, y_flat, 1) 24 | y_one_hot = y_one_hot.view(*(tuple(y.shape) + (-1,))) 25 | return y_one_hot 26 | 27 | 28 | def _make_ix_like(input, dim=0): 29 | d = input.size(dim) 30 | rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) 31 | view = [1] * input.dim() 32 | view[0] = -1 33 | return rho.view(view).transpose(0, dim) 34 | 35 | 36 | class SparsemaxFunction(Function): 37 | """ 38 | An implementation of sparsemax (Martins & Astudillo, 2016). See 39 | :cite:`DBLP:journals/corr/MartinsA16` for detailed description. 40 | 41 | By Ben Peters and Vlad Niculae 42 | """ 43 | 44 | @staticmethod 45 | def forward(ctx, input, dim=-1): 46 | """sparsemax: normalizing sparse transform (a la softmax) 47 | 48 | Parameters: 49 | input (Tensor): any shape 50 | dim: dimension along which to apply sparsemax 51 | 52 | Returns: 53 | output (Tensor): same shape as input 54 | """ 55 | ctx.dim = dim 56 | max_val, _ = input.max(dim=dim, keepdim=True) 57 | input -= max_val # same numerical stability trick as for softmax 58 | tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim) 59 | output = torch.clamp(input - tau, min=0) 60 | ctx.save_for_backward(supp_size, output) 61 | return output 62 | 63 | @staticmethod 64 | def backward(ctx, grad_output): 65 | supp_size, output = ctx.saved_tensors 66 | dim = ctx.dim 67 | grad_input = grad_output.clone() 68 | grad_input[output == 0] = 0 69 | 70 | v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() 71 | v_hat = v_hat.unsqueeze(dim) 72 | grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) 73 | return grad_input, None 74 | 75 | 76 | @staticmethod 77 | def _threshold_and_support(input, dim=-1): 78 | """Sparsemax building block: compute the threshold 79 | 80 | Args: 81 | input: any dimension 82 | dim: dimension along which to apply the sparsemax 83 | 84 | Returns: 85 | the threshold value 86 | """ 87 | 88 | input_srt, _ = torch.sort(input, descending=True, dim=dim) 89 | input_cumsum = input_srt.cumsum(dim) - 1 90 | rhos = _make_ix_like(input, dim) 91 | support = rhos * input_srt > input_cumsum 92 | 93 | support_size = support.sum(dim=dim).unsqueeze(dim) 94 | tau = input_cumsum.gather(dim, support_size - 1) 95 | tau /= support_size.to(input.dtype) 96 | return tau, support_size 97 | 98 | 99 | sparsemax = lambda input, dim=-1: SparsemaxFunction.apply(input, dim) 100 | sparsemoid = lambda input: (0.5 * input + 0.5).clamp_(0, 1) 101 | 102 | 103 | class Entmax15Function(Function): 104 | """ 105 | An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See 106 | :cite:`https://arxiv.org/abs/1905.05702 for detailed description. 107 | Source: https://github.com/deep-spin/entmax 108 | """ 109 | 110 | @staticmethod 111 | def forward(ctx, input, dim=-1): 112 | ctx.dim = dim 113 | 114 | max_val, _ = input.max(dim=dim, keepdim=True) 115 | input = input - max_val # same numerical stability trick as for softmax 116 | input = input / 2 # divide by 2 to solve actual Entmax 117 | 118 | tau_star, _ = Entmax15Function._threshold_and_support(input, dim) 119 | output = torch.clamp(input - tau_star, min=0) ** 2 120 | ctx.save_for_backward(output) 121 | return output 122 | 123 | @staticmethod 124 | def backward(ctx, grad_output): 125 | Y, = ctx.saved_tensors 126 | gppr = Y.sqrt() # = 1 / g'' (Y) 127 | dX = grad_output * gppr 128 | q = dX.sum(ctx.dim) / gppr.sum(ctx.dim) 129 | q = q.unsqueeze(ctx.dim) 130 | dX -= q * gppr 131 | return dX, None 132 | 133 | @staticmethod 134 | def _threshold_and_support(input, dim=-1): 135 | Xsrt, _ = torch.sort(input, descending=True, dim=dim) 136 | 137 | rho = _make_ix_like(input, dim) 138 | mean = Xsrt.cumsum(dim) / rho 139 | mean_sq = (Xsrt ** 2).cumsum(dim) / rho 140 | ss = rho * (mean_sq - mean ** 2) 141 | delta = (1 - ss) / rho 142 | 143 | # NOTE this is not exactly the same as in reference algo 144 | # Fortunately it seems the clamped values never wrongly 145 | # get selected by tau <= sorted_z. Prove this! 146 | delta_nz = torch.clamp(delta, 0) 147 | tau = mean - torch.sqrt(delta_nz) 148 | 149 | support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim) 150 | tau_star = tau.gather(dim, support_size - 1) 151 | return tau_star, support_size 152 | 153 | 154 | class Entmoid15(Function): 155 | """ A highly optimized equivalent of labda x: Entmax15([x, 0]) """ 156 | 157 | @staticmethod 158 | def forward(ctx, input): 159 | output = Entmoid15._forward(input) 160 | ctx.save_for_backward(output) 161 | return output 162 | 163 | @staticmethod 164 | @script 165 | def _forward(input): 166 | input, is_pos = abs(input), input >= 0 167 | tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2 168 | tau.masked_fill_(tau <= input, 2.0) 169 | y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2 170 | return torch.where(is_pos, 1 - y_neg, y_neg) 171 | 172 | @staticmethod 173 | def backward(ctx, grad_output): 174 | return Entmoid15._backward(ctx.saved_tensors[0], grad_output) 175 | 176 | @staticmethod 177 | @script 178 | def _backward(output, grad_output): 179 | gppr0, gppr1 = output.sqrt(), (1 - output).sqrt() 180 | grad_input = grad_output * gppr0 181 | q = grad_input / (gppr0 + gppr1) 182 | grad_input -= q * gppr0 183 | return grad_input 184 | 185 | 186 | entmax15 = lambda input, dim=-1: Entmax15Function.apply(input, dim) 187 | entmoid15 = Entmoid15.apply 188 | 189 | 190 | class Lambda(nn.Module): 191 | def __init__(self, func): 192 | super().__init__() 193 | self.func = func 194 | 195 | def forward(self, *args, **kwargs): 196 | return self.func(*args, **kwargs) 197 | 198 | 199 | class ModuleWithInit(nn.Module): 200 | """ Base class for pytorch module with data-aware initializer on first batch """ 201 | def __init__(self): 202 | super().__init__() 203 | self._is_initialized_tensor = nn.Parameter(torch.tensor(0, dtype=torch.uint8), requires_grad=False) 204 | self._is_initialized_bool = None 205 | # Note: this module uses a separate flag self._is_initialized so as to achieve both 206 | # * persistence: is_initialized is saved alongside model in state_dict 207 | # * speed: model doesn't need to cache 208 | # please DO NOT use these flags in child modules 209 | 210 | def initialize(self, *args, **kwargs): 211 | """ initialize module tensors using first batch of data """ 212 | raise NotImplementedError("Please implement ") 213 | 214 | def __call__(self, *args, **kwargs): 215 | if self._is_initialized_bool is None: 216 | self._is_initialized_bool = bool(self._is_initialized_tensor.item()) 217 | if not self._is_initialized_bool: 218 | self.initialize(*args, **kwargs) 219 | self._is_initialized_tensor.data[...] = 1 220 | self._is_initialized_bool = True 221 | return super().__call__(*args, **kwargs) 222 | -------------------------------------------------------------------------------- /ResNet/lib/node/odst.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Qwicen/node 2 | from warnings import warn 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | from .nn_utils import ModuleWithInit, sparsemax, sparsemoid 10 | from .utils import check_numpy 11 | 12 | 13 | class ODST(ModuleWithInit): 14 | def __init__(self, in_features, num_trees, depth=6, tree_dim=1, flatten_output=True, 15 | choice_function=sparsemax, bin_function=sparsemoid, 16 | initialize_response_=nn.init.normal_, initialize_selection_logits_=nn.init.uniform_, 17 | threshold_init_beta=1.0, threshold_init_cutoff=1.0, 18 | ): 19 | """ 20 | Oblivious Differentiable Sparsemax Trees. http://tinyurl.com/odst-readmore 21 | One can drop (sic!) this module anywhere instead of nn.Linear 22 | :param in_features: number of features in the input tensor 23 | :param num_trees: number of trees in this layer 24 | :param tree_dim: number of response channels in the response of individual tree 25 | :param depth: number of splits in every tree 26 | :param flatten_output: if False, returns [..., num_trees, tree_dim], 27 | by default returns [..., num_trees * tree_dim] 28 | :param choice_function: f(tensor, dim) -> R_simplex computes feature weights s.t. f(tensor, dim).sum(dim) == 1 29 | :param bin_function: f(tensor) -> R[0, 1], computes tree leaf weights 30 | 31 | :param initialize_response_: in-place initializer for tree output tensor 32 | :param initialize_selection_logits_: in-place initializer for logits that select features for the tree 33 | both thresholds and scales are initialized with data-aware init (or .load_state_dict) 34 | :param threshold_init_beta: initializes threshold to a q-th quantile of data points 35 | where q ~ Beta(:threshold_init_beta:, :threshold_init_beta:) 36 | If this param is set to 1, initial thresholds will have the same distribution as data points 37 | If greater than 1 (e.g. 10), thresholds will be closer to median data value 38 | If less than 1 (e.g. 0.1), thresholds will approach min/max data values. 39 | 40 | :param threshold_init_cutoff: threshold log-temperatures initializer, \in (0, inf) 41 | By default(1.0), log-remperatures are initialized in such a way that all bin selectors 42 | end up in the linear region of sparse-sigmoid. The temperatures are then scaled by this parameter. 43 | Setting this value > 1.0 will result in some margin between data points and sparse-sigmoid cutoff value 44 | Setting this value < 1.0 will cause (1 - value) part of data points to end up in flat sparse-sigmoid region 45 | For instance, threshold_init_cutoff = 0.9 will set 10% points equal to 0.0 or 1.0 46 | Setting this value > 1.0 will result in a margin between data points and sparse-sigmoid cutoff value 47 | All points will be between (0.5 - 0.5 / threshold_init_cutoff) and (0.5 + 0.5 / threshold_init_cutoff) 48 | """ 49 | super().__init__() 50 | self.depth, self.num_trees, self.tree_dim, self.flatten_output = depth, num_trees, tree_dim, flatten_output 51 | self.choice_function, self.bin_function = choice_function, bin_function 52 | self.threshold_init_beta, self.threshold_init_cutoff = threshold_init_beta, threshold_init_cutoff 53 | 54 | self.response = nn.Parameter(torch.zeros([num_trees, tree_dim, 2 ** depth]), requires_grad=True) 55 | initialize_response_(self.response) 56 | 57 | self.feature_selection_logits = nn.Parameter( 58 | torch.zeros([in_features, num_trees, depth]), requires_grad=True 59 | ) 60 | initialize_selection_logits_(self.feature_selection_logits) 61 | 62 | self.feature_thresholds = nn.Parameter( 63 | torch.full([num_trees, depth], float('nan'), dtype=torch.float32), requires_grad=True 64 | ) # nan values will be initialized on first batch (data-aware init) 65 | 66 | self.log_temperatures = nn.Parameter( 67 | torch.full([num_trees, depth], float('nan'), dtype=torch.float32), requires_grad=True 68 | ) 69 | 70 | # binary codes for mapping between 1-hot vectors and bin indices 71 | with torch.no_grad(): 72 | indices = torch.arange(2 ** self.depth) 73 | offsets = 2 ** torch.arange(self.depth) 74 | bin_codes = (indices.view(1, -1) // offsets.view(-1, 1) % 2).to(torch.float32) 75 | bin_codes_1hot = torch.stack([bin_codes, 1.0 - bin_codes], dim=-1) 76 | self.bin_codes_1hot = nn.Parameter(bin_codes_1hot, requires_grad=False) 77 | # ^-- [depth, 2 ** depth, 2] 78 | 79 | def forward(self, input): 80 | assert len(input.shape) >= 2 81 | if len(input.shape) > 2: 82 | return self.forward(input.view(-1, input.shape[-1])).view(*input.shape[:-1], -1) 83 | # new input shape: [batch_size, in_features] 84 | 85 | feature_logits = self.feature_selection_logits 86 | feature_selectors = self.choice_function(feature_logits, dim=0) 87 | # ^--[in_features, num_trees, depth] 88 | 89 | feature_values = torch.einsum('bi,ind->bnd', input, feature_selectors) 90 | # ^--[batch_size, num_trees, depth] 91 | 92 | threshold_logits = (feature_values - self.feature_thresholds) * torch.exp(-self.log_temperatures) 93 | 94 | threshold_logits = torch.stack([-threshold_logits, threshold_logits], dim=-1) 95 | # ^--[batch_size, num_trees, depth, 2] 96 | 97 | bins = self.bin_function(threshold_logits) 98 | # ^--[batch_size, num_trees, depth, 2], approximately binary 99 | 100 | bin_matches = torch.einsum('btds,dcs->btdc', bins, self.bin_codes_1hot) 101 | # ^--[batch_size, num_trees, depth, 2 ** depth] 102 | 103 | response_weights = torch.prod(bin_matches, dim=-2) 104 | # ^-- [batch_size, num_trees, 2 ** depth] 105 | 106 | response = torch.einsum('bnd,ncd->bnc', response_weights, self.response) 107 | # ^-- [batch_size, num_trees, tree_dim] 108 | 109 | return response.flatten(1, 2) if self.flatten_output else response 110 | 111 | def initialize(self, input, eps=1e-6): 112 | # data-aware initializer 113 | assert len(input.shape) == 2 114 | if input.shape[0] < 1000: 115 | warn("Data-aware initialization is performed on less than 1000 data points. This may cause instability." 116 | "To avoid potential problems, run this model on a data batch with at least 1000 data samples." 117 | "You can do so manually before training. Use with torch.no_grad() for memory efficiency.") 118 | with torch.no_grad(): 119 | feature_selectors = self.choice_function(self.feature_selection_logits, dim=0) 120 | # ^--[in_features, num_trees, depth] 121 | 122 | feature_values = torch.einsum('bi,ind->bnd', input, feature_selectors) 123 | # ^--[batch_size, num_trees, depth] 124 | 125 | # initialize thresholds: sample random percentiles of data 126 | percentiles_q = 100 * np.random.beta(self.threshold_init_beta, self.threshold_init_beta, 127 | size=[self.num_trees, self.depth]) 128 | self.feature_thresholds.data[...] = torch.as_tensor( 129 | list(map(np.percentile, check_numpy(feature_values.flatten(1, 2).t()), percentiles_q.flatten())), 130 | dtype=feature_values.dtype, device=feature_values.device 131 | ).view(self.num_trees, self.depth) 132 | 133 | # init temperatures: make sure enough data points are in the linear region of sparse-sigmoid 134 | temperatures = np.percentile(check_numpy(abs(feature_values - self.feature_thresholds)), 135 | q=100 * min(1.0, self.threshold_init_cutoff), axis=0) 136 | 137 | # if threshold_init_cutoff > 1, scale everything down by it 138 | temperatures /= max(1.0, self.threshold_init_cutoff) 139 | self.log_temperatures.data[...] = torch.log(torch.as_tensor(temperatures) + eps) 140 | 141 | def __repr__(self): 142 | return "{}(in_features={}, num_trees={}, depth={}, tree_dim={}, flatten_output={})".format( 143 | self.__class__.__name__, self.feature_selection_logits.shape[0], 144 | self.num_trees, self.depth, self.tree_dim, self.flatten_output 145 | ) 146 | 147 | -------------------------------------------------------------------------------- /ResNet/lib/node/utils.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Qwicen/node 2 | import contextlib 3 | import gc 4 | import glob 5 | import hashlib 6 | import os 7 | import time 8 | 9 | import numpy as np 10 | import requests 11 | import torch 12 | from tqdm import tqdm 13 | 14 | 15 | def download(url, filename, delete_if_interrupted=True, chunk_size=4096): 16 | """ saves file from url to filename with a fancy progressbar """ 17 | try: 18 | with open(filename, "wb") as f: 19 | print("Downloading {} > {}".format(url, filename)) 20 | response = requests.get(url, stream=True) 21 | total_length = response.headers.get('content-length') 22 | 23 | if total_length is None: # no content length header 24 | f.write(response.content) 25 | else: 26 | total_length = int(total_length) 27 | with tqdm(total=total_length) as progressbar: 28 | for data in response.iter_content(chunk_size=chunk_size): 29 | if data: # filter-out keep-alive chunks 30 | f.write(data) 31 | progressbar.update(len(data)) 32 | except Exception as e: 33 | if delete_if_interrupted: 34 | print("Removing incomplete download {}.".format(filename)) 35 | os.remove(filename) 36 | raise e 37 | return filename 38 | 39 | 40 | def iterate_minibatches(*tensors, batch_size, shuffle=True, epochs=1, 41 | allow_incomplete=True, callback=lambda x:x): 42 | indices = np.arange(len(tensors[0])) 43 | upper_bound = int((np.ceil if allow_incomplete else np.floor) (len(indices) / batch_size)) * batch_size 44 | epoch = 0 45 | while True: 46 | if shuffle: 47 | np.random.shuffle(indices) 48 | for batch_start in callback(range(0, upper_bound, batch_size)): 49 | batch_ix = indices[batch_start: batch_start + batch_size] 50 | batch = [tensor[batch_ix] for tensor in tensors] 51 | yield batch if len(tensors) > 1 else batch[0] 52 | epoch += 1 53 | if epoch >= epochs: 54 | break 55 | 56 | 57 | def process_in_chunks(function, *args, batch_size, out=None, **kwargs): 58 | """ 59 | Computes output by applying batch-parallel function to large data tensor in chunks 60 | :param function: a function(*[x[indices, ...] for x in args]) -> out[indices, ...] 61 | :param args: one or many tensors, each [num_instances, ...] 62 | :param batch_size: maximum chunk size processed in one go 63 | :param out: memory buffer for out, defaults to torch.zeros of appropriate size and type 64 | :returns: function(data), computed in a memory-efficient way 65 | """ 66 | total_size = args[0].shape[0] 67 | first_output = function(*[x[0: batch_size] for x in args]) 68 | output_shape = (total_size,) + tuple(first_output.shape[1:]) 69 | if out is None: 70 | out = torch.zeros(*output_shape, dtype=first_output.dtype, device=first_output.device, 71 | layout=first_output.layout, **kwargs) 72 | 73 | out[0: batch_size] = first_output 74 | for i in range(batch_size, total_size, batch_size): 75 | batch_ix = slice(i, min(i + batch_size, total_size)) 76 | out[batch_ix] = function(*[x[batch_ix] for x in args]) 77 | return out 78 | 79 | 80 | def check_numpy(x): 81 | """ Makes sure x is a numpy array """ 82 | if isinstance(x, torch.Tensor): 83 | x = x.detach().cpu().numpy() 84 | x = np.asarray(x) 85 | assert isinstance(x, np.ndarray) 86 | return x 87 | 88 | 89 | @contextlib.contextmanager 90 | def nop_ctx(): 91 | yield None 92 | 93 | 94 | def get_latest_file(pattern): 95 | list_of_files = glob.glob(pattern) # * means all if need specific format then *.csv 96 | assert len(list_of_files) > 0, "No files found: " + pattern 97 | return max(list_of_files, key=os.path.getctime) 98 | 99 | 100 | def md5sum(fname): 101 | """ Computes mdp checksum of a file """ 102 | hash_md5 = hashlib.md5() 103 | with open(fname, "rb") as f: 104 | for chunk in iter(lambda: f.read(4096), b""): 105 | hash_md5.update(chunk) 106 | return hash_md5.hexdigest() 107 | 108 | 109 | def free_memory(sleep_time=0.1): 110 | """ Black magic function to free torch memory and some jupyter whims """ 111 | gc.collect() 112 | torch.cuda.synchronize() 113 | gc.collect() 114 | torch.cuda.empty_cache() 115 | time.sleep(sleep_time) 116 | 117 | def to_float_str(element): 118 | try: 119 | return str(float(element)) 120 | except ValueError: 121 | return element 122 | -------------------------------------------------------------------------------- /ResNet/lib/synthetic_data.py: -------------------------------------------------------------------------------- 1 | "Code used to generate data for experiments with synthetic data" 2 | import math 3 | import typing as ty 4 | 5 | import numba 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | from numba.experimental import jitclass 10 | from tqdm.auto import tqdm 11 | 12 | 13 | class MLP(nn.Module): 14 | def __init__( 15 | self, 16 | *, 17 | d_in: int, 18 | d_layers: ty.List[int], 19 | d_out: int, 20 | bias: bool = True, 21 | ) -> None: 22 | super().__init__() 23 | self.layers = nn.ModuleList( 24 | [ 25 | nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias) 26 | for i, x in enumerate(d_layers) 27 | ] 28 | ) 29 | self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out) 30 | 31 | def init_weights(m): 32 | if isinstance(m, nn.Linear): 33 | torch.nn.init.kaiming_normal_(m.weight, mode='fan_in') 34 | if m.bias is not None: 35 | fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight) 36 | bound = 1 / math.sqrt(fan_in) 37 | torch.nn.init.uniform_(m.bias, -bound, bound) 38 | 39 | self.apply(init_weights) 40 | 41 | def forward(self, x: torch.Tensor) -> torch.Tensor: 42 | for layer in self.layers: 43 | x = layer(x) 44 | x = torch.relu(x) 45 | x = self.head(x) 46 | x = x.squeeze(-1) 47 | return x 48 | 49 | 50 | @jitclass( 51 | spec=[ 52 | ('left_children', numba.int64[:]), 53 | ('right_children', numba.int64[:]), 54 | ('feature', numba.int64[:]), 55 | ('threshold', numba.float32[:]), 56 | ('value', numba.float32[:]), 57 | ('is_leaf', numba.int64[:]), 58 | ] 59 | ) 60 | class Tree: 61 | "Randomly initialized decision tree" 62 | 63 | def __init__(self, n_features, n_nodes, max_depth): 64 | assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes" 65 | 66 | self.left_children = np.ones(n_nodes, dtype=np.int64) * -1 67 | self.right_children = np.ones(n_nodes, dtype=np.int64) * -1 68 | self.feature = np.random.randint(0, n_features, (n_nodes,)) 69 | self.threshold = np.random.randn(n_nodes).astype(np.float32) 70 | self.value = np.random.randn(n_nodes).astype(np.float32) 71 | depth = np.zeros(n_nodes, dtype=np.int64) 72 | 73 | # Root is 0 74 | self.is_leaf = np.zeros(n_nodes, dtype=np.int64) 75 | self.is_leaf[0] = 1 76 | 77 | # Keep adding nodes while we can (new node must have 2 children) 78 | while True: 79 | idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())] 80 | if depth[idx] < max_depth: 81 | unused = np.flatnonzero( 82 | (self.left_children == -1) 83 | & (self.right_children == -1) 84 | & ~self.is_leaf 85 | ) 86 | if len(unused) < 2: 87 | break 88 | 89 | lr_child = unused[np.random.permutation(unused.shape[0])[:2]] 90 | self.is_leaf[lr_child] = 1 91 | self.is_leaf[lr_child] = 1 92 | depth[lr_child] = depth[idx] + 1 93 | self.left_children[idx] = lr_child[0] 94 | self.right_children[idx] = lr_child[1] 95 | self.is_leaf[idx] = 0 96 | 97 | def apply(self, x): 98 | y = np.zeros(x.shape[0]) 99 | 100 | for i in range(x.shape[0]): 101 | idx = 0 102 | 103 | while not self.is_leaf[idx]: 104 | if x[i, self.feature[idx]] < self.threshold[idx]: 105 | idx = self.left_children[idx] 106 | else: 107 | idx = self.right_children[idx] 108 | 109 | y[i] = self.value[idx] 110 | 111 | return y 112 | 113 | 114 | class TreeEnsemble: 115 | "Combine multiple trees" 116 | 117 | def __init__(self, *, n_trees, n_features, n_nodes, max_depth): 118 | self.trees = [ 119 | Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth) 120 | for _ in range(n_trees) 121 | ] 122 | 123 | def apply(self, x): 124 | return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0) 125 | -------------------------------------------------------------------------------- /ResNet/lib/util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import os 5 | import pickle 6 | import random 7 | import shutil 8 | import sys 9 | import time 10 | import typing as ty 11 | from copy import deepcopy 12 | from pathlib import Path 13 | 14 | import numpy as np 15 | import pynvml 16 | import pytomlpp as toml 17 | import torch 18 | 19 | from . import env 20 | 21 | TRAIN = 'train' 22 | VAL = 'val' 23 | TEST = 'test' 24 | PARTS = [TRAIN, VAL, TEST] 25 | 26 | BINCLASS = 'binclass' 27 | MULTICLASS = 'multiclass' 28 | REGRESSION = 'regression' 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION] 30 | 31 | 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any: 33 | return json.loads(Path(path).read_text()) 34 | 35 | 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None: 37 | Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n') 38 | 39 | 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any: 41 | return toml.loads(Path(path).read_text()) 42 | 43 | 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None: 45 | Path(path).write_text(toml.dumps(x) + '\n') 46 | 47 | 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any: 49 | return pickle.loads(Path(path).read_bytes()) 50 | 51 | 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None: 53 | Path(path).write_bytes(pickle.dumps(x)) 54 | 55 | 56 | def load(path: ty.Union[Path, str]) -> ty.Any: 57 | return globals()[f'load_{Path(path).suffix[1:]}'](path) 58 | 59 | 60 | def load_config( 61 | argv: ty.Optional[ty.List[str]] = None, 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]: 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('config', metavar='FILE') 65 | parser.add_argument('-o', '--output', metavar='DIR') 66 | parser.add_argument('-f', '--force', action='store_true') 67 | parser.add_argument('--continue', action='store_true', dest='continue_') 68 | if argv is None: 69 | argv = sys.argv[1:] 70 | args = parser.parse_args(argv) 71 | 72 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 73 | if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists(): 74 | assert args.continue_ 75 | 76 | config_path = Path(args.config).absolute() 77 | output_dir = ( 78 | Path(args.output) 79 | if args.output 80 | else config_path.parent.joinpath(config_path.stem) 81 | ).absolute() 82 | sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir)))) # type: ignore[code] 83 | print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n') 84 | 85 | assert config_path.exists() 86 | config = load_toml(config_path) 87 | 88 | environment: ty.Dict[str, ty.Any] = {} 89 | if torch.cuda.is_available(): # type: ignore[code] 90 | cvd = os.environ.get('CUDA_VISIBLE_DEVICES') 91 | pynvml.nvmlInit() 92 | environment['devices'] = { 93 | 'CUDA_VISIBLE_DEVICES': cvd, 94 | 'torch.version.cuda': torch.version.cuda, 95 | 'torch.backends.cudnn.version()': torch.backends.cudnn.version(), # type: ignore[code] 96 | 'torch.cuda.nccl.version()': torch.cuda.nccl.version(), # type: ignore[code] 97 | 'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'), 98 | } 99 | if cvd: 100 | for i in map(int, cvd.split(',')): 101 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 102 | environment['devices'][i] = { 103 | 'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'), 104 | 'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total, 105 | } 106 | 107 | return config, output_dir 108 | 109 | 110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None: 111 | dump_json(stats, output_dir / 'stats.json', indent=4) 112 | json_output_path = os.environ.get('JSON_OUTPUT_FILE') 113 | if final: 114 | output_dir.joinpath('DONE').touch() 115 | if json_output_path: 116 | try: 117 | key = str(output_dir.relative_to(env.PROJECT_DIR)) 118 | except ValueError: 119 | pass 120 | else: 121 | json_output_path = Path(json_output_path) 122 | try: 123 | json_data = json.loads(json_output_path.read_text()) 124 | except (FileNotFoundError, json.decoder.JSONDecodeError): 125 | json_data = {} 126 | json_data[key] = stats 127 | json_output_path.write_text(json.dumps(json_data)) 128 | shutil.copyfile( 129 | json_output_path, 130 | os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'), 131 | ) 132 | 133 | 134 | _LAST_SNAPSHOT_TIME = None 135 | 136 | 137 | def backup_output(output_dir: Path) -> None: 138 | backup_dir = os.environ.get('TMP_OUTPUT_PATH') 139 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 140 | if backup_dir is None: 141 | assert snapshot_dir is None 142 | return 143 | assert snapshot_dir is not None 144 | 145 | try: 146 | relative_output_dir = output_dir.relative_to(env.PROJECT_DIR) 147 | except ValueError: 148 | return 149 | 150 | for dir_ in [backup_dir, snapshot_dir]: 151 | new_output_dir = dir_ / relative_output_dir 152 | prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev') 153 | new_output_dir.parent.mkdir(exist_ok=True, parents=True) 154 | if new_output_dir.exists(): 155 | new_output_dir.rename(prev_backup_output_dir) 156 | shutil.copytree(output_dir, new_output_dir) 157 | if prev_backup_output_dir.exists(): 158 | shutil.rmtree(prev_backup_output_dir) 159 | 160 | global _LAST_SNAPSHOT_TIME 161 | if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60: 162 | pass 163 | _LAST_SNAPSHOT_TIME = time.time() 164 | print('The snapshot was saved!') 165 | 166 | 167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any): 168 | raise ValueError(f'Unknown {unknown_what}: {unknown_value}') 169 | 170 | 171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict: 172 | x = deepcopy(default_kwargs) 173 | x.update(kwargs) 174 | return x 175 | 176 | 177 | def set_seeds(seed: int) -> None: 178 | random.seed(seed) 179 | np.random.seed(seed) 180 | 181 | 182 | def format_seconds(seconds: float) -> str: 183 | return str(datetime.timedelta(seconds=round(seconds))) 184 | 185 | 186 | def get_categories( 187 | X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List 188 | ) -> ty.Optional[ty.List[int]]: 189 | if X_cat is None: 190 | return None 191 | else: 192 | categories_count = [] 193 | for i in range(X_cat.shape[1]): 194 | # Combine unique categories from both training and testing indices for each feature 195 | unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist()) 196 | categories_count.append(len(unique_categories)) 197 | return categories_count 198 | -------------------------------------------------------------------------------- /ResNet/resnet_ft.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import math 3 | import typing as ty 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import zero 11 | from torch import Tensor 12 | 13 | import lib 14 | 15 | 16 | # %% 17 | class ResNet(nn.Module): 18 | def __init__( 19 | self, 20 | *, 21 | d_numerical: int, 22 | categories: ty.Optional[ty.List[int]], 23 | d_embedding: int, 24 | d: int, 25 | d_hidden_factor: float, 26 | n_layers: int, 27 | activation: str, 28 | normalization: str, 29 | hidden_dropout: float, 30 | residual_dropout: float, 31 | d_out: int, 32 | ) -> None: 33 | super().__init__() 34 | 35 | def make_normalization(): 36 | return {'batchnorm': nn.BatchNorm1d, 'layernorm': nn.LayerNorm}[ 37 | normalization 38 | ](d) 39 | 40 | self.main_activation = lib.get_activation_fn(activation) 41 | self.last_activation = lib.get_nonglu_activation_fn(activation) 42 | self.residual_dropout = residual_dropout 43 | self.hidden_dropout = hidden_dropout 44 | 45 | d_in = d_numerical 46 | d_hidden = int(d * d_hidden_factor) 47 | 48 | if categories is not None: 49 | d_in += len(categories) * d_embedding 50 | category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0) 51 | self.register_buffer('category_offsets', category_offsets) 52 | self.categories = torch.tensor(np.subtract(categories, 1).tolist()) 53 | self.category_embeddings = nn.Embedding(sum(categories), d_embedding) 54 | self.unknown_value = np.iinfo('int64').max - 3 55 | nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5)) 56 | print(f'{self.category_embeddings.weight.shape=}') 57 | 58 | self.first_layer = nn.Linear(d_in, d) 59 | self.layers = nn.ModuleList( 60 | [ 61 | nn.ModuleDict( 62 | { 63 | 'norm': make_normalization(), 64 | 'linear0': nn.Linear( 65 | d, d_hidden * (2 if activation.endswith('glu') else 1) 66 | ), 67 | 'linear1': nn.Linear(d_hidden, d), 68 | } 69 | ) 70 | for _ in range(n_layers) 71 | ] 72 | ) 73 | self.last_normalization = make_normalization() 74 | self.head = nn.Linear(d, d_out) 75 | 76 | def forward(self, x_num: Tensor, x_cat: ty.Optional[Tensor]) -> Tensor: 77 | x = [] 78 | if x_num is not None: 79 | x.append(x_num) 80 | if x_cat is not None: 81 | x_cat = torch.where(x_cat == self.unknown_value, self.categories.to(x_cat.device), x_cat) 82 | x.append( 83 | self.category_embeddings(x_cat + self.category_offsets[None]).view( 84 | x_cat.size(0), -1 85 | ) 86 | ) 87 | x = torch.cat(x, dim=-1) 88 | 89 | x = self.first_layer(x) 90 | for layer in self.layers: 91 | layer = ty.cast(ty.Dict[str, nn.Module], layer) 92 | z = x 93 | if x.shape[0] > 1: 94 | z = layer['norm'](z) 95 | z = layer['linear0'](z) 96 | z = self.main_activation(z) 97 | if self.hidden_dropout: 98 | z = F.dropout(z, self.hidden_dropout, self.training) 99 | z = layer['linear1'](z) 100 | if self.residual_dropout: 101 | z = F.dropout(z, self.residual_dropout, self.training) 102 | x = x + z 103 | if x.shape[0] > 1: 104 | x = self.last_normalization(x) 105 | x = self.last_activation(x) 106 | x = self.head(x) 107 | x = x.squeeze(-1) 108 | return x 109 | -------------------------------------------------------------------------------- /ResNet/resnext.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import math 3 | import typing as ty 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import zero 11 | from torch import Tensor 12 | 13 | import lib 14 | 15 | 16 | # %% 17 | class ResNext(nn.Module): 18 | def __init__( 19 | self, 20 | *, 21 | d_numerical: int, 22 | categories: ty.Optional[ty.List[int]], 23 | d_embedding: int, 24 | d: int, 25 | d_hidden_factor: float, 26 | n_layers: int, 27 | activation: str, 28 | normalization: str, 29 | hidden_dropout: float, 30 | residual_dropout: float, 31 | d_out: int, 32 | cardinality: int, 33 | ) -> None: 34 | super().__init__() 35 | 36 | def make_normalization(): 37 | return {'batchnorm': nn.BatchNorm1d, 'layernorm': nn.LayerNorm}[ 38 | normalization 39 | ](d) 40 | 41 | self.main_activation = lib.get_activation_fn(activation) 42 | self.last_activation = lib.get_nonglu_activation_fn(activation) 43 | self.residual_dropout = residual_dropout 44 | self.hidden_dropout = hidden_dropout 45 | self.cardinality = cardinality 46 | 47 | d_in = d_numerical 48 | d_hidden = int(d * d_hidden_factor) 49 | d_hidden_per_path = int(d_hidden / self.cardinality) 50 | 51 | if categories is not None: 52 | d_in += len(categories) * d_embedding 53 | category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0) 54 | self.register_buffer('category_offsets', category_offsets) 55 | self.category_embeddings = nn.Embedding(sum(categories), d_embedding) 56 | self.categories = torch.tensor(np.subtract(categories, 1).tolist()) 57 | self.unknown_value = np.iinfo('int64').max - 3 58 | nn.init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5)) 59 | print(f'{self.category_embeddings.weight.shape=}') 60 | 61 | self.first_layer = nn.Linear(d_in, d) 62 | self.layers = nn.ModuleList( 63 | [ 64 | nn.ModuleDict( 65 | { 66 | 'norm': make_normalization(), 67 | 'linear0': nn.ModuleList([nn.Linear(d, d_hidden_per_path) for _ in range(cardinality)]), 68 | 'linear1': nn.ModuleList([nn.Linear(d_hidden_per_path, d) for _ in range(cardinality)]), 69 | } 70 | ) 71 | for _ in range(n_layers) 72 | ] 73 | ) 74 | self.last_normalization = make_normalization() 75 | self.head = nn.Linear(d, d_out) 76 | 77 | def forward(self, x_num: Tensor, x_cat: ty.Optional[Tensor]) -> Tensor: 78 | x = [] 79 | if x_num is not None: 80 | x.append(x_num) 81 | if x_cat is not None: 82 | x_cat = torch.where(x_cat == self.unknown_value, self.categories.to(x_cat.device), x_cat) 83 | x.append( 84 | self.category_embeddings(x_cat + self.category_offsets[None]).view( 85 | x_cat.size(0), -1 86 | ) 87 | ) 88 | x = torch.cat(x, dim=-1) 89 | 90 | x = self.first_layer(x) 91 | for layer in self.layers: 92 | layer = ty.cast(ty.Dict[str, nn.Module], layer) 93 | z = x 94 | z = layer['norm'](z) if z.shape[0] > 1 else z 95 | path_outputs = [] 96 | for i in range(self.cardinality): 97 | path_output = layer['linear0'][i](z) 98 | path_output = self.main_activation(path_output) 99 | if self.hidden_dropout: 100 | path_output = F.dropout(path_output, p=self.hidden_dropout, training=self.training) 101 | path_output = layer['linear1'][i](path_output) 102 | if self.residual_dropout: 103 | path_output = F.dropout(path_output, self.residual_dropout, self.training) 104 | path_outputs.append(path_output) 105 | z = sum(path_outputs) 106 | x = x + z 107 | x = self.last_normalization(x) if x.shape[0] > 1 else x 108 | x = self.last_activation(x) 109 | x = self.head(x) 110 | x = x.squeeze(-1) 111 | return x 112 | -------------------------------------------------------------------------------- /TabNet/augmentations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pytorch_tabnet.utils import define_device 3 | import numpy as np 4 | 5 | 6 | class RegressionSMOTE(): 7 | """ 8 | Apply SMOTE 9 | 10 | This will average a percentage p of the elements in the batch with other elements. 11 | The target will be averaged as well (this might work with binary classification 12 | and certain loss), following a beta distribution. 13 | """ 14 | def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): 15 | "" 16 | self.seed = seed 17 | self._set_seed() 18 | self.device = define_device(device_name) 19 | self.alpha = alpha 20 | self.beta = beta 21 | self.p = p 22 | if (p < 0.) or (p > 1.0): 23 | raise ValueError("Value of p should be between 0. and 1.") 24 | 25 | def _set_seed(self): 26 | torch.manual_seed(self.seed) 27 | np.random.seed(self.seed) 28 | return 29 | 30 | def __call__(self, X, y): 31 | batch_size = X.shape[0] 32 | random_values = torch.rand(batch_size, device=self.device) 33 | idx_to_change = random_values < self.p 34 | 35 | # ensure that first element to switch has probability > 0.5 36 | np_betas = np.random.beta(self.alpha, self.beta, batch_size) / 2 + 0.5 37 | random_betas = torch.from_numpy(np_betas).to(self.device).float() 38 | index_permute = torch.randperm(batch_size, device=self.device) 39 | 40 | X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change] 41 | X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view(X[idx_to_change].size()) # noqa 42 | 43 | y[idx_to_change] = random_betas[idx_to_change, None] * y[idx_to_change] 44 | y[idx_to_change] += (1 - random_betas[idx_to_change, None]) * y[index_permute][idx_to_change].view(y[idx_to_change].size()) # noqa 45 | 46 | return X, y 47 | 48 | 49 | class ClassificationSMOTE(): 50 | """ 51 | Apply SMOTE for classification tasks. 52 | 53 | This will average a percentage p of the elements in the batch with other elements. 54 | The target will stay unchanged and keep the value of the most important row in the mix. 55 | """ 56 | def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): 57 | "" 58 | self.seed = seed 59 | self._set_seed() 60 | self.device = define_device(device_name) 61 | self.alpha = alpha 62 | self.beta = beta 63 | self.p = p 64 | if (p < 0.) or (p > 1.0): 65 | raise ValueError("Value of p should be between 0. and 1.") 66 | 67 | def _set_seed(self): 68 | torch.manual_seed(self.seed) 69 | np.random.seed(self.seed) 70 | return 71 | 72 | def __call__(self, X, y): 73 | batch_size = X.shape[0] 74 | random_values = torch.rand(batch_size, device=self.device) 75 | idx_to_change = random_values < self.p 76 | 77 | # ensure that first element to switch has probability > 0.5 78 | np_betas = np.random.beta(self.alpha, self.beta, batch_size) / 2 + 0.5 79 | random_betas = torch.from_numpy(np_betas).to(self.device).float() 80 | index_permute = torch.randperm(batch_size, device=self.device) 81 | 82 | X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change] 83 | X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view(X[idx_to_change].size()) # noqa 84 | 85 | return X, y 86 | -------------------------------------------------------------------------------- /TabNet/lib/__init__.py: -------------------------------------------------------------------------------- 1 | from icecream import install 2 | 3 | install() 4 | 5 | from . import env # noqa 6 | from .data import * # noqa 7 | from .deep import * # noqa 8 | from .metrics import * # noqa 9 | from .util import * # noqa 10 | -------------------------------------------------------------------------------- /TabNet/lib/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | 5 | -------------------------------------------------------------------------------- /TabNet/lib/metrics.py: -------------------------------------------------------------------------------- 1 | import typing as ty 2 | 3 | import numpy as np 4 | import scipy.special 5 | import sklearn.metrics as skm 6 | 7 | from . import util 8 | 9 | 10 | def calculate_metrics( 11 | task_type: str, 12 | y: np.ndarray, 13 | prediction: np.ndarray, 14 | classification_mode: str, 15 | y_info: ty.Optional[ty.Dict[str, ty.Any]], 16 | ) -> ty.Dict[str, float]: 17 | if task_type == util.REGRESSION: 18 | del classification_mode 19 | rmse = skm.mean_squared_error(y, prediction) ** 0.5 # type: ignore[code] 20 | if y_info: 21 | if y_info['policy'] == 'mean_std': 22 | rmse *= y_info['std'] 23 | else: 24 | assert False 25 | return {'rmse': rmse, 'score': -rmse} 26 | else: 27 | assert task_type in (util.BINCLASS, util.MULTICLASS) 28 | labels = None 29 | if classification_mode == 'probs': 30 | probs = prediction 31 | elif classification_mode == 'logits': 32 | probs = ( 33 | scipy.special.expit(prediction) 34 | if task_type == util.BINCLASS 35 | else scipy.special.softmax(prediction, axis=1) 36 | ) 37 | else: 38 | assert classification_mode == 'labels' 39 | probs = None 40 | labels = prediction 41 | if labels is None: 42 | labels = ( 43 | np.round(probs).astype('int64') 44 | if task_type == util.BINCLASS 45 | else probs.argmax(axis=1) # type: ignore[code] 46 | ) 47 | 48 | result = skm.classification_report(y, labels, output_dict=True) # type: ignore[code] 49 | if task_type == util.BINCLASS: 50 | result['roc_auc'] = skm.roc_auc_score(y, probs) # type: ignore[code] 51 | else: 52 | result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo') # type: ignore[code] 53 | result['score'] = result['roc_auc'] # type: ignore[code] 54 | return result # type: ignore[code] 55 | 56 | 57 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str: 58 | precision = 3 59 | summary = {} 60 | for k, v in metrics[1].items(): 61 | if k.isdigit(): 62 | continue 63 | k = { 64 | 'score': 'SCORE', 65 | 'accuracy': 'acc', 66 | 'roc_auc': 'roc_auc', 67 | 'macro avg': 'm', 68 | 'weighted avg': 'w', 69 | }.get(k, k) 70 | if isinstance(v, float): 71 | v = round(v, precision) 72 | summary[k] = v 73 | else: 74 | v = { 75 | {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get( 76 | x, x 77 | ): round(v[x], precision) 78 | for x in v 79 | } 80 | for item in v.items(): 81 | summary[k + item[0]] = item[1] 82 | 83 | s = [f'score = {summary.pop("SCORE"):.3f}'] 84 | for k, v in summary.items(): 85 | if k not in ['mp', 'mr', 'wp', 'wr']: # just to save screen space 86 | s.append(f'{k} = {v}') 87 | return ' | '.join(s) 88 | -------------------------------------------------------------------------------- /TabNet/lib/synthetic_data.py: -------------------------------------------------------------------------------- 1 | "Code used to generate data for experiments with synthetic data" 2 | import math 3 | import typing as ty 4 | 5 | import numba 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | from numba.experimental import jitclass 10 | from tqdm.auto import tqdm 11 | 12 | 13 | class MLP(nn.Module): 14 | def __init__( 15 | self, 16 | *, 17 | d_in: int, 18 | d_layers: ty.List[int], 19 | d_out: int, 20 | bias: bool = True, 21 | ) -> None: 22 | super().__init__() 23 | self.layers = nn.ModuleList( 24 | [ 25 | nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias) 26 | for i, x in enumerate(d_layers) 27 | ] 28 | ) 29 | self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out) 30 | 31 | def init_weights(m): 32 | if isinstance(m, nn.Linear): 33 | torch.nn.init.kaiming_normal_(m.weight, mode='fan_in') 34 | if m.bias is not None: 35 | fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight) 36 | bound = 1 / math.sqrt(fan_in) 37 | torch.nn.init.uniform_(m.bias, -bound, bound) 38 | 39 | self.apply(init_weights) 40 | 41 | def forward(self, x: torch.Tensor) -> torch.Tensor: 42 | for layer in self.layers: 43 | x = layer(x) 44 | x = torch.relu(x) 45 | x = self.head(x) 46 | x = x.squeeze(-1) 47 | return x 48 | 49 | 50 | @jitclass( 51 | spec=[ 52 | ('left_children', numba.int64[:]), 53 | ('right_children', numba.int64[:]), 54 | ('feature', numba.int64[:]), 55 | ('threshold', numba.float32[:]), 56 | ('value', numba.float32[:]), 57 | ('is_leaf', numba.int64[:]), 58 | ] 59 | ) 60 | class Tree: 61 | "Randomly initialized decision tree" 62 | 63 | def __init__(self, n_features, n_nodes, max_depth): 64 | assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes" 65 | 66 | self.left_children = np.ones(n_nodes, dtype=np.int64) * -1 67 | self.right_children = np.ones(n_nodes, dtype=np.int64) * -1 68 | self.feature = np.random.randint(0, n_features, (n_nodes,)) 69 | self.threshold = np.random.randn(n_nodes).astype(np.float32) 70 | self.value = np.random.randn(n_nodes).astype(np.float32) 71 | depth = np.zeros(n_nodes, dtype=np.int64) 72 | 73 | # Root is 0 74 | self.is_leaf = np.zeros(n_nodes, dtype=np.int64) 75 | self.is_leaf[0] = 1 76 | 77 | # Keep adding nodes while we can (new node must have 2 children) 78 | while True: 79 | idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())] 80 | if depth[idx] < max_depth: 81 | unused = np.flatnonzero( 82 | (self.left_children == -1) 83 | & (self.right_children == -1) 84 | & ~self.is_leaf 85 | ) 86 | if len(unused) < 2: 87 | break 88 | 89 | lr_child = unused[np.random.permutation(unused.shape[0])[:2]] 90 | self.is_leaf[lr_child] = 1 91 | self.is_leaf[lr_child] = 1 92 | depth[lr_child] = depth[idx] + 1 93 | self.left_children[idx] = lr_child[0] 94 | self.right_children[idx] = lr_child[1] 95 | self.is_leaf[idx] = 0 96 | 97 | def apply(self, x): 98 | y = np.zeros(x.shape[0]) 99 | 100 | for i in range(x.shape[0]): 101 | idx = 0 102 | 103 | while not self.is_leaf[idx]: 104 | if x[i, self.feature[idx]] < self.threshold[idx]: 105 | idx = self.left_children[idx] 106 | else: 107 | idx = self.right_children[idx] 108 | 109 | y[i] = self.value[idx] 110 | 111 | return y 112 | 113 | 114 | class TreeEnsemble: 115 | "Combine multiple trees" 116 | 117 | def __init__(self, *, n_trees, n_features, n_nodes, max_depth): 118 | self.trees = [ 119 | Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth) 120 | for _ in range(n_trees) 121 | ] 122 | 123 | def apply(self, x): 124 | return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0) 125 | -------------------------------------------------------------------------------- /TabNet/lib/util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import os 5 | import pickle 6 | import random 7 | import shutil 8 | import sys 9 | import time 10 | import typing as ty 11 | from copy import deepcopy 12 | from pathlib import Path 13 | 14 | import numpy as np 15 | import pynvml 16 | import pytomlpp as toml 17 | import torch 18 | 19 | from . import env 20 | 21 | TRAIN = 'train' 22 | VAL = 'val' 23 | TEST = 'test' 24 | PARTS = [TRAIN, VAL, TEST] 25 | 26 | BINCLASS = 'binclass' 27 | MULTICLASS = 'multiclass' 28 | REGRESSION = 'regression' 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION] 30 | 31 | 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any: 33 | return json.loads(Path(path).read_text()) 34 | 35 | 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None: 37 | Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n') 38 | 39 | 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any: 41 | return toml.loads(Path(path).read_text()) 42 | 43 | 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None: 45 | Path(path).write_text(toml.dumps(x) + '\n') 46 | 47 | 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any: 49 | return pickle.loads(Path(path).read_bytes()) 50 | 51 | 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None: 53 | Path(path).write_bytes(pickle.dumps(x)) 54 | 55 | 56 | def load(path: ty.Union[Path, str]) -> ty.Any: 57 | return globals()[f'load_{Path(path).suffix[1:]}'](path) 58 | 59 | 60 | def load_config( 61 | argv: ty.Optional[ty.List[str]] = None, 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]: 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('config', metavar='FILE') 65 | parser.add_argument('-o', '--output', metavar='DIR') 66 | parser.add_argument('-f', '--force', action='store_true') 67 | parser.add_argument('--continue', action='store_true', dest='continue_') 68 | if argv is None: 69 | argv = sys.argv[1:] 70 | args = parser.parse_args(argv) 71 | 72 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 73 | if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists(): 74 | assert args.continue_ 75 | 76 | config_path = Path(args.config).absolute() 77 | output_dir = ( 78 | Path(args.output) 79 | if args.output 80 | else config_path.parent.joinpath(config_path.stem) 81 | ).absolute() 82 | sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir)))) # type: ignore[code] 83 | print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n') 84 | 85 | assert config_path.exists() 86 | config = load_toml(config_path) 87 | 88 | environment: ty.Dict[str, ty.Any] = {} 89 | if torch.cuda.is_available(): # type: ignore[code] 90 | cvd = os.environ.get('CUDA_VISIBLE_DEVICES') 91 | pynvml.nvmlInit() 92 | environment['devices'] = { 93 | 'CUDA_VISIBLE_DEVICES': cvd, 94 | 'torch.version.cuda': torch.version.cuda, 95 | 'torch.backends.cudnn.version()': torch.backends.cudnn.version(), # type: ignore[code] 96 | 'torch.cuda.nccl.version()': torch.cuda.nccl.version(), # type: ignore[code] 97 | 'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'), 98 | } 99 | if cvd: 100 | for i in map(int, cvd.split(',')): 101 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 102 | environment['devices'][i] = { 103 | 'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'), 104 | 'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total, 105 | } 106 | 107 | return config, output_dir 108 | 109 | 110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None: 111 | dump_json(stats, output_dir / 'stats.json', indent=4) 112 | json_output_path = os.environ.get('JSON_OUTPUT_FILE') 113 | if final: 114 | output_dir.joinpath('DONE').touch() 115 | if json_output_path: 116 | try: 117 | key = str(output_dir.relative_to(env.PROJECT_DIR)) 118 | except ValueError: 119 | pass 120 | else: 121 | json_output_path = Path(json_output_path) 122 | try: 123 | json_data = json.loads(json_output_path.read_text()) 124 | except (FileNotFoundError, json.decoder.JSONDecodeError): 125 | json_data = {} 126 | json_data[key] = stats 127 | json_output_path.write_text(json.dumps(json_data)) 128 | shutil.copyfile( 129 | json_output_path, 130 | os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'), 131 | ) 132 | 133 | 134 | _LAST_SNAPSHOT_TIME = None 135 | 136 | 137 | def backup_output(output_dir: Path) -> None: 138 | backup_dir = os.environ.get('TMP_OUTPUT_PATH') 139 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 140 | if backup_dir is None: 141 | assert snapshot_dir is None 142 | return 143 | assert snapshot_dir is not None 144 | 145 | try: 146 | relative_output_dir = output_dir.relative_to(env.PROJECT_DIR) 147 | except ValueError: 148 | return 149 | 150 | for dir_ in [backup_dir, snapshot_dir]: 151 | new_output_dir = dir_ / relative_output_dir 152 | prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev') 153 | new_output_dir.parent.mkdir(exist_ok=True, parents=True) 154 | if new_output_dir.exists(): 155 | new_output_dir.rename(prev_backup_output_dir) 156 | shutil.copytree(output_dir, new_output_dir) 157 | if prev_backup_output_dir.exists(): 158 | shutil.rmtree(prev_backup_output_dir) 159 | 160 | global _LAST_SNAPSHOT_TIME 161 | if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60: 162 | pass 163 | _LAST_SNAPSHOT_TIME = time.time() 164 | print('The snapshot was saved!') 165 | 166 | 167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any): 168 | raise ValueError(f'Unknown {unknown_what}: {unknown_value}') 169 | 170 | 171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict: 172 | x = deepcopy(default_kwargs) 173 | x.update(kwargs) 174 | return x 175 | 176 | 177 | def set_seeds(seed: int) -> None: 178 | random.seed(seed) 179 | np.random.seed(seed) 180 | 181 | 182 | def format_seconds(seconds: float) -> str: 183 | return str(datetime.timedelta(seconds=round(seconds))) 184 | 185 | 186 | def get_categories( 187 | X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List 188 | ) -> ty.Optional[ty.List[int]]: 189 | if X_cat is None: 190 | return None 191 | else: 192 | categories_count = [] 193 | for i in range(X_cat.shape[1]): 194 | # Combine unique categories from both training and testing indices for each feature 195 | unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist()) 196 | categories_count.append(len(unique_categories)) 197 | return categories_count 198 | -------------------------------------------------------------------------------- /TabNet/multitask.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.special import softmax 4 | from pytorch_tabnet.utils import SparsePredictDataset, PredictDataset, filter_weights 5 | from pytorch_tabnet.abstract_model import TabModel 6 | from pytorch_tabnet.multiclass_utils import infer_multitask_output, check_output_dim 7 | from torch.utils.data import DataLoader 8 | import scipy 9 | 10 | 11 | class TabNetMultiTaskClassifier(TabModel): 12 | def __post_init__(self): 13 | super(TabNetMultiTaskClassifier, self).__post_init__() 14 | self._task = 'classification' 15 | self._default_loss = torch.nn.functional.cross_entropy 16 | self._default_metric = 'logloss' 17 | 18 | def prepare_target(self, y): 19 | y_mapped = y.copy() 20 | for task_idx in range(y.shape[1]): 21 | task_mapper = self.target_mapper[task_idx] 22 | y_mapped[:, task_idx] = np.vectorize(task_mapper.get)(y[:, task_idx]) 23 | return y_mapped 24 | 25 | def compute_loss(self, y_pred, y_true): 26 | """ 27 | Computes the loss according to network output and targets 28 | 29 | Parameters 30 | ---------- 31 | y_pred : list of tensors 32 | Output of network 33 | y_true : LongTensor 34 | Targets label encoded 35 | 36 | Returns 37 | ------- 38 | loss : torch.Tensor 39 | output of loss function(s) 40 | 41 | """ 42 | loss = 0 43 | y_true = y_true.long() 44 | if isinstance(self.loss_fn, list): 45 | # if you specify a different loss for each task 46 | for task_loss, task_output, task_id in zip( 47 | self.loss_fn, y_pred, range(len(self.loss_fn)) 48 | ): 49 | loss += task_loss(task_output, y_true[:, task_id]) 50 | else: 51 | # same loss function is applied to all tasks 52 | for task_id, task_output in enumerate(y_pred): 53 | loss += self.loss_fn(task_output, y_true[:, task_id]) 54 | 55 | loss /= len(y_pred) 56 | return loss 57 | 58 | def stack_batches(self, list_y_true, list_y_score): 59 | y_true = np.vstack(list_y_true) 60 | y_score = [] 61 | for i in range(len(self.output_dim)): 62 | score = np.vstack([x[i] for x in list_y_score]) 63 | score = softmax(score, axis=1) 64 | y_score.append(score) 65 | return y_true, y_score 66 | 67 | def update_fit_params(self, X_train, y_train, eval_set, weights): 68 | output_dim, train_labels = infer_multitask_output(y_train) 69 | for _, y in eval_set: 70 | for task_idx in range(y.shape[1]): 71 | check_output_dim(train_labels[task_idx], y[:, task_idx]) 72 | self.output_dim = output_dim 73 | self.classes_ = train_labels 74 | self.target_mapper = [ 75 | {class_label: index for index, class_label in enumerate(classes)} 76 | for classes in self.classes_ 77 | ] 78 | self.preds_mapper = [ 79 | {str(index): str(class_label) for index, class_label in enumerate(classes)} 80 | for classes in self.classes_ 81 | ] 82 | self.updated_weights = weights 83 | filter_weights(self.updated_weights) 84 | 85 | def predict(self, X): 86 | """ 87 | Make predictions on a batch (valid) 88 | 89 | Parameters 90 | ---------- 91 | X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` 92 | Input data 93 | 94 | Returns 95 | ------- 96 | results : np.array 97 | Predictions of the most probable class 98 | """ 99 | self.network.eval() 100 | 101 | if scipy.sparse.issparse(X): 102 | dataloader = DataLoader( 103 | SparsePredictDataset(X), 104 | batch_size=self.batch_size, 105 | shuffle=False, 106 | ) 107 | else: 108 | dataloader = DataLoader( 109 | PredictDataset(X), 110 | batch_size=self.batch_size, 111 | shuffle=False, 112 | ) 113 | 114 | results = {} 115 | for data in dataloader: 116 | data = data.to(self.device).float() 117 | output, _ = self.network(data) 118 | predictions = [ 119 | torch.argmax(torch.nn.Softmax(dim=1)(task_output), dim=1) 120 | .cpu() 121 | .detach() 122 | .numpy() 123 | .reshape(-1) 124 | for task_output in output 125 | ] 126 | 127 | for task_idx in range(len(self.output_dim)): 128 | results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]] 129 | # stack all task individually 130 | results = [np.hstack(task_res) for task_res in results.values()] 131 | # map all task individually 132 | results = [ 133 | np.vectorize(self.preds_mapper[task_idx].get)(task_res.astype(str)) 134 | for task_idx, task_res in enumerate(results) 135 | ] 136 | return results 137 | 138 | def predict_proba(self, X): 139 | """ 140 | Make predictions for classification on a batch (valid) 141 | 142 | Parameters 143 | ---------- 144 | X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` 145 | Input data 146 | 147 | Returns 148 | ------- 149 | res : list of np.ndarray 150 | 151 | """ 152 | self.network.eval() 153 | 154 | if scipy.sparse.issparse(X): 155 | dataloader = DataLoader( 156 | SparsePredictDataset(X), 157 | batch_size=self.batch_size, 158 | shuffle=False, 159 | ) 160 | else: 161 | dataloader = DataLoader( 162 | PredictDataset(X), 163 | batch_size=self.batch_size, 164 | shuffle=False, 165 | ) 166 | 167 | results = {} 168 | for data in dataloader: 169 | data = data.to(self.device).float() 170 | output, _ = self.network(data) 171 | predictions = [ 172 | torch.nn.Softmax(dim=1)(task_output).cpu().detach().numpy() 173 | for task_output in output 174 | ] 175 | for task_idx in range(len(self.output_dim)): 176 | results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]] 177 | res = [np.vstack(task_res) for task_res in results.values()] 178 | return res 179 | -------------------------------------------------------------------------------- /TabNet/pretraining_utils.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | from pytorch_tabnet.utils import ( 3 | create_sampler, 4 | SparsePredictDataset, 5 | PredictDataset, 6 | check_input 7 | ) 8 | import scipy 9 | 10 | 11 | def create_dataloaders( 12 | X_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory 13 | ): 14 | """ 15 | Create dataloaders with or without subsampling depending on weights and balanced. 16 | 17 | Parameters 18 | ---------- 19 | X_train : np.ndarray or scipy.sparse.csr_matrix 20 | Training data 21 | eval_set : list of np.array (for Xs and ys) or scipy.sparse.csr_matrix (for Xs) 22 | List of eval sets 23 | weights : either 0, 1, dict or iterable 24 | if 0 (default) : no weights will be applied 25 | if 1 : classification only, will balanced class with inverse frequency 26 | if dict : keys are corresponding class values are sample weights 27 | if iterable : list or np array must be of length equal to nb elements 28 | in the training set 29 | batch_size : int 30 | how many samples per batch to load 31 | num_workers : int 32 | how many subprocesses to use for data loading. 0 means that the data 33 | will be loaded in the main process 34 | drop_last : bool 35 | set to True to drop the last incomplete batch, if the dataset size is not 36 | divisible by the batch size. If False and the size of dataset is not 37 | divisible by the batch size, then the last batch will be smaller 38 | pin_memory : bool 39 | Whether to pin GPU memory during training 40 | 41 | Returns 42 | ------- 43 | train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader 44 | Training and validation dataloaders 45 | """ 46 | need_shuffle, sampler = create_sampler(weights, X_train) 47 | 48 | if scipy.sparse.issparse(X_train): 49 | train_dataloader = DataLoader( 50 | SparsePredictDataset(X_train), 51 | batch_size=batch_size, 52 | sampler=sampler, 53 | shuffle=need_shuffle, 54 | num_workers=num_workers, 55 | drop_last=drop_last, 56 | pin_memory=pin_memory, 57 | ) 58 | else: 59 | train_dataloader = DataLoader( 60 | PredictDataset(X_train), 61 | batch_size=batch_size, 62 | sampler=sampler, 63 | shuffle=need_shuffle, 64 | num_workers=num_workers, 65 | drop_last=drop_last, 66 | pin_memory=pin_memory, 67 | ) 68 | 69 | valid_dataloaders = [] 70 | for X in eval_set: 71 | if scipy.sparse.issparse(X): 72 | valid_dataloaders.append( 73 | DataLoader( 74 | SparsePredictDataset(X), 75 | batch_size=batch_size, 76 | sampler=sampler, 77 | shuffle=need_shuffle, 78 | num_workers=num_workers, 79 | drop_last=drop_last, 80 | pin_memory=pin_memory, 81 | ) 82 | ) 83 | else: 84 | valid_dataloaders.append( 85 | DataLoader( 86 | PredictDataset(X), 87 | batch_size=batch_size, 88 | sampler=sampler, 89 | shuffle=need_shuffle, 90 | num_workers=num_workers, 91 | drop_last=drop_last, 92 | pin_memory=pin_memory, 93 | ) 94 | ) 95 | 96 | return train_dataloader, valid_dataloaders 97 | 98 | 99 | def validate_eval_set(eval_set, eval_name, X_train): 100 | """Check if the shapes of eval_set are compatible with X_train. 101 | 102 | Parameters 103 | ---------- 104 | eval_set : List of numpy array 105 | The list evaluation set. 106 | The last one is used for early stopping 107 | X_train : np.ndarray 108 | Train owned products 109 | 110 | Returns 111 | ------- 112 | eval_names : list of str 113 | Validated list of eval_names. 114 | 115 | """ 116 | eval_names = eval_name or [f"val_{i}" for i in range(len(eval_set))] 117 | assert len(eval_set) == len( 118 | eval_names 119 | ), "eval_set and eval_name have not the same length" 120 | 121 | for set_nb, X in enumerate(eval_set): 122 | check_input(X) 123 | msg = ( 124 | f"Number of columns is different between eval set {set_nb}" 125 | + f"({X.shape[1]}) and X_train ({X_train.shape[1]})" 126 | ) 127 | assert X.shape[1] == X_train.shape[1], msg 128 | return eval_names 129 | -------------------------------------------------------------------------------- /TabNet/tab_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.special import softmax 4 | from tabnet_utils import SparsePredictDataset, PredictDataset, filter_weights 5 | from abstract_model import TabModel 6 | from multiclass_utils import infer_output_dim, check_output_dim 7 | from torch.utils.data import DataLoader 8 | import scipy 9 | 10 | 11 | class TabNetClassifier(TabModel): 12 | def __post_init__(self): 13 | super(TabNetClassifier, self).__post_init__() 14 | self._task = 'classification' 15 | self._default_loss = torch.nn.functional.cross_entropy 16 | self._default_metric = 'accuracy' 17 | 18 | def weight_updater(self, weights): 19 | """ 20 | Updates weights dictionary according to target_mapper. 21 | 22 | Parameters 23 | ---------- 24 | weights : bool or dict 25 | Given weights for balancing training. 26 | 27 | Returns 28 | ------- 29 | bool or dict 30 | Same bool if weights are bool, updated dict otherwise. 31 | 32 | """ 33 | if isinstance(weights, int): 34 | return weights 35 | elif isinstance(weights, dict): 36 | return {self.target_mapper[key]: value for key, value in weights.items()} 37 | else: 38 | return weights 39 | 40 | def prepare_target(self, y): 41 | return np.vectorize(self.target_mapper.get)(y) 42 | 43 | def compute_loss(self, y_pred, y_true): 44 | return self.loss_fn(y_pred, y_true.long()) 45 | 46 | def update_fit_params( 47 | self, 48 | X_train, 49 | y_train, 50 | eval_set, 51 | weights, 52 | ): 53 | output_dim, train_labels = infer_output_dim(y_train) 54 | for X, y in eval_set: 55 | check_output_dim(train_labels, y) 56 | self.output_dim = output_dim 57 | self._default_metric = ('auc' if self.output_dim == 2 else 'accuracy') 58 | self.classes_ = train_labels 59 | self.target_mapper = { 60 | class_label: index for index, class_label in enumerate(self.classes_) 61 | } 62 | self.preds_mapper = { 63 | str(index): class_label for index, class_label in enumerate(self.classes_) 64 | } 65 | self.updated_weights = self.weight_updater(weights) 66 | 67 | def stack_batches(self, list_y_true, list_y_score): 68 | y_true = np.hstack(list_y_true) 69 | y_score = np.vstack(list_y_score) 70 | y_score = softmax(y_score, axis=1) 71 | return y_true, y_score 72 | 73 | def predict_func(self, outputs): 74 | outputs = np.argmax(outputs, axis=1) 75 | return np.vectorize(self.preds_mapper.get)(outputs.astype(str)) 76 | 77 | def predict_proba(self, X): 78 | """ 79 | Make predictions for classification on a batch (valid) 80 | 81 | Parameters 82 | ---------- 83 | X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` 84 | Input data 85 | 86 | Returns 87 | ------- 88 | res : np.ndarray 89 | 90 | """ 91 | self.network.eval() 92 | 93 | if scipy.sparse.issparse(X): 94 | dataloader = DataLoader( 95 | SparsePredictDataset(X), 96 | batch_size=self.batch_size, 97 | shuffle=False, 98 | ) 99 | else: 100 | dataloader = DataLoader( 101 | PredictDataset(X), 102 | batch_size=self.batch_size, 103 | shuffle=False, 104 | ) 105 | 106 | results = [] 107 | for batch_nb, data in enumerate(dataloader): 108 | data = data.to(self.device).float() 109 | 110 | output, M_loss = self.network(data) 111 | predictions = torch.nn.Softmax(dim=1)(output).cpu().detach().numpy() 112 | results.append(predictions) 113 | res = np.vstack(results) 114 | return res 115 | 116 | 117 | class TabNetRegressor(TabModel): 118 | def __post_init__(self): 119 | super(TabNetRegressor, self).__post_init__() 120 | self._task = 'regression' 121 | self._default_loss = torch.nn.functional.mse_loss 122 | self._default_metric = 'mse' 123 | 124 | def prepare_target(self, y): 125 | return y 126 | 127 | def compute_loss(self, y_pred, y_true): 128 | return self.loss_fn(y_pred, y_true) 129 | 130 | def update_fit_params( 131 | self, 132 | X_train, 133 | y_train, 134 | eval_set, 135 | weights 136 | ): 137 | if len(y_train.shape) != 2: 138 | msg = "Targets should be 2D : (n_samples, n_regression) " + \ 139 | f"but y_train.shape={y_train.shape} given.\n" + \ 140 | "Use reshape(-1, 1) for single regression." 141 | raise ValueError(msg) 142 | self.output_dim = y_train.shape[1] 143 | self.preds_mapper = None 144 | 145 | self.updated_weights = weights 146 | filter_weights(self.updated_weights) 147 | 148 | def predict_func(self, outputs): 149 | return outputs 150 | 151 | def stack_batches(self, list_y_true, list_y_score): 152 | y_true = np.vstack(list_y_true) 153 | y_score = np.vstack(list_y_score) 154 | return y_true, y_score 155 | -------------------------------------------------------------------------------- /TabNet/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | 5 | 6 | def set_random_seed(seed): 7 | """ 8 | Set the seed for random number generation in Python, NumPy, and PyTorch. 9 | 10 | Args: 11 | seed (int): The seed value to use for all random number generators. 12 | """ 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | torch.manual_seed(seed) 16 | 17 | if torch.cuda.is_available(): 18 | torch.cuda.manual_seed(seed) 19 | torch.cuda.manual_seed_all(seed) 20 | torch.backends.cudnn.deterministic = True 21 | torch.backends.cudnn.benchmark = False -------------------------------------------------------------------------------- /TabPFN/run_tabpfn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openml 6 | from sklearn.metrics import accuracy_score, roc_auc_score 7 | from sklearn.model_selection import StratifiedKFold 8 | from sklearn.preprocessing import LabelEncoder 9 | 10 | from tabpfn import TabPFNClassifier 11 | from utils import set_random_seed 12 | import torch 13 | import numpy as np 14 | import pandas as pd 15 | 16 | 17 | def main(args: argparse.Namespace): 18 | 19 | seed = args.seed 20 | set_random_seed(seed) 21 | outer_fold = args.outer_fold 22 | dataset_id = args.dataset_id 23 | dataset = openml.datasets.get_dataset(dataset_id, download_data=False) 24 | 25 | X, y, categorical_indicator, attribute_names = dataset.get_data( 26 | dataset_format='dataframe', 27 | target=dataset.default_target_attribute, 28 | ) 29 | 30 | categorical_column_names = X.columns[categorical_indicator] 31 | X = pd.get_dummies(X, columns=categorical_column_names) 32 | 33 | label_encoder = LabelEncoder() 34 | label_encoder.fit(y) 35 | 36 | skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) 37 | splits = list(skf.split(X, y)) 38 | train_idx, test_idx = splits[outer_fold] 39 | nr_classes = len(np.unique(y)) 40 | X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 41 | y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] 42 | 43 | y_train = label_encoder.transform(y_train) 44 | y_test = label_encoder.transform(y_test) 45 | 46 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 47 | classifier = TabPFNClassifier(device=device, seed=seed, N_ensemble_configurations=32) 48 | 49 | classifier.fit(X_train, y_train) 50 | p_eval = classifier.predict_proba(X_test) 51 | y_eval = classifier.predict(X_test) 52 | if nr_classes == 2: 53 | p_eval = p_eval[:, 1] 54 | 55 | auroc_test_value = roc_auc_score(y_test, p_eval, multi_class='ovo') 56 | 57 | acc_test_value = accuracy_score(y_test, y_eval) 58 | 59 | result_path = os.path.join( 60 | args.output_dir, 61 | 'tabpfn', 62 | f'{dataset_id}', 63 | f'{outer_fold}', 64 | ) 65 | 66 | os.makedirs(result_path, exist_ok=True) 67 | result_dict = { 68 | 'test_auroc': auroc_test_value, 69 | 'test_acc': acc_test_value, 70 | } 71 | 72 | with open(os.path.join(result_path, 'result.json'), 'w') as f: 73 | json.dump(result_dict, f) 74 | 75 | 76 | if __name__ == "__main__": 77 | 78 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 79 | 80 | parser.add_argument( 81 | '--seed', 82 | type=int, 83 | default=0, 84 | help='Random seed', 85 | ) 86 | parser.add_argument( 87 | '--outer_fold', 88 | type=int, 89 | default=2, 90 | help='Outer fold iteration.', 91 | ) 92 | parser.add_argument( 93 | '--dataset_id', 94 | type=int, 95 | default=31, 96 | help='Dataset id', 97 | ) 98 | parser.add_argument( 99 | '--output_dir', 100 | type=str, 101 | default='.', 102 | help='Directory to save the results', 103 | ) 104 | 105 | args = parser.parse_args() 106 | 107 | main(args) -------------------------------------------------------------------------------- /TabPFN/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | 5 | 6 | def set_random_seed(seed): 7 | """ 8 | Set the seed for random number generation in Python, NumPy, and PyTorch. 9 | 10 | Args: 11 | seed (int): The seed value to use for all random number generators. 12 | """ 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | torch.manual_seed(seed) 16 | 17 | if torch.cuda.is_available(): 18 | torch.cuda.manual_seed(seed) 19 | torch.cuda.manual_seed_all(seed) 20 | torch.backends.cudnn.deterministic = True 21 | torch.backends.cudnn.benchmark = False -------------------------------------------------------------------------------- /XGBoost/evaluate_30_trials.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import time 4 | 5 | import numpy as np 6 | import optuna 7 | import scipy 8 | import zero 9 | import torch.nn as nn 10 | import torch 11 | import torch.nn.functional as F 12 | from sklearn.metrics import roc_auc_score, accuracy_score 13 | from xgboost import XGBClassifier 14 | 15 | import lib 16 | import wandb 17 | 18 | from sklearn.model_selection import StratifiedKFold 19 | from utils import set_random_seed 20 | 21 | # Create the parser 22 | parser = argparse.ArgumentParser(description="Train a model with specified parameters.") 23 | 24 | # Add the arguments 25 | parser.add_argument('--experiment_name', type=str, default='test', 26 | help='The name of the experiment. Default is "test".') 27 | parser.add_argument('--dataset', type=int, default=54, 28 | help='The dataset ID to use. Default is 45068 (adult).') 29 | parser.add_argument('--seed', type=int, default=0, 30 | help='The random seed for reproducibility. Default is 42.') 31 | parser.add_argument('--normalization', type=str, default='quantile', choices=['quantile', 'standard'], 32 | help='The normalization to use for the numerical features. Default is "quantile".') 33 | parser.add_argument('--cat_nan_policy', type=str, default='new', choices=['new', 'most_frequent'], 34 | help='The policy to use for handling nan values in categorical features. Default is "new".') 35 | parser.add_argument('--cat_policy', type=str, default='indices', choices=['indices', 'ohe'], 36 | help='The policy to use for handling categorical features. Default is "indices".') 37 | parser.add_argument('--outer_fold', type=int, default=0, help='The outer fold to use. Default is 0') 38 | parser.add_argument('--n_trials', type=int, default=100, 39 | help='The number of trials to use for HPO. Default is 100') 40 | parser.add_argument('--tune', action='store_true', help='Whether to tune the hyperparameters using Optuna') 41 | 42 | args = parser.parse_args() 43 | 44 | 45 | def load_best_config(project_name, dataset_name, outer_fold, num_trials=30): 46 | api = wandb.Api() 47 | target_run_name = f"{dataset_name}_outerFold_{outer_fold}" 48 | runs = api.runs(project_name) 49 | 50 | target_run = None 51 | for run in runs: 52 | if run.name == target_run_name: 53 | target_run = run 54 | break 55 | 56 | if not target_run: 57 | raise ValueError(f"No run found with name: {target_run_name}") 58 | 59 | # First scan for the best average_test_rocauc 60 | best_rocauc = 0 # Looking for the highest rocauc 61 | best_step = None 62 | history = target_run.scan_history(keys=['average_test_rocauc']) 63 | for i, row in enumerate(history): 64 | if i >= num_trials: 65 | break 66 | if 'average_test_rocauc' in row and row['average_test_rocauc'] > best_rocauc: 67 | best_rocauc = row['average_test_rocauc'] 68 | best_step = i 69 | 70 | if best_step is None: 71 | raise ValueError("Best rocauc not found within the first 30 trials") 72 | 73 | # Second scan for the HPs at the best step 74 | hp_keys = ['max_depth', 'min_child_weight', 'subsample', 'learning_rate', 'colsample_bylevel', 'colsample_bytree', 75 | 'gamma', 'reg_lambda', 'reg_alpha'] 76 | best_config = None 77 | history = target_run.scan_history(keys=hp_keys) 78 | for i, row in enumerate(history): 79 | if i == best_step: 80 | best_config = {key: row[key] for key in hp_keys if key in row} 81 | break 82 | 83 | if best_config: 84 | return best_config 85 | else: 86 | raise ValueError("HPs not found for the best rocauc step") 87 | 88 | 89 | def run_single_outer_fold(outer_fold, D, outer_folds): 90 | outer_train_idx, outer_test_idx = outer_folds[outer_fold] 91 | 92 | best_params = load_best_config('t4tab/XGBoost_optuna', D.info['dataset_name'], args.outer_fold) 93 | 94 | hyperparameters = { 95 | 'max_depth': best_params['max_depth'], 96 | 'min_child_weight': best_params['min_child_weight'], 97 | 'subsample': best_params['subsample'], 98 | 'learning_rate': best_params['learning_rate'], 99 | 'colsample_bylevel': best_params['colsample_bylevel'], 100 | 'colsample_bytree': best_params['colsample_bytree'], 101 | 'gamma': best_params['gamma'], 102 | 'reg_lambda': best_params['reg_lambda'], 103 | 'reg_alpha': best_params['reg_alpha'] 104 | } 105 | X_outer_preprocessed = D.build_X( 106 | normalization='quantile', 107 | num_nan_policy='mean', 108 | cat_nan_policy='new', 109 | cat_policy='ohe', 110 | seed=args.seed, 111 | train_idx=outer_train_idx, 112 | test_idx=outer_test_idx, 113 | ) 114 | set_random_seed(args.seed) 115 | Y, y_info = D.build_y(train_idx=outer_train_idx, test_idx=outer_test_idx) 116 | 117 | booster = "gbtree" 118 | early_stopping_rounds = 50 119 | n_estimators = 2000 120 | eval_metric = 'auc' 121 | model = XGBClassifier(booster=booster, 122 | n_estimators=n_estimators, 123 | tree_method='gpu_hist', 124 | disable_default_eval_metric=True, 125 | use_label_encoder=False) 126 | if args.tune: 127 | model.set_params(**hyperparameters) 128 | unique_classes, class_counts = np.unique(Y[outer_train_idx], axis=0, return_counts=True) 129 | nr_classes = len(unique_classes) 130 | 131 | model.fit(X_outer_preprocessed[outer_train_idx], Y[outer_train_idx], 132 | eval_set=[(X_outer_preprocessed[outer_test_idx], Y[outer_test_idx])], 133 | eval_metric=custom_auc_eval if D.is_multiclass else eval_metric, 134 | early_stopping_rounds=early_stopping_rounds, 135 | verbose=False) 136 | 137 | train_predictions_labels = model.predict(X_outer_preprocessed[outer_train_idx]) 138 | test_predictions_labels = model.predict(X_outer_preprocessed[outer_test_idx]) 139 | if D.is_multiclass: 140 | train_predictions_probabilities = model.predict_proba(X_outer_preprocessed[outer_train_idx]) 141 | test_predictions_probabilities = model.predict_proba(X_outer_preprocessed[outer_test_idx]) 142 | else: 143 | train_predictions_probabilities = model.predict_proba(X_outer_preprocessed[outer_train_idx])[:, 1] 144 | test_predictions_probabilities = model.predict_proba(X_outer_preprocessed[outer_test_idx])[:, 1] 145 | 146 | # calculate the balanced accuracy 147 | train_rocauc = roc_auc_score(Y[outer_train_idx], train_predictions_probabilities, 148 | multi_class='raise' if nr_classes == 2 else 'ovo') 149 | train_accuracy = accuracy_score(Y[outer_train_idx], train_predictions_labels) 150 | test_rocauc = roc_auc_score(Y[outer_test_idx], test_predictions_probabilities, 151 | multi_class='raise' if nr_classes == 2 else 'ovo') 152 | test_accuracy = accuracy_score(Y[outer_test_idx], test_predictions_labels) 153 | print(f"Finished outer fold {outer_fold}") 154 | 155 | output_info = { 156 | 'train_rocauc': train_rocauc, 157 | 'train_accuracy': train_accuracy, 158 | 'test_accuracy': test_accuracy, 159 | f'best_test_rocauc_outer_fold_{outer_fold}': test_rocauc, 160 | } 161 | wandb.log(output_info) 162 | wandb.finish() 163 | 164 | 165 | def custom_auc_eval(y_pred, dtrain): 166 | y_true = dtrain.get_label() 167 | 168 | y_pred = scipy.special.softmax(y_pred, axis=1) 169 | y_pred_sums = np.sum(y_pred, axis=1) 170 | if not np.allclose(y_pred_sums, 1.0): 171 | print("Probabilities do not sum to 1.0 for some instances.") 172 | y_pred = y_pred / y_pred_sums[:, np.newaxis] 173 | auc = roc_auc_score(y_true, y_pred, multi_class='ovo') 174 | 175 | return 'auc', auc 176 | 177 | 178 | if __name__ == "__main__": 179 | # %% 180 | set_random_seed(args.seed) 181 | D = lib.Dataset.from_openml(args.dataset) 182 | run_name = f"{D.info['dataset_name']}_outerFold_{args.outer_fold}" 183 | wandb.init(project=args.experiment_name, 184 | name=run_name, 185 | config=args) 186 | outer_kfold = StratifiedKFold(n_splits=10, shuffle=True) 187 | outer_folds = list(outer_kfold.split(D.X, D.y)) 188 | run_single_outer_fold(args.outer_fold, D, outer_folds) -------------------------------------------------------------------------------- /XGBoost/lib/__init__.py: -------------------------------------------------------------------------------- 1 | from icecream import install 2 | 3 | install() 4 | 5 | from . import env # noqa 6 | from .data import * # noqa 7 | from .deep import * # noqa 8 | from .metrics import * # noqa 9 | from .util import * # noqa 10 | -------------------------------------------------------------------------------- /XGBoost/lib/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | 5 | -------------------------------------------------------------------------------- /XGBoost/lib/metrics.py: -------------------------------------------------------------------------------- 1 | import typing as ty 2 | 3 | import numpy as np 4 | import scipy.special 5 | import sklearn.metrics as skm 6 | 7 | from . import util 8 | 9 | 10 | def calculate_metrics( 11 | task_type: str, 12 | y: np.ndarray, 13 | prediction: np.ndarray, 14 | classification_mode: str, 15 | y_info: ty.Optional[ty.Dict[str, ty.Any]], 16 | ) -> ty.Dict[str, float]: 17 | if task_type == util.REGRESSION: 18 | del classification_mode 19 | rmse = skm.mean_squared_error(y, prediction) ** 0.5 # type: ignore[code] 20 | 21 | return {'rmse': rmse, 'score': -rmse} 22 | else: 23 | assert task_type in (util.BINCLASS, util.MULTICLASS) 24 | labels = None 25 | if classification_mode == 'probs': 26 | probs = prediction 27 | elif classification_mode == 'logits': 28 | probs = ( 29 | scipy.special.expit(prediction) 30 | if task_type == util.BINCLASS 31 | else scipy.special.softmax(prediction, axis=1) 32 | ) 33 | else: 34 | assert classification_mode == 'labels' 35 | probs = None 36 | labels = prediction 37 | if labels is None: 38 | labels = ( 39 | np.round(probs).astype('int64') 40 | if task_type == util.BINCLASS 41 | else probs.argmax(axis=1) # type: ignore[code] 42 | ) 43 | 44 | result = skm.classification_report(y, labels, output_dict=True) # type: ignore[code] 45 | if task_type == util.BINCLASS: 46 | result['roc_auc'] = skm.roc_auc_score(y, probs) # type: ignore[code] 47 | else: 48 | result['roc_auc'] = skm.roc_auc_score(y, probs, multi_class='ovo') # type: ignore[code] 49 | result['score'] = result['roc_auc'] # type: ignore[code] 50 | return result # type: ignore[code] 51 | 52 | 53 | def make_summary(metrics: ty.Dict[str, ty.Any]) -> str: 54 | precision = 3 55 | summary = {} 56 | for k, v in metrics[1].items(): 57 | if k.isdigit(): 58 | continue 59 | k = { 60 | 'score': 'SCORE', 61 | 'accuracy': 'acc', 62 | 'roc_auc': 'roc_auc', 63 | 'macro avg': 'm', 64 | 'weighted avg': 'w', 65 | }.get(k, k) 66 | if isinstance(v, float): 67 | v = round(v, precision) 68 | summary[k] = v 69 | else: 70 | v = { 71 | {'precision': 'p', 'recall': 'r', 'f1-score': 'f1', 'support': 's'}.get( 72 | x, x 73 | ): round(v[x], precision) 74 | for x in v 75 | } 76 | for item in v.items(): 77 | summary[k + item[0]] = item[1] 78 | 79 | s = [f'score = {summary.pop("SCORE"):.3f}'] 80 | for k, v in summary.items(): 81 | if k not in ['mp', 'mr', 'wp', 'wr']: # just to save screen space 82 | s.append(f'{k} = {v}') 83 | return ' | '.join(s) 84 | -------------------------------------------------------------------------------- /XGBoost/lib/synthetic_data.py: -------------------------------------------------------------------------------- 1 | "Code used to generate data for experiments with synthetic data" 2 | import math 3 | import typing as ty 4 | 5 | import numba 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | from numba.experimental import jitclass 10 | from tqdm.auto import tqdm 11 | 12 | 13 | class MLP(nn.Module): 14 | def __init__( 15 | self, 16 | *, 17 | d_in: int, 18 | d_layers: ty.List[int], 19 | d_out: int, 20 | bias: bool = True, 21 | ) -> None: 22 | super().__init__() 23 | self.layers = nn.ModuleList( 24 | [ 25 | nn.Linear(d_layers[i - 1] if i else d_in, x, bias=bias) 26 | for i, x in enumerate(d_layers) 27 | ] 28 | ) 29 | self.head = nn.Linear(d_layers[-1] if d_layers else d_in, d_out) 30 | 31 | def init_weights(m): 32 | if isinstance(m, nn.Linear): 33 | torch.nn.init.kaiming_normal_(m.weight, mode='fan_in') 34 | if m.bias is not None: 35 | fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight) 36 | bound = 1 / math.sqrt(fan_in) 37 | torch.nn.init.uniform_(m.bias, -bound, bound) 38 | 39 | self.apply(init_weights) 40 | 41 | def forward(self, x: torch.Tensor) -> torch.Tensor: 42 | for layer in self.layers: 43 | x = layer(x) 44 | x = torch.relu(x) 45 | x = self.head(x) 46 | x = x.squeeze(-1) 47 | return x 48 | 49 | 50 | @jitclass( 51 | spec=[ 52 | ('left_children', numba.int64[:]), 53 | ('right_children', numba.int64[:]), 54 | ('feature', numba.int64[:]), 55 | ('threshold', numba.float32[:]), 56 | ('value', numba.float32[:]), 57 | ('is_leaf', numba.int64[:]), 58 | ] 59 | ) 60 | class Tree: 61 | "Randomly initialized decision tree" 62 | 63 | def __init__(self, n_features, n_nodes, max_depth): 64 | assert (2 ** np.arange(max_depth + 1)).sum() >= n_nodes, "Too much nodes" 65 | 66 | self.left_children = np.ones(n_nodes, dtype=np.int64) * -1 67 | self.right_children = np.ones(n_nodes, dtype=np.int64) * -1 68 | self.feature = np.random.randint(0, n_features, (n_nodes,)) 69 | self.threshold = np.random.randn(n_nodes).astype(np.float32) 70 | self.value = np.random.randn(n_nodes).astype(np.float32) 71 | depth = np.zeros(n_nodes, dtype=np.int64) 72 | 73 | # Root is 0 74 | self.is_leaf = np.zeros(n_nodes, dtype=np.int64) 75 | self.is_leaf[0] = 1 76 | 77 | # Keep adding nodes while we can (new node must have 2 children) 78 | while True: 79 | idx = np.flatnonzero(self.is_leaf)[np.random.choice(self.is_leaf.sum())] 80 | if depth[idx] < max_depth: 81 | unused = np.flatnonzero( 82 | (self.left_children == -1) 83 | & (self.right_children == -1) 84 | & ~self.is_leaf 85 | ) 86 | if len(unused) < 2: 87 | break 88 | 89 | lr_child = unused[np.random.permutation(unused.shape[0])[:2]] 90 | self.is_leaf[lr_child] = 1 91 | self.is_leaf[lr_child] = 1 92 | depth[lr_child] = depth[idx] + 1 93 | self.left_children[idx] = lr_child[0] 94 | self.right_children[idx] = lr_child[1] 95 | self.is_leaf[idx] = 0 96 | 97 | def apply(self, x): 98 | y = np.zeros(x.shape[0]) 99 | 100 | for i in range(x.shape[0]): 101 | idx = 0 102 | 103 | while not self.is_leaf[idx]: 104 | if x[i, self.feature[idx]] < self.threshold[idx]: 105 | idx = self.left_children[idx] 106 | else: 107 | idx = self.right_children[idx] 108 | 109 | y[i] = self.value[idx] 110 | 111 | return y 112 | 113 | 114 | class TreeEnsemble: 115 | "Combine multiple trees" 116 | 117 | def __init__(self, *, n_trees, n_features, n_nodes, max_depth): 118 | self.trees = [ 119 | Tree(n_features=n_features, n_nodes=n_nodes, max_depth=max_depth) 120 | for _ in range(n_trees) 121 | ] 122 | 123 | def apply(self, x): 124 | return np.mean([t.apply(x) for t in tqdm(self.trees)], axis=0) 125 | -------------------------------------------------------------------------------- /XGBoost/lib/util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import os 5 | import pickle 6 | import random 7 | import shutil 8 | import sys 9 | import time 10 | import typing as ty 11 | from copy import deepcopy 12 | from pathlib import Path 13 | 14 | import numpy as np 15 | import pynvml 16 | import pytomlpp as toml 17 | import torch 18 | 19 | from . import env 20 | 21 | TRAIN = 'train' 22 | VAL = 'val' 23 | TEST = 'test' 24 | PARTS = [TRAIN, VAL, TEST] 25 | 26 | BINCLASS = 'binclass' 27 | MULTICLASS = 'multiclass' 28 | REGRESSION = 'regression' 29 | TASK_TYPES = [BINCLASS, MULTICLASS, REGRESSION] 30 | 31 | 32 | def load_json(path: ty.Union[Path, str]) -> ty.Any: 33 | return json.loads(Path(path).read_text()) 34 | 35 | 36 | def dump_json(x: ty.Any, path: ty.Union[Path, str], *args, **kwargs) -> None: 37 | Path(path).write_text(json.dumps(x, *args, **kwargs) + '\n') 38 | 39 | 40 | def load_toml(path: ty.Union[Path, str]) -> ty.Any: 41 | return toml.loads(Path(path).read_text()) 42 | 43 | 44 | def dump_toml(x: ty.Any, path: ty.Union[Path, str]) -> None: 45 | Path(path).write_text(toml.dumps(x) + '\n') 46 | 47 | 48 | def load_pickle(path: ty.Union[Path, str]) -> ty.Any: 49 | return pickle.loads(Path(path).read_bytes()) 50 | 51 | 52 | def dump_pickle(x: ty.Any, path: ty.Union[Path, str]) -> None: 53 | Path(path).write_bytes(pickle.dumps(x)) 54 | 55 | 56 | def load(path: ty.Union[Path, str]) -> ty.Any: 57 | return globals()[f'load_{Path(path).suffix[1:]}'](path) 58 | 59 | 60 | def load_config( 61 | argv: ty.Optional[ty.List[str]] = None, 62 | ) -> ty.Tuple[ty.Dict[str, ty.Any], Path]: 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('config', metavar='FILE') 65 | parser.add_argument('-o', '--output', metavar='DIR') 66 | parser.add_argument('-f', '--force', action='store_true') 67 | parser.add_argument('--continue', action='store_true', dest='continue_') 68 | if argv is None: 69 | argv = sys.argv[1:] 70 | args = parser.parse_args(argv) 71 | 72 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 73 | if snapshot_dir and Path(snapshot_dir).joinpath('CHECKPOINTS_RESTORED').exists(): 74 | assert args.continue_ 75 | 76 | config_path = Path(args.config).absolute() 77 | output_dir = ( 78 | Path(args.output) 79 | if args.output 80 | else config_path.parent.joinpath(config_path.stem) 81 | ).absolute() 82 | sep = '=' * (8 + max(len(str(config_path)), len(str(output_dir)))) # type: ignore[code] 83 | print(sep, f'Config: {config_path}', f'Output: {output_dir}', sep, sep='\n') 84 | 85 | assert config_path.exists() 86 | config = load_toml(config_path) 87 | 88 | environment: ty.Dict[str, ty.Any] = {} 89 | if torch.cuda.is_available(): # type: ignore[code] 90 | cvd = os.environ.get('CUDA_VISIBLE_DEVICES') 91 | pynvml.nvmlInit() 92 | environment['devices'] = { 93 | 'CUDA_VISIBLE_DEVICES': cvd, 94 | 'torch.version.cuda': torch.version.cuda, 95 | 'torch.backends.cudnn.version()': torch.backends.cudnn.version(), # type: ignore[code] 96 | 'torch.cuda.nccl.version()': torch.cuda.nccl.version(), # type: ignore[code] 97 | 'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'), 98 | } 99 | if cvd: 100 | for i in map(int, cvd.split(',')): 101 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 102 | environment['devices'][i] = { 103 | 'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'), 104 | 'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle).total, 105 | } 106 | 107 | return config, output_dir 108 | 109 | 110 | def dump_stats(stats: dict, output_dir: Path, final: bool = False) -> None: 111 | dump_json(stats, output_dir / 'stats.json', indent=4) 112 | json_output_path = os.environ.get('JSON_OUTPUT_FILE') 113 | if final: 114 | output_dir.joinpath('DONE').touch() 115 | if json_output_path: 116 | try: 117 | key = str(output_dir.relative_to(env.PROJECT_DIR)) 118 | except ValueError: 119 | pass 120 | else: 121 | json_output_path = Path(json_output_path) 122 | try: 123 | json_data = json.loads(json_output_path.read_text()) 124 | except (FileNotFoundError, json.decoder.JSONDecodeError): 125 | json_data = {} 126 | json_data[key] = stats 127 | json_output_path.write_text(json.dumps(json_data)) 128 | shutil.copyfile( 129 | json_output_path, 130 | os.path.join(os.environ['SNAPSHOT_PATH'], 'json_output.json'), 131 | ) 132 | 133 | 134 | _LAST_SNAPSHOT_TIME = None 135 | 136 | 137 | def backup_output(output_dir: Path) -> None: 138 | backup_dir = os.environ.get('TMP_OUTPUT_PATH') 139 | snapshot_dir = os.environ.get('SNAPSHOT_PATH') 140 | if backup_dir is None: 141 | assert snapshot_dir is None 142 | return 143 | assert snapshot_dir is not None 144 | 145 | try: 146 | relative_output_dir = output_dir.relative_to(env.PROJECT_DIR) 147 | except ValueError: 148 | return 149 | 150 | for dir_ in [backup_dir, snapshot_dir]: 151 | new_output_dir = dir_ / relative_output_dir 152 | prev_backup_output_dir = new_output_dir.with_name(new_output_dir.name + '_prev') 153 | new_output_dir.parent.mkdir(exist_ok=True, parents=True) 154 | if new_output_dir.exists(): 155 | new_output_dir.rename(prev_backup_output_dir) 156 | shutil.copytree(output_dir, new_output_dir) 157 | if prev_backup_output_dir.exists(): 158 | shutil.rmtree(prev_backup_output_dir) 159 | 160 | global _LAST_SNAPSHOT_TIME 161 | if _LAST_SNAPSHOT_TIME is None or time.time() - _LAST_SNAPSHOT_TIME > 10 * 60: 162 | pass 163 | _LAST_SNAPSHOT_TIME = time.time() 164 | print('The snapshot was saved!') 165 | 166 | 167 | def raise_unknown(unknown_what: str, unknown_value: ty.Any): 168 | raise ValueError(f'Unknown {unknown_what}: {unknown_value}') 169 | 170 | 171 | def merge_defaults(kwargs: dict, default_kwargs: dict) -> dict: 172 | x = deepcopy(default_kwargs) 173 | x.update(kwargs) 174 | return x 175 | 176 | 177 | def set_seeds(seed: int) -> None: 178 | random.seed(seed) 179 | np.random.seed(seed) 180 | 181 | 182 | def format_seconds(seconds: float) -> str: 183 | return str(datetime.timedelta(seconds=round(seconds))) 184 | 185 | 186 | def get_categories( 187 | X_cat: ty.Optional[torch.Tensor], train_idx: ty.List, test_idx: ty.List 188 | ) -> ty.Optional[ty.List[int]]: 189 | if X_cat is None: 190 | return None 191 | else: 192 | categories_count = [] 193 | for i in range(X_cat.shape[1]): 194 | # Combine unique categories from both training and testing indices for each feature 195 | unique_categories = set(X_cat[train_idx][:, i].tolist()) | set(X_cat[test_idx][:, i].tolist()) 196 | categories_count.append(len(unique_categories)) 197 | return categories_count 198 | -------------------------------------------------------------------------------- /XGBoost/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | 5 | 6 | def set_random_seed(seed): 7 | """ 8 | Set the seed for random number generation in Python, NumPy, and PyTorch. 9 | 10 | Args: 11 | seed (int): The seed value to use for all random number generators. 12 | """ 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | torch.manual_seed(seed) 16 | 17 | if torch.cuda.is_available(): 18 | torch.cuda.manual_seed(seed) 19 | torch.cuda.manual_seed_all(seed) 20 | torch.backends.cudnn.deterministic = True 21 | torch.backends.cudnn.benchmark = False -------------------------------------------------------------------------------- /saint/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /saint/README.md: -------------------------------------------------------------------------------- 1 | This repository is the official PyTorch implementation of SAINT. Find the paper on [arxiv](https://arxiv.org/abs/2106.01342) 2 | 3 | # SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training 4 | 5 | 6 | ![Overview](pipeline.png) 7 | 8 | 9 | 10 | ## Requirements 11 | 12 | We recommend using `anaconda` or `miniconda` for python. Our code has been tested with `python=3.8` on linux. 13 | 14 | Create a conda environment from the yml file and activate it. 15 | ``` 16 | conda env create -f saint_environment.yml 17 | conda activate saint_env 18 | ``` 19 | 20 | Make sure the following requirements are met 21 | 22 | * torch>=1.8.1 23 | * torchvision>=0.9.1 24 | 25 | ### Optional 26 | We used wandb to update our logs. But it is optional. 27 | ``` 28 | conda install -c conda-forge wandb 29 | ``` 30 | 31 | 32 | ## Training & Evaluation 33 | 34 | In each of our experiments, we use a single Nvidia GeForce RTX 2080Ti GPU. 35 | 36 | 37 | To train the model(s) in the paper, run this command: 38 | 39 | ``` 40 | python train.py --dset_id --task --attentiontype 41 | ``` 42 | 43 | Pretraining is useful when there are few training data samples. Sample code looks like this. (Use train_robust.py file for pretraining and robustness experiments) 44 | ``` 45 | python train_robust.py --dset_id --task --attentiontype --pretrain --pt_tasks --pt_aug --ssl_samples 46 | ``` 47 | 48 | 49 | 50 | ### Arguments 51 | * `--dset_id` : Dataset id from OpenML. Works with all the datasets mentioned in the paper. Works with all OpenML datasets. 52 | * `--task` : The task we want to perform. Pick from 'regression','multiclass', or 'binary'. 53 | * `--attentiontype` : Variant of SAINT. 'col' refers to SAINT-s variant, 'row' is SAINT-i, and 'colrow' refers to SAINT. 54 | * `--embedding_size` : Size of the feature embeddings 55 | * `--transformer_depth` : Depth of the model. Number of stages. 56 | * `--attention_heads` : Number of attention heads in each Attention layer. 57 | * `--cont_embeddings` : Style of embedding continuous data. 58 | * `--pretrain` : To enable pretraining 59 | * `--pt_tasks` : Losses we want to use for pretraining. Multiple arguments can be passed. 60 | * `--pt_aug` : Types of data augmentations used in pretraining. Multiple arguments are allowed. We support only mixup and CutMix right now. 61 | * `--ssl_samples` : Number of labeled samples used in semi-supervised experiments. 62 | * `--pt_projhead_style` : Projection head style used in contrastive pipeline. 63 | * `--nce_temp` : Temperature used in contrastive loss function. 64 | * `--active_log` : To update the logs onto wandb. This is optional 65 | 66 | #### Most of the hyperparameters are hardcoded in train.py file. For datasets with really high number of features, we suggest using smaller batchsize, lower embedding dimension and fewer number of heads. 67 | 68 | ### Evaluation 69 | 70 | We choose the best model by evaluating the model on validation dataset. The AuROC(for binary classification datasets), Accuracy (for multiclass classification datasets), and RMSE (for regression datasets) of the best model on test datasets is printed after training is completed. If wandb is enabled, they are logged to 'test_auroc_bestep', 'test_accuracy_bestep', 'test_rmse_bestep' variables. 71 | 72 | 73 | 74 | ## What's new in this version? 75 | * Regression and multiclass classification models are added. 76 | * Data can be accessed directly from openml just by calling the id of the dataset. 77 | 78 | 79 | ## Acknowledgements 80 | 81 | We would like to thank the following public repo from which we borrowed various utilites. 82 | - https://github.com/lucidrains/tab-transformer-pytorch 83 | 84 | ## License 85 | This repository is released under the Apache 2.0 license as found in the [LICENSE](LICENSE) file. 86 | 87 | ## Cite us 88 | 89 | ``` 90 | @article{somepalli2021saint, 91 | title={SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training}, 92 | author={Somepalli, Gowthami and Goldblum, Micah and Schwarzschild, Avi and Bruss, C Bayan and Goldstein, Tom}, 93 | journal={arXiv preprint arXiv:2106.01342}, 94 | year={2021} 95 | } 96 | 97 | ``` 98 | -------------------------------------------------------------------------------- /saint/augmentations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def embed_data_mask(x_categ, x_cont, cat_mask, con_mask, model, vision_dset=False): 6 | device = x_cont.device if x_cont is not None else x_categ.device 7 | 8 | # Embed categorical data if available 9 | if x_categ is not None and model.embeds is not None: 10 | x_categ = torch.where(x_categ == model.unknown_value, model.categories.to(x_categ.device), x_categ) 11 | x_categ = x_categ + model.categories_offset.type_as(x_categ) 12 | x_categ_enc = model.embeds(x_categ) 13 | cat_mask_temp = cat_mask + model.cat_mask_offset.type_as(cat_mask) 14 | cat_mask_temp = model.mask_embeds_cat(cat_mask_temp) 15 | cat_mask_unsqueezed = cat_mask.unsqueeze(-1) 16 | 17 | assert x_categ_enc.shape[0] == cat_mask_temp.shape[0] == cat_mask_unsqueezed.shape[0], \ 18 | f"Mismatch in batch size. x_categ_enc: {x_categ_enc.shape[0]}, cat_mask_temp: {cat_mask_temp.shape[0]}, " \ 19 | f"cat_mask_unsqueezed: {cat_mask_unsqueezed.shape[0]} " 20 | 21 | assert x_categ_enc.shape[1] == cat_mask_temp.shape[1] == cat_mask_unsqueezed.shape[1], \ 22 | f"Mismatch in sequence length. x_categ_enc: {x_categ_enc.shape[1]}, cat_mask_temp: {cat_mask_temp.shape[1]}," \ 23 | f" cat_mask_unsqueezed: {cat_mask_unsqueezed.shape[1]} " 24 | 25 | assert x_categ_enc.shape[2] == cat_mask_temp.shape[2], \ 26 | f"Mismatch in embedding size. x_categ_enc: {x_categ_enc.shape[2]}, cat_mask_temp: {cat_mask_temp.shape[2]}" 27 | 28 | assert cat_mask_unsqueezed.shape[2] == 1, \ 29 | f"cat_mask_unsqueezed should have a singleton dimension. Found: {cat_mask_unsqueezed.shape[2]}" 30 | 31 | x_categ_enc = torch.where(cat_mask_unsqueezed == 0, cat_mask_temp, x_categ_enc) 32 | 33 | else: 34 | x_categ_enc = None 35 | 36 | # Embed continuous data if available 37 | if x_cont is not None: 38 | n1, n2 = x_cont.shape 39 | if model.cont_embeddings == 'MLP': 40 | x_cont_enc = torch.empty(n1, n2, model.dim, device=device) 41 | for i in range(model.num_continuous): 42 | x_cont_enc[:, i, :] = model.simple_MLP[i](x_cont[:, i]) 43 | else: 44 | raise Exception('This case should not work!') 45 | 46 | con_mask_temp = con_mask + model.con_mask_offset.type_as(con_mask) 47 | con_mask_temp = model.mask_embeds_cont(con_mask_temp) 48 | x_cont_enc[con_mask == 0] = con_mask_temp[con_mask == 0] 49 | else: 50 | x_cont_enc = None 51 | 52 | # Handle vision dataset specific logic 53 | if vision_dset and x_categ is not None: 54 | pos = np.tile(np.arange(x_categ.shape[-1]), (x_categ.shape[0], 1)) 55 | pos = torch.from_numpy(pos).to(device) 56 | pos_enc = model.pos_encodings(pos) 57 | x_categ_enc += pos_enc 58 | 59 | return x_categ, x_categ_enc, x_cont_enc 60 | 61 | 62 | def mixup_data(x1, x2, lam=1.0, y=None, use_cuda=True): 63 | '''Returns mixed inputs, pairs of targets''' 64 | 65 | batch_size = x1.size()[0] 66 | if use_cuda: 67 | index = torch.randperm(batch_size).cuda() 68 | else: 69 | index = torch.randperm(batch_size) 70 | 71 | mixed_x1 = lam * x1 + (1 - lam) * x1[index, :] 72 | mixed_x2 = lam * x2 + (1 - lam) * x2[index, :] 73 | if y is not None: 74 | y_a, y_b = y, y[index] 75 | return mixed_x1, mixed_x2, y_a, y_b 76 | 77 | return mixed_x1, mixed_x2 78 | 79 | 80 | def add_noise(x_categ, x_cont, noise_params={'noise_type': ['cutmix'], 'lambda': 0.1}): 81 | lam = noise_params['lambda'] 82 | device = x_categ.device 83 | batch_size = x_categ.size()[0] 84 | 85 | if 'cutmix' in noise_params['noise_type']: 86 | index = torch.randperm(batch_size) 87 | cat_corr = torch.from_numpy(np.random.choice(2, (x_categ.shape), p=[lam, 1 - lam])).to(device) 88 | con_corr = torch.from_numpy(np.random.choice(2, (x_cont.shape), p=[lam, 1 - lam])).to(device) 89 | x1, x2 = x_categ[index, :], x_cont[index, :] 90 | x_categ_corr, x_cont_corr = x_categ.clone().detach(), x_cont.clone().detach() 91 | x_categ_corr[cat_corr == 0] = x1[cat_corr == 0] 92 | x_cont_corr[con_corr == 0] = x2[con_corr == 0] 93 | return x_categ_corr, x_cont_corr 94 | elif noise_params['noise_type'] == 'missing': 95 | x_categ_mask = np.random.choice(2, (x_categ.shape), p=[lam, 1 - lam]) 96 | x_cont_mask = np.random.choice(2, (x_cont.shape), p=[lam, 1 - lam]) 97 | x_categ_mask = torch.from_numpy(x_categ_mask).to(device) 98 | x_cont_mask = torch.from_numpy(x_cont_mask).to(device) 99 | return torch.mul(x_categ, x_categ_mask), torch.mul(x_cont, x_cont_mask) 100 | 101 | else: 102 | print("yet to write this") 103 | -------------------------------------------------------------------------------- /saint/data_openml.py: -------------------------------------------------------------------------------- 1 | import openml 2 | import numpy as np 3 | from sklearn.preprocessing import LabelEncoder 4 | import pandas as pd 5 | from torch.utils.data import Dataset 6 | 7 | 8 | def simple_lapsed_time(text, lapsed): 9 | hours, rem = divmod(lapsed, 3600) 10 | minutes, seconds = divmod(rem, 60) 11 | print(text+": {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)) 12 | 13 | 14 | def task_dset_ids(task): 15 | dataset_ids = { 16 | 'binary': [1487,44,1590,42178,1111,31,42733,1494,1017,4134], 17 | 'multiclass': [188, 1596, 4541, 40664, 40685, 40687, 40975, 41166, 41169, 42734], 18 | 'regression':[541, 42726, 42727, 422, 42571, 42705, 42728, 42563, 42724, 42729] 19 | } 20 | 21 | return dataset_ids[task] 22 | 23 | def concat_data(X,y): 24 | # import ipdb; ipdb.set_trace() 25 | return pd.concat([pd.DataFrame(X['data']), pd.DataFrame(y['data'][:,0].tolist(),columns=['target'])], axis=1) 26 | 27 | 28 | def data_split(X,y,nan_mask,indices): 29 | x_d = { 30 | 'data': X.values[indices], 31 | 'mask': nan_mask.values[indices] 32 | } 33 | 34 | if x_d['data'].shape != x_d['mask'].shape: 35 | raise'Shape of data not same as that of nan mask!' 36 | 37 | y_d = { 38 | 'data': y[indices].reshape(-1, 1) 39 | } 40 | return x_d, y_d 41 | 42 | 43 | def data_prep_openml(ds_id, seed, task, datasplit=[.65, .15, .2]): 44 | 45 | np.random.seed(seed) 46 | dataset = openml.datasets.get_dataset(ds_id) 47 | 48 | X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute) 49 | if ds_id == 42178: 50 | categorical_indicator = [True, False, True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,False, False] 51 | tmp = [x if (x != ' ') else '0' for x in X['TotalCharges'].tolist()] 52 | X['TotalCharges'] = [float(i) for i in tmp ] 53 | y = y[X.TotalCharges != 0] 54 | X = X[X.TotalCharges != 0] 55 | X.reset_index(drop=True, inplace=True) 56 | print(y.shape, X.shape) 57 | if ds_id in [42728,42705,42729,42571]: 58 | # import ipdb; ipdb.set_trace() 59 | X, y = X[:50000], y[:50000] 60 | X.reset_index(drop=True, inplace=True) 61 | categorical_columns = X.columns[list(np.where(np.array(categorical_indicator)==True)[0])].tolist() 62 | cont_columns = list(set(X.columns.tolist()) - set(categorical_columns)) 63 | 64 | cat_idxs = list(np.where(np.array(categorical_indicator)==True)[0]) 65 | con_idxs = list(set(range(len(X.columns))) - set(cat_idxs)) 66 | 67 | for col in categorical_columns: 68 | X[col] = X[col].astype("object") 69 | 70 | X["Set"] = np.random.choice(["train", "valid", "test"], p = datasplit, size=(X.shape[0],)) 71 | 72 | train_indices = X[X.Set=="train"].index 73 | valid_indices = X[X.Set=="valid"].index 74 | test_indices = X[X.Set=="test"].index 75 | 76 | X = X.drop(columns=['Set']) 77 | temp = X.fillna("MissingValue") 78 | nan_mask = temp.ne("MissingValue").astype(int) 79 | 80 | cat_dims = [] 81 | for col in categorical_columns: 82 | # X[col] = X[col].cat.add_categories("MissingValue") 83 | X[col] = X[col].fillna("MissingValue") 84 | l_enc = LabelEncoder() 85 | X[col] = l_enc.fit_transform(X[col].values) 86 | cat_dims.append(len(l_enc.classes_)) 87 | for col in cont_columns: 88 | # X[col].fillna("MissingValue",inplace=True) 89 | X.fillna(X.loc[train_indices, col].mean(), inplace=True) 90 | y = y.values 91 | if task != 'regression': 92 | l_enc = LabelEncoder() 93 | y = l_enc.fit_transform(y) 94 | X_train, y_train = data_split(X,y,nan_mask,train_indices) 95 | X_valid, y_valid = data_split(X,y,nan_mask,valid_indices) 96 | X_test, y_test = data_split(X,y,nan_mask,test_indices) 97 | 98 | train_mean, train_std = np.array(X_train['data'][:,con_idxs],dtype=np.float32).mean(0), np.array(X_train['data'][:,con_idxs],dtype=np.float32).std(0) 99 | train_std = np.where(train_std < 1e-6, 1e-6, train_std) 100 | # import ipdb; ipdb.set_trace() 101 | return cat_dims, cat_idxs, con_idxs, X_train, y_train, X_valid, y_valid, X_test, y_test, train_mean, train_std 102 | 103 | 104 | 105 | 106 | class DataSetCatCon(Dataset): 107 | def __init__(self, X, Y, cat_cols,task='clf',continuous_mean_std=None): 108 | 109 | cat_cols = list(cat_cols) 110 | X_mask = X['mask'].copy() 111 | X = X['data'].copy() 112 | con_cols = list(set(np.arange(X.shape[1])) - set(cat_cols)) 113 | self.X1 = X[:,cat_cols].copy().astype(np.int64) #categorical columns 114 | self.X2 = X[:,con_cols].copy().astype(np.float32) #numerical columns 115 | self.X1_mask = X_mask[:,cat_cols].copy().astype(np.int64) #categorical columns 116 | self.X2_mask = X_mask[:,con_cols].copy().astype(np.int64) #numerical columns 117 | if task == 'clf': 118 | self.y = Y['data']#.astype(np.float32) 119 | else: 120 | self.y = Y['data'].astype(np.float32) 121 | self.cls = np.zeros_like(self.y,dtype=int) 122 | self.cls_mask = np.ones_like(self.y,dtype=int) 123 | if continuous_mean_std is not None: 124 | mean, std = continuous_mean_std 125 | self.X2 = (self.X2 - mean) / std 126 | 127 | def __len__(self): 128 | return len(self.y) 129 | 130 | def __getitem__(self, idx): 131 | # X1 has categorical data, X2 has continuous 132 | return np.concatenate((self.cls[idx], self.X1[idx])), self.X2[idx],self.y[idx], np.concatenate((self.cls_mask[idx], self.X1_mask[idx])), self.X2_mask[idx] 133 | 134 | -------------------------------------------------------------------------------- /saint/models/__init__.py: -------------------------------------------------------------------------------- 1 | from models.pretrainmodel import SAINT 2 | from models.pretrainmodel_vision import SAINT_vision 3 | -------------------------------------------------------------------------------- /saint/models/pretrainmodel.py: -------------------------------------------------------------------------------- 1 | from .model import * 2 | 3 | 4 | class sep_MLP(nn.Module): 5 | def __init__(self, dim, len_feats, categories): 6 | super(sep_MLP, self).__init__() 7 | self.len_feats = len_feats 8 | self.layers = nn.ModuleList([]) 9 | for i in range(len_feats): 10 | self.layers.append(simple_MLP([dim, 5 * dim, categories[i]])) 11 | 12 | def forward(self, x): 13 | y_pred = list([]) 14 | for i in range(self.len_feats): 15 | x_i = x[:, i, :] 16 | pred = self.layers[i](x_i) 17 | y_pred.append(pred) 18 | return y_pred 19 | 20 | 21 | class SAINT(nn.Module): 22 | def __init__( 23 | self, 24 | *, 25 | categories, 26 | num_continuous, 27 | dim, 28 | depth, 29 | heads, 30 | dim_head=16, 31 | dim_out=1, 32 | mlp_hidden_mults=(4, 2), 33 | mlp_act=None, 34 | num_special_tokens=0, 35 | attn_dropout=0., 36 | ff_dropout=0., 37 | cont_embeddings='MLP', 38 | scalingfactor=10, 39 | attentiontype='col', 40 | final_mlp_style='common', 41 | y_dim=2 42 | ): 43 | super().__init__() 44 | if categories is not None: 45 | assert all(map(lambda n: n > 0, categories)), 'number of each category must be positive' 46 | 47 | # categories related calculations 48 | self.categories = torch.tensor(np.subtract(categories, 1).tolist()) 49 | self.unknown_value = -1 50 | self.num_categories = len(categories) 51 | self.num_unique_categories = sum(categories) 52 | 53 | # create category embeddings table 54 | 55 | self.num_special_tokens = num_special_tokens 56 | self.total_tokens = self.num_unique_categories + num_special_tokens 57 | 58 | # for automatically offsetting unique category ids to the correct position in the categories embedding table 59 | 60 | categories_offset = F.pad(torch.tensor(list(categories)), (1, 0), value=num_special_tokens) 61 | categories_offset = categories_offset.cumsum(dim=-1)[:-1] 62 | 63 | self.register_buffer('categories_offset', categories_offset) 64 | else: 65 | self.num_categories = 0 66 | self.num_unique_categories = 0 67 | self.total_tokens = 0 68 | 69 | self.norm = nn.LayerNorm(num_continuous) 70 | self.num_continuous = num_continuous 71 | self.dim = dim 72 | self.cont_embeddings = cont_embeddings 73 | self.attentiontype = attentiontype 74 | self.final_mlp_style = final_mlp_style 75 | 76 | if self.cont_embeddings == 'MLP': 77 | self.simple_MLP = nn.ModuleList([simple_MLP([1, 100, self.dim]) for _ in range(self.num_continuous)]) 78 | input_size = (dim * self.num_categories) + (dim * num_continuous) 79 | nfeats = self.num_categories + num_continuous 80 | elif self.cont_embeddings == 'pos_singleMLP': 81 | self.simple_MLP = nn.ModuleList([simple_MLP([1, 100, self.dim]) for _ in range(1)]) 82 | input_size = (dim * self.num_categories) + (dim * num_continuous) 83 | nfeats = self.num_categories + num_continuous 84 | else: 85 | print('Continous features are not passed through attention') 86 | input_size = (dim * self.num_categories) + num_continuous 87 | nfeats = self.num_categories 88 | 89 | # transformer 90 | if attentiontype == 'col': 91 | self.transformer = Transformer( 92 | num_tokens=self.total_tokens, 93 | dim=dim, 94 | depth=depth, 95 | heads=heads, 96 | dim_head=dim_head, 97 | attn_dropout=attn_dropout, 98 | ff_dropout=ff_dropout 99 | ) 100 | elif attentiontype in ['row', 'colrow']: 101 | self.transformer = RowColTransformer( 102 | num_tokens=self.total_tokens, 103 | dim=dim, 104 | nfeats=nfeats, 105 | depth=depth, 106 | heads=heads, 107 | dim_head=dim_head, 108 | attn_dropout=attn_dropout, 109 | ff_dropout=ff_dropout, 110 | style=attentiontype 111 | ) 112 | 113 | l = input_size // 8 114 | hidden_dimensions = list(map(lambda t: l * t, mlp_hidden_mults)) 115 | all_dimensions = [input_size, *hidden_dimensions, dim_out] 116 | 117 | self.mlp = MLP(all_dimensions, act=mlp_act) 118 | self.embeds = nn.Embedding(self.total_tokens, self.dim) # .to(device) 119 | 120 | cat_mask_offset = F.pad(torch.Tensor(self.num_categories).fill_(2).type(torch.int8), (1, 0), value=0) 121 | cat_mask_offset = cat_mask_offset.cumsum(dim=-1)[:-1] 122 | 123 | con_mask_offset = F.pad(torch.Tensor(self.num_continuous).fill_(2).type(torch.int8), (1, 0), value=0) 124 | con_mask_offset = con_mask_offset.cumsum(dim=-1)[:-1] 125 | 126 | self.register_buffer('cat_mask_offset', cat_mask_offset) 127 | self.register_buffer('con_mask_offset', con_mask_offset) 128 | 129 | self.mask_embeds_cat = nn.Embedding(self.num_categories * 2, self.dim) 130 | self.mask_embeds_cont = nn.Embedding(self.num_continuous * 2, self.dim) 131 | self.single_mask = nn.Embedding(2, self.dim) 132 | self.pos_encodings = nn.Embedding(self.num_categories + self.num_continuous, self.dim) 133 | 134 | if self.final_mlp_style == 'common': 135 | self.mlp1 = simple_MLP([dim, (self.total_tokens) * 2, self.total_tokens]) 136 | self.mlp2 = simple_MLP([dim, (self.num_continuous), 1]) 137 | 138 | else: 139 | self.mlp1 = sep_MLP(dim, self.num_categories, categories) 140 | self.mlp2 = sep_MLP(dim, self.num_continuous, np.ones(self.num_continuous).astype(int)) 141 | 142 | self.mlpfory = simple_MLP([dim, 1000, y_dim]) 143 | self.pt_mlp = simple_MLP([dim * (self.num_continuous + self.num_categories), 144 | 6 * dim * (self.num_continuous + self.num_categories) // 5, 145 | dim * (self.num_continuous + self.num_categories) // 2]) 146 | self.pt_mlp2 = simple_MLP([dim * (self.num_continuous + self.num_categories), 147 | 6 * dim * (self.num_continuous + self.num_categories) // 5, 148 | dim * (self.num_continuous + self.num_categories) // 2]) 149 | 150 | def forward(self, x_categ, x_cont): 151 | if x_categ is None: 152 | # Handle the case when only continuous data is provided 153 | if self.cont_embeddings == 'MLP': 154 | x_cont = torch.stack([self.simple_MLP[i](x_cont[:, i].view(-1, 1)) for i in range(self.num_continuous)], 155 | dim=1) 156 | # Process continuous data 157 | x_cont = self.norm(x_cont) 158 | x = x_cont 159 | elif x_cont is None: 160 | # Handle the case when only categorical data is provided 161 | x_categ = torch.where(x_categ == self.unknown_value, self.categories.to(x_categ.device), x_categ) 162 | x = self.embeds(x_categ + self.categories_offset) 163 | else: 164 | # Handle the case when both categorical and continuous data is provided 165 | x_categ = torch.where(x_categ == self.unknown_value, self.categories.to(x_categ.device), x_categ) 166 | x_categ = self.embeds(x_categ + self.categories_offset) 167 | if self.cont_embeddings == 'MLP': 168 | x_cont = torch.stack([self.simple_MLP[i](x_cont[:, i].view(-1, 1)) for i in range(self.num_continuous)], 169 | dim=1) 170 | x_cont = self.norm(x_cont) 171 | x = torch.cat((x_categ, x_cont), dim=1) 172 | 173 | # Proceed with the rest of the forward pass 174 | x = self.transformer(x) 175 | cat_outs = self.mlp1(x[:, :self.num_categories, :]) if x_categ is not None else None 176 | con_outs = self.mlp2(x[:, self.num_categories:, :]) if x_cont is not None else None 177 | return cat_outs, con_outs 178 | -------------------------------------------------------------------------------- /saint/models/pretrainmodel_vision.py: -------------------------------------------------------------------------------- 1 | from .model import * 2 | 3 | 4 | class sep_MLP(nn.Module): 5 | def __init__(self,dim,len_feats,categories): 6 | super(sep_MLP, self).__init__() 7 | self.len_feats = len_feats 8 | self.layers = nn.ModuleList([]) 9 | for i in range(len_feats): 10 | self.layers.append(simple_MLP([dim,5*dim, categories[i]])) 11 | 12 | 13 | def forward(self, x): 14 | y_pred = list([]) 15 | for i in range(self.len_feats): 16 | x_i = x[:,i,:] 17 | pred = self.layers[i](x_i) 18 | y_pred.append(pred) 19 | return y_pred 20 | 21 | class SAINT_vision(nn.Module): 22 | def __init__( 23 | self, 24 | *, 25 | categories, 26 | num_continuous, 27 | dim, 28 | depth, 29 | heads, 30 | dim_head = 16, 31 | dim_out = 1, 32 | mlp_hidden_mults = (4, 2), 33 | mlp_act = None, 34 | num_special_tokens = 0, 35 | continuous_mean_std = None, 36 | attn_dropout = 0., 37 | ff_dropout = 0., 38 | cont_embeddings = 'MLP', 39 | scalingfactor = 10, 40 | attentiontype = 'col', 41 | final_mlp_style = 'common', 42 | y_dim = 2 43 | ): 44 | super().__init__() 45 | assert all(map(lambda n: n > 0, categories)), 'number of each category must be positive' 46 | 47 | # categories related calculations 48 | 49 | self.num_categories = len(categories) 50 | self.num_unique_categories = sum(categories) 51 | 52 | # create category embeddings table 53 | 54 | self.num_special_tokens = num_special_tokens 55 | self.total_tokens = categories[-1] + 256 56 | 57 | # for automatically offsetting unique category ids to the correct position in the categories embedding table 58 | 59 | categories_offset = torch.tensor(np.append(np.repeat(0, self.num_categories-1),[256])) 60 | self.register_buffer('categories_offset', categories_offset) 61 | 62 | 63 | self.norm = nn.LayerNorm(num_continuous) 64 | self.num_continuous = num_continuous 65 | self.dim = dim 66 | self.cont_embeddings = cont_embeddings 67 | self.attentiontype = attentiontype 68 | self.final_mlp_style = final_mlp_style 69 | 70 | if self.cont_embeddings == 'MLP': 71 | self.simple_MLP = nn.ModuleList([simple_MLP([1,100,self.dim]) for _ in range(self.num_continuous)]) 72 | input_size = (dim * self.num_categories) + (dim * num_continuous) 73 | nfeats = self.num_categories + num_continuous 74 | else: 75 | print('Continous features are not passed through attention') 76 | input_size = (dim * self.num_categories) + num_continuous 77 | nfeats = self.num_categories 78 | 79 | # transformer 80 | if attentiontype == 'col': 81 | self.transformer = Transformer( 82 | num_tokens = self.total_tokens, 83 | dim = dim, 84 | depth = depth, 85 | heads = heads, 86 | dim_head = dim_head, 87 | attn_dropout = attn_dropout, 88 | ff_dropout = ff_dropout 89 | ) 90 | elif attentiontype in ['row','colrow'] : 91 | self.transformer = RowColTransformer( 92 | num_tokens = self.total_tokens, 93 | dim = dim, 94 | nfeats= nfeats, 95 | depth = depth, 96 | heads = heads, 97 | dim_head = dim_head, 98 | attn_dropout = attn_dropout, 99 | ff_dropout = ff_dropout, 100 | style = attentiontype 101 | ) 102 | 103 | l = input_size // 8 104 | hidden_dimensions = list(map(lambda t: l * t, mlp_hidden_mults)) 105 | all_dimensions = [input_size, *hidden_dimensions, dim_out] 106 | 107 | self.mlp = MLP(all_dimensions, act = mlp_act) 108 | self.embeds = nn.Embedding(self.total_tokens, self.dim) 109 | 110 | cat_mask_offset = torch.tensor(np.append(np.repeat(0, self.num_categories-1),[2])) 111 | con_mask_offset = torch.empty(0) 112 | 113 | self.register_buffer('cat_mask_offset', cat_mask_offset) 114 | self.register_buffer('con_mask_offset', con_mask_offset) 115 | 116 | self.mask_embeds_cat = nn.Embedding(4, self.dim) 117 | self.mask_embeds_cont = nn.Embedding(4, self.dim) 118 | self.pos_encodings = nn.Embedding(self.num_categories, self.dim) 119 | if self.final_mlp_style == 'common': 120 | self.mlp1 = simple_MLP([dim,(self.total_tokens)*2, self.total_tokens]) 121 | self.mlp2 = simple_MLP([dim ,(self.num_continuous), 1]) 122 | 123 | else: 124 | self.mlp1 = sep_MLP(dim,self.num_categories,categories) 125 | self.mlp2 = sep_MLP(dim,self.num_continuous,np.ones(self.num_continuous).astype(int)) 126 | 127 | 128 | self.mlpfory = simple_MLP([dim ,100, y_dim]) 129 | 130 | 131 | def forward(self, x_categ, x_cont): 132 | x = self.transformer(x_categ, x_cont) 133 | y_reps = x[:,self.num_categories-1,:] 134 | y_outs = self.mlpfory(y_reps) 135 | return y_outs 136 | -------------------------------------------------------------------------------- /saint/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnuremberg/Revisiting-MLPs/b17e3bf1a663a5605b5e727929cfc779de4211df/saint/pipeline.png -------------------------------------------------------------------------------- /saint/pretraining.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from baselines.data_openml import data_prep_openml,task_dset_ids,DataSetCatCon 5 | from torch.utils.data import DataLoader 6 | import torch.optim as optim 7 | from augmentations import embed_data_mask 8 | from augmentations import add_noise 9 | 10 | import os 11 | import numpy as np 12 | 13 | def SAINT_pretrain(model,cat_idxs,X_train,y_train,continuous_mean_std,opt,device): 14 | train_ds = DataSetCatCon(X_train, y_train, cat_idxs,opt.dtask, continuous_mean_std) 15 | trainloader = DataLoader(train_ds, batch_size=opt.batchsize, shuffle=True,num_workers=4) 16 | vision_dset = opt.vision_dset 17 | optimizer = optim.AdamW(model.parameters(),lr=0.0001) 18 | pt_aug_dict = { 19 | 'noise_type' : opt.pt_aug, 20 | 'lambda' : opt.pt_aug_lam 21 | } 22 | criterion1 = nn.CrossEntropyLoss() 23 | criterion2 = nn.MSELoss() 24 | print("Pretraining begins!") 25 | for epoch in range(opt.pretrain_epochs): 26 | model.train() 27 | running_loss = 0.0 28 | for i, data in enumerate(trainloader, 0): 29 | optimizer.zero_grad() 30 | x_categ, x_cont, _ ,cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device) 31 | 32 | # embed_data_mask function is used to embed both categorical and continuous data. 33 | if 'cutmix' in opt.pt_aug: 34 | from augmentations import add_noise 35 | x_categ_corr, x_cont_corr = add_noise(x_categ,x_cont, noise_params = pt_aug_dict) 36 | _ , x_categ_enc_2, x_cont_enc_2 = embed_data_mask(x_categ_corr, x_cont_corr, cat_mask, con_mask,model,vision_dset) 37 | else: 38 | _ , x_categ_enc_2, x_cont_enc_2 = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset) 39 | _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset) 40 | 41 | if 'mixup' in opt.pt_aug: 42 | from augmentations import mixup_data 43 | x_categ_enc_2, x_cont_enc_2 = mixup_data(x_categ_enc_2, x_cont_enc_2 , lam=opt.mixup_lam) 44 | loss = 0 45 | if 'contrastive' in opt.pt_tasks: 46 | aug_features_1 = model.transformer(x_categ_enc, x_cont_enc) 47 | aug_features_2 = model.transformer(x_categ_enc_2, x_cont_enc_2) 48 | aug_features_1 = (aug_features_1 / aug_features_1.norm(dim=-1, keepdim=True)).flatten(1,2) 49 | aug_features_2 = (aug_features_2 / aug_features_2.norm(dim=-1, keepdim=True)).flatten(1,2) 50 | if opt.pt_projhead_style == 'diff': 51 | aug_features_1 = model.pt_mlp(aug_features_1) 52 | aug_features_2 = model.pt_mlp2(aug_features_2) 53 | elif opt.pt_projhead_style == 'same': 54 | aug_features_1 = model.pt_mlp(aug_features_1) 55 | aug_features_2 = model.pt_mlp(aug_features_2) 56 | else: 57 | print('Not using projection head') 58 | logits_per_aug1 = aug_features_1 @ aug_features_2.t()/opt.nce_temp 59 | logits_per_aug2 = aug_features_2 @ aug_features_1.t()/opt.nce_temp 60 | targets = torch.arange(logits_per_aug1.size(0)).to(logits_per_aug1.device) 61 | loss_1 = criterion1(logits_per_aug1, targets) 62 | loss_2 = criterion1(logits_per_aug2, targets) 63 | loss = opt.lam0*(loss_1 + loss_2)/2 64 | elif 'contrastive_sim' in opt.pt_tasks: 65 | aug_features_1 = model.transformer(x_categ_enc, x_cont_enc) 66 | aug_features_2 = model.transformer(x_categ_enc_2, x_cont_enc_2) 67 | aug_features_1 = (aug_features_1 / aug_features_1.norm(dim=-1, keepdim=True)).flatten(1,2) 68 | aug_features_2 = (aug_features_2 / aug_features_2.norm(dim=-1, keepdim=True)).flatten(1,2) 69 | aug_features_1 = model.pt_mlp(aug_features_1) 70 | aug_features_2 = model.pt_mlp2(aug_features_2) 71 | c1 = aug_features_1 @ aug_features_2.t() 72 | loss+= opt.lam1*torch.diagonal(-1*c1).add_(1).pow_(2).sum() 73 | if 'denoising' in opt.pt_tasks: 74 | cat_outs, con_outs = model(x_categ_enc_2, x_cont_enc_2) 75 | # if con_outs.shape(-1) != 0: 76 | # import ipdb; ipdb.set_trace() 77 | if len(con_outs) > 0: 78 | con_outs = torch.cat(con_outs,dim=1) 79 | l2 = criterion2(con_outs, x_cont) 80 | else: 81 | l2 = 0 82 | l1 = 0 83 | # import ipdb; ipdb.set_trace() 84 | n_cat = x_categ.shape[-1] 85 | for j in range(1,n_cat): 86 | l1+= criterion1(cat_outs[j],x_categ[:,j]) 87 | loss += opt.lam2*l1 + opt.lam3*l2 88 | loss.backward() 89 | optimizer.step() 90 | running_loss += loss.item() 91 | 92 | print(f'Epoch: {epoch}, Running Loss: {running_loss}') 93 | 94 | print('END OF PRETRAINING!') 95 | return model 96 | # if opt.active_log: 97 | # wandb.log({'pt_epoch': epoch ,'pretrain_epoch_loss': running_loss 98 | # }) 99 | -------------------------------------------------------------------------------- /saint/saint_environment.yml: -------------------------------------------------------------------------------- 1 | name: saint_env 2 | channels: 3 | - anaconda 4 | - pytorch 5 | - rwest 6 | - vgauthier 7 | - conda-forge 8 | - defaults 9 | - ostrokach 10 | dependencies: 11 | - _libgcc_mutex=0.1=conda_forge 12 | - _openmp_mutex=4.5=1_gnu 13 | - _py-xgboost-mutex=2.0=cpu_0 14 | - anyio=3.2.1=py38h578d9bd_0 15 | - argh=0.26.2=pyh9f0ad1d_1002 16 | - argon2-cffi=20.1.0=py38h497a2fe_2 17 | - async_generator=1.10=py_0 18 | - attrs=21.2.0=pyhd8ed1ab_0 19 | - babel=2.9.1=pyh44b312d_0 20 | - backcall=0.2.0=pyh9f0ad1d_0 21 | - backports=1.0=py_2 22 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 23 | - blas=1.0=mkl 24 | - bleach=3.3.1=pyhd8ed1ab_0 25 | - brotlipy=0.7.0=py38h497a2fe_1001 26 | - bzip2=1.0.8=h7f98852_4 27 | - ca-certificates=2021.5.30=ha878542_0 28 | - certifi=2021.5.30=py38h578d9bd_0 29 | - cffi=1.14.5=py38ha65f79e_0 30 | - chardet=4.0.0=py38h578d9bd_1 31 | - click=8.0.1=py38h578d9bd_0 32 | - configparser=5.0.2=pyhd8ed1ab_0 33 | - cryptography=3.4.7=py38ha5dfef3_0 34 | - cudatoolkit=11.1.1=h6406543_8 35 | - cycler=0.10.0=py_2 36 | - dbus=1.13.18=hb2f20db_0 37 | - debugpy=1.3.0=py38h709712a_0 38 | - decorator=5.0.9=pyhd8ed1ab_0 39 | - defusedxml=0.7.1=pyhd8ed1ab_0 40 | - docker-pycreds=0.4.0=py_0 41 | - einops=0.3.0=py_0 42 | - entrypoints=0.3=pyhd8ed1ab_1003 43 | - expat=2.4.1=h9c3ff4c_0 44 | - ffmpeg=4.3=hf484d3e_0 45 | - fontconfig=2.13.1=hba837de_1005 46 | - freetype=2.10.4=h0708190_1 47 | - gettext=0.19.8.1=h0b5b191_1005 48 | - gitdb=4.0.7=pyhd8ed1ab_0 49 | - gitpython=3.1.17=pyhd8ed1ab_0 50 | - glib=2.68.3=h9c3ff4c_0 51 | - glib-tools=2.68.3=h9c3ff4c_0 52 | - gmp=6.2.1=h58526e2_0 53 | - gnutls=3.6.13=h85f3911_1 54 | - gql=0.1.0=py_0 55 | - graphql-core=3.1.5=pyhd8ed1ab_0 56 | - gst-plugins-base=1.14.0=hbbd80ab_1 57 | - gstreamer=1.14.0=h28cd5cc_2 58 | - icu=58.2=hf484d3e_1000 59 | - idna=2.10=pyh9f0ad1d_0 60 | - importlib-metadata=4.6.1=py38h578d9bd_0 61 | - intel-openmp=2021.2.0=h06a4308_610 62 | - ipdb=0.13.9=pyhd8ed1ab_0 63 | - ipykernel=6.0.2=py38hd0cf306_0 64 | - ipython=7.25.0=py38hd0cf306_1 65 | - ipython_genutils=0.2.0=py_1 66 | - jedi=0.18.0=py38h578d9bd_2 67 | - jinja2=3.0.1=pyhd8ed1ab_0 68 | - joblib=0.17.0=py_0 69 | - jpeg=9b=h024ee3a_2 70 | - json5=0.9.5=pyh9f0ad1d_0 71 | - jsonschema=3.2.0=pyhd8ed1ab_3 72 | - jupyter_client=6.1.12=pyhd8ed1ab_0 73 | - jupyter_core=4.7.1=py38h578d9bd_0 74 | - jupyter_server=1.9.0=pyhd8ed1ab_0 75 | - jupyterlab=3.0.16=pyhd8ed1ab_0 76 | - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0 77 | - jupyterlab_server=2.6.1=pyhd8ed1ab_0 78 | - kiwisolver=1.3.1=py38h1fd1430_1 79 | - lame=3.100=h7f98852_1001 80 | - lcms2=2.12=h3be6417_0 81 | - ld_impl_linux-64=2.35.1=hea4e1c9_2 82 | - liac-arff=2.5.0=pyhd8ed1ab_1 83 | - libffi=3.3=h58526e2_2 84 | - libgcc-ng=9.3.0=h2828fa1_19 85 | - libgfortran-ng=7.3.0=hdf63c60_0 86 | - libglib=2.68.3=h3e27bee_0 87 | - libgomp=9.3.0=h2828fa1_19 88 | - libiconv=1.16=h516909a_0 89 | - libidn2=2.3.1=h7f98852_0 90 | - libpng=1.6.37=h21135ba_2 91 | - libprotobuf=3.17.2=h780b84a_0 92 | - libsodium=1.0.18=h36c2ea0_1 93 | - libstdcxx-ng=9.3.0=h6de172a_19 94 | - libtiff=4.2.0=h85742a9_0 95 | - libunistring=0.9.10=h14c3975_0 96 | - libuuid=2.32.1=h7f98852_1000 97 | - libuv=1.41.0=h7f98852_0 98 | - libwebp-base=1.2.0=h7f98852_2 99 | - libxcb=1.13=h7f98852_1003 100 | - libxgboost=1.4.0=h9c3ff4c_0 101 | - libxml2=2.9.12=h03d6c58_0 102 | - lz4-c=1.9.3=h9c3ff4c_0 103 | - markupsafe=2.0.1=py38h497a2fe_0 104 | - matplotlib=3.4.2=py38h578d9bd_0 105 | - matplotlib-base=3.4.2=py38hcc49a3a_0 106 | - matplotlib-inline=0.1.2=pyhd8ed1ab_2 107 | - mistune=0.8.4=py38h497a2fe_1004 108 | - mkl=2021.2.0=h06a4308_296 109 | - mkl-service=2.4.0=py38h497a2fe_0 110 | - mkl_fft=1.3.0=py38h42c9631_2 111 | - mkl_random=1.2.2=py38h1abd341_0 112 | - nbclassic=0.3.1=pyhd8ed1ab_1 113 | - nbclient=0.5.3=pyhd8ed1ab_0 114 | - nbconvert=6.1.0=py38h578d9bd_0 115 | - nbformat=5.1.3=pyhd8ed1ab_0 116 | - ncurses=6.2=h58526e2_4 117 | - nest-asyncio=1.5.1=pyhd8ed1ab_0 118 | - nettle=3.6=he412f7d_0 119 | - ninja=1.10.2=h4bd325d_0 120 | - notebook=6.4.0=pyha770c72_0 121 | - numpy=1.20.2=py38h2d18471_0 122 | - numpy-base=1.20.2=py38hfae3a4d_0 123 | - nvidia-ml=7.352.0=py_0 124 | - olefile=0.46=pyh9f0ad1d_1 125 | - openh264=2.1.1=h780b84a_0 126 | - openml=0.11.0=pyhd8ed1ab_0 127 | - openssl=1.1.1k=h7f98852_0 128 | - packaging=21.0=pyhd8ed1ab_0 129 | - pandas=1.2.4=py38h1abd341_0 130 | - pandoc=2.14.0.3=h7f98852_0 131 | - pandocfilters=1.4.2=py_1 132 | - parso=0.8.2=pyhd8ed1ab_0 133 | - pathtools=0.1.2=py_1 134 | - pcre=8.45=h9c3ff4c_0 135 | - pexpect=4.8.0=pyh9f0ad1d_2 136 | - pickleshare=0.7.5=py_1003 137 | - pillow=8.2.0=py38he98fc37_0 138 | - pip=21.1.2=pyhd8ed1ab_0 139 | - prometheus_client=0.11.0=pyhd8ed1ab_0 140 | - promise=2.3=py38h578d9bd_3 141 | - prompt-toolkit=3.0.19=pyha770c72_0 142 | - protobuf=3.17.2=py38h709712a_0 143 | - psutil=5.8.0=py38h497a2fe_1 144 | - pthread-stubs=0.4=h36c2ea0_1001 145 | - ptyprocess=0.7.0=pyhd3deb0d_0 146 | - py-xgboost=1.4.0=py38h578d9bd_0 147 | - pycparser=2.20=pyh9f0ad1d_2 148 | - pygments=2.9.0=pyhd8ed1ab_0 149 | - pyopenssl=20.0.1=pyhd8ed1ab_0 150 | - pyparsing=2.4.7=pyh9f0ad1d_0 151 | - pyqt=5.9.2=py38h05f1152_4 152 | - pyrsistent=0.17.3=py38h497a2fe_2 153 | - pysocks=1.7.1=py38h578d9bd_3 154 | - python=3.8.10=h49503c6_1_cpython 155 | - python-dateutil=2.8.1=py_0 156 | - python-wget=3.2=py_0 157 | - python_abi=3.8=1_cp38 158 | - pytorch=1.8.1=py3.8_cuda11.1_cudnn8.0.5_0 159 | - pytz=2021.1=pyhd8ed1ab_0 160 | - pyyaml=5.4.1=py38h497a2fe_0 161 | - pyzmq=22.1.0=py38h2035c66_0 162 | - qt=5.9.7=h5867ecd_1 163 | - readline=8.1=h46c0cb4_0 164 | - requests=2.25.1=pyhd3deb0d_0 165 | - requests-unixsocket=0.2.0=py_0 166 | - scikit-learn=0.23.2=py38h0573a6f_0 167 | - scipy=1.6.2=py38had2a1c9_1 168 | - seaborn=0.11.0=py_0 169 | - send2trash=1.7.1=pyhd8ed1ab_0 170 | - sentry-sdk=1.1.0=pyhd8ed1ab_0 171 | - setuptools=49.6.0=py38h578d9bd_3 172 | - shortuuid=1.0.1=py38h578d9bd_4 173 | - sip=4.19.13=py38he6710b0_0 174 | - six=1.16.0=pyh6c4a22f_0 175 | - smmap=3.0.5=pyh44b312d_0 176 | - sniffio=1.2.0=py38h578d9bd_1 177 | - sqlite=3.35.5=h74cdb3f_0 178 | - subprocess32=3.5.4=py_1 179 | - terminado=0.10.1=py38h578d9bd_0 180 | - testpath=0.5.0=pyhd8ed1ab_0 181 | - threadpoolctl=2.1.0=pyh5ca1d4c_0 182 | - tk=8.6.10=h21135ba_1 183 | - torchvision=0.9.1=py38_cu111 184 | - tornado=6.1=py38h497a2fe_1 185 | - traitlets=5.0.5=py_0 186 | - typing_extensions=3.7.4.3=py_0 187 | - urllib3=1.26.5=pyhd8ed1ab_0 188 | - wandb=0.10.31=pyhd8ed1ab_0 189 | - watchdog=0.10.4=py38h578d9bd_0 190 | - wcwidth=0.2.5=pyh9f0ad1d_2 191 | - webencodings=0.5.1=py_1 192 | - websocket-client=0.57.0=py38h578d9bd_4 193 | - wget=1.20.1=h22169c7_0 194 | - wheel=0.36.2=pyhd3deb0d_0 195 | - xgboost=1.4.0=py38h578d9bd_0 196 | - xmltodict=0.12.0=py_0 197 | - xorg-libxau=1.0.9=h7f98852_0 198 | - xorg-libxdmcp=1.1.3=h7f98852_0 199 | - xz=5.2.5=h516909a_1 200 | - yaml=0.2.5=h516909a_0 201 | - zeromq=4.3.4=h9c3ff4c_0 202 | - zipp=3.5.0=pyhd8ed1ab_0 203 | - zlib=1.2.11=h516909a_1010 204 | - zstd=1.4.9=ha95c52a_0 -------------------------------------------------------------------------------- /saint/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.metrics import roc_auc_score, mean_squared_error 3 | import numpy as np 4 | from augmentations import embed_data_mask 5 | import torch.nn as nn 6 | 7 | def make_default_mask(x): 8 | mask = np.ones_like(x) 9 | mask[:,-1] = 0 10 | return mask 11 | 12 | def tag_gen(tag,y): 13 | return np.repeat(tag,len(y['data'])) 14 | 15 | 16 | def count_parameters(model): 17 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 18 | 19 | def get_scheduler(args, optimizer): 20 | if args.scheduler == 'cosine': 21 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs) 22 | elif args.scheduler == 'linear': 23 | scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, 24 | milestones=[args.epochs // 2.667, args.epochs // 1.6, args.epochs // 1.142], gamma=0.1) 25 | return scheduler 26 | 27 | def imputations_acc_justy(model,dloader,device): 28 | model.eval() 29 | m = nn.Softmax(dim=1) 30 | y_test = torch.empty(0).to(device) 31 | y_pred = torch.empty(0).to(device) 32 | prob = torch.empty(0).to(device) 33 | with torch.no_grad(): 34 | for i, data in enumerate(dloader, 0): 35 | x_categ, x_cont, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device) 36 | _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model) 37 | reps = model.transformer(x_categ_enc, x_cont_enc) 38 | y_reps = reps[:,model.num_categories-1,:] 39 | y_outs = model.mlpfory(y_reps) 40 | # import ipdb; ipdb.set_trace() 41 | y_test = torch.cat([y_test,x_categ[:,-1].float()],dim=0) 42 | y_pred = torch.cat([y_pred,torch.argmax(m(y_outs), dim=1).float()],dim=0) 43 | prob = torch.cat([prob,m(y_outs)[:,-1].float()],dim=0) 44 | 45 | correct_results_sum = (y_pred == y_test).sum().float() 46 | acc = correct_results_sum/y_test.shape[0]*100 47 | auc = roc_auc_score(y_score=prob.cpu(), y_true=y_test.cpu()) 48 | return acc, auc 49 | 50 | 51 | def multiclass_acc_justy(model,dloader,device): 52 | model.eval() 53 | vision_dset = True 54 | m = nn.Softmax(dim=1) 55 | y_test = torch.empty(0).to(device) 56 | y_pred = torch.empty(0).to(device) 57 | prob = torch.empty(0).to(device) 58 | with torch.no_grad(): 59 | for i, data in enumerate(dloader, 0): 60 | x_categ, x_cont, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device) 61 | _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset) 62 | reps = model.transformer(x_categ_enc, x_cont_enc) 63 | y_reps = reps[:,model.num_categories-1,:] 64 | y_outs = model.mlpfory(y_reps) 65 | # import ipdb; ipdb.set_trace() 66 | y_test = torch.cat([y_test,x_categ[:,-1].float()],dim=0) 67 | y_pred = torch.cat([y_pred,torch.argmax(m(y_outs), dim=1).float()],dim=0) 68 | 69 | correct_results_sum = (y_pred == y_test).sum().float() 70 | acc = correct_results_sum/y_test.shape[0]*100 71 | return acc, 0 72 | 73 | 74 | def classification_scores(model, dloader, device, task,vision_dset): 75 | model.eval() 76 | m = nn.Softmax(dim=1) 77 | y_test = torch.empty(0).to(device) 78 | y_pred = torch.empty(0).to(device) 79 | prob = torch.empty(0).to(device) 80 | with torch.no_grad(): 81 | for i, data in enumerate(dloader, 0): 82 | x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device) 83 | _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset) 84 | reps = model.transformer(x_categ_enc, x_cont_enc) 85 | y_reps = reps[:,0,:] 86 | y_outs = model.mlpfory(y_reps) 87 | # import ipdb; ipdb.set_trace() 88 | y_test = torch.cat([y_test,y_gts.squeeze().float()],dim=0) 89 | y_pred = torch.cat([y_pred,torch.argmax(y_outs, dim=1).float()],dim=0) 90 | if task == 'binary': 91 | prob = torch.cat([prob,m(y_outs)[:,-1].float()],dim=0) 92 | 93 | correct_results_sum = (y_pred == y_test).sum().float() 94 | acc = correct_results_sum/y_test.shape[0]*100 95 | auc = 0 96 | if task == 'binary': 97 | auc = roc_auc_score(y_score=prob.cpu(), y_true=y_test.cpu()) 98 | return acc.cpu().numpy(), auc 99 | 100 | def mean_sq_error(model, dloader, device, vision_dset): 101 | model.eval() 102 | y_test = torch.empty(0).to(device) 103 | y_pred = torch.empty(0).to(device) 104 | with torch.no_grad(): 105 | for i, data in enumerate(dloader, 0): 106 | x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device) 107 | _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset) 108 | reps = model.transformer(x_categ_enc, x_cont_enc) 109 | y_reps = reps[:,0,:] 110 | y_outs = model.mlpfory(y_reps) 111 | y_test = torch.cat([y_test,y_gts.squeeze().float()],dim=0) 112 | y_pred = torch.cat([y_pred,y_outs],dim=0) 113 | # import ipdb; ipdb.set_trace() 114 | rmse = mean_squared_error(y_test.cpu(), y_pred.cpu(), squared=False) 115 | return rmse 116 | 117 | --------------------------------------------------------------------------------