├── logs └── .gitignore ├── datasets └── .gitignore ├── pics └── NNG-Mix.png ├── baseline ├── DeepSAD │ ├── src │ │ ├── baselines │ │ │ ├── shallow_ssad │ │ │ │ ├── __init__.py │ │ │ │ └── ssad_convex.py │ │ │ ├── __init__.py │ │ │ ├── SemiDGM.py │ │ │ ├── isoforest.py │ │ │ ├── kde.py │ │ │ ├── ocsvm.py │ │ │ └── ssad.py │ │ ├── base │ │ │ ├── __init__.py │ │ │ ├── torchvision_dataset.py │ │ │ ├── base_net.py │ │ │ ├── base_dataset.py │ │ │ ├── base_trainer.py │ │ │ └── odds_dataset.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── visualization │ │ │ │ └── plot_images_grid.py │ │ │ └── misc.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── main.py │ │ │ ├── odds.py │ │ │ ├── preprocessing.py │ │ │ ├── mnist.py │ │ │ ├── cifar10.py │ │ │ └── fmnist.py │ │ ├── optim │ │ │ ├── __init__.py │ │ │ ├── variational.py │ │ │ ├── vae_trainer.py │ │ │ ├── ae_trainer.py │ │ │ ├── DeepSAD_trainer.py │ │ │ └── SemiDGM_trainer.py │ │ ├── networks │ │ │ ├── __init__.py │ │ │ ├── inference │ │ │ │ └── distributions.py │ │ │ ├── main.py │ │ │ ├── layers │ │ │ │ ├── stochastic.py │ │ │ │ └── standard.py │ │ │ ├── mnist_LeNet.py │ │ │ ├── mlp.py │ │ │ ├── fmnist_LeNet.py │ │ │ ├── cifar10_LeNet.py │ │ │ ├── dgm.py │ │ │ └── vae.py │ │ ├── run.py │ │ ├── deepsad.py │ │ ├── baseline_ocsvm.py │ │ ├── baseline_ssad.py │ │ ├── baseline_kde.py │ │ └── baseline_isoforest.py │ ├── imgs │ │ └── fig1.png │ ├── ae_results.json │ ├── requirements.txt │ ├── LICENSE │ └── README.md └── Supervised.py ├── requirement.txt ├── README.md └── myutils.py /logs/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pics/NNG-Mix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donghao51/NNG-Mix/HEAD/pics/NNG-Mix.png -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baselines/shallow_ssad/__init__.py: -------------------------------------------------------------------------------- 1 | from .ssad_convex import ConvexSSAD 2 | -------------------------------------------------------------------------------- /baseline/DeepSAD/imgs/fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donghao51/NNG-Mix/HEAD/baseline/DeepSAD/imgs/fig1.png -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | torch==1.11.0+cu113 2 | torchvision==0.12.0+cu113 3 | numpy==1.23.5 4 | pandas==1.4.2 5 | scipy==1.10.1 6 | copulas 7 | scikit-learn 8 | -------------------------------------------------------------------------------- /baseline/DeepSAD/ae_results.json: -------------------------------------------------------------------------------- 1 | {"train_time": 49.85617208480835, "test_aucroc": 0.45984687500000004, "test_aucpr": 0.13354349144694128, "test_time": 0.12167739868164062} -------------------------------------------------------------------------------- /baseline/DeepSAD/src/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import * 2 | from .torchvision_dataset import * 3 | from .odds_dataset import * 4 | from .base_net import * 5 | from .base_trainer import * 6 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import Config 2 | from .visualization.plot_images_grid import plot_images_grid 3 | from .misc import enumerate_discrete, log_sum_exp, binary_cross_entropy 4 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baselines/__init__.py: -------------------------------------------------------------------------------- 1 | from .SemiDGM import SemiDeepGenerativeModel 2 | from .ocsvm import OCSVM 3 | from .kde import KDE 4 | from .isoforest import IsoForest 5 | from .ssad import SSAD 6 | from .shallow_ssad.ssad_convex import ConvexSSAD 7 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import load_dataset 2 | # from .mnist import MNIST_Dataset 3 | # from .fmnist import FashionMNIST_Dataset 4 | # from .cifar10 import CIFAR10_Dataset 5 | from .odds import ODDSADDataset 6 | from .preprocessing import * 7 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from .DeepSAD_trainer import DeepSADTrainer 2 | from .ae_trainer import AETrainer 3 | from .SemiDGM_trainer import SemiDeepGenerativeTrainer 4 | from .vae_trainer import VAETrainer 5 | from .variational import SVI, ImportanceWeightedSampler 6 | -------------------------------------------------------------------------------- /baseline/DeepSAD/requirements.txt: -------------------------------------------------------------------------------- 1 | Click==7.0 2 | cvxopt==1.2.3 3 | cycler==0.10.0 4 | joblib==0.13.2 5 | kiwisolver==1.1.0 6 | matplotlib==3.1.0 7 | numpy==1.16.4 8 | pandas==0.24.2 9 | Pillow==6.0.0 10 | pyparsing==2.4.0 11 | python-dateutil==2.8.0 12 | pytz==2019.1 13 | scikit-learn==0.21.2 14 | scipy==1.3.0 15 | seaborn==0.9.0 16 | six==1.12.0 17 | torch==1.1.0 18 | torchvision==0.3.0 19 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/datasets/main.py: -------------------------------------------------------------------------------- 1 | # from .mnist import MNIST_Dataset 2 | # from .fmnist import FashionMNIST_Dataset 3 | # from .cifar10 import CIFAR10_Dataset 4 | from .odds import ODDSADDataset 5 | 6 | 7 | def load_dataset(data, train=True): 8 | """Loads the dataset.""" 9 | 10 | # for tabular data 11 | dataset = ODDSADDataset(data=data, train=train) 12 | 13 | return dataset 14 | -------------------------------------------------------------------------------- /baseline/Supervised.py: -------------------------------------------------------------------------------- 1 | from sklearn.neural_network import MLPClassifier 2 | 3 | from myutils import Utils 4 | 5 | class supervised(): 6 | def __init__(self, seed:int, model_name:str=None): 7 | self.seed = seed 8 | self.utils = Utils() 9 | 10 | self.model_name = model_name 11 | self.model_dict = {'MLP':MLPClassifier} 12 | 13 | def fit(self, X_train, y_train, ratio=None): 14 | self.model = self.model_dict[self.model_name](random_state=self.seed) 15 | 16 | # fitting 17 | self.model.fit(X_train, y_train) 18 | 19 | return self 20 | 21 | def predict_score(self, X): 22 | score = self.model.predict_proba(X)[:, 1] 23 | return score -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import build_network, build_autoencoder 2 | # from .mnist_LeNet import MNIST_LeNet, MNIST_LeNet_Decoder, MNIST_LeNet_Autoencoder 3 | # from .fmnist_LeNet import FashionMNIST_LeNet, FashionMNIST_LeNet_Decoder, FashionMNIST_LeNet_Autoencoder 4 | # from .cifar10_LeNet import CIFAR10_LeNet, CIFAR10_LeNet_Decoder, CIFAR10_LeNet_Autoencoder 5 | from .mlp import MLP, MLP_Decoder, MLP_Autoencoder 6 | from .layers.stochastic import GaussianSample 7 | from .layers.standard import Standardize 8 | from .inference.distributions import log_standard_gaussian, log_gaussian, log_standard_categorical 9 | from .vae import VariationalAutoencoder, Encoder, Decoder 10 | from .dgm import DeepGenerativeModel, StackedDeepGenerativeModel 11 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/utils/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class Config(object): 5 | """Base class for experimental setting/configuration.""" 6 | 7 | def __init__(self, settings): 8 | self.settings = settings 9 | 10 | def load_config(self, import_json): 11 | """Load settings dict from import_json (path/filename.json) JSON-file.""" 12 | 13 | with open(import_json, 'r') as fp: 14 | settings = json.load(fp) 15 | 16 | for key, value in settings.items(): 17 | self.settings[key] = value 18 | 19 | def save_config(self, export_json): 20 | """Save settings dict to export_json (path/filename.json) JSON-file.""" 21 | 22 | with open(export_json, 'w') as fp: 23 | json.dump(self.settings, fp) 24 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/base/torchvision_dataset.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import BaseADDataset 2 | from torch.utils.data import DataLoader 3 | 4 | 5 | class TorchvisionDataset(BaseADDataset): 6 | """TorchvisionDataset class for datasets_cc already implemented in torchvision.datasets_cc.""" 7 | 8 | def __init__(self, root: str): 9 | super().__init__(root) 10 | 11 | def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> ( 12 | DataLoader, DataLoader): 13 | train_loader = DataLoader(dataset=self.train_set, batch_size=batch_size, shuffle=shuffle_train, 14 | num_workers=num_workers, drop_last=True) 15 | test_loader = DataLoader(dataset=self.test_set, batch_size=batch_size, shuffle=shuffle_test, 16 | num_workers=num_workers, drop_last=False) 17 | return train_loader, test_loader 18 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/base/base_net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | 6 | class BaseNet(nn.Module): 7 | """Base class for all neural networks.""" 8 | 9 | def __init__(self): 10 | super().__init__() 11 | self.logger = logging.getLogger(self.__class__.__name__) 12 | self.rep_dim = None # representation dimensionality, i.e. dim of the code layer or last layer 13 | 14 | def forward(self, *input): 15 | """ 16 | Forward pass logic 17 | :return: Network output 18 | """ 19 | raise NotImplementedError 20 | 21 | def summary(self): 22 | """Network summary.""" 23 | net_parameters = filter(lambda p: p.requires_grad, self.parameters()) 24 | params = sum([np.prod(p.size()) for p in net_parameters]) 25 | self.logger.info('Trainable parameters: {}'.format(params)) 26 | self.logger.info(self) 27 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/utils/visualization/plot_images_grid.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import matplotlib 3 | matplotlib.use('Agg') # or 'PS', 'PDF', 'SVG' 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | from torchvision.utils import make_grid 8 | 9 | 10 | def plot_images_grid(x: torch.tensor, export_img, title: str = '', nrow=8, padding=2, normalize=False, pad_value=0): 11 | """Plot 4D Tensor of images of shape (B x C x H x W) as a grid.""" 12 | 13 | grid = make_grid(x, nrow=nrow, padding=padding, normalize=normalize, pad_value=pad_value) 14 | npgrid = grid.cpu().numpy() 15 | 16 | plt.imshow(np.transpose(npgrid, (1, 2, 0)), interpolation='nearest') 17 | 18 | ax = plt.gca() 19 | ax.xaxis.set_visible(False) 20 | ax.yaxis.set_visible(False) 21 | 22 | if not (title == ''): 23 | plt.title(title) 24 | 25 | plt.savefig(export_img, bbox_inches='tight', pad_inches=0.1) 26 | plt.clf() 27 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/base/base_dataset.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from torch.utils.data import DataLoader 3 | 4 | 5 | class BaseADDataset(ABC): 6 | """Anomaly detection dataset base class.""" 7 | 8 | def __init__(self, root: str): 9 | super().__init__() 10 | self.root = root # root path to data 11 | 12 | self.n_classes = 2 # 0: normal, 1: outlier 13 | self.normal_classes = None # tuple with original class labels that define the normal class 14 | self.outlier_classes = None # tuple with original class labels that define the outlier class 15 | 16 | self.train_set = None # must be of type torch.utils.data.Dataset 17 | self.test_set = None # must be of type torch.utils.data.Dataset 18 | 19 | @abstractmethod 20 | def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> ( 21 | DataLoader, DataLoader): 22 | """Implement data loaders of type torch.utils.data.DataLoader for train_set and test_set.""" 23 | pass 24 | 25 | def __repr__(self): 26 | return self.__class__.__name__ 27 | -------------------------------------------------------------------------------- /baseline/DeepSAD/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 lukasruff 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/base/base_trainer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from .base_dataset import BaseADDataset 3 | from .base_net import BaseNet 4 | 5 | 6 | class BaseTrainer(ABC): 7 | """Trainer base class.""" 8 | 9 | def __init__(self, optimizer_name: str, lr: float, n_epochs: int, lr_milestones: tuple, batch_size: int, 10 | weight_decay: float, device: str, n_jobs_dataloader: int): 11 | super().__init__() 12 | self.optimizer_name = optimizer_name 13 | self.lr = lr 14 | self.n_epochs = n_epochs 15 | self.lr_milestones = lr_milestones 16 | self.batch_size = batch_size 17 | self.weight_decay = weight_decay 18 | self.device = device 19 | self.n_jobs_dataloader = n_jobs_dataloader 20 | 21 | @abstractmethod 22 | def train(self, dataset: BaseADDataset, net: BaseNet) -> BaseNet: 23 | """ 24 | Implement train method that trains the given network using the train_set of dataset. 25 | :return: Trained net 26 | """ 27 | pass 28 | 29 | @abstractmethod 30 | def test(self, dataset: BaseADDataset, net: BaseNet): 31 | """ 32 | Implement test method that evaluates the test_set of dataset on the given network. 33 | """ 34 | pass 35 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/inference/distributions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch 7 | def log_standard_gaussian(x): 8 | """ 9 | Evaluates the log pdf of a standard normal distribution at x. 10 | 11 | :param x: point to evaluate 12 | :return: log N(x|0,I) 13 | """ 14 | return torch.sum(-0.5 * math.log(2 * math.pi) - x ** 2 / 2, dim=-1) 15 | 16 | 17 | def log_gaussian(x, mu, log_var): 18 | """ 19 | Evaluates the log pdf of a normal distribution parametrized by mu and log_var at x. 20 | 21 | :param x: point to evaluate 22 | :param mu: mean 23 | :param log_var: log variance 24 | :return: log N(x|µ,σI) 25 | """ 26 | log_pdf = -0.5 * math.log(2 * math.pi) - log_var / 2 - (x - mu)**2 / (2 * torch.exp(log_var)) 27 | return torch.sum(log_pdf, dim=-1) 28 | 29 | 30 | def log_standard_categorical(p): 31 | """ 32 | Computes the cross-entropy between a (one-hot) categorical vector and a standard (uniform) categorical distribution. 33 | :param p: one-hot categorical distribution 34 | :return: H(p,u) 35 | """ 36 | eps = 1e-8 37 | prior = F.softmax(torch.ones_like(p), dim=1) # Uniform prior over y 38 | prior.requires_grad = False 39 | cross_entropy = -torch.sum(p * torch.log(prior + eps), dim=1) 40 | 41 | return cross_entropy 42 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/main.py: -------------------------------------------------------------------------------- 1 | # from .mnist_LeNet import MNIST_LeNet, MNIST_LeNet_Autoencoder 2 | # from .fmnist_LeNet import FashionMNIST_LeNet, FashionMNIST_LeNet_Autoencoder 3 | # from .cifar10_LeNet import CIFAR10_LeNet, CIFAR10_LeNet_Autoencoder 4 | from .mlp import MLP, MLP_Autoencoder 5 | from .vae import VariationalAutoencoder 6 | from .dgm import DeepGenerativeModel, StackedDeepGenerativeModel 7 | 8 | 9 | #注意此处与源码有不同 10 | #源码是不同数据集有不同的网络结构(which is weird) 11 | #注意bias必须要设为0,否则DeepSAD可能出现mode collapse(原论文中也提及) 12 | def build_network(net_name, input_size ,ae_net=None): 13 | """Builds the neural network.""" 14 | net = None 15 | 16 | if net_name == 'mnist_LeNet': 17 | net = MNIST_LeNet() 18 | 19 | elif net_name == 'fmnist_LeNet': 20 | net = FashionMNIST_LeNet() 21 | 22 | elif net_name == 'cifar10_LeNet': 23 | net = CIFAR10_LeNet() 24 | 25 | else: 26 | net = MLP(x_dim=input_size, h_dims=[100, 20], rep_dim=10, bias=False) 27 | 28 | return net 29 | 30 | def build_autoencoder(net_name, input_size): 31 | """Builds the corresponding autoencoder network.""" 32 | ae_net = None 33 | 34 | if net_name == 'mnist_LeNet': 35 | ae_net = MNIST_LeNet_Autoencoder() 36 | 37 | elif net_name == 'fmnist_LeNet': 38 | ae_net = FashionMNIST_LeNet_Autoencoder() 39 | 40 | elif net_name == 'cifar10_LeNet': 41 | ae_net = CIFAR10_LeNet_Autoencoder() 42 | 43 | else: 44 | ae_net = MLP_Autoencoder(x_dim=input_size, h_dims=[100, 20], rep_dim=10, bias=False) 45 | 46 | return ae_net 47 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/datasets/odds.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader, Subset 2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset 3 | from baseline.DeepSAD.src.base.odds_dataset import ODDSDataset 4 | from .preprocessing import create_semisupervised_setting 5 | 6 | import torch 7 | 8 | 9 | class ODDSADDataset(BaseADDataset): 10 | 11 | def __init__(self, data, train): 12 | super().__init__(self) 13 | 14 | # Define normal and outlier classes 15 | self.n_classes = 2 # 0: normal, 1: outlier 16 | self.normal_classes = (0,) 17 | self.outlier_classes = (1,) 18 | 19 | # training or testing dataset 20 | self.train = train 21 | 22 | if self.train: 23 | # Get training set 24 | self.train_set = ODDSDataset(data=data, train=True) 25 | else: 26 | # Get testing set 27 | self.test_set = ODDSDataset(data=data, train=False) 28 | 29 | def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> ( 30 | DataLoader, DataLoader): 31 | 32 | if self.train: 33 | train_loader = DataLoader(dataset=self.train_set, batch_size=batch_size, shuffle=shuffle_train, 34 | num_workers=num_workers, drop_last=True) 35 | return train_loader 36 | else: 37 | test_loader = DataLoader(dataset=self.test_set, batch_size=batch_size, shuffle=shuffle_test, 38 | num_workers=num_workers, drop_last=False) 39 | return test_loader -------------------------------------------------------------------------------- /baseline/DeepSAD/src/utils/misc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.autograd import Variable 4 | 5 | 6 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch 7 | def enumerate_discrete(x, y_dim): 8 | """ 9 | Generates a 'torch.Tensor' of size batch_size x n_labels of the given label. 10 | 11 | :param x: tensor with batch size to mimic 12 | :param y_dim: number of total labels 13 | :return variable 14 | """ 15 | 16 | def batch(batch_size, label): 17 | labels = (torch.ones(batch_size, 1) * label).type(torch.LongTensor) 18 | y = torch.zeros((batch_size, y_dim)) 19 | y.scatter_(1, labels, 1) 20 | return y.type(torch.LongTensor) 21 | 22 | batch_size = x.size(0) 23 | generated = torch.cat([batch(batch_size, i) for i in range(y_dim)]) 24 | 25 | if x.is_cuda: 26 | generated = generated.to(x.device) 27 | 28 | return Variable(generated.float()) 29 | 30 | 31 | def log_sum_exp(tensor, dim=-1, sum_op=torch.sum): 32 | """ 33 | Uses the LogSumExp (LSE) as an approximation for the sum in a log-domain. 34 | 35 | :param tensor: Tensor to compute LSE over 36 | :param dim: dimension to perform operation over 37 | :param sum_op: reductive operation to be applied, e.g. torch.sum or torch.mean 38 | :return: LSE 39 | """ 40 | max, _ = torch.max(tensor, dim=dim, keepdim=True) 41 | return torch.log(sum_op(torch.exp(tensor - max), dim=dim, keepdim=True) + 1e-8) + max 42 | 43 | 44 | def binary_cross_entropy(x, y): 45 | eps = 1e-8 46 | return -torch.sum(y * torch.log(x + eps) + (1 - y) * torch.log(1 - x + eps), dim=-1) 47 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/layers/stochastic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from torch.autograd import Variable 6 | 7 | 8 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch 9 | class Stochastic(nn.Module): 10 | """ 11 | Base stochastic layer that uses the reparametrization trick (Kingma and Welling, 2013) to draw a sample from a 12 | distribution parametrized by mu and log_var. 13 | """ 14 | 15 | def __init__(self): 16 | super(Stochastic, self).__init__() 17 | 18 | def reparametrize(self, mu, log_var): 19 | epsilon = Variable(torch.randn(mu.size()), requires_grad=False) 20 | 21 | if mu.is_cuda: 22 | epsilon = epsilon.to(mu.device) 23 | 24 | # log_std = 0.5 * log_var 25 | # std = exp(log_std) 26 | std = log_var.mul(0.5).exp_() 27 | 28 | # z = std * epsilon + mu 29 | z = mu.addcmul(std, epsilon) 30 | 31 | return z 32 | 33 | def forward(self, x): 34 | raise NotImplementedError 35 | 36 | 37 | class GaussianSample(Stochastic): 38 | """ 39 | Layer that represents a sample from a Gaussian distribution. 40 | """ 41 | 42 | def __init__(self, in_features, out_features): 43 | super(GaussianSample, self).__init__() 44 | self.in_features = in_features 45 | self.out_features = out_features 46 | 47 | self.mu = nn.Linear(in_features, out_features) 48 | self.log_var = nn.Linear(in_features, out_features) 49 | 50 | def forward(self, x): 51 | mu = self.mu(x) 52 | log_var = F.softplus(self.log_var(x)) 53 | return self.reparametrize(mu, log_var), mu, log_var 54 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/base/odds_dataset.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from torch.utils.data import Dataset 3 | from scipy.io import loadmat 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.preprocessing import StandardScaler, MinMaxScaler 6 | from torchvision.datasets.utils import download_url 7 | 8 | import os 9 | import torch 10 | import pandas as pd 11 | import numpy as np 12 | import os 13 | 14 | 15 | class ODDSDataset(Dataset): 16 | """ 17 | ODDSDataset class for datasets_cc from Outlier Detection DataSets (ODDS): http://odds.cs.stonybrook.edu/ 18 | 19 | Dataset class with additional targets for the semi-supervised setting and modification of __getitem__ method 20 | to also return the semi-supervised target as well as the index of a data sample. 21 | """ 22 | 23 | def __init__(self, data, train=True): 24 | super(Dataset, self).__init__() 25 | self.train = train 26 | 27 | if self.train: 28 | self.data = torch.tensor(data['X_train'], dtype=torch.float32) 29 | self.targets = torch.tensor(data['y_train'], dtype=torch.int64) 30 | else: 31 | self.data = torch.tensor(data['X_test'], dtype=torch.float32) 32 | self.targets = torch.tensor(data['y_test'], dtype=torch.int64) 33 | 34 | # self.semi_targets = torch.zeros_like(self.targets) 35 | self.semi_targets = self.targets 36 | 37 | def __getitem__(self, index): 38 | """ 39 | Args: 40 | index (int): Index 41 | 42 | Returns: 43 | tuple: (sample, target, semi_target, index) 44 | """ 45 | sample, target, semi_target = self.data[index], int(self.targets[index]), int(self.semi_targets[index]) 46 | 47 | return sample, target, semi_target, index 48 | 49 | def __len__(self): 50 | return len(self.data) 51 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/layers/standard.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.nn import Module 4 | from torch.nn import init 5 | from torch.nn.parameter import Parameter 6 | 7 | 8 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch 9 | class Standardize(Module): 10 | """ 11 | Applies (element-wise) standardization with trainable translation parameter μ and scale parameter σ, i.e. computes 12 | (x - μ) / σ where '/' is applied element-wise. 13 | 14 | Args: 15 | in_features: size of each input sample 16 | out_features: size of each output sample 17 | bias: If set to False, the layer will not learn a translation parameter μ. 18 | Default: ``True`` 19 | 20 | Attributes: 21 | mu: the learnable translation parameter μ. 22 | std: the learnable scale parameter σ. 23 | """ 24 | __constants__ = ['mu'] 25 | 26 | def __init__(self, in_features, bias=True, eps=1e-6): 27 | super(Standardize, self).__init__() 28 | self.in_features = in_features 29 | self.out_features = in_features 30 | self.eps = eps 31 | self.std = Parameter(torch.Tensor(in_features)) 32 | if bias: 33 | self.mu = Parameter(torch.Tensor(in_features)) 34 | else: 35 | self.register_parameter('mu', None) 36 | self.reset_parameters() 37 | 38 | def reset_parameters(self): 39 | init.constant_(self.std, 1) 40 | if self.mu is not None: 41 | init.constant_(self.mu, 0) 42 | 43 | def forward(self, x): 44 | if self.mu is not None: 45 | x -= self.mu 46 | x = torch.div(x, self.std + self.eps) 47 | return x 48 | 49 | def extra_repr(self): 50 | return 'in_features={}, out_features={}, bias={}'.format( 51 | self.in_features, self.out_features, self.mu is not None 52 | ) 53 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/mnist_LeNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from base.base_net import BaseNet 6 | 7 | 8 | class MNIST_LeNet(BaseNet): 9 | 10 | def __init__(self, rep_dim=32): 11 | super().__init__() 12 | 13 | self.rep_dim = rep_dim 14 | self.pool = nn.MaxPool2d(2, 2) 15 | 16 | self.conv1 = nn.Conv2d(1, 8, 5, bias=False, padding=2) 17 | self.bn1 = nn.BatchNorm2d(8, eps=1e-04, affine=False) 18 | self.conv2 = nn.Conv2d(8, 4, 5, bias=False, padding=2) 19 | self.bn2 = nn.BatchNorm2d(4, eps=1e-04, affine=False) 20 | self.fc1 = nn.Linear(4 * 7 * 7, self.rep_dim, bias=False) 21 | 22 | def forward(self, x): 23 | x = x.view(-1, 1, 28, 28) 24 | x = self.conv1(x) 25 | x = self.pool(F.leaky_relu(self.bn1(x))) 26 | x = self.conv2(x) 27 | x = self.pool(F.leaky_relu(self.bn2(x))) 28 | x = x.view(int(x.size(0)), -1) 29 | x = self.fc1(x) 30 | return x 31 | 32 | 33 | class MNIST_LeNet_Decoder(BaseNet): 34 | 35 | def __init__(self, rep_dim=32): 36 | super().__init__() 37 | 38 | self.rep_dim = rep_dim 39 | 40 | # Decoder network 41 | self.deconv1 = nn.ConvTranspose2d(2, 4, 5, bias=False, padding=2) 42 | self.bn3 = nn.BatchNorm2d(4, eps=1e-04, affine=False) 43 | self.deconv2 = nn.ConvTranspose2d(4, 8, 5, bias=False, padding=3) 44 | self.bn4 = nn.BatchNorm2d(8, eps=1e-04, affine=False) 45 | self.deconv3 = nn.ConvTranspose2d(8, 1, 5, bias=False, padding=2) 46 | 47 | def forward(self, x): 48 | x = x.view(int(x.size(0)), int(self.rep_dim / 16), 4, 4) 49 | x = F.interpolate(F.leaky_relu(x), scale_factor=2) 50 | x = self.deconv1(x) 51 | x = F.interpolate(F.leaky_relu(self.bn3(x)), scale_factor=2) 52 | x = self.deconv2(x) 53 | x = F.interpolate(F.leaky_relu(self.bn4(x)), scale_factor=2) 54 | x = self.deconv3(x) 55 | x = torch.sigmoid(x) 56 | return x 57 | 58 | 59 | class MNIST_LeNet_Autoencoder(BaseNet): 60 | 61 | def __init__(self, rep_dim=32): 62 | super().__init__() 63 | 64 | self.rep_dim = rep_dim 65 | self.encoder = MNIST_LeNet(rep_dim=rep_dim) 66 | self.decoder = MNIST_LeNet_Decoder(rep_dim=rep_dim) 67 | 68 | def forward(self, x): 69 | x = self.encoder(x) 70 | x = self.decoder(x) 71 | return x 72 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/mlp.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | from baseline.DeepSAD.src.base.base_net import BaseNet 5 | 6 | 7 | class MLP(BaseNet): 8 | 9 | def __init__(self, x_dim, h_dims=[128, 64], rep_dim=32, bias=False): 10 | super().__init__() 11 | 12 | self.rep_dim = rep_dim 13 | 14 | neurons = [x_dim, *h_dims] 15 | layers = [Linear_BN_leakyReLU(neurons[i - 1], neurons[i], bias=bias) for i in range(1, len(neurons))] 16 | 17 | self.hidden = nn.ModuleList(layers) 18 | self.code = nn.Linear(h_dims[-1], rep_dim, bias=bias) 19 | 20 | def forward(self, x): 21 | x = x.view(int(x.size(0)), -1) 22 | for layer in self.hidden: 23 | x = layer(x) 24 | return self.code(x) 25 | 26 | 27 | class MLP_Decoder(BaseNet): 28 | 29 | def __init__(self, x_dim, h_dims=[64, 128], rep_dim=32, bias=False): 30 | super().__init__() 31 | 32 | self.rep_dim = rep_dim 33 | 34 | neurons = [rep_dim, *h_dims] 35 | layers = [Linear_BN_leakyReLU(neurons[i - 1], neurons[i], bias=bias) for i in range(1, len(neurons))] 36 | 37 | self.hidden = nn.ModuleList(layers) 38 | self.reconstruction = nn.Linear(h_dims[-1], x_dim, bias=bias) 39 | self.output_activation = nn.Sigmoid() 40 | 41 | def forward(self, x): 42 | x = x.view(int(x.size(0)), -1) 43 | for layer in self.hidden: 44 | x = layer(x) 45 | x = self.reconstruction(x) 46 | return self.output_activation(x) 47 | 48 | 49 | class MLP_Autoencoder(BaseNet): 50 | 51 | def __init__(self, x_dim, h_dims=[128, 64], rep_dim=32, bias=False): 52 | super().__init__() 53 | 54 | self.rep_dim = rep_dim 55 | self.encoder = MLP(x_dim, h_dims, rep_dim, bias) 56 | self.decoder = MLP_Decoder(x_dim, list(reversed(h_dims)), rep_dim, bias) 57 | 58 | def forward(self, x): 59 | x = self.encoder(x) 60 | x = self.decoder(x) 61 | return x 62 | 63 | 64 | class Linear_BN_leakyReLU(nn.Module): 65 | """ 66 | A nn.Module that consists of a Linear layer followed by BatchNorm1d and a leaky ReLu activation 67 | """ 68 | 69 | def __init__(self, in_features, out_features, bias=False, eps=1e-04): 70 | super(Linear_BN_leakyReLU, self).__init__() 71 | 72 | self.linear = nn.Linear(in_features, out_features, bias=bias) 73 | self.bn = nn.BatchNorm1d(out_features, eps=eps, affine=bias) 74 | 75 | def forward(self, x): 76 | return F.leaky_relu(self.bn(self.linear(x))) 77 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/fmnist_LeNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from base.base_net import BaseNet 6 | 7 | 8 | class FashionMNIST_LeNet(BaseNet): 9 | 10 | def __init__(self, rep_dim=64): 11 | super().__init__() 12 | 13 | self.rep_dim = rep_dim 14 | self.pool = nn.MaxPool2d(2, 2) 15 | 16 | self.conv1 = nn.Conv2d(1, 16, 5, bias=False, padding=2) 17 | self.bn2d1 = nn.BatchNorm2d(16, eps=1e-04, affine=False) 18 | self.conv2 = nn.Conv2d(16, 32, 5, bias=False, padding=2) 19 | self.bn2d2 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 20 | self.fc1 = nn.Linear(32 * 7 * 7, 128, bias=False) 21 | self.bn1d1 = nn.BatchNorm1d(128, eps=1e-04, affine=False) 22 | self.fc2 = nn.Linear(128, self.rep_dim, bias=False) 23 | 24 | def forward(self, x): 25 | x = x.view(-1, 1, 28, 28) 26 | x = self.conv1(x) 27 | x = self.pool(F.leaky_relu(self.bn2d1(x))) 28 | x = self.conv2(x) 29 | x = self.pool(F.leaky_relu(self.bn2d2(x))) 30 | x = x.view(int(x.size(0)), -1) 31 | x = F.leaky_relu(self.bn1d1(self.fc1(x))) 32 | x = self.fc2(x) 33 | return x 34 | 35 | 36 | class FashionMNIST_LeNet_Decoder(BaseNet): 37 | 38 | def __init__(self, rep_dim=64): 39 | super().__init__() 40 | 41 | self.rep_dim = rep_dim 42 | 43 | self.fc3 = nn.Linear(self.rep_dim, 128, bias=False) 44 | self.bn1d2 = nn.BatchNorm1d(128, eps=1e-04, affine=False) 45 | self.deconv1 = nn.ConvTranspose2d(8, 32, 5, bias=False, padding=2) 46 | self.bn2d3 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 47 | self.deconv2 = nn.ConvTranspose2d(32, 16, 5, bias=False, padding=3) 48 | self.bn2d4 = nn.BatchNorm2d(16, eps=1e-04, affine=False) 49 | self.deconv3 = nn.ConvTranspose2d(16, 1, 5, bias=False, padding=2) 50 | 51 | def forward(self, x): 52 | x = self.bn1d2(self.fc3(x)) 53 | x = x.view(int(x.size(0)), int(128 / 16), 4, 4) 54 | x = F.interpolate(F.leaky_relu(x), scale_factor=2) 55 | x = self.deconv1(x) 56 | x = F.interpolate(F.leaky_relu(self.bn2d3(x)), scale_factor=2) 57 | x = self.deconv2(x) 58 | x = F.interpolate(F.leaky_relu(self.bn2d4(x)), scale_factor=2) 59 | x = self.deconv3(x) 60 | x = torch.sigmoid(x) 61 | return x 62 | 63 | 64 | class FashionMNIST_LeNet_Autoencoder(BaseNet): 65 | 66 | def __init__(self, rep_dim=64): 67 | super().__init__() 68 | 69 | self.rep_dim = rep_dim 70 | self.encoder = FashionMNIST_LeNet(rep_dim=rep_dim) 71 | self.decoder = FashionMNIST_LeNet_Decoder(rep_dim=rep_dim) 72 | 73 | def forward(self, x): 74 | x = self.encoder(x) 75 | x = self.decoder(x) 76 | return x 77 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/optim/variational.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | from torch import nn 5 | from itertools import repeat 6 | from baseline.DeepSAD.src.utils import enumerate_discrete, log_sum_exp 7 | from baseline.DeepSAD.src.networks import log_standard_categorical 8 | 9 | 10 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch 11 | class ImportanceWeightedSampler(object): 12 | """ 13 | Importance weighted sampler (Burda et al., 2015) to be used together with SVI. 14 | 15 | :param mc: number of Monte Carlo samples 16 | :param iw: number of Importance Weighted samples 17 | """ 18 | 19 | def __init__(self, mc=1, iw=1): 20 | self.mc = mc 21 | self.iw = iw 22 | 23 | def resample(self, x): 24 | return x.repeat(self.mc * self.iw, 1) 25 | 26 | def __call__(self, elbo): 27 | elbo = elbo.view(self.mc, self.iw, -1) 28 | elbo = torch.mean(log_sum_exp(elbo, dim=1, sum_op=torch.mean), dim=0) 29 | return elbo.view(-1) 30 | 31 | 32 | class SVI(nn.Module): 33 | """ 34 | Stochastic variational inference (SVI) optimizer for semi-supervised learning. 35 | 36 | :param model: semi-supervised model to evaluate 37 | :param likelihood: p(x|y,z) for example BCE or MSE 38 | :param beta: warm-up/scaling of KL-term 39 | :param sampler: sampler for x and y, e.g. for Monte Carlo 40 | """ 41 | 42 | base_sampler = ImportanceWeightedSampler(mc=1, iw=1) 43 | 44 | def __init__(self, model, likelihood=F.binary_cross_entropy, beta=repeat(1), sampler=base_sampler): 45 | super(SVI, self).__init__() 46 | self.model = model 47 | self.likelihood = likelihood 48 | self.sampler = sampler 49 | self.beta = beta 50 | 51 | def forward(self, x, y=None): 52 | is_labeled = False if y is None else True 53 | 54 | # Prepare for sampling 55 | xs, ys = (x, y) 56 | 57 | # Enumerate choices of label 58 | if not is_labeled: 59 | ys = enumerate_discrete(xs, self.model.y_dim) 60 | xs = xs.repeat(self.model.y_dim, 1) 61 | 62 | # Increase sampling dimension 63 | xs = self.sampler.resample(xs) 64 | ys = self.sampler.resample(ys) 65 | 66 | reconstruction = self.model(xs, ys) 67 | 68 | # p(x|y,z) 69 | likelihood = -self.likelihood(reconstruction, xs) 70 | 71 | # p(y) 72 | prior = -log_standard_categorical(ys) 73 | 74 | # Equivalent to -L(x, y) 75 | elbo = likelihood + prior - next(self.beta) * self.model.kl_divergence 76 | L = self.sampler(elbo) 77 | 78 | if is_labeled: 79 | return torch.mean(L) 80 | 81 | logits = self.model.classify(x) 82 | 83 | L = L.view_as(logits.t()).t() 84 | 85 | # Calculate entropy H(q(y|x)) and sum over all labels 86 | eps = 1e-8 87 | H = -torch.sum(torch.mul(logits, torch.log(logits + eps)), dim=-1) 88 | L = torch.sum(torch.mul(logits, L), dim=-1) 89 | 90 | # Equivalent to -U(x) 91 | U = L + H 92 | 93 | return torch.mean(U) 94 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/cifar10_LeNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from base.base_net import BaseNet 6 | 7 | 8 | class CIFAR10_LeNet(BaseNet): 9 | 10 | def __init__(self, rep_dim=128): 11 | super().__init__() 12 | 13 | self.rep_dim = rep_dim 14 | self.pool = nn.MaxPool2d(2, 2) 15 | 16 | self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2) 17 | self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 18 | self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2) 19 | self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False) 20 | self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2) 21 | self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False) 22 | self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False) 23 | 24 | def forward(self, x): 25 | x = x.view(-1, 3, 32, 32) 26 | x = self.conv1(x) 27 | x = self.pool(F.leaky_relu(self.bn2d1(x))) 28 | x = self.conv2(x) 29 | x = self.pool(F.leaky_relu(self.bn2d2(x))) 30 | x = self.conv3(x) 31 | x = self.pool(F.leaky_relu(self.bn2d3(x))) 32 | x = x.view(int(x.size(0)), -1) 33 | x = self.fc1(x) 34 | return x 35 | 36 | 37 | class CIFAR10_LeNet_Decoder(BaseNet): 38 | 39 | def __init__(self, rep_dim=128): 40 | super().__init__() 41 | 42 | self.rep_dim = rep_dim 43 | 44 | self.deconv1 = nn.ConvTranspose2d(int(self.rep_dim / (4 * 4)), 128, 5, bias=False, padding=2) 45 | nn.init.xavier_uniform_(self.deconv1.weight, gain=nn.init.calculate_gain('leaky_relu')) 46 | self.bn2d4 = nn.BatchNorm2d(128, eps=1e-04, affine=False) 47 | self.deconv2 = nn.ConvTranspose2d(128, 64, 5, bias=False, padding=2) 48 | nn.init.xavier_uniform_(self.deconv2.weight, gain=nn.init.calculate_gain('leaky_relu')) 49 | self.bn2d5 = nn.BatchNorm2d(64, eps=1e-04, affine=False) 50 | self.deconv3 = nn.ConvTranspose2d(64, 32, 5, bias=False, padding=2) 51 | nn.init.xavier_uniform_(self.deconv3.weight, gain=nn.init.calculate_gain('leaky_relu')) 52 | self.bn2d6 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 53 | self.deconv4 = nn.ConvTranspose2d(32, 3, 5, bias=False, padding=2) 54 | nn.init.xavier_uniform_(self.deconv4.weight, gain=nn.init.calculate_gain('leaky_relu')) 55 | 56 | def forward(self, x): 57 | x = x.view(int(x.size(0)), int(self.rep_dim / (4 * 4)), 4, 4) 58 | x = F.leaky_relu(x) 59 | x = self.deconv1(x) 60 | x = F.interpolate(F.leaky_relu(self.bn2d4(x)), scale_factor=2) 61 | x = self.deconv2(x) 62 | x = F.interpolate(F.leaky_relu(self.bn2d5(x)), scale_factor=2) 63 | x = self.deconv3(x) 64 | x = F.interpolate(F.leaky_relu(self.bn2d6(x)), scale_factor=2) 65 | x = self.deconv4(x) 66 | x = torch.sigmoid(x) 67 | return x 68 | 69 | 70 | class CIFAR10_LeNet_Autoencoder(BaseNet): 71 | 72 | def __init__(self, rep_dim=128): 73 | super().__init__() 74 | 75 | self.rep_dim = rep_dim 76 | self.encoder = CIFAR10_LeNet(rep_dim=rep_dim) 77 | self.decoder = CIFAR10_LeNet_Decoder(rep_dim=rep_dim) 78 | 79 | def forward(self, x): 80 | x = self.encoder(x) 81 | x = self.decoder(x) 82 | return x 83 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/datasets/preprocessing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def create_semisupervised_setting(labels, normal_classes, outlier_classes, known_outlier_classes, 6 | ratio_known_normal, ratio_known_outlier, ratio_pollution): 7 | """ 8 | Create a semi-supervised data setting. 9 | :param labels: np.array with labels of all dataset samples 10 | :param normal_classes: tuple with normal class labels 11 | :param outlier_classes: tuple with anomaly class labels 12 | :param known_outlier_classes: tuple with known (labeled) anomaly class labels 13 | :param ratio_known_normal: the desired ratio of known (labeled) normal samples 14 | :param ratio_known_outlier: the desired ratio of known (labeled) anomalous samples 15 | :param ratio_pollution: the desired pollution ratio of the unlabeled data with unknown (unlabeled) anomalies. 16 | :return: tuple with list of sample indices, list of original labels, and list of semi-supervised labels 17 | """ 18 | idx_normal = np.argwhere(np.isin(labels, normal_classes)).flatten() 19 | idx_outlier = np.argwhere(np.isin(labels, outlier_classes)).flatten() 20 | idx_known_outlier_candidates = np.argwhere(np.isin(labels, known_outlier_classes)).flatten() 21 | 22 | n_normal = len(idx_normal) 23 | 24 | # Solve system of linear equations to obtain respective number of samples 25 | a = np.array([[1, 1, 0, 0], 26 | [(1-ratio_known_normal), -ratio_known_normal, -ratio_known_normal, -ratio_known_normal], 27 | [-ratio_known_outlier, -ratio_known_outlier, -ratio_known_outlier, (1-ratio_known_outlier)], 28 | [0, -ratio_pollution, (1-ratio_pollution), 0]]) 29 | b = np.array([n_normal, 0, 0, 0]) 30 | x = np.linalg.solve(a, b) 31 | 32 | # Get number of samples 33 | n_known_normal = int(x[0]) 34 | n_unlabeled_normal = int(x[1]) 35 | n_unlabeled_outlier = int(x[2]) 36 | n_known_outlier = int(x[3]) 37 | 38 | # Sample indices 39 | perm_normal = np.random.permutation(n_normal) 40 | perm_outlier = np.random.permutation(len(idx_outlier)) 41 | perm_known_outlier = np.random.permutation(len(idx_known_outlier_candidates)) 42 | 43 | idx_known_normal = idx_normal[perm_normal[:n_known_normal]].tolist() 44 | idx_unlabeled_normal = idx_normal[perm_normal[n_known_normal:n_known_normal+n_unlabeled_normal]].tolist() 45 | idx_unlabeled_outlier = idx_outlier[perm_outlier[:n_unlabeled_outlier]].tolist() 46 | idx_known_outlier = idx_known_outlier_candidates[perm_known_outlier[:n_known_outlier]].tolist() 47 | 48 | # Get original class labels 49 | labels_known_normal = labels[idx_known_normal].tolist() 50 | labels_unlabeled_normal = labels[idx_unlabeled_normal].tolist() 51 | labels_unlabeled_outlier = labels[idx_unlabeled_outlier].tolist() 52 | labels_known_outlier = labels[idx_known_outlier].tolist() 53 | 54 | # Get semi-supervised setting labels 55 | semi_labels_known_normal = np.ones(n_known_normal).astype(np.int32).tolist() 56 | semi_labels_unlabeled_normal = np.zeros(n_unlabeled_normal).astype(np.int32).tolist() 57 | semi_labels_unlabeled_outlier = np.zeros(n_unlabeled_outlier).astype(np.int32).tolist() 58 | semi_labels_known_outlier = (-np.ones(n_known_outlier).astype(np.int32)).tolist() 59 | 60 | # Create final lists 61 | list_idx = idx_known_normal + idx_unlabeled_normal + idx_unlabeled_outlier + idx_known_outlier 62 | list_labels = labels_known_normal + labels_unlabeled_normal + labels_unlabeled_outlier + labels_known_outlier 63 | list_semi_labels = (semi_labels_known_normal + semi_labels_unlabeled_normal + semi_labels_unlabeled_outlier 64 | + semi_labels_known_outlier) 65 | 66 | return list_idx, list_labels, list_semi_labels 67 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/datasets/mnist.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Subset 2 | from PIL import Image 3 | from torchvision.datasets import MNIST 4 | from baseline.DeepSAD.src.base.torchvision_dataset import TorchvisionDataset 5 | from .preprocessing import create_semisupervised_setting 6 | 7 | import torch 8 | import torchvision.transforms as transforms 9 | import random 10 | 11 | 12 | class MNIST_Dataset(TorchvisionDataset): 13 | 14 | def __init__(self, root: str, normal_class: int = 0, known_outlier_class: int = 1, n_known_outlier_classes: int = 0, 15 | ratio_known_normal: float = 0.0, ratio_known_outlier: float = 0.0, ratio_pollution: float = 0.0): 16 | super().__init__(root) 17 | 18 | # Define normal and outlier classes 19 | self.n_classes = 2 # 0: normal, 1: outlier 20 | self.normal_classes = tuple([normal_class]) 21 | self.outlier_classes = list(range(0, 10)) 22 | self.outlier_classes.remove(normal_class) 23 | self.outlier_classes = tuple(self.outlier_classes) 24 | 25 | if n_known_outlier_classes == 0: 26 | self.known_outlier_classes = () 27 | elif n_known_outlier_classes == 1: 28 | self.known_outlier_classes = tuple([known_outlier_class]) 29 | else: 30 | self.known_outlier_classes = tuple(random.sample(self.outlier_classes, n_known_outlier_classes)) 31 | 32 | # MNIST preprocessing: feature scaling to [0, 1] 33 | transform = transforms.ToTensor() 34 | target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes)) 35 | 36 | # Get train set 37 | train_set = MyMNIST(root=self.root, train=True, transform=transform, target_transform=target_transform, 38 | download=True) 39 | 40 | # Create semi-supervised setting 41 | idx, _, semi_targets = create_semisupervised_setting(train_set.targets.cpu().data.numpy(), self.normal_classes, 42 | self.outlier_classes, self.known_outlier_classes, 43 | ratio_known_normal, ratio_known_outlier, ratio_pollution) 44 | train_set.semi_targets[idx] = torch.tensor(semi_targets) # set respective semi-supervised labels 45 | 46 | # Subset train_set to semi-supervised setup 47 | self.train_set = Subset(train_set, idx) 48 | 49 | # Get test set 50 | self.test_set = MyMNIST(root=self.root, train=False, transform=transform, target_transform=target_transform, 51 | download=True) 52 | 53 | 54 | class MyMNIST(MNIST): 55 | """ 56 | Torchvision MNIST class with additional targets for the semi-supervised setting and patch of __getitem__ method 57 | to also return the semi-supervised target as well as the index of a data sample. 58 | """ 59 | 60 | def __init__(self, *args, **kwargs): 61 | super(MyMNIST, self).__init__(*args, **kwargs) 62 | 63 | self.semi_targets = torch.zeros_like(self.targets) 64 | 65 | def __getitem__(self, index): 66 | """Override the original method of the MNIST class. 67 | Args: 68 | index (int): Index 69 | 70 | Returns: 71 | tuple: (image, target, semi_target, index) 72 | """ 73 | img, target, semi_target = self.data[index], int(self.targets[index]), int(self.semi_targets[index]) 74 | 75 | # doing this so that it is consistent with all other datasets_cc 76 | # to return a PIL Image 77 | img = Image.fromarray(img.numpy(), mode='L') 78 | 79 | if self.transform is not None: 80 | img = self.transform(img) 81 | 82 | if self.target_transform is not None: 83 | target = self.target_transform(target) 84 | 85 | return img, target, semi_target, index 86 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/datasets/cifar10.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Subset 2 | from PIL import Image 3 | from torchvision.datasets import CIFAR10 4 | from base.torchvision_dataset import TorchvisionDataset 5 | from .preprocessing import create_semisupervised_setting 6 | 7 | import torch 8 | import torchvision.transforms as transforms 9 | import random 10 | import numpy as np 11 | 12 | 13 | class CIFAR10_Dataset(TorchvisionDataset): 14 | 15 | def __init__(self, root: str, normal_class: int = 5, known_outlier_class: int = 3, n_known_outlier_classes: int = 0, 16 | ratio_known_normal: float = 0.0, ratio_known_outlier: float = 0.0, ratio_pollution: float = 0.0): 17 | super().__init__(root) 18 | 19 | # Define normal and outlier classes 20 | self.n_classes = 2 # 0: normal, 1: outlier 21 | self.normal_classes = tuple([normal_class]) 22 | self.outlier_classes = list(range(0, 10)) 23 | self.outlier_classes.remove(normal_class) 24 | self.outlier_classes = tuple(self.outlier_classes) 25 | 26 | if n_known_outlier_classes == 0: 27 | self.known_outlier_classes = () 28 | elif n_known_outlier_classes == 1: 29 | self.known_outlier_classes = tuple([known_outlier_class]) 30 | else: 31 | self.known_outlier_classes = tuple(random.sample(self.outlier_classes, n_known_outlier_classes)) 32 | 33 | # CIFAR-10 preprocessing: feature scaling to [0, 1] 34 | transform = transforms.ToTensor() 35 | target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes)) 36 | 37 | # Get train set 38 | train_set = MyCIFAR10(root=self.root, train=True, transform=transform, target_transform=target_transform, 39 | download=True) 40 | 41 | # Create semi-supervised setting 42 | idx, _, semi_targets = create_semisupervised_setting(np.array(train_set.targets), self.normal_classes, 43 | self.outlier_classes, self.known_outlier_classes, 44 | ratio_known_normal, ratio_known_outlier, ratio_pollution) 45 | train_set.semi_targets[idx] = torch.tensor(semi_targets) # set respective semi-supervised labels 46 | 47 | # Subset train_set to semi-supervised setup 48 | self.train_set = Subset(train_set, idx) 49 | 50 | # Get test set 51 | self.test_set = MyCIFAR10(root=self.root, train=False, transform=transform, target_transform=target_transform, 52 | download=True) 53 | 54 | 55 | class MyCIFAR10(CIFAR10): 56 | """ 57 | Torchvision CIFAR10 class with additional targets for the semi-supervised setting and patch of __getitem__ method 58 | to also return the semi-supervised target as well as the index of a data sample. 59 | """ 60 | 61 | def __init__(self, *args, **kwargs): 62 | super(MyCIFAR10, self).__init__(*args, **kwargs) 63 | 64 | self.semi_targets = torch.zeros(len(self.targets), dtype=torch.int64) 65 | 66 | def __getitem__(self, index): 67 | """Override the original method of the CIFAR10 class. 68 | Args: 69 | index (int): Index 70 | 71 | Returns: 72 | tuple: (image, target, semi_target, index) 73 | """ 74 | img, target, semi_target = self.data[index], self.targets[index], int(self.semi_targets[index]) 75 | 76 | # doing this so that it is consistent with all other datasets_cc 77 | # to return a PIL Image 78 | img = Image.fromarray(img) 79 | 80 | if self.transform is not None: 81 | img = self.transform(img) 82 | 83 | if self.target_transform is not None: 84 | target = self.target_transform(target) 85 | 86 | return img, target, semi_target, index 87 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/datasets/fmnist.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Subset 2 | from PIL import Image 3 | from torchvision.datasets import FashionMNIST 4 | from base.torchvision_dataset import TorchvisionDataset 5 | from .preprocessing import create_semisupervised_setting 6 | 7 | import torch 8 | import torchvision.transforms as transforms 9 | import random 10 | 11 | 12 | class FashionMNIST_Dataset(TorchvisionDataset): 13 | 14 | def __init__(self, root: str, normal_class: int = 0, known_outlier_class: int = 1, n_known_outlier_classes: int = 0, 15 | ratio_known_normal: float = 0.0, ratio_known_outlier: float = 0.0, ratio_pollution: float = 0.0): 16 | super().__init__(root) 17 | 18 | # Define normal and outlier classes 19 | self.n_classes = 2 # 0: normal, 1: outlier 20 | self.normal_classes = tuple([normal_class]) 21 | self.outlier_classes = list(range(0, 10)) 22 | self.outlier_classes.remove(normal_class) 23 | self.outlier_classes = tuple(self.outlier_classes) 24 | 25 | if n_known_outlier_classes == 0: 26 | self.known_outlier_classes = () 27 | elif n_known_outlier_classes == 1: 28 | self.known_outlier_classes = tuple([known_outlier_class]) 29 | else: 30 | self.known_outlier_classes = tuple(random.sample(self.outlier_classes, n_known_outlier_classes)) 31 | 32 | # FashionMNIST preprocessing: feature scaling to [0, 1] 33 | transform = transforms.ToTensor() 34 | target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes)) 35 | 36 | # Get train set 37 | train_set = MyFashionMNIST(root=self.root, train=True, transform=transform, target_transform=target_transform, 38 | download=True) 39 | 40 | # Create semi-supervised setting 41 | idx, _, semi_targets = create_semisupervised_setting(train_set.targets.cpu().data.numpy(), self.normal_classes, 42 | self.outlier_classes, self.known_outlier_classes, 43 | ratio_known_normal, ratio_known_outlier, ratio_pollution) 44 | train_set.semi_targets[idx] = torch.tensor(semi_targets) # set respective semi-supervised labels 45 | 46 | # Subset train_set to semi-supervised setup 47 | self.train_set = Subset(train_set, idx) 48 | 49 | # Get test set 50 | self.test_set = MyFashionMNIST(root=self.root, train=False, transform=transform, 51 | target_transform=target_transform, download=True) 52 | 53 | 54 | class MyFashionMNIST(FashionMNIST): 55 | """ 56 | Torchvision FashionMNIST class with additional targets for the semi-supervised setting and patch of __getitem__ 57 | method to also return the semi-supervised target as well as the index of a data sample. 58 | """ 59 | 60 | def __init__(self, *args, **kwargs): 61 | super(MyFashionMNIST, self).__init__(*args, **kwargs) 62 | 63 | self.semi_targets = torch.zeros_like(self.targets) 64 | 65 | def __getitem__(self, index): 66 | """Override the original method of the MyFashionMNIST class. 67 | Args: 68 | index (int): Index 69 | 70 | Returns: 71 | tuple: (image, target, semi_target, index) 72 | """ 73 | img, target, semi_target = self.data[index], int(self.targets[index]), int(self.semi_targets[index]) 74 | 75 | # doing this so that it is consistent with all other datasets_cc 76 | # to return a PIL Image 77 | img = Image.fromarray(img.numpy(), mode='L') 78 | 79 | if self.transform is not None: 80 | img = self.transform(img) 81 | 82 | if self.target_transform is not None: 83 | target = self.target_transform(target) 84 | 85 | return img, target, semi_target, index 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NNG-Mix 2 | 3 | This repository contains the implementation of the paper: 4 | 5 | **NNG-Mix: Improving Semi-supervised Anomaly Detection with Pseudo-anomaly Generation** 6 | [Hao Dong](https://sites.google.com/view/dong-hao/), [Gaëtan Frusque](https://frusquegaetan.github.io/), [Yue Zhao](https://viterbi-web.usc.edu/~yzhao010/), [Eleni Chatzi](https://chatzi.ibk.ethz.ch/about-us/people/prof-dr-eleni-chatzi.html) and [Olga Fink](https://people.epfl.ch/olga.fink?lang=en) 7 | [Link](https://arxiv.org/abs/2311.11961) to the arXiv version of the paper is available. 8 | 9 | We investigate improving semi-supervised anomaly detection performance from a novel viewpoint, by generating additional pseudo-anomalies based on the limited labeled anomalies and a large amount of unlabeled data. We introduce NNG-Mix, a simple and effective pseudo-anomaly generation algorithm, that optimally utilizes information from both labeled anomalies and unlabeled data. 10 | 11 | 12 | Nearest Neighbor Gaussian Mixup (NNG-Mix) makes good use of information from both labeled anomalies and unlabeled data to generate pseudo-anomalies effectively. 13 | 14 | ## Dataset 15 | Download `Classical`, `CV_by_ResNet18`, and `NLP_by_BERT` from [ADBench](https://github.com/Minqi824/ADBench/tree/main/adbench/datasets) and put under `datasets/` folder. 16 | 17 | ## Code 18 | 19 | Change `--ratio 1.0` to `--ratio 0.5` or `--ratio 0.1` for training with 5% or 1% available labeled anomalies. 20 | ### Classical Dataset 21 |
22 | Click for details... 23 | 24 | 25 | #### Train on Classical datasets with 10% available labeled anomalies using DeepSAD 26 | ``` 27 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg DeepSAD --dataset Classical --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.01 --mixup_alpha 0.2 --mixup_beta 0.2 28 | ``` 29 | 30 | #### Train on Classical datasets with 10% available labeled anomalies using MLP 31 | ``` 32 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg MLP --dataset Classical --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.01 --mixup_alpha 0.2 --mixup_beta 0.2 33 | ``` 34 | 35 |
36 | 37 | ### CV Dataset 38 |
39 | Click for details... 40 | 41 | 42 | #### Train on CV with 10% available labeled anomalies using DeepSAD 43 | ``` 44 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg DeepSAD --dataset CV --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.01 --mixup_alpha 0.2 --mixup_beta 0.2 45 | ``` 46 | 47 | #### Train on CV with 10% available labeled anomalies using MLP 48 | ``` 49 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg MLP --dataset CV --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.3 --mixup_alpha 0.2 --mixup_beta 0.2 50 | ``` 51 | 52 |
53 | 54 | 55 | ### NLP Dataset 56 |
57 | Click for details... 58 | 59 | 60 | #### Train on NLP with 10% available labeled anomalies using DeepSAD 61 | ``` 62 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg DeepSAD --dataset NLP --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.01 --mixup_alpha 0.2 --mixup_beta 0.2 63 | ``` 64 | 65 | #### Train on NLP with 10% available labeled anomalies using MLP 66 | ``` 67 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg MLP --dataset NLP --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.3 --mixup_alpha 0.2 --mixup_beta 0.2 68 | ``` 69 | 70 |
71 | 72 | ## Contact 73 | If you have any questions, please send an email to donghaospurs@gmail.com 74 | 75 | ## Citation 76 | 77 | If you find our work useful in your research please consider citing our paper: 78 | 79 | ``` 80 | @article{dong2023nngmix, 81 | author = {Hao Dong and Ga{\"e}tan Frusque and Yue Zhao and Eleni Chatzi and Olga Fink}, 82 | title = {{NNG-Mix: Improving Semi-supervised Anomaly Detection with Pseudo-anomaly Generation}}, 83 | journal = {arXiv preprint arXiv:2311.11961}, 84 | year = {2023}, 85 | } 86 | ``` 87 | 88 | ## Related Projects 89 | 90 | [MultiOOD](https://github.com/donghao51/MultiOOD): Scaling Out-of-Distribution Detection for Multiple Modalities 91 | 92 | ## Acknowledgement 93 | 94 | Many thanks to the excellent open-source projects [ADBench](https://github.com/Minqi824/ADBench). 95 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/dgm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from torch.nn import init 6 | from .vae import VariationalAutoencoder, Encoder, Decoder 7 | 8 | 9 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch 10 | class Classifier(nn.Module): 11 | """ 12 | Classifier network, i.e. q(y|x), for two classes (0: normal, 1: outlier) 13 | 14 | :param net: neural network class to use (as parameter to use the same network over different shallow_ssad) 15 | """ 16 | 17 | def __init__(self, net, dims=None): 18 | super(Classifier, self).__init__() 19 | self.dims = dims 20 | if dims is None: 21 | self.net = net() 22 | self.logits = nn.Linear(self.net.rep_dim, 2) 23 | else: 24 | [x_dim, h_dim, y_dim] = dims 25 | self.dense = nn.Linear(x_dim, h_dim) 26 | self.logits = nn.Linear(h_dim, y_dim) 27 | 28 | def forward(self, x): 29 | if self.dims is None: 30 | x = self.net(x) 31 | else: 32 | x = F.relu(self.dense(x)) 33 | x = F.softmax(self.logits(x), dim=-1) 34 | return x 35 | 36 | 37 | class DeepGenerativeModel(VariationalAutoencoder): 38 | """ 39 | M2 model from the paper 'Semi-Supervised Learning with Deep Generative Models' (Kingma et al., 2014). 40 | 41 | The 'Generative semi-supervised model' (M2) is a probabilistic model that incorporates label information in both 42 | inference and generation. 43 | 44 | :param dims: dimensions of the model given by [input_dim, label_dim, latent_dim, [hidden_dims]]. 45 | :param classifier_net: classifier network class to use. 46 | """ 47 | 48 | def __init__(self, dims, classifier_net=None): 49 | [x_dim, self.y_dim, z_dim, h_dim] = dims 50 | super(DeepGenerativeModel, self).__init__([x_dim, z_dim, h_dim]) 51 | 52 | self.encoder = Encoder([x_dim + self.y_dim, h_dim, z_dim]) 53 | self.decoder = Decoder([z_dim + self.y_dim, list(reversed(h_dim)), x_dim]) 54 | if classifier_net is None: 55 | self.classifier = Classifier(net=None, dims=[x_dim, h_dim[0], self.y_dim]) 56 | else: 57 | self.classifier = Classifier(classifier_net) 58 | 59 | # Init linear layers 60 | for m in self.modules(): 61 | if isinstance(m, nn.Linear): 62 | init.xavier_normal_(m.weight.data) 63 | if m.bias is not None: 64 | m.bias.data.zero_() 65 | 66 | def forward(self, x, y): 67 | z, q_mu, q_log_var = self.encoder(torch.cat((x, y), dim=1)) 68 | self.kl_divergence = self._kld(z, (q_mu, q_log_var)) 69 | rec = self.decoder(torch.cat((z, y), dim=1)) 70 | 71 | return rec 72 | 73 | def classify(self, x): 74 | logits = self.classifier(x) 75 | return logits 76 | 77 | def sample(self, z, y): 78 | """ 79 | Samples from the Decoder to generate an x. 80 | 81 | :param z: latent normal variable 82 | :param y: label (one-hot encoded) 83 | :return: x 84 | """ 85 | y = y.float() 86 | x = self.decoder(torch.cat((z, y), dim=1)) 87 | return x 88 | 89 | 90 | class StackedDeepGenerativeModel(DeepGenerativeModel): 91 | def __init__(self, dims, features): 92 | """ 93 | M1+M2 model as described in (Kingma et al., 2014). 94 | 95 | :param dims: dimensions of the model given by [input_dim, label_dim, latent_dim, [hidden_dims]]. 96 | :param classifier_net: classifier network class to use. 97 | :param features: a pre-trained M1 model of class 'VariationalAutoencoder' trained on the same dataset. 98 | """ 99 | [x_dim, y_dim, z_dim, h_dim] = dims 100 | super(StackedDeepGenerativeModel, self).__init__([features.z_dim, y_dim, z_dim, h_dim]) 101 | 102 | # Be sure to reconstruct with the same dimensions 103 | in_features = self.decoder.reconstruction.in_features 104 | self.decoder.reconstruction = nn.Linear(in_features, x_dim) 105 | 106 | # Make vae feature model untrainable by freezing parameters 107 | self.features = features 108 | self.features.train(False) 109 | 110 | for param in self.features.parameters(): 111 | param.requires_grad = False 112 | 113 | def forward(self, x, y): 114 | # Sample a new latent x from the M1 model 115 | x_sample, _, _ = self.features.encoder(x) 116 | 117 | # Use the sample as new input to M2 118 | return super(StackedDeepGenerativeModel, self).forward(x_sample, y) 119 | 120 | def classify(self, x): 121 | _, x, _ = self.features.encoder(x) 122 | logits = self.classifier(x) 123 | return logits 124 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/run.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import logging 3 | import random 4 | import numpy as np 5 | import pandas as pd 6 | import os 7 | from .utils.config import Config 8 | from .utils.visualization.plot_images_grid import plot_images_grid 9 | from .deepsad import deepsad 10 | from .datasets.main import load_dataset 11 | from myutils import Utils 12 | 13 | class DeepSAD(): 14 | def __init__(self, seed, model_name='DeepSAD'): 15 | self.utils = Utils() 16 | self.device = self.utils.get_device() # get device 17 | self.seed = seed 18 | 19 | self.net_name = 'dense' 20 | self.xp_path = None 21 | self.load_config = None 22 | self.load_model = None 23 | self.eta = 1.0 # eta in the loss function 24 | self.optimizer_name = 'adam' 25 | self.lr = 0.001 26 | self.n_epochs = 50 27 | self.lr_milestone = [0] 28 | self.batch_size = 128 29 | self.weight_decay = 1e-6 30 | self.pretrain = True # whether to use auto-encoder for pretraining 31 | self.ae_optimizer_name = 'adam' 32 | self.ae_lr = 0.001 33 | self.ae_n_epochs = 100 34 | self.ae_lr_milestone = [0] 35 | self.ae_batch_size = 128 36 | self.ae_weight_decay = 1e-6 37 | self.num_threads = 0 38 | self.n_jobs_dataloader = 0 39 | 40 | def fit(self, X_train, y_train, ratio=None): 41 | """ 42 | Deep SAD, a method for deep semi-supervised anomaly detection. 43 | 44 | :arg DATASET_NAME: Name of the dataset to load. 45 | :arg NET_NAME: Name of the neural network to use. 46 | :arg XP_PATH: Export path for logging the experiment. 47 | """ 48 | 49 | # Set seed (using myutils) 50 | self.utils.set_seed(self.seed) 51 | 52 | # Set the number of threads used for parallelizing CPU operations 53 | if self.num_threads > 0: 54 | torch.set_num_threads(self.num_threads) 55 | logging.info('Computation device: %s' % self.device) 56 | logging.info('Number of threads: %d' % self.num_threads) 57 | logging.info('Number of dataloader workers: %d' % self.n_jobs_dataloader) 58 | 59 | # Load data 60 | data = {'X_train': X_train, 'y_train': y_train} 61 | dataset = load_dataset(data=data, train=True) 62 | input_size = dataset.train_set.data.size(1) #input size 63 | 64 | # Initialize DeepSAD model and set neural network phi 65 | self.deepSAD = deepsad(self.eta) 66 | self.deepSAD.set_network(self.net_name, input_size) 67 | 68 | # If specified, load Deep SAD model (center c, network weights, and possibly autoencoder weights) 69 | if self.load_model: 70 | self.deepSAD.load_model(model_path=self.load_model, load_ae=True, map_location=self.device) 71 | logging.info('Loading model from %s.' % self.load_model) 72 | 73 | logging.info('Pretraining: %s' % self.pretrain) 74 | if self.pretrain: 75 | # Pretrain model on dataset (via autoencoder) 76 | self.deepSAD.pretrain(dataset, 77 | input_size, 78 | optimizer_name=self.ae_optimizer_name, 79 | lr=self.ae_lr, 80 | n_epochs=self.ae_n_epochs, 81 | lr_milestones=self.ae_lr_milestone, 82 | batch_size=self.ae_batch_size, 83 | weight_decay=self.ae_weight_decay, 84 | device=self.device, 85 | n_jobs_dataloader=self.n_jobs_dataloader) 86 | 87 | # Train model on dataset 88 | self.deepSAD.train(dataset, 89 | optimizer_name=self.optimizer_name, 90 | lr=self.lr, 91 | n_epochs=self.n_epochs, 92 | lr_milestones=self.lr_milestone, 93 | batch_size=self.batch_size, 94 | weight_decay=self.weight_decay, 95 | device=self.device, 96 | n_jobs_dataloader=self.n_jobs_dataloader) 97 | 98 | # Save results, model, and configuration 99 | # deepSAD.save_results(export_json=xp_path + '/results.json') 100 | # deepSAD.save_model(export_model=xp_path + '/model.tar') 101 | # cfg.save_config(export_json=xp_path + '/config.json') 102 | 103 | # Plot most anomalous and most normal test samples 104 | # indices, labels, scores = zip(*deepSAD.results['test_scores']) 105 | # indices, labels, scores = np.array(indices), np.array(labels), np.array(scores) 106 | # idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score 107 | # idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score 108 | 109 | return self 110 | 111 | def predict_score(self, X): 112 | # input randomly generated y label for consistence 113 | dataset = load_dataset(data={'X_test': X, 'y_test': np.random.choice([0, 1], X.shape[0])}, train=False) 114 | score = self.deepSAD.test(dataset, device=self.device, n_jobs_dataloader=self.n_jobs_dataloader) 115 | 116 | return score -------------------------------------------------------------------------------- /baseline/DeepSAD/src/networks/vae.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from torch.nn import init 4 | 5 | from .layers.stochastic import GaussianSample 6 | from .inference.distributions import log_standard_gaussian, log_gaussian 7 | 8 | 9 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch 10 | class Encoder(nn.Module): 11 | """ 12 | Encoder, i.e. the inference network. 13 | 14 | Attempts to infer the latent probability distribution p(z|x) from the data x by fitting a 15 | variational distribution q_φ(z|x). Returns the two parameters of the distribution (µ, log σ²). 16 | 17 | :param dims: dimensions of the network given by [input_dim, [hidden_dims], latent_dim]. 18 | """ 19 | 20 | def __init__(self, dims, sample_layer=GaussianSample): 21 | super(Encoder, self).__init__() 22 | 23 | [x_dim, h_dim, z_dim] = dims 24 | neurons = [x_dim, *h_dim] 25 | linear_layers = [nn.Linear(neurons[i-1], neurons[i]) for i in range(1, len(neurons))] 26 | 27 | self.hidden = nn.ModuleList(linear_layers) 28 | self.sample = sample_layer(h_dim[-1], z_dim) 29 | 30 | def forward(self, x): 31 | for layer in self.hidden: 32 | x = F.relu(layer(x)) 33 | return self.sample(x) 34 | 35 | 36 | class Decoder(nn.Module): 37 | """ 38 | Decoder, i.e. the generative network. 39 | 40 | Generates samples from an approximation p_θ(x|z) of the original distribution p(x) 41 | by transforming a latent representation z. 42 | 43 | :param dims: dimensions of the network given by [latent_dim, [hidden_dims], input_dim]. 44 | """ 45 | 46 | def __init__(self, dims): 47 | super(Decoder, self).__init__() 48 | 49 | [z_dim, h_dim, x_dim] = dims 50 | neurons = [z_dim, *h_dim] 51 | linear_layers = [nn.Linear(neurons[i-1], neurons[i]) for i in range(1, len(neurons))] 52 | 53 | self.hidden = nn.ModuleList(linear_layers) 54 | self.reconstruction = nn.Linear(h_dim[-1], x_dim) 55 | self.output_activation = nn.Sigmoid() 56 | 57 | def forward(self, x): 58 | for layer in self.hidden: 59 | x = F.relu(layer(x)) 60 | return self.output_activation(self.reconstruction(x)) 61 | 62 | 63 | class VariationalAutoencoder(nn.Module): 64 | """ 65 | Variational Autoencoder (VAE) (Kingma and Welling, 2013) model consisting of an encoder-decoder pair for which 66 | a variational distribution is fitted to the encoder. 67 | Also known as the M1 model in (Kingma et al., 2014) 68 | 69 | :param dims: dimensions of the networks given by [input_dim, latent_dim, [hidden_dims]]. Encoder and decoder 70 | are build symmetrically. 71 | """ 72 | 73 | def __init__(self, dims): 74 | super(VariationalAutoencoder, self).__init__() 75 | 76 | [x_dim, z_dim, h_dim] = dims 77 | self.z_dim = z_dim 78 | self.flow = None 79 | 80 | self.encoder = Encoder([x_dim, h_dim, z_dim]) 81 | self.decoder = Decoder([z_dim, list(reversed(h_dim)), x_dim]) 82 | self.kl_divergence = 0 83 | 84 | # Init linear layers 85 | for m in self.modules(): 86 | if isinstance(m, nn.Linear): 87 | init.xavier_normal_(m.weight.data) 88 | if m.bias is not None: 89 | m.bias.data.zero_() 90 | 91 | def _kld(self, z, q_param, p_param=None): 92 | """ 93 | Computes the KL-divergence of some latent variable z. 94 | 95 | KL(q||p) = - ∫ q(z) log [ p(z) / q(z) ] = - E_q[ log p(z) - log q(z) ] 96 | 97 | :param z: sample from q-distribuion 98 | :param q_param: (mu, log_var) of the q-distribution 99 | :param p_param: (mu, log_var) of the p-distribution 100 | :return: KL(q||p) 101 | """ 102 | (mu, log_var) = q_param 103 | 104 | if self.flow is not None: 105 | f_z, log_det_z = self.flow(z) 106 | qz = log_gaussian(z, mu, log_var) - sum(log_det_z) 107 | z = f_z 108 | else: 109 | qz = log_gaussian(z, mu, log_var) 110 | 111 | if p_param is None: 112 | pz = log_standard_gaussian(z) 113 | else: 114 | (mu, log_var) = p_param 115 | pz = log_gaussian(z, mu, log_var) 116 | 117 | kl = qz - pz 118 | 119 | return kl 120 | 121 | def add_flow(self, flow): 122 | self.flow = flow 123 | 124 | def forward(self, x, y=None): 125 | """ 126 | Runs a forward pass on a data point through the VAE model to provide its reconstruction and the parameters of 127 | the variational approximate distribution q. 128 | 129 | :param x: input data 130 | :return: reconstructed input 131 | """ 132 | z, q_mu, q_log_var = self.encoder(x) 133 | self.kl_divergence = self._kld(z, (q_mu, q_log_var)) 134 | rec = self.decoder(z) 135 | 136 | return rec 137 | 138 | def sample(self, z): 139 | """ 140 | Given z ~ N(0, I) generates a sample from the learned distribution based on p_θ(x|z). 141 | 142 | :param z: (torch.autograd.Variable) latent normal variable 143 | :return: (torch.autograd.Variable) generated sample 144 | """ 145 | return self.decoder(z) 146 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/optim/vae_trainer.py: -------------------------------------------------------------------------------- 1 | from baseline.DeepSAD.src.base.base_trainer import BaseTrainer 2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset 3 | from baseline.DeepSAD.src.base.base_net import BaseNet 4 | from baseline.DeepSAD.src.utils.misc import binary_cross_entropy 5 | from sklearn.metrics import roc_auc_score 6 | 7 | import logging 8 | import time 9 | import torch 10 | import torch.optim as optim 11 | import numpy as np 12 | 13 | 14 | class VAETrainer(BaseTrainer): 15 | 16 | def __init__(self, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150, lr_milestones: tuple = (), 17 | batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', n_jobs_dataloader: int = 0): 18 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device, 19 | n_jobs_dataloader) 20 | 21 | # Results 22 | self.train_time = None 23 | self.test_auc = None 24 | self.test_time = None 25 | 26 | def train(self, dataset: BaseADDataset, vae: BaseNet): 27 | logger = logging.getLogger() 28 | 29 | # Get train data loader 30 | train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 31 | 32 | # Set device 33 | vae = vae.to(self.device) 34 | 35 | # Set optimizer (Adam optimizer for now) 36 | optimizer = optim.Adam(vae.parameters(), lr=self.lr, weight_decay=self.weight_decay) 37 | 38 | # Set learning rate scheduler 39 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1) 40 | 41 | # Training 42 | logger.info('Starting pretraining...') 43 | start_time = time.time() 44 | vae.train() 45 | for epoch in range(self.n_epochs): 46 | 47 | scheduler.step() 48 | if epoch in self.lr_milestones: 49 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0])) 50 | 51 | epoch_loss = 0.0 52 | n_batches = 0 53 | epoch_start_time = time.time() 54 | for data in train_loader: 55 | inputs, _, _, _ = data 56 | inputs = inputs.to(self.device) 57 | inputs = inputs.view(inputs.size(0), -1) 58 | 59 | # Zero the network parameter gradients 60 | optimizer.zero_grad() 61 | 62 | # Update network parameters via backpropagation: forward + backward + optimize 63 | rec = vae(inputs) 64 | 65 | likelihood = -binary_cross_entropy(rec, inputs) 66 | elbo = likelihood - vae.kl_divergence 67 | 68 | # Overall loss 69 | loss = -torch.mean(elbo) 70 | 71 | loss.backward() 72 | optimizer.step() 73 | 74 | epoch_loss += loss.item() 75 | n_batches += 1 76 | 77 | # log epoch statistics 78 | epoch_train_time = time.time() - epoch_start_time 79 | logger.info(f'| Epoch: {epoch + 1:03}/{self.n_epochs:03} | Train Time: {epoch_train_time:.3f}s ' 80 | f'| Train Loss: {epoch_loss / n_batches:.6f} |') 81 | 82 | self.train_time = time.time() - start_time 83 | logger.info('Pretraining Time: {:.3f}s'.format(self.train_time)) 84 | logger.info('Finished pretraining.') 85 | 86 | return vae 87 | 88 | def test(self, dataset: BaseADDataset, vae: BaseNet): 89 | logger = logging.getLogger() 90 | 91 | # Get test data loader 92 | _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 93 | 94 | # Set device 95 | vae = vae.to(self.device) 96 | 97 | # Testing 98 | logger.info('Starting testing...') 99 | epoch_loss = 0.0 100 | n_batches = 0 101 | start_time = time.time() 102 | idx_label_score = [] 103 | vae.eval() 104 | with torch.no_grad(): 105 | for data in test_loader: 106 | inputs, labels, _, idx = data 107 | inputs, labels, idx = inputs.to(self.device), labels.to(self.device), idx.to(self.device) 108 | 109 | inputs = inputs.view(inputs.size(0), -1) 110 | 111 | rec = vae(inputs) 112 | likelihood = -binary_cross_entropy(rec, inputs) 113 | scores = -likelihood # negative likelihood as anomaly score 114 | 115 | # Save triple of (idx, label, score) in a list 116 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(), 117 | labels.cpu().data.numpy().tolist(), 118 | scores.cpu().data.numpy().tolist())) 119 | 120 | # Overall loss 121 | elbo = likelihood - vae.kl_divergence 122 | loss = -torch.mean(elbo) 123 | 124 | epoch_loss += loss.item() 125 | n_batches += 1 126 | 127 | self.test_time = time.time() - start_time 128 | 129 | # Compute AUC 130 | _, labels, scores = zip(*idx_label_score) 131 | labels = np.array(labels) 132 | scores = np.array(scores) 133 | self.test_auc = roc_auc_score(labels, scores) 134 | 135 | # Log results 136 | logger.info('Test Loss: {:.6f}'.format(epoch_loss / n_batches)) 137 | logger.info('Test AUC: {:.2f}%'.format(100. * self.test_auc)) 138 | logger.info('Test Time: {:.3f}s'.format(self.test_time)) 139 | logger.info('Finished testing variational autoencoder.') 140 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/optim/ae_trainer.py: -------------------------------------------------------------------------------- 1 | from baseline.DeepSAD.src.base.base_trainer import BaseTrainer 2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset 3 | from baseline.DeepSAD.src.base.base_net import BaseNet 4 | from sklearn.metrics import roc_auc_score, average_precision_score 5 | 6 | import logging 7 | import time 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | import numpy as np 12 | 13 | 14 | class AETrainer(BaseTrainer): 15 | 16 | def __init__(self, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150, lr_milestones: tuple = (), 17 | batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', n_jobs_dataloader: int = 0): 18 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device, 19 | n_jobs_dataloader) 20 | 21 | # Results 22 | self.train_time = None 23 | self.test_aucroc = None; self.test_aucpr = None 24 | self.test_time = None 25 | 26 | def train(self, dataset: BaseADDataset, ae_net: BaseNet): 27 | logger = logging.getLogger() 28 | 29 | # Get train data loader 30 | train_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 31 | 32 | # Set loss 33 | criterion = nn.MSELoss(reduction='none') 34 | 35 | # Set device 36 | ae_net = ae_net.to(self.device) 37 | criterion = criterion.to(self.device) 38 | 39 | # Set optimizer (Adam optimizer for now) 40 | optimizer = optim.Adam(ae_net.parameters(), lr=self.lr, weight_decay=self.weight_decay) 41 | 42 | # Set learning rate scheduler 43 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1) 44 | 45 | # Training 46 | logger.info('Starting pretraining...') 47 | start_time = time.time() 48 | ae_net.train() 49 | for epoch in range(self.n_epochs): 50 | 51 | epoch_loss = 0.0 52 | n_batches = 0 53 | epoch_start_time = time.time() 54 | for data in train_loader: 55 | inputs, _, _, _ = data 56 | inputs = inputs.to(self.device) 57 | 58 | # Zero the network parameter gradients 59 | optimizer.zero_grad() 60 | 61 | # Update network parameters via backpropagation: forward + backward + optimize 62 | rec = ae_net(inputs) 63 | rec_loss = criterion(rec, inputs) 64 | loss = torch.mean(rec_loss) 65 | loss.backward() 66 | optimizer.step() 67 | scheduler.step() 68 | if epoch in self.lr_milestones: 69 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0])) 70 | 71 | epoch_loss += loss.item() 72 | n_batches += 1 73 | 74 | # log epoch statistics 75 | epoch_train_time = time.time() - epoch_start_time 76 | logger.info(f'| Epoch: {epoch + 1:03}/{self.n_epochs:03} | Train Time: {epoch_train_time:.3f}s ' 77 | f'| Train Loss: {epoch_loss / n_batches:.6f} |') 78 | 79 | self.train_time = time.time() - start_time 80 | logger.info('Pretraining Time: {:.3f}s'.format(self.train_time)) 81 | logger.info('Finished pretraining.') 82 | 83 | return ae_net 84 | 85 | def test(self, dataset: BaseADDataset, ae_net: BaseNet): 86 | logger = logging.getLogger() 87 | 88 | # Get test data loader 89 | test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 90 | 91 | # Set loss 92 | criterion = nn.MSELoss(reduction='none') 93 | 94 | # Set device for network 95 | ae_net = ae_net.to(self.device) 96 | criterion = criterion.to(self.device) 97 | 98 | # Testing 99 | logger.info('Testing autoencoder...') 100 | epoch_loss = 0.0 101 | n_batches = 0 102 | start_time = time.time() 103 | idx_label_score = [] 104 | ae_net.eval() 105 | with torch.no_grad(): 106 | for data in test_loader: 107 | inputs, labels, _, idx = data 108 | inputs, labels, idx = inputs.to(self.device), labels.to(self.device), idx.to(self.device) 109 | 110 | rec = ae_net(inputs) 111 | rec_loss = criterion(rec, inputs) 112 | scores = torch.mean(rec_loss, dim=tuple(range(1, rec.dim()))) 113 | 114 | # Save triple of (idx, label, score) in a list 115 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(), 116 | labels.cpu().data.numpy().tolist(), 117 | scores.cpu().data.numpy().tolist())) 118 | 119 | loss = torch.mean(rec_loss) 120 | epoch_loss += loss.item() 121 | n_batches += 1 122 | 123 | self.test_time = time.time() - start_time 124 | 125 | # Compute AUC 126 | # _, labels, scores = zip(*idx_label_score) 127 | # labels = np.array(labels) 128 | # scores = np.array(scores) 129 | # self.test_aucroc = roc_auc_score(labels, scores) 130 | # self.test_aucpr = average_precision_score(labels, scores, pos_label=1) 131 | 132 | # Log results 133 | # logger.info('Test Loss: {:.6f}'.format(epoch_loss / n_batches)) 134 | # logger.info('Test AUCROC: {:.2f}%'.format(100. * self.test_aucroc)) 135 | # logger.info('Test AUCPR: {:.2f}%'.format(100. * self.test_aucpr)) 136 | # logger.info('Test Time: {:.3f}s'.format(self.test_time)) 137 | # logger.info('Finished testing autoencoder.') 138 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baselines/SemiDGM.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | 4 | from base.base_dataset import BaseADDataset 5 | from networks.main import build_network, build_autoencoder 6 | from optim import SemiDeepGenerativeTrainer, VAETrainer 7 | 8 | 9 | class SemiDeepGenerativeModel(object): 10 | """A class for the Semi-Supervised Deep Generative model (M1+M2 model). 11 | 12 | Paper: Kingma et al. (2014). Semi-supervised learning with deep generative models. In NIPS (pp. 3581-3589). 13 | Link: https://papers.nips.cc/paper/5352-semi-supervised-learning-with-deep-generative-models.pdf 14 | 15 | Attributes: 16 | net_name: A string indicating the name of the neural network to use. 17 | net: The neural network. 18 | trainer: SemiDeepGenerativeTrainer to train a Semi-Supervised Deep Generative model. 19 | optimizer_name: A string indicating the optimizer to use for training. 20 | results: A dictionary to save the results. 21 | """ 22 | 23 | def __init__(self, alpha: float = 0.1): 24 | """Inits SemiDeepGenerativeModel.""" 25 | 26 | self.alpha = alpha 27 | 28 | self.net_name = None 29 | self.net = None 30 | 31 | self.trainer = None 32 | self.optimizer_name = None 33 | 34 | self.vae_net = None # variational autoencoder network for pretraining 35 | self.vae_trainer = None 36 | self.vae_optimizer_name = None 37 | 38 | self.results = { 39 | 'train_time': None, 40 | 'test_auc': None, 41 | 'test_time': None, 42 | 'test_scores': None, 43 | } 44 | 45 | self.vae_results = { 46 | 'train_time': None, 47 | 'test_auc': None, 48 | 'test_time': None 49 | } 50 | 51 | def set_vae(self, net_name): 52 | """Builds the variational autoencoder network for pretraining.""" 53 | self.net_name = net_name 54 | self.vae_net = build_autoencoder(self.net_name) # VAE for pretraining 55 | 56 | def set_network(self, net_name): 57 | """Builds the neural network.""" 58 | self.net_name = net_name 59 | self.net = build_network(net_name, ae_net=self.vae_net) # full M1+M2 model 60 | 61 | def train(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 50, 62 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', 63 | n_jobs_dataloader: int = 0): 64 | """Trains the Semi-Supervised Deep Generative model on the training data.""" 65 | 66 | self.optimizer_name = optimizer_name 67 | 68 | self.trainer = SemiDeepGenerativeTrainer(alpha=self.alpha, optimizer_name=optimizer_name, lr=lr, 69 | n_epochs=n_epochs, lr_milestones=lr_milestones, batch_size=batch_size, 70 | weight_decay=weight_decay, device=device, 71 | n_jobs_dataloader=n_jobs_dataloader) 72 | self.net = self.trainer.train(dataset, self.net) 73 | self.results['train_time'] = self.trainer.train_time 74 | 75 | def test(self, dataset: BaseADDataset, device: str = 'cuda', n_jobs_dataloader: int = 0): 76 | """Tests the Semi-Supervised Deep Generative model on the test data.""" 77 | 78 | if self.trainer is None: 79 | self.trainer = SemiDeepGenerativeTrainer(alpha=self.alpha, device=device, 80 | n_jobs_dataloader=n_jobs_dataloader) 81 | 82 | self.trainer.test(dataset, self.net) 83 | # Get results 84 | self.results['test_auc'] = self.trainer.test_auc 85 | self.results['test_time'] = self.trainer.test_time 86 | self.results['test_scores'] = self.trainer.test_scores 87 | 88 | def pretrain(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 100, 89 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', 90 | n_jobs_dataloader: int = 0): 91 | """Pretrains a variational autoencoder (M1) for the Semi-Supervised Deep Generative model.""" 92 | 93 | # Train 94 | self.vae_optimizer_name = optimizer_name 95 | self.vae_trainer = VAETrainer(optimizer_name=optimizer_name, lr=lr, n_epochs=n_epochs, 96 | lr_milestones=lr_milestones, batch_size=batch_size, weight_decay=weight_decay, 97 | device=device, n_jobs_dataloader=n_jobs_dataloader) 98 | self.vae_net = self.vae_trainer.train(dataset, self.vae_net) 99 | # Get train results 100 | self.vae_results['train_time'] = self.vae_trainer.train_time 101 | 102 | # Test 103 | self.vae_trainer.test(dataset, self.vae_net) 104 | # Get test results 105 | self.vae_results['test_auc'] = self.vae_trainer.test_auc 106 | self.vae_results['test_time'] = self.vae_trainer.test_time 107 | 108 | def save_model(self, export_model): 109 | """Save a Semi-Supervised Deep Generative model to export_model.""" 110 | 111 | net_dict = self.net.state_dict() 112 | torch.save({'net_dict': net_dict}, export_model) 113 | 114 | def load_model(self, model_path): 115 | """Load a Semi-Supervised Deep Generative model from model_path.""" 116 | 117 | model_dict = torch.load(model_path) 118 | self.net.load_state_dict(model_dict['net_dict']) 119 | 120 | def save_results(self, export_json): 121 | """Save results dict to a JSON-file.""" 122 | with open(export_json, 'w') as fp: 123 | json.dump(self.results, fp) 124 | 125 | def save_vae_results(self, export_json): 126 | """Save variational autoencoder results dict to a JSON-file.""" 127 | with open(export_json, 'w') as fp: 128 | json.dump(self.vae_results, fp) 129 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baselines/isoforest.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import time 4 | import torch 5 | import numpy as np 6 | 7 | from torch.utils.data import DataLoader 8 | from sklearn.ensemble import IsolationForest 9 | from sklearn.metrics import roc_auc_score 10 | from base.base_dataset import BaseADDataset 11 | from networks.main import build_autoencoder 12 | 13 | 14 | class IsoForest(object): 15 | """A class for Isolation Forest models.""" 16 | 17 | def __init__(self, hybrid=False, n_estimators=100, max_samples='auto', contamination=0.1, n_jobs=-1, seed=None, 18 | **kwargs): 19 | """Init Isolation Forest instance.""" 20 | self.n_estimators = n_estimators 21 | self.max_samples = max_samples 22 | self.contamination = contamination 23 | self.n_jobs = n_jobs 24 | self.seed = seed 25 | 26 | self.model = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination, 27 | n_jobs=n_jobs, random_state=seed, **kwargs) 28 | 29 | self.hybrid = hybrid 30 | self.ae_net = None # autoencoder network for the case of a hybrid model 31 | 32 | self.results = { 33 | 'train_time': None, 34 | 'test_time': None, 35 | 'test_auc': None, 36 | 'test_scores': None 37 | } 38 | 39 | def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0): 40 | """Trains the Isolation Forest model on the training data.""" 41 | logger = logging.getLogger() 42 | 43 | # do not drop last batch for non-SGD optimization shallow_ssad 44 | train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True, 45 | num_workers=n_jobs_dataloader, drop_last=False) 46 | 47 | # Get data from loader 48 | X = () 49 | for data in train_loader: 50 | inputs, _, _, _ = data 51 | inputs = inputs.to(device) 52 | if self.hybrid: 53 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 54 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 55 | X += (X_batch.cpu().data.numpy(),) 56 | X = np.concatenate(X) 57 | 58 | # Training 59 | logger.info('Starting training...') 60 | start_time = time.time() 61 | self.model.fit(X) 62 | train_time = time.time() - start_time 63 | self.results['train_time'] = train_time 64 | 65 | logger.info('Training Time: {:.3f}s'.format(self.results['train_time'])) 66 | logger.info('Finished training.') 67 | 68 | def test(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0): 69 | """Tests the Isolation Forest model on the test data.""" 70 | logger = logging.getLogger() 71 | 72 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader) 73 | 74 | # Get data from loader 75 | idx_label_score = [] 76 | X = () 77 | idxs = [] 78 | labels = [] 79 | for data in test_loader: 80 | inputs, label_batch, _, idx = data 81 | inputs, label_batch, idx = inputs.to(device), label_batch.to(device), idx.to(device) 82 | if self.hybrid: 83 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 84 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 85 | X += (X_batch.cpu().data.numpy(),) 86 | idxs += idx.cpu().data.numpy().astype(np.int64).tolist() 87 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist() 88 | X = np.concatenate(X) 89 | 90 | # Testing 91 | logger.info('Starting testing...') 92 | start_time = time.time() 93 | scores = (-1.0) * self.model.decision_function(X) 94 | self.results['test_time'] = time.time() - start_time 95 | scores = scores.flatten() 96 | 97 | # Save triples of (idx, label, score) in a list 98 | idx_label_score += list(zip(idxs, labels, scores.tolist())) 99 | self.results['test_scores'] = idx_label_score 100 | 101 | # Compute AUC 102 | _, labels, scores = zip(*idx_label_score) 103 | labels = np.array(labels) 104 | scores = np.array(scores) 105 | self.results['test_auc'] = roc_auc_score(labels, scores) 106 | 107 | # Log results 108 | logger.info('Test AUC: {:.2f}%'.format(100. * self.results['test_auc'])) 109 | logger.info('Test Time: {:.3f}s'.format(self.results['test_time'])) 110 | logger.info('Finished testing.') 111 | 112 | def load_ae(self, dataset_name, model_path): 113 | """Load pretrained autoencoder from model_path for feature extraction in a hybrid Isolation Forest model.""" 114 | 115 | model_dict = torch.load(model_path, map_location='cpu') 116 | ae_net_dict = model_dict['ae_net_dict'] 117 | if dataset_name in ['mnist', 'fmnist', 'cifar10']: 118 | net_name = dataset_name + '_LeNet' 119 | else: 120 | net_name = dataset_name + '_mlp' 121 | 122 | if self.ae_net is None: 123 | self.ae_net = build_autoencoder(net_name) 124 | 125 | # update keys (since there was a change in network definition) 126 | ae_keys = list(self.ae_net.state_dict().keys()) 127 | for i in range(len(ae_net_dict)): 128 | k, v = ae_net_dict.popitem(False) 129 | new_key = ae_keys[i] 130 | ae_net_dict[new_key] = v 131 | i += 1 132 | 133 | self.ae_net.load_state_dict(ae_net_dict) 134 | self.ae_net.eval() 135 | 136 | def save_model(self, export_path): 137 | """Save Isolation Forest model to export_path.""" 138 | pass 139 | 140 | def load_model(self, import_path, device: str = 'cpu'): 141 | """Load Isolation Forest model from import_path.""" 142 | pass 143 | 144 | def save_results(self, export_json): 145 | """Save results dict to a JSON-file.""" 146 | with open(export_json, 'w') as fp: 147 | json.dump(self.results, fp) 148 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baselines/kde.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import time 4 | import torch 5 | import numpy as np 6 | 7 | from torch.utils.data import DataLoader 8 | from sklearn.neighbors import KernelDensity 9 | from sklearn.metrics import roc_auc_score 10 | from sklearn.metrics.pairwise import pairwise_distances 11 | from sklearn.model_selection import GridSearchCV 12 | from base.base_dataset import BaseADDataset 13 | from networks.main import build_autoencoder 14 | 15 | 16 | class KDE(object): 17 | """A class for Kernel Density Estimation models.""" 18 | 19 | def __init__(self, hybrid=False, kernel='gaussian', n_jobs=-1, seed=None, **kwargs): 20 | """Init Kernel Density Estimation instance.""" 21 | self.kernel = kernel 22 | self.n_jobs = n_jobs 23 | self.seed = seed 24 | 25 | self.model = KernelDensity(kernel=kernel, **kwargs) 26 | self.bandwidth = self.model.bandwidth 27 | 28 | self.hybrid = hybrid 29 | self.ae_net = None # autoencoder network for the case of a hybrid model 30 | 31 | self.results = { 32 | 'train_time': None, 33 | 'test_time': None, 34 | 'test_auc': None, 35 | 'test_scores': None 36 | } 37 | 38 | def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0, 39 | bandwidth_GridSearchCV: bool = True): 40 | """Trains the Kernel Density Estimation model on the training data.""" 41 | logger = logging.getLogger() 42 | 43 | # do not drop last batch for non-SGD optimization shallow_ssad 44 | train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True, 45 | num_workers=n_jobs_dataloader, drop_last=False) 46 | 47 | # Get data from loader 48 | X = () 49 | for data in train_loader: 50 | inputs, _, _, _ = data 51 | inputs = inputs.to(device) 52 | if self.hybrid: 53 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 54 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 55 | X += (X_batch.cpu().data.numpy(),) 56 | X = np.concatenate(X) 57 | 58 | # Training 59 | logger.info('Starting training...') 60 | start_time = time.time() 61 | 62 | if bandwidth_GridSearchCV: 63 | # use grid search cross-validation to select bandwidth 64 | logger.info('Using GridSearchCV for bandwidth selection...') 65 | params = {'bandwidth': np.logspace(0.5, 5, num=10, base=2)} 66 | hyper_kde = GridSearchCV(KernelDensity(kernel=self.kernel), params, n_jobs=self.n_jobs, cv=5, verbose=0) 67 | hyper_kde.fit(X) 68 | self.bandwidth = hyper_kde.best_estimator_.bandwidth 69 | logger.info('Best bandwidth: {:.8f}'.format(self.bandwidth)) 70 | self.model = hyper_kde.best_estimator_ 71 | else: 72 | # if exponential kernel, re-initialize kde with bandwidth minimizing the numerical error 73 | if self.kernel == 'exponential': 74 | self.bandwidth = np.max(pairwise_distances(X)) ** 2 75 | self.model = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth) 76 | 77 | self.model.fit(X) 78 | 79 | train_time = time.time() - start_time 80 | self.results['train_time'] = train_time 81 | 82 | logger.info('Training Time: {:.3f}s'.format(self.results['train_time'])) 83 | logger.info('Finished training.') 84 | 85 | def test(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0): 86 | """Tests the Kernel Density Estimation model on the test data.""" 87 | logger = logging.getLogger() 88 | 89 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader) 90 | 91 | # Get data from loader 92 | idx_label_score = [] 93 | X = () 94 | idxs = [] 95 | labels = [] 96 | for data in test_loader: 97 | inputs, label_batch, _, idx = data 98 | inputs, label_batch, idx = inputs.to(device), label_batch.to(device), idx.to(device) 99 | if self.hybrid: 100 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 101 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 102 | X += (X_batch.cpu().data.numpy(),) 103 | idxs += idx.cpu().data.numpy().astype(np.int64).tolist() 104 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist() 105 | X = np.concatenate(X) 106 | 107 | # Testing 108 | logger.info('Starting testing...') 109 | start_time = time.time() 110 | scores = (-1.0) * self.model.score_samples(X) 111 | self.results['test_time'] = time.time() - start_time 112 | scores = scores.flatten() 113 | 114 | # Save triples of (idx, label, score) in a list 115 | idx_label_score += list(zip(idxs, labels, scores.tolist())) 116 | self.results['test_scores'] = idx_label_score 117 | 118 | # Compute AUC 119 | _, labels, scores = zip(*idx_label_score) 120 | labels = np.array(labels) 121 | scores = np.array(scores) 122 | self.results['test_auc'] = roc_auc_score(labels, scores) 123 | 124 | # Log results 125 | logger.info('Test AUC: {:.2f}%'.format(100. * self.results['test_auc'])) 126 | logger.info('Test Time: {:.3f}s'.format(self.results['test_time'])) 127 | logger.info('Finished testing.') 128 | 129 | def load_ae(self, dataset_name, model_path): 130 | """Load pretrained autoencoder from model_path for feature extraction in a hybrid KDE model.""" 131 | 132 | model_dict = torch.load(model_path, map_location='cpu') 133 | ae_net_dict = model_dict['ae_net_dict'] 134 | if dataset_name in ['mnist', 'fmnist', 'cifar10']: 135 | net_name = dataset_name + '_LeNet' 136 | else: 137 | net_name = dataset_name + '_mlp' 138 | 139 | if self.ae_net is None: 140 | self.ae_net = build_autoencoder(net_name) 141 | 142 | # update keys (since there was a change in network definition) 143 | ae_keys = list(self.ae_net.state_dict().keys()) 144 | for i in range(len(ae_net_dict)): 145 | k, v = ae_net_dict.popitem(False) 146 | new_key = ae_keys[i] 147 | ae_net_dict[new_key] = v 148 | i += 1 149 | 150 | self.ae_net.load_state_dict(ae_net_dict) 151 | self.ae_net.eval() 152 | 153 | def save_model(self, export_path): 154 | """Save KDE model to export_path.""" 155 | pass 156 | 157 | def load_model(self, import_path, device: str = 'cpu'): 158 | """Load KDE model from import_path.""" 159 | pass 160 | 161 | def save_results(self, export_json): 162 | """Save results dict to a JSON-file.""" 163 | with open(export_json, 'w') as fp: 164 | json.dump(self.results, fp) 165 | -------------------------------------------------------------------------------- /baseline/DeepSAD/README.md: -------------------------------------------------------------------------------- 1 | # Deep SAD: A Method for Deep Semi-Supervised Anomaly Detection 2 | This repository provides a [PyTorch](https://pytorch.org/) implementation of the *Deep SAD* method presented in our ICLR 2020 paper ”Deep Semi-Supervised Anomaly Detection”. 3 | 4 | 5 | ## Citation and Contact 6 | You find a PDF of the Deep Semi-Supervised Anomaly Detection ICLR 2020 paper on arXiv 7 | [https://arxiv.org/abs/1906.02694](https://arxiv.org/abs/1906.02694). 8 | 9 | If you find our work useful, please also cite the paper: 10 | ``` 11 | @InProceedings{ruff2020deep, 12 | title = {Deep Semi-Supervised Anomaly Detection}, 13 | author = {Ruff, Lukas and Vandermeulen, Robert A. and G{\"o}rnitz, Nico and Binder, Alexander and M{\"u}ller, Emmanuel and M{\"u}ller, Klaus-Robert and Kloft, Marius}, 14 | booktitle = {International Conference on Learning Representations}, 15 | year = {2020}, 16 | url = {https://openreview.net/forum?id=HkgH0TEYwH} 17 | } 18 | ``` 19 | 20 | If you would like get in touch, just drop us an email to [contact@lukasruff.com](mailto:contact@lukasruff.com). 21 | 22 | 23 | ## Abstract 24 | > > Deep approaches to anomaly detection have recently shown promising results over shallow methods on large and complex datasets. Typically anomaly detection is treated as an unsupervised learning problem. In practice however, one may have---in addition to a large set of unlabeled samples---access to a small pool of labeled samples, e.g. a subset verified by some domain expert as being normal or anomalous. Semi-supervised approaches to anomaly detection aim to utilize such labeled samples, but most proposed methods are limited to merely including labeled normal samples. Only a few methods take advantage of labeled anomalies, with existing deep approaches being domain-specific. In this work we present Deep SAD, an end-to-end deep methodology for general semi-supervised anomaly detection. We further introduce an information-theoretic framework for deep anomaly detection based on the idea that the entropy of the latent distribution for normal data should be lower than the entropy of the anomalous distribution, which can serve as a theoretical interpretation for our method. In extensive experiments on MNIST, Fashion-MNIST, and CIFAR-10, along with other anomaly detection benchmark datasets, we demonstrate that our method is on par or outperforms shallow, hybrid, and deep competitors, yielding appreciable performance improvements even when provided with only little labeled data. 25 | 26 | ## The need for semi-supervised anomaly detection 27 | 28 | ![fig1](imgs/fig1.png?raw=true "fig1") 29 | 30 | 31 | ## Installation 32 | This code is written in `Python 3.7` and requires the packages listed in `requirements.txt`. 33 | 34 | Clone the repository to your machine and directory of choice: 35 | ``` 36 | git clone https://github.com/lukasruff/Deep-SAD-PyTorch.git 37 | ``` 38 | 39 | To run the code, we recommend setting up a virtual environment, e.g. using `virtualenv` or `conda`: 40 | 41 | ### `virtualenv` 42 | ``` 43 | # pip install virtualenv 44 | cd 45 | virtualenv myenv 46 | source myenv/bin/activate 47 | pip install -r requirements.txt 48 | ``` 49 | 50 | ### `conda` 51 | ``` 52 | cd 53 | conda create --name myenv 54 | source activate myenv 55 | while read requirement; do conda install -n myenv --yes $requirement; done < requirements.txt 56 | ``` 57 | 58 | 59 | ## Running experiments 60 | We have implemented the [`MNIST`](http://yann.lecun.com/exdb/mnist/), 61 | [`Fashion-MNIST`](https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/), and 62 | [`CIFAR-10`](https://www.cs.toronto.edu/~kriz/cifar.html) datasets as well as the classic anomaly detection 63 | benchmark datasets `arrhythmia`, `cardio`, `satellite`, `satimage-2`, `shuttle`, and `thyroid` from the 64 | Outlier Detection DataSets (ODDS) repository ([http://odds.cs.stonybrook.edu/](http://odds.cs.stonybrook.edu/)) 65 | as reported in the paper. 66 | 67 | The implemented network architectures are as reported in the appendix of the paper. 68 | 69 | ### Deep SAD 70 | You can run Deep SAD experiments using the `main.py` script. 71 | 72 | Here's an example on `MNIST` with `0` considered to be the normal class and having 1% labeled (known) training samples 73 | from anomaly class `1` with a pollution ratio of 10% of the unlabeled training data (with unknown anomalies from all 74 | anomaly classes `1`-`9`): 75 | ``` 76 | cd 77 | 78 | # activate virtual environment 79 | source myenv/bin/activate # or 'source activate myenv' for conda 80 | 81 | # create folders for experimental output 82 | mkdir log/DeepSAD 83 | mkdir log/DeepSAD/mnist_test 84 | 85 | # change to source directory 86 | cd src 87 | 88 | # run experiment 89 | python main.py mnist mnist_LeNet ../log/DeepSAD/mnist_test ../data --ratio_known_outlier 0.01 --ratio_pollution 0.1 --lr 0.0001 --n_epochs 150 --lr_milestone 50 --batch_size 128 --weight_decay 0.5e-6 --pretrain True --ae_lr 0.0001 --ae_n_epochs 150 --ae_batch_size 128 --ae_weight_decay 0.5e-3 --normal_class 0 --known_outlier_class 1 --n_known_outlier_classes 1; 90 | ``` 91 | Have a look into `main.py` for all possible arguments and options. 92 | 93 | ### Baselines 94 | We also provide an implementation of the following baselines via the respective `baseline_.py` scripts: 95 | OC-SVM (`ocsvm`), Isolation Forest (`isoforest`), Kernel Density Estimation (`kde`), kernel Semi-Supervised Anomaly 96 | Detection (`ssad`), and Semi-Supervised Deep Generative Model (`SemiDGM`). 97 | 98 | Here's how to run SSAD for example on the same experimental setup as above: 99 | ``` 100 | cd 101 | 102 | # activate virtual environment 103 | source myenv/bin/activate # or 'source activate myenv' for conda 104 | 105 | # create folder for experimental output 106 | mkdir log/ssad 107 | mkdir log/ssad/mnist_test 108 | 109 | # change to source directory 110 | cd src 111 | 112 | # run experiment 113 | python baseline_ssad.py mnist ../log/ssad/mnist_test ../data --ratio_known_outlier 0.01 --ratio_pollution 0.1 --kernel rbf --kappa 1.0 --normal_class 0 --known_outlier_class 1 --n_known_outlier_classes 1; 114 | ``` 115 | 116 | The autoencoder is provided through Deep SAD pre-training using `--pretrain True` with `main.py`. 117 | To then run a hybrid approach using one of the classic methods on top of autoencoder features, simply point to the saved 118 | autoencoder model using `--load_ae ../log/DeepSAD/mnist_test/model.tar` and set `--hybrid True`. 119 | 120 | To run hybrid SSAD for example on the same experimental setup as above: 121 | ``` 122 | cd 123 | 124 | # activate virtual environment 125 | source myenv/bin/activate # or 'source activate myenv' for conda 126 | 127 | # create folder for experimental output 128 | mkdir log/hybrid_ssad 129 | mkdir log/hybrid_ssad/mnist_test 130 | 131 | # change to source directory 132 | cd src 133 | 134 | # run experiment 135 | python baseline_ssad.py mnist ../log/hybrid_ssad/mnist_test ../data --ratio_known_outlier 0.01 --ratio_pollution 0.1 --kernel rbf --kappa 1.0 --hybrid True --load_ae ../log/DeepSAD/mnist_test/model.tar --normal_class 0 --known_outlier_class 1 --n_known_outlier_classes 1; 136 | ``` 137 | 138 | ## License 139 | MIT 140 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/deepsad.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | 4 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset 5 | from baseline.DeepSAD.src.networks.main import build_network, build_autoencoder 6 | from baseline.DeepSAD.src.optim.DeepSAD_trainer import DeepSADTrainer 7 | from baseline.DeepSAD.src.optim.ae_trainer import AETrainer 8 | 9 | 10 | class deepsad(object): 11 | """A class for the Deep SAD method. 12 | 13 | Attributes: 14 | eta: Deep SAD hyperparameter eta (must be 0 < eta). 15 | c: Hypersphere center c. 16 | net_name: A string indicating the name of the neural network to use. 17 | net: The neural network phi. 18 | trainer: DeepSADTrainer to train a Deep SAD model. 19 | optimizer_name: A string indicating the optimizer to use for training the Deep SAD network. 20 | ae_net: The autoencoder network corresponding to phi for network weights pretraining. 21 | ae_trainer: AETrainer to train an autoencoder in pretraining. 22 | ae_optimizer_name: A string indicating the optimizer to use for pretraining the autoencoder. 23 | results: A dictionary to save the results. 24 | ae_results: A dictionary to save the autoencoder results. 25 | """ 26 | 27 | def __init__(self, eta: float = 1.0): 28 | """Inits DeepSAD with hyperparameter eta.""" 29 | 30 | self.eta = eta 31 | self.c = None # hypersphere center c 32 | 33 | self.net_name = None 34 | self.net = None # neural network phi 35 | 36 | self.trainer = None 37 | self.optimizer_name = None 38 | 39 | self.ae_net = None # autoencoder network for pretraining 40 | self.ae_trainer = None 41 | self.ae_optimizer_name = None 42 | 43 | self.results = { 44 | 'train_time': None, 45 | 'test_aucroc': None, 46 | 'test_aucpr': None, 47 | 'test_time': None, 48 | 'test_scores': None, 49 | } 50 | 51 | self.ae_results = { 52 | 'train_time': None, 53 | 'test_aucroc': None, 54 | 'test_aucpr': None, 55 | 'test_time': None 56 | } 57 | 58 | def set_network(self, net_name, input_size): 59 | """Builds the neural network phi.""" 60 | self.net_name = net_name 61 | self.net = build_network(net_name, input_size) 62 | 63 | def train(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 50, 64 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', 65 | n_jobs_dataloader: int = 0): 66 | """Trains the Deep SAD model on the training data.""" 67 | 68 | self.optimizer_name = optimizer_name 69 | self.trainer = DeepSADTrainer(self.c, self.eta, optimizer_name=optimizer_name, lr=lr, n_epochs=n_epochs, 70 | lr_milestones=lr_milestones, batch_size=batch_size, weight_decay=weight_decay, 71 | device=device, n_jobs_dataloader=n_jobs_dataloader) 72 | # Get the model 73 | self.net = self.trainer.train(dataset, self.net) 74 | self.results['train_time'] = self.trainer.train_time 75 | self.c = self.trainer.c.cpu().data.numpy().tolist() # get as list 76 | 77 | def test(self, dataset: BaseADDataset, device: str = 'cuda', n_jobs_dataloader: int = 0): 78 | """Tests the Deep SAD model on the test data.""" 79 | 80 | if self.trainer is None: 81 | self.trainer = DeepSADTrainer(self.c, self.eta, device=device, n_jobs_dataloader=n_jobs_dataloader) 82 | 83 | score = self.trainer.test(dataset, self.net) 84 | 85 | # Get results 86 | # self.results['test_aucroc'] = self.trainer.test_aucroc 87 | # self.results['test_aucpr'] = self.trainer.test_aucpr 88 | self.results['test_time'] = self.trainer.test_time 89 | self.results['test_scores'] = self.trainer.test_scores 90 | 91 | return score 92 | 93 | def pretrain(self, dataset: BaseADDataset, input_size ,optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 100, 94 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', 95 | n_jobs_dataloader: int = 0): 96 | """Pretrains the weights for the Deep SAD network phi via autoencoder.""" 97 | 98 | # Set autoencoder network 99 | self.ae_net = build_autoencoder(self.net_name, input_size) 100 | 101 | # Train 102 | self.ae_optimizer_name = optimizer_name 103 | self.ae_trainer = AETrainer(optimizer_name, lr=lr, n_epochs=n_epochs, lr_milestones=lr_milestones, 104 | batch_size=batch_size, weight_decay=weight_decay, device=device, 105 | n_jobs_dataloader=n_jobs_dataloader) 106 | self.ae_net = self.ae_trainer.train(dataset, self.ae_net) 107 | 108 | # Get train results 109 | self.ae_results['train_time'] = self.ae_trainer.train_time 110 | 111 | # Test 112 | self.ae_trainer.test(dataset, self.ae_net) 113 | 114 | # Get test results 115 | self.ae_results['test_aucroc'] = self.ae_trainer.test_aucroc 116 | self.ae_results['test_aucpr'] = self.ae_trainer.test_aucpr 117 | self.ae_results['test_time'] = self.ae_trainer.test_time 118 | 119 | # Initialize Deep SAD network weights from pre-trained encoder 120 | self.init_network_weights_from_pretraining() 121 | 122 | def init_network_weights_from_pretraining(self): 123 | """Initialize the Deep SAD network weights from the encoder weights of the pretraining autoencoder.""" 124 | 125 | net_dict = self.net.state_dict() 126 | ae_net_dict = self.ae_net.state_dict() 127 | 128 | # Filter out decoder network keys 129 | ae_net_dict = {k: v for k, v in ae_net_dict.items() if k in net_dict} 130 | # Overwrite values in the existing state_dict 131 | net_dict.update(ae_net_dict) 132 | # Load the new state_dict 133 | self.net.load_state_dict(net_dict) 134 | 135 | def save_model(self, export_model, save_ae=True): 136 | """Save Deep SAD model to export_model.""" 137 | 138 | net_dict = self.net.state_dict() 139 | ae_net_dict = self.ae_net.state_dict() if save_ae else None 140 | 141 | torch.save({'c': self.c, 142 | 'net_dict': net_dict, 143 | 'ae_net_dict': ae_net_dict}, export_model) 144 | 145 | def load_model(self, model_path, load_ae=False, map_location='cpu'): 146 | """Load Deep SAD model from model_path.""" 147 | 148 | model_dict = torch.load(model_path, map_location=map_location) 149 | 150 | self.c = model_dict['c'] 151 | self.net.load_state_dict(model_dict['net_dict']) 152 | 153 | # load autoencoder parameters if specified 154 | if load_ae: 155 | if self.ae_net is None: 156 | self.ae_net = build_autoencoder(self.net_name) 157 | self.ae_net.load_state_dict(model_dict['ae_net_dict']) 158 | 159 | def save_results(self, export_json): 160 | """Save results dict to a JSON-file.""" 161 | with open(export_json, 'w') as fp: 162 | json.dump(self.results, fp) 163 | 164 | def save_ae_results(self, export_json): 165 | """Save autoencoder results dict to a JSON-file.""" 166 | with open(export_json, 'w') as fp: 167 | json.dump(self.ae_results, fp) 168 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/optim/DeepSAD_trainer.py: -------------------------------------------------------------------------------- 1 | from baseline.DeepSAD.src.base.base_trainer import BaseTrainer 2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset 3 | from baseline.DeepSAD.src.base.base_net import BaseNet 4 | from torch.utils.data.dataloader import DataLoader 5 | from sklearn.metrics import roc_auc_score, average_precision_score 6 | 7 | import logging 8 | import time 9 | import torch 10 | import torch.optim as optim 11 | import numpy as np 12 | 13 | 14 | class DeepSADTrainer(BaseTrainer): 15 | 16 | def __init__(self, c, eta: float, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150, 17 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', 18 | n_jobs_dataloader: int = 0): 19 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device, 20 | n_jobs_dataloader) 21 | 22 | # Deep SAD parameters 23 | self.c = torch.tensor(c, device=self.device) if c is not None else None 24 | self.eta = eta 25 | 26 | # Optimization parameters 27 | self.eps = 1e-6 28 | 29 | # Results 30 | self.train_time = None 31 | self.test_aucroc = None; self.test_aucpr = None 32 | self.test_time = None 33 | self.test_scores = None 34 | 35 | def train(self, dataset: BaseADDataset, net: BaseNet): 36 | logger = logging.getLogger() 37 | 38 | # Get train data loader 39 | train_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 40 | 41 | # Set device for network 42 | net = net.to(self.device) 43 | 44 | # Set optimizer (Adam optimizer for now) 45 | optimizer = optim.Adam(net.parameters(), lr=self.lr, weight_decay=self.weight_decay) 46 | 47 | # Set learning rate scheduler 48 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1) 49 | 50 | # Initialize hypersphere center c (if c not loaded) 51 | if self.c is None: 52 | logger.info('Initializing center c...') 53 | self.c = self.init_center_c(train_loader, net) 54 | logger.info('Center c initialized.') 55 | 56 | # Training 57 | logger.info('Starting training...') 58 | start_time = time.time() 59 | net.train() 60 | for epoch in range(self.n_epochs): 61 | 62 | epoch_loss = 0.0 63 | n_batches = 0 64 | epoch_start_time = time.time() 65 | for data in train_loader: 66 | inputs, _, semi_targets, _ = data 67 | inputs, semi_targets = inputs.to(self.device), semi_targets.to(self.device) 68 | 69 | # transfer the label "1" to "-1" for the inverse loss 70 | semi_targets[semi_targets==1] = -1 71 | 72 | # Zero the network parameter gradients 73 | optimizer.zero_grad() 74 | 75 | # Update network parameters via backpropagation: forward + backward + optimize 76 | outputs = net(inputs) 77 | dist = torch.sum((outputs - self.c) ** 2, dim=1) 78 | losses = torch.where(semi_targets == 0, dist, self.eta * ((dist + self.eps) ** semi_targets.float())) 79 | loss = torch.mean(losses) 80 | loss.backward() 81 | optimizer.step() 82 | scheduler.step() 83 | if epoch in self.lr_milestones: 84 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0])) 85 | 86 | epoch_loss += loss.item() 87 | n_batches += 1 88 | 89 | # log epoch statistics 90 | epoch_train_time = time.time() - epoch_start_time 91 | logger.info(f'| Epoch: {epoch + 1:03}/{self.n_epochs:03} | Train Time: {epoch_train_time:.3f}s ' 92 | f'| Train Loss: {epoch_loss / n_batches:.6f} |') 93 | 94 | self.train_time = time.time() - start_time 95 | logger.info('Training Time: {:.3f}s'.format(self.train_time)) 96 | logger.info('Finished training.') 97 | 98 | return net 99 | 100 | def test(self, dataset: BaseADDataset, net: BaseNet): 101 | logger = logging.getLogger() 102 | 103 | # Get test data loader 104 | test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 105 | 106 | # Set device for network 107 | net = net.to(self.device) 108 | 109 | # Testing 110 | logger.info('Starting testing...') 111 | epoch_loss = 0.0 112 | n_batches = 0 113 | start_time = time.time() 114 | idx_label_score = [] 115 | net.eval() 116 | with torch.no_grad(): 117 | for data in test_loader: 118 | inputs, labels, semi_targets, idx = data 119 | 120 | inputs = inputs.to(self.device) 121 | labels = labels.to(self.device) 122 | semi_targets = semi_targets.to(self.device) 123 | idx = idx.to(self.device) 124 | 125 | outputs = net(inputs) 126 | dist = torch.sum((outputs - self.c) ** 2, dim=1) 127 | losses = torch.where(semi_targets == 0, dist, self.eta * ((dist + self.eps) ** semi_targets.float())) 128 | loss = torch.mean(losses) 129 | scores = dist 130 | 131 | # Save triples of (idx, label, score) in a list 132 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(), 133 | labels.cpu().data.numpy().tolist(), 134 | scores.cpu().data.numpy().tolist())) 135 | 136 | epoch_loss += loss.item() 137 | n_batches += 1 138 | 139 | self.test_time = time.time() - start_time 140 | self.test_scores = idx_label_score 141 | 142 | # Compute AUC 143 | _, labels, scores = zip(*idx_label_score) 144 | # labels = np.array(labels) 145 | scores = np.array(scores) 146 | # self.test_aucroc = roc_auc_score(labels, scores) 147 | # self.test_aucpr = average_precision_score(labels, scores, pos_label = 1) 148 | 149 | # Log results 150 | logger.info('Test Loss: {:.6f}'.format(epoch_loss / n_batches)) 151 | # logger.info('Test AUCROC: {:.2f}%'.format(100. * self.test_aucroc)) 152 | # logger.info('Test AUCPR: {:.2f}%'.format(100. * self.test_aucpr)) 153 | logger.info('Test Time: {:.3f}s'.format(self.test_time)) 154 | logger.info('Finished testing.') 155 | 156 | return scores 157 | 158 | def init_center_c(self, train_loader: DataLoader, net: BaseNet, eps=0.1): 159 | """Initialize hypersphere center c as the mean from an initial forward pass on the data.""" 160 | n_samples = 0 161 | c = torch.zeros(net.rep_dim, device=self.device) 162 | 163 | net.eval() 164 | with torch.no_grad(): 165 | for data in train_loader: 166 | # get the inputs of the batch 167 | inputs, _, _, _ = data 168 | inputs = inputs.to(self.device) 169 | outputs = net(inputs) 170 | n_samples += outputs.shape[0] 171 | c += torch.sum(outputs, dim=0) 172 | 173 | c /= n_samples 174 | 175 | # If c_i is too close to 0, set to +-eps. Reason: a zero unit can be trivially matched with zero weights. 176 | c[(abs(c) < eps) & (c < 0)] = -eps 177 | c[(abs(c) < eps) & (c > 0)] = eps 178 | 179 | return c 180 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/optim/SemiDGM_trainer.py: -------------------------------------------------------------------------------- 1 | from baseline.DeepSAD.src.base.base_trainer import BaseTrainer 2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset 3 | from baseline.DeepSAD.src.base.base_net import BaseNet 4 | from baseline.DeepSAD.src.optim.variational import SVI, ImportanceWeightedSampler 5 | from baseline.DeepSAD.src.utils.misc import binary_cross_entropy 6 | from sklearn.metrics import roc_auc_score 7 | 8 | import logging 9 | import time 10 | import torch 11 | import torch.optim as optim 12 | import numpy as np 13 | 14 | 15 | class SemiDeepGenerativeTrainer(BaseTrainer): 16 | 17 | def __init__(self, alpha: float = 0.1, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150, 18 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', 19 | n_jobs_dataloader: int = 0): 20 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device, 21 | n_jobs_dataloader) 22 | 23 | self.alpha = alpha 24 | 25 | # Results 26 | self.train_time = None 27 | self.test_auc = None 28 | self.test_time = None 29 | self.test_scores = None 30 | 31 | def train(self, dataset: BaseADDataset, net: BaseNet): 32 | logger = logging.getLogger() 33 | 34 | # Get train data loader 35 | train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 36 | 37 | # Set device 38 | net = net.to(self.device) 39 | 40 | # Use importance weighted sampler (Burda et al., 2015) to get a better estimate on the log-likelihood. 41 | sampler = ImportanceWeightedSampler(mc=1, iw=1) 42 | elbo = SVI(net, likelihood=binary_cross_entropy, sampler=sampler) 43 | 44 | # Set optimizer (Adam optimizer for now) 45 | optimizer = optim.Adam(net.parameters(), lr=self.lr, weight_decay=self.weight_decay) 46 | 47 | # Set learning rate scheduler 48 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1) 49 | 50 | # Training 51 | logger.info('Starting training...') 52 | start_time = time.time() 53 | net.train() 54 | for epoch in range(self.n_epochs): 55 | 56 | scheduler.step() 57 | if epoch in self.lr_milestones: 58 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0])) 59 | 60 | epoch_loss = 0.0 61 | n_batches = 0 62 | epoch_start_time = time.time() 63 | for data in train_loader: 64 | inputs, labels, semi_targets, _ = data 65 | 66 | inputs = inputs.to(self.device) 67 | labels = labels.to(self.device) 68 | semi_targets = semi_targets.to(self.device) 69 | 70 | # Get labeled and unlabeled data and make labels one-hot 71 | inputs = inputs.view(inputs.size(0), -1) 72 | x = inputs[semi_targets != 0] 73 | u = inputs[semi_targets == 0] 74 | y = labels[semi_targets != 0] 75 | if y.nelement() > 1: 76 | y_onehot = torch.Tensor(y.size(0), 2).to(self.device) # two labels: 0: normal, 1: outlier 77 | y_onehot.zero_() 78 | y_onehot.scatter_(1, y.view(-1, 1), 1) 79 | 80 | # Zero the network parameter gradients 81 | optimizer.zero_grad() 82 | 83 | # Update network parameters via backpropagation: forward + backward + optimize 84 | if y.nelement() < 2: 85 | L = torch.tensor(0.0).to(self.device) 86 | else: 87 | L = -elbo(x, y_onehot) 88 | U = -elbo(u) 89 | 90 | # Regular cross entropy 91 | if y.nelement() < 2: 92 | classication_loss = torch.tensor(0.0).to(self.device) 93 | else: 94 | # Add auxiliary classification loss q(y|x) 95 | logits = net.classify(x) 96 | eps = 1e-8 97 | classication_loss = torch.sum(y_onehot * torch.log(logits + eps), dim=1).mean() 98 | 99 | # Overall loss 100 | loss = L - self.alpha * classication_loss + U # J_alpha 101 | 102 | loss.backward() 103 | optimizer.step() 104 | 105 | epoch_loss += loss.item() 106 | n_batches += 1 107 | 108 | # log epoch statistics 109 | epoch_train_time = time.time() - epoch_start_time 110 | logger.info(f'| Epoch: {epoch + 1:03}/{self.n_epochs:03} | Train Time: {epoch_train_time:.3f}s ' 111 | f'| Train Loss: {epoch_loss / n_batches:.6f} |') 112 | 113 | self.train_time = time.time() - start_time 114 | logger.info('Training Time: {:.3f}s'.format(self.train_time)) 115 | logger.info('Finished training.') 116 | 117 | return net 118 | 119 | def test(self, dataset: BaseADDataset, net: BaseNet): 120 | logger = logging.getLogger() 121 | 122 | # Get test data loader 123 | _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 124 | 125 | # Set device 126 | net = net.to(self.device) 127 | 128 | # Use importance weighted sampler (Burda et al., 2015) to get a better estimate on the log-likelihood. 129 | sampler = ImportanceWeightedSampler(mc=1, iw=1) 130 | elbo = SVI(net, likelihood=binary_cross_entropy, sampler=sampler) 131 | 132 | # Testing 133 | logger.info('Starting testing...') 134 | epoch_loss = 0.0 135 | n_batches = 0 136 | start_time = time.time() 137 | idx_label_score = [] 138 | net.eval() 139 | with torch.no_grad(): 140 | for data in test_loader: 141 | inputs, labels, _, idx = data 142 | inputs = inputs.to(self.device) 143 | labels = labels.to(self.device) 144 | idx = idx.to(self.device) 145 | 146 | # All test data is considered unlabeled 147 | inputs = inputs.view(inputs.size(0), -1) 148 | u = inputs 149 | y = labels 150 | y_onehot = torch.Tensor(y.size(0), 2).to(self.device) # two labels: 0: normal, 1: outlier 151 | y_onehot.zero_() 152 | y_onehot.scatter_(1, y.view(-1, 1), 1) 153 | 154 | # Compute loss 155 | L = -elbo(u, y_onehot) 156 | U = -elbo(u) 157 | 158 | logits = net.classify(u) 159 | eps = 1e-8 160 | classication_loss = -torch.sum(y_onehot * torch.log(logits + eps), dim=1).mean() 161 | 162 | loss = L + self.alpha * classication_loss + U # J_alpha 163 | 164 | # Compute scores 165 | scores = logits[:, 1] # likelihood/confidence for anomalous class as anomaly score 166 | 167 | # Save triple of (idx, label, score) in a list 168 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(), 169 | labels.cpu().data.numpy().tolist(), 170 | scores.cpu().data.numpy().tolist())) 171 | 172 | epoch_loss += loss.item() 173 | n_batches += 1 174 | 175 | self.test_time = time.time() - start_time 176 | self.test_scores = idx_label_score 177 | 178 | # Compute AUC 179 | _, labels, scores = zip(*idx_label_score) 180 | labels = np.array(labels) 181 | scores = np.array(scores) 182 | self.test_auc = roc_auc_score(labels, scores) 183 | 184 | # Log results 185 | logger.info('Test Loss: {:.6f}'.format(epoch_loss / n_batches)) 186 | logger.info('Test AUC: {:.2f}%'.format(100. * self.test_auc)) 187 | logger.info('Test Time: {:.3f}s'.format(self.test_time)) 188 | logger.info('Finished testing.') 189 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baselines/shallow_ssad/ssad_convex.py: -------------------------------------------------------------------------------- 1 | ######################################################################################################################## 2 | # Acknowledgements: https://github.com/nicococo/tilitools 3 | ######################################################################################################################## 4 | import numpy as np 5 | 6 | from cvxopt import matrix, spmatrix, sparse, spdiag 7 | from cvxopt.solvers import qp 8 | 9 | 10 | class ConvexSSAD: 11 | """ Convex semi-supervised anomaly detection with hinge-loss and L2 regularizer 12 | as described in Goernitz et al., Towards Supervised Anomaly Detection, JAIR, 2013 13 | 14 | minimize 0.5 ||w||^2_2 - rho - kappa*gamma + eta_u sum_i xi_i + eta_l sum_j xi_j 15 | {w,rho,gamma>=0,xi>=0} 16 | subject to >= rho - xi_i 17 | y_j >= y_j*rho + gamma - xi_j 18 | 19 | And the corresponding dual optimization problem: 20 | 21 | maximize -0.5 sum_(i,j) alpha_i alpha_j y_i y_j k(x_i,x_j) 22 | {0<=alpha_i<=eta_i} 23 | subject to kappa <= sum_j alpha_j (for all labeled examples) 24 | 1 = sum_j y_i alpha_j (for all examples) 25 | 26 | We introduce labels y_i = +1 for all unlabeled examples which enables us to combine sums. 27 | 28 | Note: Only dual solution is supported. 29 | 30 | Written by: Nico Goernitz, TU Berlin, 2013/14 31 | """ 32 | PRECISION = 1e-9 # important: effects the threshold, support vectors and speed! 33 | 34 | def __init__(self, kernel, y, kappa=1.0, Cp=1.0, Cu=1.0, Cn=1.0): 35 | assert(len(y.shape) == 1) 36 | self.kernel = kernel 37 | self.y = y # (vector) corresponding labels (+1,-1 and 0 for unlabeled) 38 | self.kappa = kappa # (scalar) regularizer for importance of the margin 39 | self.Cp = Cp # (scalar) the regularization constant for positively labeled samples > 0 40 | self.Cu = Cu # (scalar) the regularization constant for unlabeled samples > 0 41 | self.Cn = Cn # (scalar) the regularization constant for outliers > 0 42 | self.samples = y.size 43 | self.labeled = np.sum(np.abs(y)) 44 | 45 | # cy: (vector) converted label vector (+1 for pos and unlabeled, -1 for outliers) 46 | self.cy = y.copy().reshape((y.size, 1)) 47 | self.cy[y == 0] = 1 # cy=+1.0 (unlabeled,pos) & cy=-1.0 (neg) 48 | 49 | # cl: (vector) converted label vector (+1 for labeled examples, 0.0 for unlabeled) 50 | self.cl = np.abs(y.copy()) # cl=+1.0 (labeled) cl=0.0 (unlabeled) 51 | 52 | # (vector) converted upper bound box constraint for each example 53 | self.cC = np.zeros(y.size) # cC=Cu (unlabeled) cC=Cp (pos) cC=Cn (neg) 54 | self.cC[y == 0] = Cu 55 | self.cC[y == 1] = Cp 56 | self.cC[y ==-1] = Cn 57 | 58 | self.alphas = None 59 | self.svs = None # (vector) list of support vector (contains indices) 60 | self.threshold = 0.0 # (scalar) the optimized threshold (rho) 61 | 62 | # if there are no labeled examples, then set kappa to 0.0 otherwise 63 | # the dual constraint kappa <= sum_{i \in labeled} alpha_i = 0.0 will 64 | # prohibit a solution 65 | if self.labeled == 0: 66 | print('There are no labeled examples hence, setting kappa=0.0') 67 | self.kappa = 0.0 68 | print('Convex semi-supervised anomaly detection with {0} samples ({1} labeled).'.format(self.samples, self.labeled)) 69 | 70 | def set_train_kernel(self, kernel): 71 | dim1, dim2 = kernel.shape 72 | print([dim1, dim2]) 73 | assert(dim1 == dim2 and dim1 == self.samples) 74 | self.kernel = kernel 75 | 76 | def fit(self, check_psd_eigs=False): 77 | # number of training examples 78 | N = self.samples 79 | 80 | # generate the label kernel 81 | Y = self.cy.dot(self.cy.T) 82 | 83 | # generate the final PDS kernel 84 | P = matrix(self.kernel*Y) 85 | 86 | # check for PSD 87 | if check_psd_eigs: 88 | eigs = np.linalg.eigvalsh(np.array(P)) 89 | if eigs[0] < 0.0: 90 | print('Smallest eigenvalue is {0}'.format(eigs[0])) 91 | P += spdiag([-eigs[0] for i in range(N)]) 92 | 93 | # there is no linear part of the objective 94 | q = matrix(0.0, (N, 1)) 95 | 96 | # sum_i y_i alpha_i = A alpha = b = 1.0 97 | A = matrix(self.cy, (1, self.samples), 'd') 98 | b = matrix(1.0, (1, 1)) 99 | 100 | # inequality constraints: G alpha <= h 101 | # 1) alpha_i <= C_i 102 | # 2) -alpha_i <= 0 103 | G12 = spmatrix(1.0, range(N), range(N)) 104 | h1 = matrix(self.cC) 105 | h2 = matrix(0.0, (N, 1)) 106 | G = sparse([G12, -G12]) 107 | h = matrix([h1, h2]) 108 | if self.labeled > 0: 109 | # 3) kappa <= \sum_i labeled_i alpha_i -> -cl' alpha <= -kappa 110 | print('Labeled data found.') 111 | G3 = -matrix(self.cl, (1, self.cl.size), 'd') 112 | h3 = -matrix(self.kappa, (1, 1)) 113 | G = sparse([G12, -G12, G3]) 114 | h = matrix([h1, h2, h3]) 115 | 116 | # solve the quadratic programm 117 | sol = qp(P, -q, G, h, A, b) 118 | 119 | # store solution 120 | self.alphas = np.array(sol['x']) 121 | 122 | # 1. find all support vectors, i.e. 0 < alpha_i <= C 123 | # 2. store all support vector with alpha_i < C in 'margins' 124 | self.svs = np.where(self.alphas >= ConvexSSAD.PRECISION)[0] 125 | 126 | # these should sum to one 127 | print('Validate solution:') 128 | print('- found {0} support vectors'.format(len(self.svs))) 129 | print('0 <= alpha_i : {0} of {1}'.format(np.sum(0. <= self.alphas), N)) 130 | print('- sum_(i) alpha_i cy_i = {0} = 1.0'.format(np.sum(self.alphas*self.cy))) 131 | print('- sum_(i in sv) alpha_i cy_i = {0} ~ 1.0 (approx error)'.format(np.sum(self.alphas[self.svs]*self.cy[self.svs]))) 132 | print('- sum_(i in labeled) alpha_i = {0} >= {1} = kappa'.format(np.sum(self.alphas[self.cl == 1]), self.kappa)) 133 | print('- sum_(i in unlabeled) alpha_i = {0}'.format(np.sum(self.alphas[self.y == 0]))) 134 | print('- sum_(i in positives) alpha_i = {0}'.format(np.sum(self.alphas[self.y == 1]))) 135 | print('- sum_(i in negatives) alpha_i = {0}'.format(np.sum(self.alphas[self.y ==-1]))) 136 | 137 | # infer threshold (rho) 138 | psvs = np.where(self.y[self.svs] == 0)[0] 139 | # case 1: unlabeled support vectors available 140 | self.threshold = 0. 141 | unl_threshold = -1e12 142 | lbl_threshold = -1e12 143 | if psvs.size > 0: 144 | k = self.kernel[:, self.svs] 145 | k = k[self.svs[psvs], :] 146 | unl_threshold = np.max(self.apply(k)) 147 | 148 | if np.sum(self.cl) > 1e-12: 149 | # case 2: only labeled examples available 150 | k = self.kernel[:, self.svs] 151 | k = k[self.svs, :] 152 | thres = self.apply(k) 153 | pinds = np.where(self.y[self.svs] == +1)[0] 154 | ninds = np.where(self.y[self.svs] == -1)[0] 155 | # only negatives is not possible 156 | if ninds.size > 0 and pinds.size == 0: 157 | print('ERROR: Check pre-defined PRECISION.') 158 | lbl_threshold = np.max(thres[ninds]) 159 | elif ninds.size == 0: 160 | lbl_threshold = np.max(thres[pinds]) 161 | else: 162 | # smallest negative + largest positive 163 | p = np.max(thres[pinds]) 164 | n = np.min(thres[ninds]) 165 | lbl_threshold = (n+p)/2. 166 | self.threshold = np.max((unl_threshold, lbl_threshold)) 167 | 168 | def get_threshold(self): 169 | return self.threshold 170 | 171 | def get_support_dual(self): 172 | return self.svs 173 | 174 | def get_alphas(self): 175 | return self.alphas 176 | 177 | def apply(self, kernel): 178 | """ Application of dual trained ssad. 179 | kernel = get_kernel(Y, X[:, cssad.svs], kernel_type, kernel_param) 180 | """ 181 | if kernel.shape[1] == self.samples: 182 | # if kernel is not restricted to support vectors 183 | ay = self.alphas * self.cy 184 | else: 185 | ay = self.alphas[self.svs] * self.cy[self.svs] 186 | return ay.T.dot(kernel.T).T - self.threshold 187 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baseline_ocsvm.py: -------------------------------------------------------------------------------- 1 | import click 2 | import torch 3 | import logging 4 | import random 5 | import numpy as np 6 | 7 | from utils.config import Config 8 | from utils.visualization.plot_images_grid import plot_images_grid 9 | from baselines.ocsvm import OCSVM 10 | from datasets.main import load_dataset 11 | 12 | 13 | ################################################################################ 14 | # Settings 15 | ################################################################################ 16 | @click.command() 17 | @click.argument('dataset_name', type=click.Choice(['mnist', 'fmnist', 'cifar10', 'arrhythmia', 'cardio', 'satellite', 18 | 'satimage-2', 'shuttle', 'thyroid'])) 19 | @click.argument('xp_path', type=click.Path(exists=True)) 20 | @click.argument('data_path', type=click.Path(exists=True)) 21 | @click.option('--load_config', type=click.Path(exists=True), default=None, 22 | help='Config JSON-file path (default: None).') 23 | @click.option('--load_model', type=click.Path(exists=True), default=None, 24 | help='Model file path (default: None).') 25 | @click.option('--ratio_known_normal', type=float, default=0.0, 26 | help='Ratio of known (labeled) normal training examples.') 27 | @click.option('--ratio_known_outlier', type=float, default=0.0, 28 | help='Ratio of known (labeled) anomalous training examples.') 29 | @click.option('--ratio_pollution', type=float, default=0.0, 30 | help='Pollution ratio of unlabeled training data with unknown (unlabeled) anomalies.') 31 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.') 32 | @click.option('--kernel', type=click.Choice(['rbf', 'linear', 'poly']), default='rbf', help='Kernel for the OC-SVM') 33 | @click.option('--nu', type=float, default=0.1, help='OC-SVM hyperparameter nu (must be 0 < nu <= 1).') 34 | @click.option('--hybrid', type=bool, default=False, 35 | help='Train OC-SVM on features extracted from an autoencoder. If True, load_ae must be specified.') 36 | @click.option('--load_ae', type=click.Path(exists=True), default=None, 37 | help='Model file path to load autoencoder weights (default: None).') 38 | @click.option('--n_jobs_dataloader', type=int, default=0, 39 | help='Number of workers for data loading. 0 means that the data will be loaded in the main process.') 40 | @click.option('--normal_class', type=int, default=0, 41 | help='Specify the normal class of the dataset (all other classes are considered anomalous).') 42 | @click.option('--known_outlier_class', type=int, default=1, 43 | help='Specify the known outlier class of the dataset for semi-supervised anomaly detection.') 44 | @click.option('--n_known_outlier_classes', type=int, default=0, 45 | help='Number of known outlier classes.' 46 | 'If 0, no anomalies are known.' 47 | 'If 1, outlier class as specified in --known_outlier_class option.' 48 | 'If > 1, the specified number of outlier classes will be sampled at random.') 49 | def main(dataset_name, xp_path, data_path, load_config, load_model, ratio_known_normal, ratio_known_outlier, 50 | ratio_pollution, seed, kernel, nu, hybrid, load_ae, n_jobs_dataloader, normal_class, known_outlier_class, 51 | n_known_outlier_classes): 52 | """ 53 | (Hybrid) One-Class SVM for anomaly detection. 54 | 55 | :arg DATASET_NAME: Name of the dataset to load. 56 | :arg XP_PATH: Export path for logging the experiment. 57 | :arg DATA_PATH: Root path of data. 58 | """ 59 | 60 | # Get configuration 61 | cfg = Config(locals().copy()) 62 | 63 | # Set up logging 64 | logging.basicConfig(level=logging.INFO) 65 | logger = logging.getLogger() 66 | logger.setLevel(logging.INFO) 67 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 68 | log_file = xp_path + '/log.txt' 69 | file_handler = logging.FileHandler(log_file) 70 | file_handler.setLevel(logging.INFO) 71 | file_handler.setFormatter(formatter) 72 | logger.addHandler(file_handler) 73 | 74 | # Print paths 75 | logger.info('Log file is %s.' % log_file) 76 | logger.info('Data path is %s.' % data_path) 77 | logger.info('Export path is %s.' % xp_path) 78 | 79 | # Print experimental setup 80 | logger.info('Dataset: %s' % dataset_name) 81 | logger.info('Normal class: %d' % normal_class) 82 | logger.info('Ratio of labeled normal train samples: %.2f' % ratio_known_normal) 83 | logger.info('Ratio of labeled anomalous samples: %.2f' % ratio_known_outlier) 84 | logger.info('Pollution ratio of unlabeled train data: %.2f' % ratio_pollution) 85 | if n_known_outlier_classes == 1: 86 | logger.info('Known anomaly class: %d' % known_outlier_class) 87 | else: 88 | logger.info('Number of known anomaly classes: %d' % n_known_outlier_classes) 89 | 90 | # If specified, load experiment config from JSON-file 91 | if load_config: 92 | cfg.load_config(import_json=load_config) 93 | logger.info('Loaded configuration from %s.' % load_config) 94 | 95 | # Print OC-SVM configuration 96 | logger.info('OC-SVM kernel: %s' % cfg.settings['kernel']) 97 | logger.info('Nu-paramerter: %.2f' % cfg.settings['nu']) 98 | logger.info('Hybrid model: %s' % cfg.settings['hybrid']) 99 | 100 | # Set seed 101 | if cfg.settings['seed'] != -1: 102 | random.seed(cfg.settings['seed']) 103 | np.random.seed(cfg.settings['seed']) 104 | torch.manual_seed(cfg.settings['seed']) 105 | torch.cuda.manual_seed(cfg.settings['seed']) 106 | torch.backends.cudnn.deterministic = True 107 | logger.info('Set seed to %d.' % cfg.settings['seed']) 108 | 109 | # Use 'cpu' as device for OC-SVM 110 | device = 'cpu' 111 | torch.multiprocessing.set_sharing_strategy('file_system') # fix multiprocessing issue for ubuntu 112 | logger.info('Computation device: %s' % device) 113 | logger.info('Number of dataloader workers: %d' % n_jobs_dataloader) 114 | 115 | # Load data 116 | dataset = load_dataset(dataset_name, data_path, normal_class, known_outlier_class, n_known_outlier_classes, 117 | ratio_known_normal, ratio_known_outlier, ratio_pollution, 118 | random_state=np.random.RandomState(cfg.settings['seed'])) 119 | # Log random sample of known anomaly classes if more than 1 class 120 | if n_known_outlier_classes > 1: 121 | logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes,)) 122 | 123 | # Initialize OC-SVM model 124 | ocsvm = OCSVM(cfg.settings['kernel'], cfg.settings['nu'], cfg.settings['hybrid']) 125 | 126 | # If specified, load model parameters from already trained model 127 | if load_model: 128 | ocsvm.load_model(import_path=load_model, device=device) 129 | logger.info('Loading model from %s.' % load_model) 130 | 131 | # If specified, load model autoencoder weights for a hybrid approach 132 | if hybrid and load_ae is not None: 133 | ocsvm.load_ae(dataset_name, model_path=load_ae) 134 | logger.info('Loaded pretrained autoencoder for features from %s.' % load_ae) 135 | 136 | # Train model on dataset 137 | ocsvm.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) 138 | 139 | # Test model 140 | ocsvm.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) 141 | 142 | # Save results and configuration 143 | ocsvm.save_results(export_json=xp_path + '/results.json') 144 | cfg.save_config(export_json=xp_path + '/config.json') 145 | 146 | # Plot most anomalous and most normal test samples 147 | indices, labels, scores = zip(*ocsvm.results['test_scores']) 148 | indices, labels, scores = np.array(indices), np.array(labels), np.array(scores) 149 | idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score 150 | idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score 151 | 152 | if dataset_name in ('mnist', 'fmnist', 'cifar10'): 153 | 154 | if dataset_name in ('mnist', 'fmnist'): 155 | X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1) 156 | X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(1) 157 | X_normal_low = dataset.test_set.data[idx_normal_sorted[:32], ...].unsqueeze(1) 158 | X_normal_high = dataset.test_set.data[idx_normal_sorted[-32:], ...].unsqueeze(1) 159 | 160 | if dataset_name == 'cifar10': 161 | X_all_low = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[:32], ...], (0, 3, 1, 2))) 162 | X_all_high = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[-32:], ...], (0, 3, 1, 2))) 163 | X_normal_low = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[:32], ...], (0, 3, 1, 2))) 164 | X_normal_high = torch.tensor( 165 | np.transpose(dataset.test_set.data[idx_normal_sorted[-32:], ...], (0, 3, 1, 2))) 166 | 167 | plot_images_grid(X_all_low, export_img=xp_path + '/all_low', padding=2) 168 | plot_images_grid(X_all_high, export_img=xp_path + '/all_high', padding=2) 169 | plot_images_grid(X_normal_low, export_img=xp_path + '/normals_low', padding=2) 170 | plot_images_grid(X_normal_high, export_img=xp_path + '/normals_high', padding=2) 171 | 172 | 173 | if __name__ == '__main__': 174 | main() 175 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baseline_ssad.py: -------------------------------------------------------------------------------- 1 | import click 2 | import torch 3 | import logging 4 | import random 5 | import numpy as np 6 | import cvxopt as co 7 | 8 | from utils.config import Config 9 | from utils.visualization.plot_images_grid import plot_images_grid 10 | from baselines.ssad import SSAD 11 | from datasets.main import load_dataset 12 | 13 | 14 | ################################################################################ 15 | # Settings 16 | ################################################################################ 17 | @click.command() 18 | @click.argument('dataset_name', type=click.Choice(['mnist', 'fmnist', 'cifar10', 'arrhythmia', 'cardio', 'satellite', 19 | 'satimage-2', 'shuttle', 'thyroid'])) 20 | @click.argument('xp_path', type=click.Path(exists=True)) 21 | @click.argument('data_path', type=click.Path(exists=True)) 22 | @click.option('--load_config', type=click.Path(exists=True), default=None, 23 | help='Config JSON-file path (default: None).') 24 | @click.option('--load_model', type=click.Path(exists=True), default=None, 25 | help='Model file path (default: None).') 26 | @click.option('--ratio_known_normal', type=float, default=0.0, 27 | help='Ratio of known (labeled) normal training examples.') 28 | @click.option('--ratio_known_outlier', type=float, default=0.0, 29 | help='Ratio of known (labeled) anomalous training examples.') 30 | @click.option('--ratio_pollution', type=float, default=0.0, 31 | help='Pollution ratio of unlabeled training data with unknown (unlabeled) anomalies.') 32 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.') 33 | @click.option('--kernel', type=click.Choice(['rbf']), default='rbf', help='Kernel for SSAD') 34 | @click.option('--kappa', type=float, default=1.0, help='SSAD hyperparameter kappa.') 35 | @click.option('--hybrid', type=bool, default=False, 36 | help='Train SSAD on features extracted from an autoencoder. If True, load_ae must be specified') 37 | @click.option('--load_ae', type=click.Path(exists=True), default=None, 38 | help='Model file path to load autoencoder weights (default: None).') 39 | @click.option('--n_jobs_dataloader', type=int, default=0, 40 | help='Number of workers for data loading. 0 means that the data will be loaded in the main process.') 41 | @click.option('--normal_class', type=int, default=0, 42 | help='Specify the normal class of the dataset (all other classes are considered anomalous).') 43 | @click.option('--known_outlier_class', type=int, default=1, 44 | help='Specify the known outlier class of the dataset for semi-supervised anomaly detection.') 45 | @click.option('--n_known_outlier_classes', type=int, default=0, 46 | help='Number of known outlier classes.' 47 | 'If 0, no anomalies are known.' 48 | 'If 1, outlier class as specified in --known_outlier_class option.' 49 | 'If > 1, the specified number of outlier classes will be sampled at random.') 50 | def main(dataset_name, xp_path, data_path, load_config, load_model, ratio_known_normal, ratio_known_outlier, 51 | ratio_pollution, seed, kernel, kappa, hybrid, load_ae, n_jobs_dataloader, normal_class, known_outlier_class, 52 | n_known_outlier_classes): 53 | """ 54 | (Hybrid) SSAD for anomaly detection as in Goernitz et al., Towards Supervised Anomaly Detection, JAIR, 2013. 55 | 56 | :arg DATASET_NAME: Name of the dataset to load. 57 | :arg XP_PATH: Export path for logging the experiment. 58 | :arg DATA_PATH: Root path of data. 59 | """ 60 | 61 | # Get configuration 62 | cfg = Config(locals().copy()) 63 | 64 | # Set up logging 65 | logging.basicConfig(level=logging.INFO) 66 | logger = logging.getLogger() 67 | logger.setLevel(logging.INFO) 68 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 69 | log_file = xp_path + '/log.txt' 70 | file_handler = logging.FileHandler(log_file) 71 | file_handler.setLevel(logging.INFO) 72 | file_handler.setFormatter(formatter) 73 | logger.addHandler(file_handler) 74 | 75 | # Print paths 76 | logger.info('Log file is %s.' % log_file) 77 | logger.info('Data path is %s.' % data_path) 78 | logger.info('Export path is %s.' % xp_path) 79 | 80 | # Print experimental setup 81 | logger.info('Dataset: %s' % dataset_name) 82 | logger.info('Normal class: %d' % normal_class) 83 | logger.info('Ratio of labeled normal train samples: %.2f' % ratio_known_normal) 84 | logger.info('Ratio of labeled anomalous samples: %.2f' % ratio_known_outlier) 85 | logger.info('Pollution ratio of unlabeled train data: %.2f' % ratio_pollution) 86 | if n_known_outlier_classes == 1: 87 | logger.info('Known anomaly class: %d' % known_outlier_class) 88 | else: 89 | logger.info('Number of known anomaly classes: %d' % n_known_outlier_classes) 90 | 91 | # If specified, load experiment config from JSON-file 92 | if load_config: 93 | cfg.load_config(import_json=load_config) 94 | logger.info('Loaded configuration from %s.' % load_config) 95 | 96 | # Print SSAD configuration 97 | logger.info('SSAD kernel: %s' % cfg.settings['kernel']) 98 | logger.info('Kappa-paramerter: %.2f' % cfg.settings['kappa']) 99 | logger.info('Hybrid model: %s' % cfg.settings['hybrid']) 100 | 101 | # Set seed 102 | if cfg.settings['seed'] != -1: 103 | random.seed(cfg.settings['seed']) 104 | np.random.seed(cfg.settings['seed']) 105 | co.setseed(cfg.settings['seed']) 106 | torch.manual_seed(cfg.settings['seed']) 107 | torch.cuda.manual_seed(cfg.settings['seed']) 108 | torch.backends.cudnn.deterministic = True 109 | logger.info('Set seed to %d.' % cfg.settings['seed']) 110 | 111 | # Use 'cpu' as device for SSAD 112 | device = 'cpu' 113 | torch.multiprocessing.set_sharing_strategy('file_system') # fix multiprocessing issue for ubuntu 114 | logger.info('Computation device: %s' % device) 115 | logger.info('Number of dataloader workers: %d' % n_jobs_dataloader) 116 | 117 | # Load data 118 | dataset = load_dataset(dataset_name, data_path, normal_class, known_outlier_class, n_known_outlier_classes, 119 | ratio_known_normal, ratio_known_outlier, ratio_pollution, 120 | random_state=np.random.RandomState(cfg.settings['seed'])) 121 | # Log random sample of known anomaly classes if more than 1 class 122 | if n_known_outlier_classes > 1: 123 | logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes,)) 124 | 125 | # Initialize SSAD model 126 | ssad = SSAD(kernel=cfg.settings['kernel'], kappa=cfg.settings['kappa'], hybrid=cfg.settings['hybrid']) 127 | 128 | # If specified, load model parameters from already trained model 129 | if load_model: 130 | ssad.load_model(import_path=load_model, device=device) 131 | logger.info('Loading model from %s.' % load_model) 132 | 133 | # If specified, load model autoencoder weights for a hybrid approach 134 | if hybrid and load_ae is not None: 135 | ssad.load_ae(dataset_name, model_path=load_ae) 136 | logger.info('Loaded pretrained autoencoder for features from %s.' % load_ae) 137 | 138 | # Train model on dataset 139 | ssad.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) 140 | 141 | # Test model 142 | ssad.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) 143 | 144 | # Save results and configuration 145 | ssad.save_results(export_json=xp_path + '/results.json') 146 | cfg.save_config(export_json=xp_path + '/config.json') 147 | 148 | # Plot most anomalous and most normal test samples 149 | indices, labels, scores = zip(*ssad.results['test_scores']) 150 | indices, labels, scores = np.array(indices), np.array(labels), np.array(scores) 151 | idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score 152 | idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score 153 | 154 | if dataset_name in ('mnist', 'fmnist', 'cifar10'): 155 | 156 | if dataset_name in ('mnist', 'fmnist'): 157 | X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1) 158 | X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(1) 159 | X_normal_low = dataset.test_set.data[idx_normal_sorted[:32], ...].unsqueeze(1) 160 | X_normal_high = dataset.test_set.data[idx_normal_sorted[-32:], ...].unsqueeze(1) 161 | 162 | if dataset_name == 'cifar10': 163 | X_all_low = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[:32], ...], (0, 3, 1, 2))) 164 | X_all_high = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[-32:], ...], (0, 3, 1, 2))) 165 | X_normal_low = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[:32], ...], (0, 3, 1, 2))) 166 | X_normal_high = torch.tensor( 167 | np.transpose(dataset.test_set.data[idx_normal_sorted[-32:], ...], (0, 3, 1, 2))) 168 | 169 | plot_images_grid(X_all_low, export_img=xp_path + '/all_low', padding=2) 170 | plot_images_grid(X_all_high, export_img=xp_path + '/all_high', padding=2) 171 | plot_images_grid(X_normal_low, export_img=xp_path + '/normals_low', padding=2) 172 | plot_images_grid(X_normal_high, export_img=xp_path + '/normals_high', padding=2) 173 | 174 | 175 | if __name__ == '__main__': 176 | main() 177 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baselines/ocsvm.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import time 4 | import torch 5 | import numpy as np 6 | 7 | from torch.utils.data import DataLoader 8 | from sklearn.svm import OneClassSVM 9 | from sklearn.metrics import roc_auc_score 10 | from base.base_dataset import BaseADDataset 11 | from networks.main import build_autoencoder 12 | 13 | 14 | class OCSVM(object): 15 | """A class for One-Class SVM models.""" 16 | 17 | def __init__(self, kernel='rbf', nu=0.1, hybrid=False): 18 | """Init OCSVM instance.""" 19 | self.kernel = kernel 20 | self.nu = nu 21 | self.rho = None 22 | self.gamma = None 23 | 24 | self.model = OneClassSVM(kernel=kernel, nu=nu) 25 | 26 | self.hybrid = hybrid 27 | self.ae_net = None # autoencoder network for the case of a hybrid model 28 | self.linear_model = None # also init a model with linear kernel if hybrid approach 29 | 30 | self.results = { 31 | 'train_time': None, 32 | 'test_time': None, 33 | 'test_auc': None, 34 | 'test_scores': None, 35 | 'train_time_linear': None, 36 | 'test_time_linear': None, 37 | 'test_auc_linear': None 38 | } 39 | 40 | def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0): 41 | """Trains the OC-SVM model on the training data.""" 42 | logger = logging.getLogger() 43 | 44 | # do not drop last batch for non-SGD optimization shallow_ssad 45 | train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True, 46 | num_workers=n_jobs_dataloader, drop_last=False) 47 | 48 | # Get data from loader 49 | X = () 50 | for data in train_loader: 51 | inputs, _, _, _ = data 52 | inputs = inputs.to(device) 53 | if self.hybrid: 54 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 55 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 56 | X += (X_batch.cpu().data.numpy(),) 57 | X = np.concatenate(X) 58 | 59 | # Training 60 | logger.info('Starting training...') 61 | 62 | # Select model via hold-out test set of 1000 samples 63 | gammas = np.logspace(-7, 2, num=10, base=2) 64 | best_auc = 0.0 65 | 66 | # Sample hold-out set from test set 67 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader) 68 | 69 | X_test = () 70 | labels = [] 71 | for data in test_loader: 72 | inputs, label_batch, _, _ = data 73 | inputs, label_batch = inputs.to(device), label_batch.to(device) 74 | if self.hybrid: 75 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 76 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 77 | X_test += (X_batch.cpu().data.numpy(),) 78 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist() 79 | X_test, labels = np.concatenate(X_test), np.array(labels) 80 | n_test, n_normal, n_outlier = len(X_test), np.sum(labels == 0), np.sum(labels == 1) 81 | n_val = int(0.1 * n_test) 82 | n_val_normal, n_val_outlier = int(n_val * (n_normal/n_test)), int(n_val * (n_outlier/n_test)) 83 | perm = np.random.permutation(n_test) 84 | X_val = np.concatenate((X_test[perm][labels[perm] == 0][:n_val_normal], 85 | X_test[perm][labels[perm] == 1][:n_val_outlier])) 86 | labels = np.array([0] * n_val_normal + [1] * n_val_outlier) 87 | 88 | i = 1 89 | for gamma in gammas: 90 | 91 | # Model candidate 92 | model = OneClassSVM(kernel=self.kernel, nu=self.nu, gamma=gamma) 93 | 94 | # Train 95 | start_time = time.time() 96 | model.fit(X) 97 | train_time = time.time() - start_time 98 | 99 | # Test on small hold-out set from test set 100 | scores = (-1.0) * model.decision_function(X_val) 101 | scores = scores.flatten() 102 | 103 | # Compute AUC 104 | auc = roc_auc_score(labels, scores) 105 | 106 | logger.info(f' | Model {i:02}/{len(gammas):02} | Gamma: {gamma:.8f} | Train Time: {train_time:.3f}s ' 107 | f'| Val AUC: {100. * auc:.2f} |') 108 | 109 | if auc > best_auc: 110 | best_auc = auc 111 | self.model = model 112 | self.gamma = gamma 113 | self.results['train_time'] = train_time 114 | 115 | i += 1 116 | 117 | # If hybrid, also train a model with linear kernel 118 | if self.hybrid: 119 | self.linear_model = OneClassSVM(kernel='linear', nu=self.nu) 120 | start_time = time.time() 121 | self.linear_model.fit(X) 122 | train_time = time.time() - start_time 123 | self.results['train_time_linear'] = train_time 124 | 125 | logger.info(f'Best Model: | Gamma: {self.gamma:.8f} | AUC: {100. * best_auc:.2f}') 126 | logger.info('Training Time: {:.3f}s'.format(self.results['train_time'])) 127 | logger.info('Finished training.') 128 | 129 | def test(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0): 130 | """Tests the OC-SVM model on the test data.""" 131 | logger = logging.getLogger() 132 | 133 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader) 134 | 135 | # Get data from loader 136 | idx_label_score = [] 137 | X = () 138 | idxs = [] 139 | labels = [] 140 | for data in test_loader: 141 | inputs, label_batch, _, idx = data 142 | inputs, label_batch, idx = inputs.to(device), label_batch.to(device), idx.to(device) 143 | if self.hybrid: 144 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 145 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 146 | X += (X_batch.cpu().data.numpy(),) 147 | idxs += idx.cpu().data.numpy().astype(np.int64).tolist() 148 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist() 149 | X = np.concatenate(X) 150 | 151 | # Testing 152 | logger.info('Starting testing...') 153 | start_time = time.time() 154 | 155 | scores = (-1.0) * self.model.decision_function(X) 156 | 157 | self.results['test_time'] = time.time() - start_time 158 | scores = scores.flatten() 159 | self.rho = -self.model.intercept_[0] 160 | 161 | # Save triples of (idx, label, score) in a list 162 | idx_label_score += list(zip(idxs, labels, scores.tolist())) 163 | self.results['test_scores'] = idx_label_score 164 | 165 | # Compute AUC 166 | _, labels, scores = zip(*idx_label_score) 167 | labels = np.array(labels) 168 | scores = np.array(scores) 169 | self.results['test_auc'] = roc_auc_score(labels, scores) 170 | 171 | # If hybrid, also test model with linear kernel 172 | if self.hybrid: 173 | start_time = time.time() 174 | scores_linear = (-1.0) * self.linear_model.decision_function(X) 175 | self.results['test_time_linear'] = time.time() - start_time 176 | scores_linear = scores_linear.flatten() 177 | self.results['test_auc_linear'] = roc_auc_score(labels, scores_linear) 178 | logger.info('Test AUC linear model: {:.2f}%'.format(100. * self.results['test_auc_linear'])) 179 | logger.info('Test Time linear model: {:.3f}s'.format(self.results['test_time_linear'])) 180 | 181 | # Log results 182 | logger.info('Test AUC: {:.2f}%'.format(100. * self.results['test_auc'])) 183 | logger.info('Test Time: {:.3f}s'.format(self.results['test_time'])) 184 | logger.info('Finished testing.') 185 | 186 | def load_ae(self, dataset_name, model_path): 187 | """Load pretrained autoencoder from model_path for feature extraction in a hybrid OC-SVM model.""" 188 | 189 | model_dict = torch.load(model_path, map_location='cpu') 190 | ae_net_dict = model_dict['ae_net_dict'] 191 | if dataset_name in ['mnist', 'fmnist', 'cifar10']: 192 | net_name = dataset_name + '_LeNet' 193 | else: 194 | net_name = dataset_name + '_mlp' 195 | 196 | if self.ae_net is None: 197 | self.ae_net = build_autoencoder(net_name) 198 | 199 | # update keys (since there was a change in network definition) 200 | ae_keys = list(self.ae_net.state_dict().keys()) 201 | for i in range(len(ae_net_dict)): 202 | k, v = ae_net_dict.popitem(False) 203 | new_key = ae_keys[i] 204 | ae_net_dict[new_key] = v 205 | i += 1 206 | 207 | self.ae_net.load_state_dict(ae_net_dict) 208 | self.ae_net.eval() 209 | 210 | def save_model(self, export_path): 211 | """Save OC-SVM model to export_path.""" 212 | pass 213 | 214 | def load_model(self, import_path, device: str = 'cpu'): 215 | """Load OC-SVM model from import_path.""" 216 | pass 217 | 218 | def save_results(self, export_json): 219 | """Save results dict to a JSON-file.""" 220 | with open(export_json, 'w') as fp: 221 | json.dump(self.results, fp) 222 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baseline_kde.py: -------------------------------------------------------------------------------- 1 | import click 2 | import torch 3 | import logging 4 | import random 5 | import numpy as np 6 | 7 | from utils.config import Config 8 | from utils.visualization.plot_images_grid import plot_images_grid 9 | from baselines.kde import KDE 10 | from datasets.main import load_dataset 11 | 12 | 13 | ################################################################################ 14 | # Settings 15 | ################################################################################ 16 | @click.command() 17 | @click.argument('dataset_name', type=click.Choice(['mnist', 'fmnist', 'cifar10', 'arrhythmia', 'cardio', 'satellite', 18 | 'satimage-2', 'shuttle', 'thyroid'])) 19 | @click.argument('xp_path', type=click.Path(exists=True)) 20 | @click.argument('data_path', type=click.Path(exists=True)) 21 | @click.option('--load_config', type=click.Path(exists=True), default=None, 22 | help='Config JSON-file path (default: None).') 23 | @click.option('--load_model', type=click.Path(exists=True), default=None, 24 | help='Model file path (default: None).') 25 | @click.option('--ratio_known_normal', type=float, default=0.0, 26 | help='Ratio of known (labeled) normal training examples.') 27 | @click.option('--ratio_known_outlier', type=float, default=0.0, 28 | help='Ratio of known (labeled) anomalous training examples.') 29 | @click.option('--ratio_pollution', type=float, default=0.0, 30 | help='Pollution ratio of unlabeled training data with unknown (unlabeled) anomalies.') 31 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.') 32 | @click.option('--kernel', type=click.Choice(['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']), 33 | default='gaussian', help='Kernel for the KDE') 34 | @click.option('--grid_search_cv', type=bool, default=True, 35 | help='Use sklearn GridSearchCV to determine optimal bandwidth') 36 | @click.option('--n_jobs_model', type=int, default=-1, help='Number of jobs for model training.') 37 | @click.option('--hybrid', type=bool, default=False, 38 | help='Train KDE on features extracted from an autoencoder. If True, load_ae must be specified.') 39 | @click.option('--load_ae', type=click.Path(exists=True), default=None, 40 | help='Model file path to load autoencoder weights (default: None).') 41 | @click.option('--n_jobs_dataloader', type=int, default=0, 42 | help='Number of workers for data loading. 0 means that the data will be loaded in the main process.') 43 | @click.option('--normal_class', type=int, default=0, 44 | help='Specify the normal class of the dataset (all other classes are considered anomalous).') 45 | @click.option('--known_outlier_class', type=int, default=1, 46 | help='Specify the known outlier class of the dataset for semi-supervised anomaly detection.') 47 | @click.option('--n_known_outlier_classes', type=int, default=0, 48 | help='Number of known outlier classes.' 49 | 'If 0, no anomalies are known.' 50 | 'If 1, outlier class as specified in --known_outlier_class option.' 51 | 'If > 1, the specified number of outlier classes will be sampled at random.') 52 | def main(dataset_name, xp_path, data_path, load_config, load_model, ratio_known_normal, ratio_known_outlier, 53 | ratio_pollution, seed, kernel, grid_search_cv, n_jobs_model, hybrid, load_ae, n_jobs_dataloader, normal_class, 54 | known_outlier_class, n_known_outlier_classes): 55 | """ 56 | (Hybrid) KDE for anomaly detection. 57 | 58 | :arg DATASET_NAME: Name of the dataset to load. 59 | :arg XP_PATH: Export path for logging the experiment. 60 | :arg DATA_PATH: Root path of data. 61 | """ 62 | 63 | # Get configuration 64 | cfg = Config(locals().copy()) 65 | 66 | # Set up logging 67 | logging.basicConfig(level=logging.INFO) 68 | logger = logging.getLogger() 69 | logger.setLevel(logging.INFO) 70 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 71 | log_file = xp_path + '/log.txt' 72 | file_handler = logging.FileHandler(log_file) 73 | file_handler.setLevel(logging.INFO) 74 | file_handler.setFormatter(formatter) 75 | logger.addHandler(file_handler) 76 | 77 | # Print paths 78 | logger.info('Log file is %s.' % log_file) 79 | logger.info('Data path is %s.' % data_path) 80 | logger.info('Export path is %s.' % xp_path) 81 | 82 | # Print experimental setup 83 | logger.info('Dataset: %s' % dataset_name) 84 | logger.info('Normal class: %d' % normal_class) 85 | logger.info('Ratio of labeled normal train samples: %.2f' % ratio_known_normal) 86 | logger.info('Ratio of labeled anomalous samples: %.2f' % ratio_known_outlier) 87 | logger.info('Pollution ratio of unlabeled train data: %.2f' % ratio_pollution) 88 | if n_known_outlier_classes == 1: 89 | logger.info('Known anomaly class: %d' % known_outlier_class) 90 | else: 91 | logger.info('Number of known anomaly classes: %d' % n_known_outlier_classes) 92 | 93 | # If specified, load experiment config from JSON-file 94 | if load_config: 95 | cfg.load_config(import_json=load_config) 96 | logger.info('Loaded configuration from %s.' % load_config) 97 | 98 | # Print KDE configuration 99 | logger.info('KDE kernel: %s' % cfg.settings['kernel']) 100 | logger.info('Use GridSearchCV for bandwidth selection: %s' % cfg.settings['grid_search_cv']) 101 | logger.info('Number of jobs for model training: %d' % n_jobs_model) 102 | logger.info('Hybrid model: %s' % cfg.settings['hybrid']) 103 | 104 | # Set seed 105 | if cfg.settings['seed'] != -1: 106 | random.seed(cfg.settings['seed']) 107 | np.random.seed(cfg.settings['seed']) 108 | torch.manual_seed(cfg.settings['seed']) 109 | torch.cuda.manual_seed(cfg.settings['seed']) 110 | torch.backends.cudnn.deterministic = True 111 | logger.info('Set seed to %d.' % cfg.settings['seed']) 112 | 113 | # Use 'cpu' as device for KDE 114 | device = 'cpu' 115 | torch.multiprocessing.set_sharing_strategy('file_system') # fix multiprocessing issue for ubuntu 116 | logger.info('Computation device: %s' % device) 117 | logger.info('Number of dataloader workers: %d' % n_jobs_dataloader) 118 | 119 | # Load data 120 | dataset = load_dataset(dataset_name, data_path, normal_class, known_outlier_class, n_known_outlier_classes, 121 | ratio_known_normal, ratio_known_outlier, ratio_pollution, 122 | random_state=np.random.RandomState(cfg.settings['seed'])) 123 | # Log random sample of known anomaly classes if more than 1 class 124 | if n_known_outlier_classes > 1: 125 | logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes,)) 126 | 127 | # Initialize KDE model 128 | kde = KDE(hybrid=cfg.settings['hybrid'], kernel=cfg.settings['kernel'], n_jobs=n_jobs_model, 129 | seed=cfg.settings['seed']) 130 | 131 | # If specified, load model parameters from already trained model 132 | if load_model: 133 | kde.load_model(import_path=load_model, device=device) 134 | logger.info('Loading model from %s.' % load_model) 135 | 136 | # If specified, load model autoencoder weights for a hybrid approach 137 | if hybrid and load_ae is not None: 138 | kde.load_ae(dataset_name, model_path=load_ae) 139 | logger.info('Loaded pretrained autoencoder for features from %s.' % load_ae) 140 | 141 | # Train model on dataset 142 | kde.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader, 143 | bandwidth_GridSearchCV=cfg.settings['grid_search_cv']) 144 | 145 | # Test model 146 | kde.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) 147 | 148 | # Save results and configuration 149 | kde.save_results(export_json=xp_path + '/results.json') 150 | cfg.save_config(export_json=xp_path + '/config.json') 151 | 152 | # Plot most anomalous and most normal test samples 153 | indices, labels, scores = zip(*kde.results['test_scores']) 154 | indices, labels, scores = np.array(indices), np.array(labels), np.array(scores) 155 | idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score 156 | idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score 157 | 158 | if dataset_name in ('mnist', 'fmnist', 'cifar10'): 159 | 160 | if dataset_name in ('mnist', 'fmnist'): 161 | X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1) 162 | X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(1) 163 | X_normal_low = dataset.test_set.data[idx_normal_sorted[:32], ...].unsqueeze(1) 164 | X_normal_high = dataset.test_set.data[idx_normal_sorted[-32:], ...].unsqueeze(1) 165 | 166 | if dataset_name == 'cifar10': 167 | X_all_low = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[:32], ...], (0, 3, 1, 2))) 168 | X_all_high = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[-32:], ...], (0, 3, 1, 2))) 169 | X_normal_low = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[:32], ...], (0, 3, 1, 2))) 170 | X_normal_high = torch.tensor( 171 | np.transpose(dataset.test_set.data[idx_normal_sorted[-32:], ...], (0, 3, 1, 2))) 172 | 173 | plot_images_grid(X_all_low, export_img=xp_path + '/all_low', padding=2) 174 | plot_images_grid(X_all_high, export_img=xp_path + '/all_high', padding=2) 175 | plot_images_grid(X_normal_low, export_img=xp_path + '/normals_low', padding=2) 176 | plot_images_grid(X_normal_high, export_img=xp_path + '/normals_high', padding=2) 177 | 178 | 179 | if __name__ == '__main__': 180 | main() 181 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baseline_isoforest.py: -------------------------------------------------------------------------------- 1 | import click 2 | import torch 3 | import logging 4 | import random 5 | import numpy as np 6 | 7 | from utils.config import Config 8 | from utils.visualization.plot_images_grid import plot_images_grid 9 | from baselines.isoforest import IsoForest 10 | from datasets.main import load_dataset 11 | 12 | 13 | ################################################################################ 14 | # Settings 15 | ################################################################################ 16 | @click.command() 17 | @click.argument('dataset_name', type=click.Choice(['mnist', 'fmnist', 'cifar10', 'arrhythmia', 'cardio', 'satellite', 18 | 'satimage-2', 'shuttle', 'thyroid'])) 19 | @click.argument('xp_path', type=click.Path(exists=True)) 20 | @click.argument('data_path', type=click.Path(exists=True)) 21 | @click.option('--load_config', type=click.Path(exists=True), default=None, 22 | help='Config JSON-file path (default: None).') 23 | @click.option('--load_model', type=click.Path(exists=True), default=None, 24 | help='Model file path (default: None).') 25 | @click.option('--ratio_known_normal', type=float, default=0.0, 26 | help='Ratio of known (labeled) normal training examples.') 27 | @click.option('--ratio_known_outlier', type=float, default=0.0, 28 | help='Ratio of known (labeled) anomalous training examples.') 29 | @click.option('--ratio_pollution', type=float, default=0.0, 30 | help='Pollution ratio of unlabeled training data with unknown (unlabeled) anomalies.') 31 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.') 32 | @click.option('--n_estimators', type=int, default=100, 33 | help='Set the number of base estimators in the ensemble (default: 100).') 34 | @click.option('--max_samples', type=int, default=256, 35 | help='Set the number of samples drawn to train each base estimator (default: 256).') 36 | @click.option('--contamination', type=float, default=0.1, 37 | help='Expected fraction of anomalies in the training set. (default: 0.1).') 38 | @click.option('--n_jobs_model', type=int, default=-1, help='Number of jobs for model training.') 39 | @click.option('--hybrid', type=bool, default=False, 40 | help='Train model on features extracted from an autoencoder. If True, load_ae must be specified.') 41 | @click.option('--load_ae', type=click.Path(exists=True), default=None, 42 | help='Model file path to load autoencoder weights (default: None).') 43 | @click.option('--n_jobs_dataloader', type=int, default=0, 44 | help='Number of workers for data loading. 0 means that the data will be loaded in the main process.') 45 | @click.option('--normal_class', type=int, default=0, 46 | help='Specify the normal class of the dataset (all other classes are considered anomalous).') 47 | @click.option('--known_outlier_class', type=int, default=1, 48 | help='Specify the known outlier class of the dataset for semi-supervised anomaly detection.') 49 | @click.option('--n_known_outlier_classes', type=int, default=0, 50 | help='Number of known outlier classes.' 51 | 'If 0, no anomalies are known.' 52 | 'If 1, outlier class as specified in --known_outlier_class option.' 53 | 'If > 1, the specified number of outlier classes will be sampled at random.') 54 | def main(dataset_name, xp_path, data_path, load_config, load_model, ratio_known_normal, ratio_known_outlier, 55 | ratio_pollution, seed, n_estimators, max_samples, contamination, n_jobs_model, hybrid, load_ae, 56 | n_jobs_dataloader, normal_class, known_outlier_class, n_known_outlier_classes): 57 | """ 58 | (Hybrid) Isolation Forest model for anomaly detection. 59 | 60 | :arg DATASET_NAME: Name of the dataset to load. 61 | :arg XP_PATH: Export path for logging the experiment. 62 | :arg DATA_PATH: Root path of data. 63 | """ 64 | 65 | # Get configuration 66 | cfg = Config(locals().copy()) 67 | 68 | # Set up logging 69 | logging.basicConfig(level=logging.INFO) 70 | logger = logging.getLogger() 71 | logger.setLevel(logging.INFO) 72 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 73 | log_file = xp_path + '/log.txt' 74 | file_handler = logging.FileHandler(log_file) 75 | file_handler.setLevel(logging.INFO) 76 | file_handler.setFormatter(formatter) 77 | logger.addHandler(file_handler) 78 | 79 | # Print paths 80 | logger.info('Log file is %s.' % log_file) 81 | logger.info('Data path is %s.' % data_path) 82 | logger.info('Export path is %s.' % xp_path) 83 | 84 | # Print experimental setup 85 | logger.info('Dataset: %s' % dataset_name) 86 | logger.info('Normal class: %d' % normal_class) 87 | logger.info('Ratio of labeled normal train samples: %.2f' % ratio_known_normal) 88 | logger.info('Ratio of labeled anomalous samples: %.2f' % ratio_known_outlier) 89 | logger.info('Pollution ratio of unlabeled train data: %.2f' % ratio_pollution) 90 | if n_known_outlier_classes == 1: 91 | logger.info('Known anomaly class: %d' % known_outlier_class) 92 | else: 93 | logger.info('Number of known anomaly classes: %d' % n_known_outlier_classes) 94 | 95 | # If specified, load experiment config from JSON-file 96 | if load_config: 97 | cfg.load_config(import_json=load_config) 98 | logger.info('Loaded configuration from %s.' % load_config) 99 | 100 | # Print Isolation Forest configuration 101 | logger.info('Number of base estimators in the ensemble: %d' % cfg.settings['n_estimators']) 102 | logger.info('Number of samples for training each base estimator: %d' % cfg.settings['max_samples']) 103 | logger.info('Contamination parameter: %.2f' % cfg.settings['contamination']) 104 | logger.info('Number of jobs for model training: %d' % n_jobs_model) 105 | logger.info('Hybrid model: %s' % cfg.settings['hybrid']) 106 | 107 | # Set seed 108 | if cfg.settings['seed'] != -1: 109 | random.seed(cfg.settings['seed']) 110 | np.random.seed(cfg.settings['seed']) 111 | torch.manual_seed(cfg.settings['seed']) 112 | torch.cuda.manual_seed(cfg.settings['seed']) 113 | torch.backends.cudnn.deterministic = True 114 | logger.info('Set seed to %d.' % cfg.settings['seed']) 115 | 116 | # Use 'cpu' as device for Isolation Forest 117 | device = 'cpu' 118 | torch.multiprocessing.set_sharing_strategy('file_system') # fix multiprocessing issue for ubuntu 119 | logger.info('Computation device: %s' % device) 120 | logger.info('Number of dataloader workers: %d' % n_jobs_dataloader) 121 | 122 | # Load data 123 | dataset = load_dataset(dataset_name, data_path, normal_class, known_outlier_class, n_known_outlier_classes, 124 | ratio_known_normal, ratio_known_outlier, ratio_pollution, 125 | random_state=np.random.RandomState(cfg.settings['seed'])) 126 | # Log random sample of known anomaly classes if more than 1 class 127 | if n_known_outlier_classes > 1: 128 | logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes,)) 129 | 130 | # Initialize Isolation Forest model 131 | Isoforest = IsoForest(hybrid=cfg.settings['hybrid'], n_estimators=cfg.settings['n_estimators'], 132 | max_samples=cfg.settings['max_samples'], contamination=cfg.settings['contamination'], 133 | n_jobs=n_jobs_model, seed=cfg.settings['seed']) 134 | 135 | # If specified, load model parameters from already trained model 136 | if load_model: 137 | Isoforest.load_model(import_path=load_model, device=device) 138 | logger.info('Loading model from %s.' % load_model) 139 | 140 | # If specified, load model autoencoder weights for a hybrid approach 141 | if hybrid and load_ae is not None: 142 | Isoforest.load_ae(dataset_name, model_path=load_ae) 143 | logger.info('Loaded pretrained autoencoder for features from %s.' % load_ae) 144 | 145 | # Train model on dataset 146 | Isoforest.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) 147 | 148 | # Test model 149 | Isoforest.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) 150 | 151 | # Save results and configuration 152 | Isoforest.save_results(export_json=xp_path + '/results.json') 153 | cfg.save_config(export_json=xp_path + '/config.json') 154 | 155 | # Plot most anomalous and most normal test samples 156 | indices, labels, scores = zip(*Isoforest.results['test_scores']) 157 | indices, labels, scores = np.array(indices), np.array(labels), np.array(scores) 158 | idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score 159 | idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score 160 | 161 | if dataset_name in ('mnist', 'fmnist', 'cifar10'): 162 | 163 | if dataset_name in ('mnist', 'fmnist'): 164 | X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1) 165 | X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(1) 166 | X_normal_low = dataset.test_set.data[idx_normal_sorted[:32], ...].unsqueeze(1) 167 | X_normal_high = dataset.test_set.data[idx_normal_sorted[-32:], ...].unsqueeze(1) 168 | 169 | if dataset_name == 'cifar10': 170 | X_all_low = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[:32], ...], (0, 3, 1, 2))) 171 | X_all_high = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[-32:], ...], (0, 3, 1, 2))) 172 | X_normal_low = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[:32], ...], (0, 3, 1, 2))) 173 | X_normal_high = torch.tensor( 174 | np.transpose(dataset.test_set.data[idx_normal_sorted[-32:], ...], (0, 3, 1, 2))) 175 | 176 | plot_images_grid(X_all_low, export_img=xp_path + '/all_low', padding=2) 177 | plot_images_grid(X_all_high, export_img=xp_path + '/all_high', padding=2) 178 | plot_images_grid(X_normal_low, export_img=xp_path + '/normals_low', padding=2) 179 | plot_images_grid(X_normal_high, export_img=xp_path + '/normals_high', padding=2) 180 | 181 | 182 | if __name__ == '__main__': 183 | main() 184 | -------------------------------------------------------------------------------- /baseline/DeepSAD/src/baselines/ssad.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import time 4 | import torch 5 | import numpy as np 6 | 7 | from torch.utils.data import DataLoader 8 | from .shallow_ssad.ssad_convex import ConvexSSAD 9 | from sklearn.metrics import roc_auc_score 10 | from sklearn.metrics.pairwise import pairwise_kernels 11 | from base.base_dataset import BaseADDataset 12 | from networks.main import build_autoencoder 13 | 14 | 15 | class SSAD(object): 16 | """ 17 | A class for kernel SSAD models as described in Goernitz et al., Towards Supervised Anomaly Detection, JAIR, 2013. 18 | """ 19 | 20 | def __init__(self, kernel='rbf', kappa=1.0, Cp=1.0, Cu=1.0, Cn=1.0, hybrid=False): 21 | """Init SSAD instance.""" 22 | self.kernel = kernel 23 | self.kappa = kappa 24 | self.Cp = Cp 25 | self.Cu = Cu 26 | self.Cn = Cn 27 | self.rho = None 28 | self.gamma = None 29 | 30 | self.model = None 31 | self.X_svs = None 32 | 33 | self.hybrid = hybrid 34 | self.ae_net = None # autoencoder network for the case of a hybrid model 35 | self.linear_model = None # also init a model with linear kernel if hybrid approach 36 | self.linear_X_svs = None 37 | 38 | self.results = { 39 | 'train_time': None, 40 | 'test_time': None, 41 | 'test_auc': None, 42 | 'test_scores': None, 43 | 'train_time_linear': None, 44 | 'test_time_linear': None, 45 | 'test_auc_linear': None 46 | } 47 | 48 | def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0): 49 | """Trains the SSAD model on the training data.""" 50 | logger = logging.getLogger() 51 | 52 | # do not drop last batch for non-SGD optimization shallow_ssad 53 | train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True, 54 | num_workers=n_jobs_dataloader, drop_last=False) 55 | 56 | # Get data from loader 57 | X = () 58 | semi_targets = [] 59 | for data in train_loader: 60 | inputs, _, semi_targets_batch, _ = data 61 | inputs, semi_targets_batch = inputs.to(device), semi_targets_batch.to(device) 62 | if self.hybrid: 63 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 64 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 65 | X += (X_batch.cpu().data.numpy(),) 66 | semi_targets += semi_targets_batch.cpu().data.numpy().astype(np.int).tolist() 67 | X, semi_targets = np.concatenate(X), np.array(semi_targets) 68 | 69 | # Training 70 | logger.info('Starting training...') 71 | 72 | # Select model via hold-out test set of 1000 samples 73 | gammas = np.logspace(-7, 2, num=10, base=2) 74 | best_auc = 0.0 75 | 76 | # Sample hold-out set from test set 77 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader) 78 | 79 | X_test = () 80 | labels = [] 81 | for data in test_loader: 82 | inputs, label_batch, _, _ = data 83 | inputs, label_batch = inputs.to(device), label_batch.to(device) 84 | if self.hybrid: 85 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 86 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 87 | X_test += (X_batch.cpu().data.numpy(),) 88 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist() 89 | X_test, labels = np.concatenate(X_test), np.array(labels) 90 | n_test, n_normal, n_outlier = len(X_test), np.sum(labels == 0), np.sum(labels == 1) 91 | n_val = int(0.1 * n_test) 92 | n_val_normal, n_val_outlier = int(n_val * (n_normal/n_test)), int(n_val * (n_outlier/n_test)) 93 | perm = np.random.permutation(n_test) 94 | X_val = np.concatenate((X_test[perm][labels[perm] == 0][:n_val_normal], 95 | X_test[perm][labels[perm] == 1][:n_val_outlier])) 96 | labels = np.array([0] * n_val_normal + [1] * n_val_outlier) 97 | 98 | i = 1 99 | for gamma in gammas: 100 | 101 | # Build the training kernel 102 | kernel = pairwise_kernels(X, X, metric=self.kernel, gamma=gamma) 103 | 104 | # Model candidate 105 | model = ConvexSSAD(kernel, semi_targets, Cp=self.Cp, Cu=self.Cu, Cn=self.Cn) 106 | 107 | # Train 108 | start_time = time.time() 109 | model.fit() 110 | train_time = time.time() - start_time 111 | 112 | # Test on small hold-out set from test set 113 | kernel_val = pairwise_kernels(X_val, X[model.svs, :], metric=self.kernel, gamma=gamma) 114 | scores = (-1.0) * model.apply(kernel_val) 115 | scores = scores.flatten() 116 | 117 | # Compute AUC 118 | auc = roc_auc_score(labels, scores) 119 | 120 | logger.info(f' | Model {i:02}/{len(gammas):02} | Gamma: {gamma:.8f} | Train Time: {train_time:.3f}s ' 121 | f'| Val AUC: {100. * auc:.2f} |') 122 | 123 | if auc > best_auc: 124 | best_auc = auc 125 | self.model = model 126 | self.gamma = gamma 127 | self.results['train_time'] = train_time 128 | 129 | i += 1 130 | 131 | # Get support vectors for testing 132 | self.X_svs = X[self.model.svs, :] 133 | 134 | # If hybrid, also train a model with linear kernel 135 | if self.hybrid: 136 | linear_kernel = pairwise_kernels(X, X, metric='linear') 137 | self.linear_model = ConvexSSAD(linear_kernel, semi_targets, Cp=self.Cp, Cu=self.Cu, Cn=self.Cn) 138 | start_time = time.time() 139 | self.linear_model.fit() 140 | train_time = time.time() - start_time 141 | self.results['train_time_linear'] = train_time 142 | self.linear_X_svs = X[self.linear_model.svs, :] 143 | 144 | logger.info(f'Best Model: | Gamma: {self.gamma:.8f} | AUC: {100. * best_auc:.2f}') 145 | logger.info('Training Time: {:.3f}s'.format(self.results['train_time'])) 146 | logger.info('Finished training.') 147 | 148 | def test(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0): 149 | """Tests the SSAD model on the test data.""" 150 | logger = logging.getLogger() 151 | 152 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader) 153 | 154 | # Get data from loader 155 | idx_label_score = [] 156 | X = () 157 | idxs = [] 158 | labels = [] 159 | for data in test_loader: 160 | inputs, label_batch, _, idx = data 161 | inputs, label_batch, idx = inputs.to(device), label_batch.to(device), idx.to(device) 162 | if self.hybrid: 163 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features 164 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width) 165 | X += (X_batch.cpu().data.numpy(),) 166 | idxs += idx.cpu().data.numpy().astype(np.int64).tolist() 167 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist() 168 | X = np.concatenate(X) 169 | 170 | # Testing 171 | logger.info('Starting testing...') 172 | start_time = time.time() 173 | 174 | # Build kernel 175 | kernel = pairwise_kernels(X, self.X_svs, metric=self.kernel, gamma=self.gamma) 176 | 177 | scores = (-1.0) * self.model.apply(kernel) 178 | 179 | self.results['test_time'] = time.time() - start_time 180 | scores = scores.flatten() 181 | self.rho = -self.model.threshold 182 | 183 | # Save triples of (idx, label, score) in a list 184 | idx_label_score += list(zip(idxs, labels, scores.tolist())) 185 | self.results['test_scores'] = idx_label_score 186 | 187 | # Compute AUC 188 | _, labels, scores = zip(*idx_label_score) 189 | labels = np.array(labels) 190 | scores = np.array(scores) 191 | self.results['test_auc'] = roc_auc_score(labels, scores) 192 | 193 | # If hybrid, also test model with linear kernel 194 | if self.hybrid: 195 | start_time = time.time() 196 | linear_kernel = pairwise_kernels(X, self.linear_X_svs, metric='linear') 197 | scores_linear = (-1.0) * self.linear_model.apply(linear_kernel) 198 | self.results['test_time_linear'] = time.time() - start_time 199 | scores_linear = scores_linear.flatten() 200 | self.results['test_auc_linear'] = roc_auc_score(labels, scores_linear) 201 | logger.info('Test AUC linear model: {:.2f}%'.format(100. * self.results['test_auc_linear'])) 202 | logger.info('Test Time linear model: {:.3f}s'.format(self.results['test_time_linear'])) 203 | 204 | # Log results 205 | logger.info('Test AUC: {:.2f}%'.format(100. * self.results['test_auc'])) 206 | logger.info('Test Time: {:.3f}s'.format(self.results['test_time'])) 207 | logger.info('Finished testing.') 208 | 209 | def load_ae(self, dataset_name, model_path): 210 | """Load pretrained autoencoder from model_path for feature extraction in a hybrid SSAD model.""" 211 | 212 | model_dict = torch.load(model_path, map_location='cpu') 213 | ae_net_dict = model_dict['ae_net_dict'] 214 | if dataset_name in ['mnist', 'fmnist', 'cifar10']: 215 | net_name = dataset_name + '_LeNet' 216 | else: 217 | net_name = dataset_name + '_mlp' 218 | 219 | if self.ae_net is None: 220 | self.ae_net = build_autoencoder(net_name) 221 | 222 | # update keys (since there was a change in network definition) 223 | ae_keys = list(self.ae_net.state_dict().keys()) 224 | for i in range(len(ae_net_dict)): 225 | k, v = ae_net_dict.popitem(False) 226 | new_key = ae_keys[i] 227 | ae_net_dict[new_key] = v 228 | i += 1 229 | 230 | self.ae_net.load_state_dict(ae_net_dict) 231 | self.ae_net.eval() 232 | 233 | def save_model(self, export_path): 234 | """Save SSAD model to export_path.""" 235 | pass 236 | 237 | def load_model(self, import_path, device: str = 'cpu'): 238 | """Load SSAD model from import_path.""" 239 | pass 240 | 241 | def save_results(self, export_json): 242 | """Save results dict to a JSON-file.""" 243 | with open(export_json, 'w') as fp: 244 | json.dump(self.results, fp) 245 | -------------------------------------------------------------------------------- /myutils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | import random 5 | import torch 6 | 7 | # metric 8 | from sklearn.metrics import roc_auc_score, average_precision_score 9 | 10 | # plot 11 | import matplotlib.pyplot as plt 12 | 13 | # statistical analysis 14 | from scipy.stats import wilcoxon 15 | 16 | class Utils(): 17 | def __init__(self): 18 | pass 19 | 20 | # remove randomness 21 | def set_seed(self, seed): 22 | # os.environ['PYTHONHASHSEED'] = str(seed) 23 | # os.environ['TF_CUDNN_DETERMINISTIC'] = 'true' 24 | # os.environ['TF_DETERMINISTIC_OPS'] = 'true' 25 | 26 | # basic seed 27 | np.random.seed(seed) 28 | random.seed(seed) 29 | 30 | # pytorch seed 31 | torch.manual_seed(seed) 32 | torch.backends.cudnn.deterministic = True 33 | torch.backends.cudnn.benchmark = False 34 | 35 | def get_device(self, gpu_specific=False): 36 | if gpu_specific: 37 | if torch.cuda.is_available(): 38 | n_gpu = torch.cuda.device_count() 39 | print(f'number of gpu: {n_gpu}') 40 | print(f'cuda name: {torch.cuda.get_device_name(0)}') 41 | print('GPU is on') 42 | else: 43 | print('GPU is off') 44 | 45 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 46 | else: 47 | device = torch.device("cpu") 48 | return device 49 | 50 | # generate unique value 51 | def unique(self, a, b): 52 | u = 0.5 * (a + b) * (a + b + 1) + b 53 | return int(u) 54 | 55 | def data_description(self, X, y): 56 | des_dict = {} 57 | des_dict['Samples'] = X.shape[0] 58 | des_dict['Features'] = X.shape[1] 59 | des_dict['Anomalies'] = sum(y) 60 | des_dict['Anomalies Ratio(%)'] = round((sum(y) / len(y)) * 100, 2) 61 | 62 | print(des_dict) 63 | 64 | # metric 65 | def metric(self, y_true, y_score, pos_label=1): 66 | aucroc = roc_auc_score(y_true=y_true, y_score=y_score) 67 | aucpr = average_precision_score(y_true=y_true, y_score=y_score, pos_label=1) 68 | 69 | return {'aucroc':aucroc, 'aucpr':aucpr} 70 | 71 | # resampling function 72 | def sampler(self, X_train, y_train, batch_size): 73 | index_u = np.where(y_train == 0)[0] 74 | index_a = np.where(y_train == 1)[0] 75 | 76 | n = 0 77 | while len(index_u) >= batch_size: 78 | self.set_seed(n) 79 | index_u_batch = np.random.choice(index_u, batch_size // 2, replace=False) 80 | index_u = np.setdiff1d(index_u, index_u_batch) 81 | 82 | index_a_batch = np.random.choice(index_a, batch_size // 2, replace=True) 83 | 84 | # batch index 85 | index_batch = np.append(index_u_batch, index_a_batch) 86 | # shuffle 87 | np.random.shuffle(index_batch) 88 | 89 | if n == 0: 90 | X_train_new = X_train[index_batch] 91 | y_train_new = y_train[index_batch] 92 | else: 93 | X_train_new = np.append(X_train_new, X_train[index_batch], axis=0) 94 | y_train_new = np.append(y_train_new, y_train[index_batch]) 95 | n += 1 96 | 97 | return X_train_new, y_train_new 98 | 99 | def sampler_2(self, X_train, y_train, step, batch_size=512): 100 | index_u = np.where(y_train == 0)[0] 101 | index_a = np.where(y_train == 1)[0] 102 | 103 | for i in range(step): 104 | index_u_batch = np.random.choice(index_u, batch_size // 2, replace=True) 105 | index_a_batch = np.random.choice(index_a, batch_size // 2, replace=True) 106 | 107 | # batch index 108 | index_batch = np.append(index_u_batch, index_a_batch) 109 | # shuffle 110 | np.random.shuffle(index_batch) 111 | 112 | if i == 0: 113 | X_train_new = X_train[index_batch] 114 | y_train_new = y_train[index_batch] 115 | else: 116 | X_train_new = np.append(X_train_new, X_train[index_batch], axis=0) 117 | y_train_new = np.append(y_train_new, y_train[index_batch]) 118 | 119 | return X_train_new, y_train_new 120 | 121 | # for PReNet 122 | def sampler_pairs(self, X_train_tensor, y_train, epoch, batch_num, batch_size, s_a_a, s_a_u, s_u_u): 123 | ''' 124 | X_train_tensor: the input X in the torch.tensor form 125 | y_train: label in the numpy.array form 126 | 127 | batch_num: generate how many batches in one epoch 128 | batch_size: the batch size 129 | ''' 130 | data_loader_X = [] 131 | data_loader_y = [] 132 | 133 | index_a = np.where(y_train == 1)[0] 134 | index_u = np.where(y_train == 0)[0] 135 | 136 | for i in range(batch_num): # i.e., drop_last = True 137 | index = [] 138 | 139 | # pairs of (a,a); (a,u); (u,u) 140 | for j in range(6): 141 | # generate unique seed and set seed 142 | # seed = self.unique(epoch, i) 143 | # seed = self.unique(seed, j) 144 | # self.set_seed(seed) 145 | 146 | if j < 3: 147 | index_sub = np.random.choice(index_a, batch_size // 4, replace=True) 148 | index.append(list(index_sub)) 149 | 150 | if j == 3: 151 | index_sub = np.random.choice(index_u, batch_size // 4, replace=True) 152 | index.append(list(index_sub)) 153 | 154 | if j > 3: 155 | index_sub = np.random.choice(index_u, batch_size // 2, replace=True) 156 | index.append(list(index_sub)) 157 | 158 | # index[0] + index[1] = (a,a), batch / 4 159 | # index[2] + index[2] = (a,u), batch / 4 160 | # index[4] + index[5] = (u,u), batch / 2 161 | index_left = index[0] + index[2] + index[4] 162 | index_right = index[1] + index[3] + index[5] 163 | 164 | X_train_tensor_left = X_train_tensor[index_left] 165 | X_train_tensor_right = X_train_tensor[index_right] 166 | 167 | # generate label 168 | y_train_new = np.append(np.repeat(s_a_a, batch_size // 4), np.repeat(s_a_u, batch_size // 4)) 169 | y_train_new = np.append(y_train_new, np.repeat(s_u_u, batch_size // 2)) 170 | y_train_new = torch.from_numpy(y_train_new).float() 171 | 172 | # shuffle 173 | index_shuffle = np.arange(len(y_train_new)) 174 | index_shuffle = np.random.choice(index_shuffle, len(index_shuffle), replace=False) 175 | 176 | X_train_tensor_left = X_train_tensor_left[index_shuffle] 177 | X_train_tensor_right = X_train_tensor_right[index_shuffle] 178 | y_train_new = y_train_new[index_shuffle] 179 | 180 | # save 181 | data_loader_X.append([X_train_tensor_left, X_train_tensor_right]) 182 | data_loader_y.append(y_train_new) 183 | 184 | return data_loader_X, data_loader_y 185 | 186 | # gradient norm 187 | def grad_norm(self, grad_tuple): 188 | 189 | grad = torch.tensor([0.0]) 190 | for i in range(len(grad_tuple)): 191 | grad += torch.norm(grad_tuple[i]) 192 | 193 | return grad 194 | 195 | # visualize the gradient flow in network 196 | def plot_grad_flow(self, named_parameters): 197 | ave_grads = [] 198 | layers = [] 199 | for n, p in named_parameters: 200 | if (p.requires_grad) and ("bias" not in n): 201 | layers.append(n) 202 | ave_grads.append(p.grad.abs().mean()) 203 | plt.plot(ave_grads, alpha=0.3, color="b") 204 | plt.hlines(0, 0, len(ave_grads) + 1, linewidth=1, color="k") 205 | plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical") 206 | plt.xlim(xmin=0, xmax=len(ave_grads)) 207 | plt.xlabel("Layers") 208 | plt.ylabel("average gradient") 209 | plt.title("Gradient flow") 210 | plt.grid(True) 211 | 212 | # def torch_wasserstein_loss(tensor_a, tensor_b): 213 | # # Compute the first Wasserstein distance between two 1D distributions. 214 | # return (torch_cdf_loss(tensor_a, tensor_b, p=1)) 215 | 216 | # Calculate the First Wasserstein Distance 217 | def torch_cdf_loss(self, tensor_a, tensor_b, p=1): 218 | # last-dimension is weight distribution 219 | # p is the norm of the distance, p=1 --> First Wasserstein Distance 220 | # to get a positive weight with our normalized distribution 221 | # we recommend combining this loss with other difference-based losses like L1 222 | 223 | # normalize distribution, add 1e-14 to divisor to avoid 0/0 224 | tensor_a = tensor_a / (torch.sum(tensor_a, dim=-1, keepdim=True) + 1e-14) 225 | tensor_b = tensor_b / (torch.sum(tensor_b, dim=-1, keepdim=True) + 1e-14) 226 | # make cdf with cumsum 227 | cdf_tensor_a = torch.cumsum(tensor_a, dim=-1) 228 | cdf_tensor_b = torch.cumsum(tensor_b, dim=-1) 229 | 230 | # choose different formulas for different norm situations 231 | if p == 1: 232 | cdf_distance = torch.sum(torch.abs((cdf_tensor_a - cdf_tensor_b)), dim=-1) 233 | elif p == 2: 234 | cdf_distance = torch.sqrt(torch.sum(torch.pow((cdf_tensor_a - cdf_tensor_b), 2), dim=-1)) 235 | else: 236 | cdf_distance = torch.pow(torch.sum(torch.pow(torch.abs(cdf_tensor_a - cdf_tensor_b), p), dim=-1), 1 / p) 237 | 238 | cdf_loss = cdf_distance.mean() 239 | return cdf_loss 240 | 241 | # Calculate the loss like devnet in PyTorch 242 | def cal_loss(self, y, y_pred, mode='devnet'): 243 | if mode == 'devnet': 244 | y_pred.squeeze_() 245 | 246 | ref = torch.randn(5000) # sampling from the normal distribution 247 | dev = (y_pred - torch.mean(ref)) / torch.std(ref) 248 | # print(f'mean:{torch.mean(ref)}, std:{torch.std(ref)}') 249 | inlier_loss = torch.abs(dev) 250 | outlier_loss = torch.max(5.0 - dev, torch.zeros_like(5.0 - dev)) 251 | 252 | loss = torch.mean((1 - y) * inlier_loss + y * outlier_loss) 253 | else: 254 | raise NotImplementedError 255 | 256 | return loss 257 | 258 | def result_process(self, result_show, name, std=False): 259 | # average performance 260 | ave_metric = np.mean(result_show, axis=0).values 261 | std_metric = np.std(result_show, axis=0).values 262 | 263 | # statistical test 264 | wilcoxon_df = pd.DataFrame(data=None, index=result_show.columns, columns=result_show.columns) 265 | 266 | for i in range(wilcoxon_df.shape[0]): 267 | for j in range(wilcoxon_df.shape[1]): 268 | if i != j: 269 | wilcoxon_df.iloc[i, j] = \ 270 | wilcoxon(result_show.iloc[:, i] - result_show.iloc[:, j], alternative='greater')[1] 271 | 272 | # average rank 273 | result_show.loc['Ave.rank'] = np.mean(result_show.rank(ascending=False, method='dense', axis=1), axis=0) 274 | 275 | # average metric 276 | if std: 277 | result_show.loc['Ave.metric'] = [str(format(round(a,3), '.3f')) + '±' + str(format(round(s,3), '.3f')) 278 | for a,s in zip(ave_metric, std_metric)] 279 | else: 280 | result_show.loc['Ave.metric'] = [str(format(round(a, 3), '.3f')) for a, s in zip(ave_metric, std_metric)] 281 | 282 | 283 | # the p-value of wilcoxon statistical test 284 | result_show.loc['p-value'] = wilcoxon_df.loc[name].values 285 | 286 | 287 | for _ in result_show.index: 288 | if _ in ['Ave.rank', 'p-value']: 289 | result_show.loc[_, :] = [format(round(_, 2), '.2f') for _ in result_show.loc[_, :].values] 290 | 291 | # result_show = result_show.astype('float') 292 | # result_show = result_show.round(2) 293 | 294 | return result_show --------------------------------------------------------------------------------