├── logs
└── .gitignore
├── datasets
└── .gitignore
├── pics
└── NNG-Mix.png
├── baseline
├── DeepSAD
│ ├── src
│ │ ├── baselines
│ │ │ ├── shallow_ssad
│ │ │ │ ├── __init__.py
│ │ │ │ └── ssad_convex.py
│ │ │ ├── __init__.py
│ │ │ ├── SemiDGM.py
│ │ │ ├── isoforest.py
│ │ │ ├── kde.py
│ │ │ ├── ocsvm.py
│ │ │ └── ssad.py
│ │ ├── base
│ │ │ ├── __init__.py
│ │ │ ├── torchvision_dataset.py
│ │ │ ├── base_net.py
│ │ │ ├── base_dataset.py
│ │ │ ├── base_trainer.py
│ │ │ └── odds_dataset.py
│ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ ├── visualization
│ │ │ │ └── plot_images_grid.py
│ │ │ └── misc.py
│ │ ├── datasets
│ │ │ ├── __init__.py
│ │ │ ├── main.py
│ │ │ ├── odds.py
│ │ │ ├── preprocessing.py
│ │ │ ├── mnist.py
│ │ │ ├── cifar10.py
│ │ │ └── fmnist.py
│ │ ├── optim
│ │ │ ├── __init__.py
│ │ │ ├── variational.py
│ │ │ ├── vae_trainer.py
│ │ │ ├── ae_trainer.py
│ │ │ ├── DeepSAD_trainer.py
│ │ │ └── SemiDGM_trainer.py
│ │ ├── networks
│ │ │ ├── __init__.py
│ │ │ ├── inference
│ │ │ │ └── distributions.py
│ │ │ ├── main.py
│ │ │ ├── layers
│ │ │ │ ├── stochastic.py
│ │ │ │ └── standard.py
│ │ │ ├── mnist_LeNet.py
│ │ │ ├── mlp.py
│ │ │ ├── fmnist_LeNet.py
│ │ │ ├── cifar10_LeNet.py
│ │ │ ├── dgm.py
│ │ │ └── vae.py
│ │ ├── run.py
│ │ ├── deepsad.py
│ │ ├── baseline_ocsvm.py
│ │ ├── baseline_ssad.py
│ │ ├── baseline_kde.py
│ │ └── baseline_isoforest.py
│ ├── imgs
│ │ └── fig1.png
│ ├── ae_results.json
│ ├── requirements.txt
│ ├── LICENSE
│ └── README.md
└── Supervised.py
├── requirement.txt
├── README.md
└── myutils.py
/logs/.gitignore:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/datasets/.gitignore:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pics/NNG-Mix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donghao51/NNG-Mix/HEAD/pics/NNG-Mix.png
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baselines/shallow_ssad/__init__.py:
--------------------------------------------------------------------------------
1 | from .ssad_convex import ConvexSSAD
2 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/imgs/fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donghao51/NNG-Mix/HEAD/baseline/DeepSAD/imgs/fig1.png
--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | torch==1.11.0+cu113
2 | torchvision==0.12.0+cu113
3 | numpy==1.23.5
4 | pandas==1.4.2
5 | scipy==1.10.1
6 | copulas
7 | scikit-learn
8 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/ae_results.json:
--------------------------------------------------------------------------------
1 | {"train_time": 49.85617208480835, "test_aucroc": 0.45984687500000004, "test_aucpr": 0.13354349144694128, "test_time": 0.12167739868164062}
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_dataset import *
2 | from .torchvision_dataset import *
3 | from .odds_dataset import *
4 | from .base_net import *
5 | from .base_trainer import *
6 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import Config
2 | from .visualization.plot_images_grid import plot_images_grid
3 | from .misc import enumerate_discrete, log_sum_exp, binary_cross_entropy
4 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | from .SemiDGM import SemiDeepGenerativeModel
2 | from .ocsvm import OCSVM
3 | from .kde import KDE
4 | from .isoforest import IsoForest
5 | from .ssad import SSAD
6 | from .shallow_ssad.ssad_convex import ConvexSSAD
7 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import load_dataset
2 | # from .mnist import MNIST_Dataset
3 | # from .fmnist import FashionMNIST_Dataset
4 | # from .cifar10 import CIFAR10_Dataset
5 | from .odds import ODDSADDataset
6 | from .preprocessing import *
7 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/optim/__init__.py:
--------------------------------------------------------------------------------
1 | from .DeepSAD_trainer import DeepSADTrainer
2 | from .ae_trainer import AETrainer
3 | from .SemiDGM_trainer import SemiDeepGenerativeTrainer
4 | from .vae_trainer import VAETrainer
5 | from .variational import SVI, ImportanceWeightedSampler
6 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/requirements.txt:
--------------------------------------------------------------------------------
1 | Click==7.0
2 | cvxopt==1.2.3
3 | cycler==0.10.0
4 | joblib==0.13.2
5 | kiwisolver==1.1.0
6 | matplotlib==3.1.0
7 | numpy==1.16.4
8 | pandas==0.24.2
9 | Pillow==6.0.0
10 | pyparsing==2.4.0
11 | python-dateutil==2.8.0
12 | pytz==2019.1
13 | scikit-learn==0.21.2
14 | scipy==1.3.0
15 | seaborn==0.9.0
16 | six==1.12.0
17 | torch==1.1.0
18 | torchvision==0.3.0
19 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/datasets/main.py:
--------------------------------------------------------------------------------
1 | # from .mnist import MNIST_Dataset
2 | # from .fmnist import FashionMNIST_Dataset
3 | # from .cifar10 import CIFAR10_Dataset
4 | from .odds import ODDSADDataset
5 |
6 |
7 | def load_dataset(data, train=True):
8 | """Loads the dataset."""
9 |
10 | # for tabular data
11 | dataset = ODDSADDataset(data=data, train=train)
12 |
13 | return dataset
14 |
--------------------------------------------------------------------------------
/baseline/Supervised.py:
--------------------------------------------------------------------------------
1 | from sklearn.neural_network import MLPClassifier
2 |
3 | from myutils import Utils
4 |
5 | class supervised():
6 | def __init__(self, seed:int, model_name:str=None):
7 | self.seed = seed
8 | self.utils = Utils()
9 |
10 | self.model_name = model_name
11 | self.model_dict = {'MLP':MLPClassifier}
12 |
13 | def fit(self, X_train, y_train, ratio=None):
14 | self.model = self.model_dict[self.model_name](random_state=self.seed)
15 |
16 | # fitting
17 | self.model.fit(X_train, y_train)
18 |
19 | return self
20 |
21 | def predict_score(self, X):
22 | score = self.model.predict_proba(X)[:, 1]
23 | return score
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import build_network, build_autoencoder
2 | # from .mnist_LeNet import MNIST_LeNet, MNIST_LeNet_Decoder, MNIST_LeNet_Autoencoder
3 | # from .fmnist_LeNet import FashionMNIST_LeNet, FashionMNIST_LeNet_Decoder, FashionMNIST_LeNet_Autoencoder
4 | # from .cifar10_LeNet import CIFAR10_LeNet, CIFAR10_LeNet_Decoder, CIFAR10_LeNet_Autoencoder
5 | from .mlp import MLP, MLP_Decoder, MLP_Autoencoder
6 | from .layers.stochastic import GaussianSample
7 | from .layers.standard import Standardize
8 | from .inference.distributions import log_standard_gaussian, log_gaussian, log_standard_categorical
9 | from .vae import VariationalAutoencoder, Encoder, Decoder
10 | from .dgm import DeepGenerativeModel, StackedDeepGenerativeModel
11 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/utils/config.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | class Config(object):
5 | """Base class for experimental setting/configuration."""
6 |
7 | def __init__(self, settings):
8 | self.settings = settings
9 |
10 | def load_config(self, import_json):
11 | """Load settings dict from import_json (path/filename.json) JSON-file."""
12 |
13 | with open(import_json, 'r') as fp:
14 | settings = json.load(fp)
15 |
16 | for key, value in settings.items():
17 | self.settings[key] = value
18 |
19 | def save_config(self, export_json):
20 | """Save settings dict to export_json (path/filename.json) JSON-file."""
21 |
22 | with open(export_json, 'w') as fp:
23 | json.dump(self.settings, fp)
24 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/base/torchvision_dataset.py:
--------------------------------------------------------------------------------
1 | from .base_dataset import BaseADDataset
2 | from torch.utils.data import DataLoader
3 |
4 |
5 | class TorchvisionDataset(BaseADDataset):
6 | """TorchvisionDataset class for datasets_cc already implemented in torchvision.datasets_cc."""
7 |
8 | def __init__(self, root: str):
9 | super().__init__(root)
10 |
11 | def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> (
12 | DataLoader, DataLoader):
13 | train_loader = DataLoader(dataset=self.train_set, batch_size=batch_size, shuffle=shuffle_train,
14 | num_workers=num_workers, drop_last=True)
15 | test_loader = DataLoader(dataset=self.test_set, batch_size=batch_size, shuffle=shuffle_test,
16 | num_workers=num_workers, drop_last=False)
17 | return train_loader, test_loader
18 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/base/base_net.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import torch.nn as nn
3 | import numpy as np
4 |
5 |
6 | class BaseNet(nn.Module):
7 | """Base class for all neural networks."""
8 |
9 | def __init__(self):
10 | super().__init__()
11 | self.logger = logging.getLogger(self.__class__.__name__)
12 | self.rep_dim = None # representation dimensionality, i.e. dim of the code layer or last layer
13 |
14 | def forward(self, *input):
15 | """
16 | Forward pass logic
17 | :return: Network output
18 | """
19 | raise NotImplementedError
20 |
21 | def summary(self):
22 | """Network summary."""
23 | net_parameters = filter(lambda p: p.requires_grad, self.parameters())
24 | params = sum([np.prod(p.size()) for p in net_parameters])
25 | self.logger.info('Trainable parameters: {}'.format(params))
26 | self.logger.info(self)
27 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/utils/visualization/plot_images_grid.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import matplotlib
3 | matplotlib.use('Agg') # or 'PS', 'PDF', 'SVG'
4 |
5 | import matplotlib.pyplot as plt
6 | import numpy as np
7 | from torchvision.utils import make_grid
8 |
9 |
10 | def plot_images_grid(x: torch.tensor, export_img, title: str = '', nrow=8, padding=2, normalize=False, pad_value=0):
11 | """Plot 4D Tensor of images of shape (B x C x H x W) as a grid."""
12 |
13 | grid = make_grid(x, nrow=nrow, padding=padding, normalize=normalize, pad_value=pad_value)
14 | npgrid = grid.cpu().numpy()
15 |
16 | plt.imshow(np.transpose(npgrid, (1, 2, 0)), interpolation='nearest')
17 |
18 | ax = plt.gca()
19 | ax.xaxis.set_visible(False)
20 | ax.yaxis.set_visible(False)
21 |
22 | if not (title == ''):
23 | plt.title(title)
24 |
25 | plt.savefig(export_img, bbox_inches='tight', pad_inches=0.1)
26 | plt.clf()
27 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/base/base_dataset.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from torch.utils.data import DataLoader
3 |
4 |
5 | class BaseADDataset(ABC):
6 | """Anomaly detection dataset base class."""
7 |
8 | def __init__(self, root: str):
9 | super().__init__()
10 | self.root = root # root path to data
11 |
12 | self.n_classes = 2 # 0: normal, 1: outlier
13 | self.normal_classes = None # tuple with original class labels that define the normal class
14 | self.outlier_classes = None # tuple with original class labels that define the outlier class
15 |
16 | self.train_set = None # must be of type torch.utils.data.Dataset
17 | self.test_set = None # must be of type torch.utils.data.Dataset
18 |
19 | @abstractmethod
20 | def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> (
21 | DataLoader, DataLoader):
22 | """Implement data loaders of type torch.utils.data.DataLoader for train_set and test_set."""
23 | pass
24 |
25 | def __repr__(self):
26 | return self.__class__.__name__
27 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 lukasruff
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/base/base_trainer.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from .base_dataset import BaseADDataset
3 | from .base_net import BaseNet
4 |
5 |
6 | class BaseTrainer(ABC):
7 | """Trainer base class."""
8 |
9 | def __init__(self, optimizer_name: str, lr: float, n_epochs: int, lr_milestones: tuple, batch_size: int,
10 | weight_decay: float, device: str, n_jobs_dataloader: int):
11 | super().__init__()
12 | self.optimizer_name = optimizer_name
13 | self.lr = lr
14 | self.n_epochs = n_epochs
15 | self.lr_milestones = lr_milestones
16 | self.batch_size = batch_size
17 | self.weight_decay = weight_decay
18 | self.device = device
19 | self.n_jobs_dataloader = n_jobs_dataloader
20 |
21 | @abstractmethod
22 | def train(self, dataset: BaseADDataset, net: BaseNet) -> BaseNet:
23 | """
24 | Implement train method that trains the given network using the train_set of dataset.
25 | :return: Trained net
26 | """
27 | pass
28 |
29 | @abstractmethod
30 | def test(self, dataset: BaseADDataset, net: BaseNet):
31 | """
32 | Implement test method that evaluates the test_set of dataset on the given network.
33 | """
34 | pass
35 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/inference/distributions.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn.functional as F
4 |
5 |
6 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch
7 | def log_standard_gaussian(x):
8 | """
9 | Evaluates the log pdf of a standard normal distribution at x.
10 |
11 | :param x: point to evaluate
12 | :return: log N(x|0,I)
13 | """
14 | return torch.sum(-0.5 * math.log(2 * math.pi) - x ** 2 / 2, dim=-1)
15 |
16 |
17 | def log_gaussian(x, mu, log_var):
18 | """
19 | Evaluates the log pdf of a normal distribution parametrized by mu and log_var at x.
20 |
21 | :param x: point to evaluate
22 | :param mu: mean
23 | :param log_var: log variance
24 | :return: log N(x|µ,σI)
25 | """
26 | log_pdf = -0.5 * math.log(2 * math.pi) - log_var / 2 - (x - mu)**2 / (2 * torch.exp(log_var))
27 | return torch.sum(log_pdf, dim=-1)
28 |
29 |
30 | def log_standard_categorical(p):
31 | """
32 | Computes the cross-entropy between a (one-hot) categorical vector and a standard (uniform) categorical distribution.
33 | :param p: one-hot categorical distribution
34 | :return: H(p,u)
35 | """
36 | eps = 1e-8
37 | prior = F.softmax(torch.ones_like(p), dim=1) # Uniform prior over y
38 | prior.requires_grad = False
39 | cross_entropy = -torch.sum(p * torch.log(prior + eps), dim=1)
40 |
41 | return cross_entropy
42 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/main.py:
--------------------------------------------------------------------------------
1 | # from .mnist_LeNet import MNIST_LeNet, MNIST_LeNet_Autoencoder
2 | # from .fmnist_LeNet import FashionMNIST_LeNet, FashionMNIST_LeNet_Autoencoder
3 | # from .cifar10_LeNet import CIFAR10_LeNet, CIFAR10_LeNet_Autoencoder
4 | from .mlp import MLP, MLP_Autoencoder
5 | from .vae import VariationalAutoencoder
6 | from .dgm import DeepGenerativeModel, StackedDeepGenerativeModel
7 |
8 |
9 | #注意此处与源码有不同
10 | #源码是不同数据集有不同的网络结构(which is weird)
11 | #注意bias必须要设为0,否则DeepSAD可能出现mode collapse(原论文中也提及)
12 | def build_network(net_name, input_size ,ae_net=None):
13 | """Builds the neural network."""
14 | net = None
15 |
16 | if net_name == 'mnist_LeNet':
17 | net = MNIST_LeNet()
18 |
19 | elif net_name == 'fmnist_LeNet':
20 | net = FashionMNIST_LeNet()
21 |
22 | elif net_name == 'cifar10_LeNet':
23 | net = CIFAR10_LeNet()
24 |
25 | else:
26 | net = MLP(x_dim=input_size, h_dims=[100, 20], rep_dim=10, bias=False)
27 |
28 | return net
29 |
30 | def build_autoencoder(net_name, input_size):
31 | """Builds the corresponding autoencoder network."""
32 | ae_net = None
33 |
34 | if net_name == 'mnist_LeNet':
35 | ae_net = MNIST_LeNet_Autoencoder()
36 |
37 | elif net_name == 'fmnist_LeNet':
38 | ae_net = FashionMNIST_LeNet_Autoencoder()
39 |
40 | elif net_name == 'cifar10_LeNet':
41 | ae_net = CIFAR10_LeNet_Autoencoder()
42 |
43 | else:
44 | ae_net = MLP_Autoencoder(x_dim=input_size, h_dims=[100, 20], rep_dim=10, bias=False)
45 |
46 | return ae_net
47 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/datasets/odds.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import DataLoader, Subset
2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset
3 | from baseline.DeepSAD.src.base.odds_dataset import ODDSDataset
4 | from .preprocessing import create_semisupervised_setting
5 |
6 | import torch
7 |
8 |
9 | class ODDSADDataset(BaseADDataset):
10 |
11 | def __init__(self, data, train):
12 | super().__init__(self)
13 |
14 | # Define normal and outlier classes
15 | self.n_classes = 2 # 0: normal, 1: outlier
16 | self.normal_classes = (0,)
17 | self.outlier_classes = (1,)
18 |
19 | # training or testing dataset
20 | self.train = train
21 |
22 | if self.train:
23 | # Get training set
24 | self.train_set = ODDSDataset(data=data, train=True)
25 | else:
26 | # Get testing set
27 | self.test_set = ODDSDataset(data=data, train=False)
28 |
29 | def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> (
30 | DataLoader, DataLoader):
31 |
32 | if self.train:
33 | train_loader = DataLoader(dataset=self.train_set, batch_size=batch_size, shuffle=shuffle_train,
34 | num_workers=num_workers, drop_last=True)
35 | return train_loader
36 | else:
37 | test_loader = DataLoader(dataset=self.test_set, batch_size=batch_size, shuffle=shuffle_test,
38 | num_workers=num_workers, drop_last=False)
39 | return test_loader
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/utils/misc.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from torch.autograd import Variable
4 |
5 |
6 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch
7 | def enumerate_discrete(x, y_dim):
8 | """
9 | Generates a 'torch.Tensor' of size batch_size x n_labels of the given label.
10 |
11 | :param x: tensor with batch size to mimic
12 | :param y_dim: number of total labels
13 | :return variable
14 | """
15 |
16 | def batch(batch_size, label):
17 | labels = (torch.ones(batch_size, 1) * label).type(torch.LongTensor)
18 | y = torch.zeros((batch_size, y_dim))
19 | y.scatter_(1, labels, 1)
20 | return y.type(torch.LongTensor)
21 |
22 | batch_size = x.size(0)
23 | generated = torch.cat([batch(batch_size, i) for i in range(y_dim)])
24 |
25 | if x.is_cuda:
26 | generated = generated.to(x.device)
27 |
28 | return Variable(generated.float())
29 |
30 |
31 | def log_sum_exp(tensor, dim=-1, sum_op=torch.sum):
32 | """
33 | Uses the LogSumExp (LSE) as an approximation for the sum in a log-domain.
34 |
35 | :param tensor: Tensor to compute LSE over
36 | :param dim: dimension to perform operation over
37 | :param sum_op: reductive operation to be applied, e.g. torch.sum or torch.mean
38 | :return: LSE
39 | """
40 | max, _ = torch.max(tensor, dim=dim, keepdim=True)
41 | return torch.log(sum_op(torch.exp(tensor - max), dim=dim, keepdim=True) + 1e-8) + max
42 |
43 |
44 | def binary_cross_entropy(x, y):
45 | eps = 1e-8
46 | return -torch.sum(y * torch.log(x + eps) + (1 - y) * torch.log(1 - x + eps), dim=-1)
47 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/layers/stochastic.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from torch.autograd import Variable
6 |
7 |
8 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch
9 | class Stochastic(nn.Module):
10 | """
11 | Base stochastic layer that uses the reparametrization trick (Kingma and Welling, 2013) to draw a sample from a
12 | distribution parametrized by mu and log_var.
13 | """
14 |
15 | def __init__(self):
16 | super(Stochastic, self).__init__()
17 |
18 | def reparametrize(self, mu, log_var):
19 | epsilon = Variable(torch.randn(mu.size()), requires_grad=False)
20 |
21 | if mu.is_cuda:
22 | epsilon = epsilon.to(mu.device)
23 |
24 | # log_std = 0.5 * log_var
25 | # std = exp(log_std)
26 | std = log_var.mul(0.5).exp_()
27 |
28 | # z = std * epsilon + mu
29 | z = mu.addcmul(std, epsilon)
30 |
31 | return z
32 |
33 | def forward(self, x):
34 | raise NotImplementedError
35 |
36 |
37 | class GaussianSample(Stochastic):
38 | """
39 | Layer that represents a sample from a Gaussian distribution.
40 | """
41 |
42 | def __init__(self, in_features, out_features):
43 | super(GaussianSample, self).__init__()
44 | self.in_features = in_features
45 | self.out_features = out_features
46 |
47 | self.mu = nn.Linear(in_features, out_features)
48 | self.log_var = nn.Linear(in_features, out_features)
49 |
50 | def forward(self, x):
51 | mu = self.mu(x)
52 | log_var = F.softplus(self.log_var(x))
53 | return self.reparametrize(mu, log_var), mu, log_var
54 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/base/odds_dataset.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from torch.utils.data import Dataset
3 | from scipy.io import loadmat
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.preprocessing import StandardScaler, MinMaxScaler
6 | from torchvision.datasets.utils import download_url
7 |
8 | import os
9 | import torch
10 | import pandas as pd
11 | import numpy as np
12 | import os
13 |
14 |
15 | class ODDSDataset(Dataset):
16 | """
17 | ODDSDataset class for datasets_cc from Outlier Detection DataSets (ODDS): http://odds.cs.stonybrook.edu/
18 |
19 | Dataset class with additional targets for the semi-supervised setting and modification of __getitem__ method
20 | to also return the semi-supervised target as well as the index of a data sample.
21 | """
22 |
23 | def __init__(self, data, train=True):
24 | super(Dataset, self).__init__()
25 | self.train = train
26 |
27 | if self.train:
28 | self.data = torch.tensor(data['X_train'], dtype=torch.float32)
29 | self.targets = torch.tensor(data['y_train'], dtype=torch.int64)
30 | else:
31 | self.data = torch.tensor(data['X_test'], dtype=torch.float32)
32 | self.targets = torch.tensor(data['y_test'], dtype=torch.int64)
33 |
34 | # self.semi_targets = torch.zeros_like(self.targets)
35 | self.semi_targets = self.targets
36 |
37 | def __getitem__(self, index):
38 | """
39 | Args:
40 | index (int): Index
41 |
42 | Returns:
43 | tuple: (sample, target, semi_target, index)
44 | """
45 | sample, target, semi_target = self.data[index], int(self.targets[index]), int(self.semi_targets[index])
46 |
47 | return sample, target, semi_target, index
48 |
49 | def __len__(self):
50 | return len(self.data)
51 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/layers/standard.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from torch.nn import Module
4 | from torch.nn import init
5 | from torch.nn.parameter import Parameter
6 |
7 |
8 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch
9 | class Standardize(Module):
10 | """
11 | Applies (element-wise) standardization with trainable translation parameter μ and scale parameter σ, i.e. computes
12 | (x - μ) / σ where '/' is applied element-wise.
13 |
14 | Args:
15 | in_features: size of each input sample
16 | out_features: size of each output sample
17 | bias: If set to False, the layer will not learn a translation parameter μ.
18 | Default: ``True``
19 |
20 | Attributes:
21 | mu: the learnable translation parameter μ.
22 | std: the learnable scale parameter σ.
23 | """
24 | __constants__ = ['mu']
25 |
26 | def __init__(self, in_features, bias=True, eps=1e-6):
27 | super(Standardize, self).__init__()
28 | self.in_features = in_features
29 | self.out_features = in_features
30 | self.eps = eps
31 | self.std = Parameter(torch.Tensor(in_features))
32 | if bias:
33 | self.mu = Parameter(torch.Tensor(in_features))
34 | else:
35 | self.register_parameter('mu', None)
36 | self.reset_parameters()
37 |
38 | def reset_parameters(self):
39 | init.constant_(self.std, 1)
40 | if self.mu is not None:
41 | init.constant_(self.mu, 0)
42 |
43 | def forward(self, x):
44 | if self.mu is not None:
45 | x -= self.mu
46 | x = torch.div(x, self.std + self.eps)
47 | return x
48 |
49 | def extra_repr(self):
50 | return 'in_features={}, out_features={}, bias={}'.format(
51 | self.in_features, self.out_features, self.mu is not None
52 | )
53 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/mnist_LeNet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from base.base_net import BaseNet
6 |
7 |
8 | class MNIST_LeNet(BaseNet):
9 |
10 | def __init__(self, rep_dim=32):
11 | super().__init__()
12 |
13 | self.rep_dim = rep_dim
14 | self.pool = nn.MaxPool2d(2, 2)
15 |
16 | self.conv1 = nn.Conv2d(1, 8, 5, bias=False, padding=2)
17 | self.bn1 = nn.BatchNorm2d(8, eps=1e-04, affine=False)
18 | self.conv2 = nn.Conv2d(8, 4, 5, bias=False, padding=2)
19 | self.bn2 = nn.BatchNorm2d(4, eps=1e-04, affine=False)
20 | self.fc1 = nn.Linear(4 * 7 * 7, self.rep_dim, bias=False)
21 |
22 | def forward(self, x):
23 | x = x.view(-1, 1, 28, 28)
24 | x = self.conv1(x)
25 | x = self.pool(F.leaky_relu(self.bn1(x)))
26 | x = self.conv2(x)
27 | x = self.pool(F.leaky_relu(self.bn2(x)))
28 | x = x.view(int(x.size(0)), -1)
29 | x = self.fc1(x)
30 | return x
31 |
32 |
33 | class MNIST_LeNet_Decoder(BaseNet):
34 |
35 | def __init__(self, rep_dim=32):
36 | super().__init__()
37 |
38 | self.rep_dim = rep_dim
39 |
40 | # Decoder network
41 | self.deconv1 = nn.ConvTranspose2d(2, 4, 5, bias=False, padding=2)
42 | self.bn3 = nn.BatchNorm2d(4, eps=1e-04, affine=False)
43 | self.deconv2 = nn.ConvTranspose2d(4, 8, 5, bias=False, padding=3)
44 | self.bn4 = nn.BatchNorm2d(8, eps=1e-04, affine=False)
45 | self.deconv3 = nn.ConvTranspose2d(8, 1, 5, bias=False, padding=2)
46 |
47 | def forward(self, x):
48 | x = x.view(int(x.size(0)), int(self.rep_dim / 16), 4, 4)
49 | x = F.interpolate(F.leaky_relu(x), scale_factor=2)
50 | x = self.deconv1(x)
51 | x = F.interpolate(F.leaky_relu(self.bn3(x)), scale_factor=2)
52 | x = self.deconv2(x)
53 | x = F.interpolate(F.leaky_relu(self.bn4(x)), scale_factor=2)
54 | x = self.deconv3(x)
55 | x = torch.sigmoid(x)
56 | return x
57 |
58 |
59 | class MNIST_LeNet_Autoencoder(BaseNet):
60 |
61 | def __init__(self, rep_dim=32):
62 | super().__init__()
63 |
64 | self.rep_dim = rep_dim
65 | self.encoder = MNIST_LeNet(rep_dim=rep_dim)
66 | self.decoder = MNIST_LeNet_Decoder(rep_dim=rep_dim)
67 |
68 | def forward(self, x):
69 | x = self.encoder(x)
70 | x = self.decoder(x)
71 | return x
72 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/mlp.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 |
4 | from baseline.DeepSAD.src.base.base_net import BaseNet
5 |
6 |
7 | class MLP(BaseNet):
8 |
9 | def __init__(self, x_dim, h_dims=[128, 64], rep_dim=32, bias=False):
10 | super().__init__()
11 |
12 | self.rep_dim = rep_dim
13 |
14 | neurons = [x_dim, *h_dims]
15 | layers = [Linear_BN_leakyReLU(neurons[i - 1], neurons[i], bias=bias) for i in range(1, len(neurons))]
16 |
17 | self.hidden = nn.ModuleList(layers)
18 | self.code = nn.Linear(h_dims[-1], rep_dim, bias=bias)
19 |
20 | def forward(self, x):
21 | x = x.view(int(x.size(0)), -1)
22 | for layer in self.hidden:
23 | x = layer(x)
24 | return self.code(x)
25 |
26 |
27 | class MLP_Decoder(BaseNet):
28 |
29 | def __init__(self, x_dim, h_dims=[64, 128], rep_dim=32, bias=False):
30 | super().__init__()
31 |
32 | self.rep_dim = rep_dim
33 |
34 | neurons = [rep_dim, *h_dims]
35 | layers = [Linear_BN_leakyReLU(neurons[i - 1], neurons[i], bias=bias) for i in range(1, len(neurons))]
36 |
37 | self.hidden = nn.ModuleList(layers)
38 | self.reconstruction = nn.Linear(h_dims[-1], x_dim, bias=bias)
39 | self.output_activation = nn.Sigmoid()
40 |
41 | def forward(self, x):
42 | x = x.view(int(x.size(0)), -1)
43 | for layer in self.hidden:
44 | x = layer(x)
45 | x = self.reconstruction(x)
46 | return self.output_activation(x)
47 |
48 |
49 | class MLP_Autoencoder(BaseNet):
50 |
51 | def __init__(self, x_dim, h_dims=[128, 64], rep_dim=32, bias=False):
52 | super().__init__()
53 |
54 | self.rep_dim = rep_dim
55 | self.encoder = MLP(x_dim, h_dims, rep_dim, bias)
56 | self.decoder = MLP_Decoder(x_dim, list(reversed(h_dims)), rep_dim, bias)
57 |
58 | def forward(self, x):
59 | x = self.encoder(x)
60 | x = self.decoder(x)
61 | return x
62 |
63 |
64 | class Linear_BN_leakyReLU(nn.Module):
65 | """
66 | A nn.Module that consists of a Linear layer followed by BatchNorm1d and a leaky ReLu activation
67 | """
68 |
69 | def __init__(self, in_features, out_features, bias=False, eps=1e-04):
70 | super(Linear_BN_leakyReLU, self).__init__()
71 |
72 | self.linear = nn.Linear(in_features, out_features, bias=bias)
73 | self.bn = nn.BatchNorm1d(out_features, eps=eps, affine=bias)
74 |
75 | def forward(self, x):
76 | return F.leaky_relu(self.bn(self.linear(x)))
77 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/fmnist_LeNet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from base.base_net import BaseNet
6 |
7 |
8 | class FashionMNIST_LeNet(BaseNet):
9 |
10 | def __init__(self, rep_dim=64):
11 | super().__init__()
12 |
13 | self.rep_dim = rep_dim
14 | self.pool = nn.MaxPool2d(2, 2)
15 |
16 | self.conv1 = nn.Conv2d(1, 16, 5, bias=False, padding=2)
17 | self.bn2d1 = nn.BatchNorm2d(16, eps=1e-04, affine=False)
18 | self.conv2 = nn.Conv2d(16, 32, 5, bias=False, padding=2)
19 | self.bn2d2 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
20 | self.fc1 = nn.Linear(32 * 7 * 7, 128, bias=False)
21 | self.bn1d1 = nn.BatchNorm1d(128, eps=1e-04, affine=False)
22 | self.fc2 = nn.Linear(128, self.rep_dim, bias=False)
23 |
24 | def forward(self, x):
25 | x = x.view(-1, 1, 28, 28)
26 | x = self.conv1(x)
27 | x = self.pool(F.leaky_relu(self.bn2d1(x)))
28 | x = self.conv2(x)
29 | x = self.pool(F.leaky_relu(self.bn2d2(x)))
30 | x = x.view(int(x.size(0)), -1)
31 | x = F.leaky_relu(self.bn1d1(self.fc1(x)))
32 | x = self.fc2(x)
33 | return x
34 |
35 |
36 | class FashionMNIST_LeNet_Decoder(BaseNet):
37 |
38 | def __init__(self, rep_dim=64):
39 | super().__init__()
40 |
41 | self.rep_dim = rep_dim
42 |
43 | self.fc3 = nn.Linear(self.rep_dim, 128, bias=False)
44 | self.bn1d2 = nn.BatchNorm1d(128, eps=1e-04, affine=False)
45 | self.deconv1 = nn.ConvTranspose2d(8, 32, 5, bias=False, padding=2)
46 | self.bn2d3 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
47 | self.deconv2 = nn.ConvTranspose2d(32, 16, 5, bias=False, padding=3)
48 | self.bn2d4 = nn.BatchNorm2d(16, eps=1e-04, affine=False)
49 | self.deconv3 = nn.ConvTranspose2d(16, 1, 5, bias=False, padding=2)
50 |
51 | def forward(self, x):
52 | x = self.bn1d2(self.fc3(x))
53 | x = x.view(int(x.size(0)), int(128 / 16), 4, 4)
54 | x = F.interpolate(F.leaky_relu(x), scale_factor=2)
55 | x = self.deconv1(x)
56 | x = F.interpolate(F.leaky_relu(self.bn2d3(x)), scale_factor=2)
57 | x = self.deconv2(x)
58 | x = F.interpolate(F.leaky_relu(self.bn2d4(x)), scale_factor=2)
59 | x = self.deconv3(x)
60 | x = torch.sigmoid(x)
61 | return x
62 |
63 |
64 | class FashionMNIST_LeNet_Autoencoder(BaseNet):
65 |
66 | def __init__(self, rep_dim=64):
67 | super().__init__()
68 |
69 | self.rep_dim = rep_dim
70 | self.encoder = FashionMNIST_LeNet(rep_dim=rep_dim)
71 | self.decoder = FashionMNIST_LeNet_Decoder(rep_dim=rep_dim)
72 |
73 | def forward(self, x):
74 | x = self.encoder(x)
75 | x = self.decoder(x)
76 | return x
77 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/optim/variational.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 | from torch import nn
5 | from itertools import repeat
6 | from baseline.DeepSAD.src.utils import enumerate_discrete, log_sum_exp
7 | from baseline.DeepSAD.src.networks import log_standard_categorical
8 |
9 |
10 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch
11 | class ImportanceWeightedSampler(object):
12 | """
13 | Importance weighted sampler (Burda et al., 2015) to be used together with SVI.
14 |
15 | :param mc: number of Monte Carlo samples
16 | :param iw: number of Importance Weighted samples
17 | """
18 |
19 | def __init__(self, mc=1, iw=1):
20 | self.mc = mc
21 | self.iw = iw
22 |
23 | def resample(self, x):
24 | return x.repeat(self.mc * self.iw, 1)
25 |
26 | def __call__(self, elbo):
27 | elbo = elbo.view(self.mc, self.iw, -1)
28 | elbo = torch.mean(log_sum_exp(elbo, dim=1, sum_op=torch.mean), dim=0)
29 | return elbo.view(-1)
30 |
31 |
32 | class SVI(nn.Module):
33 | """
34 | Stochastic variational inference (SVI) optimizer for semi-supervised learning.
35 |
36 | :param model: semi-supervised model to evaluate
37 | :param likelihood: p(x|y,z) for example BCE or MSE
38 | :param beta: warm-up/scaling of KL-term
39 | :param sampler: sampler for x and y, e.g. for Monte Carlo
40 | """
41 |
42 | base_sampler = ImportanceWeightedSampler(mc=1, iw=1)
43 |
44 | def __init__(self, model, likelihood=F.binary_cross_entropy, beta=repeat(1), sampler=base_sampler):
45 | super(SVI, self).__init__()
46 | self.model = model
47 | self.likelihood = likelihood
48 | self.sampler = sampler
49 | self.beta = beta
50 |
51 | def forward(self, x, y=None):
52 | is_labeled = False if y is None else True
53 |
54 | # Prepare for sampling
55 | xs, ys = (x, y)
56 |
57 | # Enumerate choices of label
58 | if not is_labeled:
59 | ys = enumerate_discrete(xs, self.model.y_dim)
60 | xs = xs.repeat(self.model.y_dim, 1)
61 |
62 | # Increase sampling dimension
63 | xs = self.sampler.resample(xs)
64 | ys = self.sampler.resample(ys)
65 |
66 | reconstruction = self.model(xs, ys)
67 |
68 | # p(x|y,z)
69 | likelihood = -self.likelihood(reconstruction, xs)
70 |
71 | # p(y)
72 | prior = -log_standard_categorical(ys)
73 |
74 | # Equivalent to -L(x, y)
75 | elbo = likelihood + prior - next(self.beta) * self.model.kl_divergence
76 | L = self.sampler(elbo)
77 |
78 | if is_labeled:
79 | return torch.mean(L)
80 |
81 | logits = self.model.classify(x)
82 |
83 | L = L.view_as(logits.t()).t()
84 |
85 | # Calculate entropy H(q(y|x)) and sum over all labels
86 | eps = 1e-8
87 | H = -torch.sum(torch.mul(logits, torch.log(logits + eps)), dim=-1)
88 | L = torch.sum(torch.mul(logits, L), dim=-1)
89 |
90 | # Equivalent to -U(x)
91 | U = L + H
92 |
93 | return torch.mean(U)
94 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/cifar10_LeNet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from base.base_net import BaseNet
6 |
7 |
8 | class CIFAR10_LeNet(BaseNet):
9 |
10 | def __init__(self, rep_dim=128):
11 | super().__init__()
12 |
13 | self.rep_dim = rep_dim
14 | self.pool = nn.MaxPool2d(2, 2)
15 |
16 | self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2)
17 | self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
18 | self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2)
19 | self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False)
20 | self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2)
21 | self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False)
22 | self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False)
23 |
24 | def forward(self, x):
25 | x = x.view(-1, 3, 32, 32)
26 | x = self.conv1(x)
27 | x = self.pool(F.leaky_relu(self.bn2d1(x)))
28 | x = self.conv2(x)
29 | x = self.pool(F.leaky_relu(self.bn2d2(x)))
30 | x = self.conv3(x)
31 | x = self.pool(F.leaky_relu(self.bn2d3(x)))
32 | x = x.view(int(x.size(0)), -1)
33 | x = self.fc1(x)
34 | return x
35 |
36 |
37 | class CIFAR10_LeNet_Decoder(BaseNet):
38 |
39 | def __init__(self, rep_dim=128):
40 | super().__init__()
41 |
42 | self.rep_dim = rep_dim
43 |
44 | self.deconv1 = nn.ConvTranspose2d(int(self.rep_dim / (4 * 4)), 128, 5, bias=False, padding=2)
45 | nn.init.xavier_uniform_(self.deconv1.weight, gain=nn.init.calculate_gain('leaky_relu'))
46 | self.bn2d4 = nn.BatchNorm2d(128, eps=1e-04, affine=False)
47 | self.deconv2 = nn.ConvTranspose2d(128, 64, 5, bias=False, padding=2)
48 | nn.init.xavier_uniform_(self.deconv2.weight, gain=nn.init.calculate_gain('leaky_relu'))
49 | self.bn2d5 = nn.BatchNorm2d(64, eps=1e-04, affine=False)
50 | self.deconv3 = nn.ConvTranspose2d(64, 32, 5, bias=False, padding=2)
51 | nn.init.xavier_uniform_(self.deconv3.weight, gain=nn.init.calculate_gain('leaky_relu'))
52 | self.bn2d6 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
53 | self.deconv4 = nn.ConvTranspose2d(32, 3, 5, bias=False, padding=2)
54 | nn.init.xavier_uniform_(self.deconv4.weight, gain=nn.init.calculate_gain('leaky_relu'))
55 |
56 | def forward(self, x):
57 | x = x.view(int(x.size(0)), int(self.rep_dim / (4 * 4)), 4, 4)
58 | x = F.leaky_relu(x)
59 | x = self.deconv1(x)
60 | x = F.interpolate(F.leaky_relu(self.bn2d4(x)), scale_factor=2)
61 | x = self.deconv2(x)
62 | x = F.interpolate(F.leaky_relu(self.bn2d5(x)), scale_factor=2)
63 | x = self.deconv3(x)
64 | x = F.interpolate(F.leaky_relu(self.bn2d6(x)), scale_factor=2)
65 | x = self.deconv4(x)
66 | x = torch.sigmoid(x)
67 | return x
68 |
69 |
70 | class CIFAR10_LeNet_Autoencoder(BaseNet):
71 |
72 | def __init__(self, rep_dim=128):
73 | super().__init__()
74 |
75 | self.rep_dim = rep_dim
76 | self.encoder = CIFAR10_LeNet(rep_dim=rep_dim)
77 | self.decoder = CIFAR10_LeNet_Decoder(rep_dim=rep_dim)
78 |
79 | def forward(self, x):
80 | x = self.encoder(x)
81 | x = self.decoder(x)
82 | return x
83 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/datasets/preprocessing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 |
5 | def create_semisupervised_setting(labels, normal_classes, outlier_classes, known_outlier_classes,
6 | ratio_known_normal, ratio_known_outlier, ratio_pollution):
7 | """
8 | Create a semi-supervised data setting.
9 | :param labels: np.array with labels of all dataset samples
10 | :param normal_classes: tuple with normal class labels
11 | :param outlier_classes: tuple with anomaly class labels
12 | :param known_outlier_classes: tuple with known (labeled) anomaly class labels
13 | :param ratio_known_normal: the desired ratio of known (labeled) normal samples
14 | :param ratio_known_outlier: the desired ratio of known (labeled) anomalous samples
15 | :param ratio_pollution: the desired pollution ratio of the unlabeled data with unknown (unlabeled) anomalies.
16 | :return: tuple with list of sample indices, list of original labels, and list of semi-supervised labels
17 | """
18 | idx_normal = np.argwhere(np.isin(labels, normal_classes)).flatten()
19 | idx_outlier = np.argwhere(np.isin(labels, outlier_classes)).flatten()
20 | idx_known_outlier_candidates = np.argwhere(np.isin(labels, known_outlier_classes)).flatten()
21 |
22 | n_normal = len(idx_normal)
23 |
24 | # Solve system of linear equations to obtain respective number of samples
25 | a = np.array([[1, 1, 0, 0],
26 | [(1-ratio_known_normal), -ratio_known_normal, -ratio_known_normal, -ratio_known_normal],
27 | [-ratio_known_outlier, -ratio_known_outlier, -ratio_known_outlier, (1-ratio_known_outlier)],
28 | [0, -ratio_pollution, (1-ratio_pollution), 0]])
29 | b = np.array([n_normal, 0, 0, 0])
30 | x = np.linalg.solve(a, b)
31 |
32 | # Get number of samples
33 | n_known_normal = int(x[0])
34 | n_unlabeled_normal = int(x[1])
35 | n_unlabeled_outlier = int(x[2])
36 | n_known_outlier = int(x[3])
37 |
38 | # Sample indices
39 | perm_normal = np.random.permutation(n_normal)
40 | perm_outlier = np.random.permutation(len(idx_outlier))
41 | perm_known_outlier = np.random.permutation(len(idx_known_outlier_candidates))
42 |
43 | idx_known_normal = idx_normal[perm_normal[:n_known_normal]].tolist()
44 | idx_unlabeled_normal = idx_normal[perm_normal[n_known_normal:n_known_normal+n_unlabeled_normal]].tolist()
45 | idx_unlabeled_outlier = idx_outlier[perm_outlier[:n_unlabeled_outlier]].tolist()
46 | idx_known_outlier = idx_known_outlier_candidates[perm_known_outlier[:n_known_outlier]].tolist()
47 |
48 | # Get original class labels
49 | labels_known_normal = labels[idx_known_normal].tolist()
50 | labels_unlabeled_normal = labels[idx_unlabeled_normal].tolist()
51 | labels_unlabeled_outlier = labels[idx_unlabeled_outlier].tolist()
52 | labels_known_outlier = labels[idx_known_outlier].tolist()
53 |
54 | # Get semi-supervised setting labels
55 | semi_labels_known_normal = np.ones(n_known_normal).astype(np.int32).tolist()
56 | semi_labels_unlabeled_normal = np.zeros(n_unlabeled_normal).astype(np.int32).tolist()
57 | semi_labels_unlabeled_outlier = np.zeros(n_unlabeled_outlier).astype(np.int32).tolist()
58 | semi_labels_known_outlier = (-np.ones(n_known_outlier).astype(np.int32)).tolist()
59 |
60 | # Create final lists
61 | list_idx = idx_known_normal + idx_unlabeled_normal + idx_unlabeled_outlier + idx_known_outlier
62 | list_labels = labels_known_normal + labels_unlabeled_normal + labels_unlabeled_outlier + labels_known_outlier
63 | list_semi_labels = (semi_labels_known_normal + semi_labels_unlabeled_normal + semi_labels_unlabeled_outlier
64 | + semi_labels_known_outlier)
65 |
66 | return list_idx, list_labels, list_semi_labels
67 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/datasets/mnist.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Subset
2 | from PIL import Image
3 | from torchvision.datasets import MNIST
4 | from baseline.DeepSAD.src.base.torchvision_dataset import TorchvisionDataset
5 | from .preprocessing import create_semisupervised_setting
6 |
7 | import torch
8 | import torchvision.transforms as transforms
9 | import random
10 |
11 |
12 | class MNIST_Dataset(TorchvisionDataset):
13 |
14 | def __init__(self, root: str, normal_class: int = 0, known_outlier_class: int = 1, n_known_outlier_classes: int = 0,
15 | ratio_known_normal: float = 0.0, ratio_known_outlier: float = 0.0, ratio_pollution: float = 0.0):
16 | super().__init__(root)
17 |
18 | # Define normal and outlier classes
19 | self.n_classes = 2 # 0: normal, 1: outlier
20 | self.normal_classes = tuple([normal_class])
21 | self.outlier_classes = list(range(0, 10))
22 | self.outlier_classes.remove(normal_class)
23 | self.outlier_classes = tuple(self.outlier_classes)
24 |
25 | if n_known_outlier_classes == 0:
26 | self.known_outlier_classes = ()
27 | elif n_known_outlier_classes == 1:
28 | self.known_outlier_classes = tuple([known_outlier_class])
29 | else:
30 | self.known_outlier_classes = tuple(random.sample(self.outlier_classes, n_known_outlier_classes))
31 |
32 | # MNIST preprocessing: feature scaling to [0, 1]
33 | transform = transforms.ToTensor()
34 | target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes))
35 |
36 | # Get train set
37 | train_set = MyMNIST(root=self.root, train=True, transform=transform, target_transform=target_transform,
38 | download=True)
39 |
40 | # Create semi-supervised setting
41 | idx, _, semi_targets = create_semisupervised_setting(train_set.targets.cpu().data.numpy(), self.normal_classes,
42 | self.outlier_classes, self.known_outlier_classes,
43 | ratio_known_normal, ratio_known_outlier, ratio_pollution)
44 | train_set.semi_targets[idx] = torch.tensor(semi_targets) # set respective semi-supervised labels
45 |
46 | # Subset train_set to semi-supervised setup
47 | self.train_set = Subset(train_set, idx)
48 |
49 | # Get test set
50 | self.test_set = MyMNIST(root=self.root, train=False, transform=transform, target_transform=target_transform,
51 | download=True)
52 |
53 |
54 | class MyMNIST(MNIST):
55 | """
56 | Torchvision MNIST class with additional targets for the semi-supervised setting and patch of __getitem__ method
57 | to also return the semi-supervised target as well as the index of a data sample.
58 | """
59 |
60 | def __init__(self, *args, **kwargs):
61 | super(MyMNIST, self).__init__(*args, **kwargs)
62 |
63 | self.semi_targets = torch.zeros_like(self.targets)
64 |
65 | def __getitem__(self, index):
66 | """Override the original method of the MNIST class.
67 | Args:
68 | index (int): Index
69 |
70 | Returns:
71 | tuple: (image, target, semi_target, index)
72 | """
73 | img, target, semi_target = self.data[index], int(self.targets[index]), int(self.semi_targets[index])
74 |
75 | # doing this so that it is consistent with all other datasets_cc
76 | # to return a PIL Image
77 | img = Image.fromarray(img.numpy(), mode='L')
78 |
79 | if self.transform is not None:
80 | img = self.transform(img)
81 |
82 | if self.target_transform is not None:
83 | target = self.target_transform(target)
84 |
85 | return img, target, semi_target, index
86 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/datasets/cifar10.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Subset
2 | from PIL import Image
3 | from torchvision.datasets import CIFAR10
4 | from base.torchvision_dataset import TorchvisionDataset
5 | from .preprocessing import create_semisupervised_setting
6 |
7 | import torch
8 | import torchvision.transforms as transforms
9 | import random
10 | import numpy as np
11 |
12 |
13 | class CIFAR10_Dataset(TorchvisionDataset):
14 |
15 | def __init__(self, root: str, normal_class: int = 5, known_outlier_class: int = 3, n_known_outlier_classes: int = 0,
16 | ratio_known_normal: float = 0.0, ratio_known_outlier: float = 0.0, ratio_pollution: float = 0.0):
17 | super().__init__(root)
18 |
19 | # Define normal and outlier classes
20 | self.n_classes = 2 # 0: normal, 1: outlier
21 | self.normal_classes = tuple([normal_class])
22 | self.outlier_classes = list(range(0, 10))
23 | self.outlier_classes.remove(normal_class)
24 | self.outlier_classes = tuple(self.outlier_classes)
25 |
26 | if n_known_outlier_classes == 0:
27 | self.known_outlier_classes = ()
28 | elif n_known_outlier_classes == 1:
29 | self.known_outlier_classes = tuple([known_outlier_class])
30 | else:
31 | self.known_outlier_classes = tuple(random.sample(self.outlier_classes, n_known_outlier_classes))
32 |
33 | # CIFAR-10 preprocessing: feature scaling to [0, 1]
34 | transform = transforms.ToTensor()
35 | target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes))
36 |
37 | # Get train set
38 | train_set = MyCIFAR10(root=self.root, train=True, transform=transform, target_transform=target_transform,
39 | download=True)
40 |
41 | # Create semi-supervised setting
42 | idx, _, semi_targets = create_semisupervised_setting(np.array(train_set.targets), self.normal_classes,
43 | self.outlier_classes, self.known_outlier_classes,
44 | ratio_known_normal, ratio_known_outlier, ratio_pollution)
45 | train_set.semi_targets[idx] = torch.tensor(semi_targets) # set respective semi-supervised labels
46 |
47 | # Subset train_set to semi-supervised setup
48 | self.train_set = Subset(train_set, idx)
49 |
50 | # Get test set
51 | self.test_set = MyCIFAR10(root=self.root, train=False, transform=transform, target_transform=target_transform,
52 | download=True)
53 |
54 |
55 | class MyCIFAR10(CIFAR10):
56 | """
57 | Torchvision CIFAR10 class with additional targets for the semi-supervised setting and patch of __getitem__ method
58 | to also return the semi-supervised target as well as the index of a data sample.
59 | """
60 |
61 | def __init__(self, *args, **kwargs):
62 | super(MyCIFAR10, self).__init__(*args, **kwargs)
63 |
64 | self.semi_targets = torch.zeros(len(self.targets), dtype=torch.int64)
65 |
66 | def __getitem__(self, index):
67 | """Override the original method of the CIFAR10 class.
68 | Args:
69 | index (int): Index
70 |
71 | Returns:
72 | tuple: (image, target, semi_target, index)
73 | """
74 | img, target, semi_target = self.data[index], self.targets[index], int(self.semi_targets[index])
75 |
76 | # doing this so that it is consistent with all other datasets_cc
77 | # to return a PIL Image
78 | img = Image.fromarray(img)
79 |
80 | if self.transform is not None:
81 | img = self.transform(img)
82 |
83 | if self.target_transform is not None:
84 | target = self.target_transform(target)
85 |
86 | return img, target, semi_target, index
87 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/datasets/fmnist.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Subset
2 | from PIL import Image
3 | from torchvision.datasets import FashionMNIST
4 | from base.torchvision_dataset import TorchvisionDataset
5 | from .preprocessing import create_semisupervised_setting
6 |
7 | import torch
8 | import torchvision.transforms as transforms
9 | import random
10 |
11 |
12 | class FashionMNIST_Dataset(TorchvisionDataset):
13 |
14 | def __init__(self, root: str, normal_class: int = 0, known_outlier_class: int = 1, n_known_outlier_classes: int = 0,
15 | ratio_known_normal: float = 0.0, ratio_known_outlier: float = 0.0, ratio_pollution: float = 0.0):
16 | super().__init__(root)
17 |
18 | # Define normal and outlier classes
19 | self.n_classes = 2 # 0: normal, 1: outlier
20 | self.normal_classes = tuple([normal_class])
21 | self.outlier_classes = list(range(0, 10))
22 | self.outlier_classes.remove(normal_class)
23 | self.outlier_classes = tuple(self.outlier_classes)
24 |
25 | if n_known_outlier_classes == 0:
26 | self.known_outlier_classes = ()
27 | elif n_known_outlier_classes == 1:
28 | self.known_outlier_classes = tuple([known_outlier_class])
29 | else:
30 | self.known_outlier_classes = tuple(random.sample(self.outlier_classes, n_known_outlier_classes))
31 |
32 | # FashionMNIST preprocessing: feature scaling to [0, 1]
33 | transform = transforms.ToTensor()
34 | target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes))
35 |
36 | # Get train set
37 | train_set = MyFashionMNIST(root=self.root, train=True, transform=transform, target_transform=target_transform,
38 | download=True)
39 |
40 | # Create semi-supervised setting
41 | idx, _, semi_targets = create_semisupervised_setting(train_set.targets.cpu().data.numpy(), self.normal_classes,
42 | self.outlier_classes, self.known_outlier_classes,
43 | ratio_known_normal, ratio_known_outlier, ratio_pollution)
44 | train_set.semi_targets[idx] = torch.tensor(semi_targets) # set respective semi-supervised labels
45 |
46 | # Subset train_set to semi-supervised setup
47 | self.train_set = Subset(train_set, idx)
48 |
49 | # Get test set
50 | self.test_set = MyFashionMNIST(root=self.root, train=False, transform=transform,
51 | target_transform=target_transform, download=True)
52 |
53 |
54 | class MyFashionMNIST(FashionMNIST):
55 | """
56 | Torchvision FashionMNIST class with additional targets for the semi-supervised setting and patch of __getitem__
57 | method to also return the semi-supervised target as well as the index of a data sample.
58 | """
59 |
60 | def __init__(self, *args, **kwargs):
61 | super(MyFashionMNIST, self).__init__(*args, **kwargs)
62 |
63 | self.semi_targets = torch.zeros_like(self.targets)
64 |
65 | def __getitem__(self, index):
66 | """Override the original method of the MyFashionMNIST class.
67 | Args:
68 | index (int): Index
69 |
70 | Returns:
71 | tuple: (image, target, semi_target, index)
72 | """
73 | img, target, semi_target = self.data[index], int(self.targets[index]), int(self.semi_targets[index])
74 |
75 | # doing this so that it is consistent with all other datasets_cc
76 | # to return a PIL Image
77 | img = Image.fromarray(img.numpy(), mode='L')
78 |
79 | if self.transform is not None:
80 | img = self.transform(img)
81 |
82 | if self.target_transform is not None:
83 | target = self.target_transform(target)
84 |
85 | return img, target, semi_target, index
86 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NNG-Mix
2 |
3 | This repository contains the implementation of the paper:
4 |
5 | **NNG-Mix: Improving Semi-supervised Anomaly Detection with Pseudo-anomaly Generation**
6 | [Hao Dong](https://sites.google.com/view/dong-hao/), [Gaëtan Frusque](https://frusquegaetan.github.io/), [Yue Zhao](https://viterbi-web.usc.edu/~yzhao010/), [Eleni Chatzi](https://chatzi.ibk.ethz.ch/about-us/people/prof-dr-eleni-chatzi.html) and [Olga Fink](https://people.epfl.ch/olga.fink?lang=en)
7 | [Link](https://arxiv.org/abs/2311.11961) to the arXiv version of the paper is available.
8 |
9 | We investigate improving semi-supervised anomaly detection performance from a novel viewpoint, by generating additional pseudo-anomalies based on the limited labeled anomalies and a large amount of unlabeled data. We introduce NNG-Mix, a simple and effective pseudo-anomaly generation algorithm, that optimally utilizes information from both labeled anomalies and unlabeled data.
10 |
11 |
12 | Nearest Neighbor Gaussian Mixup (NNG-Mix) makes good use of information from both labeled anomalies and unlabeled data to generate pseudo-anomalies effectively.
13 |
14 | ## Dataset
15 | Download `Classical`, `CV_by_ResNet18`, and `NLP_by_BERT` from [ADBench](https://github.com/Minqi824/ADBench/tree/main/adbench/datasets) and put under `datasets/` folder.
16 |
17 | ## Code
18 |
19 | Change `--ratio 1.0` to `--ratio 0.5` or `--ratio 0.1` for training with 5% or 1% available labeled anomalies.
20 | ### Classical Dataset
21 |
22 | Click for details...
23 |
24 |
25 | #### Train on Classical datasets with 10% available labeled anomalies using DeepSAD
26 | ```
27 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg DeepSAD --dataset Classical --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.01 --mixup_alpha 0.2 --mixup_beta 0.2
28 | ```
29 |
30 | #### Train on Classical datasets with 10% available labeled anomalies using MLP
31 | ```
32 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg MLP --dataset Classical --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.01 --mixup_alpha 0.2 --mixup_beta 0.2
33 | ```
34 |
35 |
36 |
37 | ### CV Dataset
38 |
39 | Click for details...
40 |
41 |
42 | #### Train on CV with 10% available labeled anomalies using DeepSAD
43 | ```
44 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg DeepSAD --dataset CV --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.01 --mixup_alpha 0.2 --mixup_beta 0.2
45 | ```
46 |
47 | #### Train on CV with 10% available labeled anomalies using MLP
48 | ```
49 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg MLP --dataset CV --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.3 --mixup_alpha 0.2 --mixup_beta 0.2
50 | ```
51 |
52 |
53 |
54 |
55 | ### NLP Dataset
56 |
57 | Click for details...
58 |
59 |
60 | #### Train on NLP with 10% available labeled anomalies using DeepSAD
61 | ```
62 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg DeepSAD --dataset NLP --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.01 --mixup_alpha 0.2 --mixup_beta 0.2
63 | ```
64 |
65 | #### Train on NLP with 10% available labeled anomalies using MLP
66 | ```
67 | python NNG_Mix.py --ratio 1.0 --method nng_mix --seed 0 --alg MLP --dataset NLP --nn_k 10 --nn_k_anomaly 10 --nn_mix_gaussian --nn_mix_gaussian_std 0.3 --mixup_alpha 0.2 --mixup_beta 0.2
68 | ```
69 |
70 |
71 |
72 | ## Contact
73 | If you have any questions, please send an email to donghaospurs@gmail.com
74 |
75 | ## Citation
76 |
77 | If you find our work useful in your research please consider citing our paper:
78 |
79 | ```
80 | @article{dong2023nngmix,
81 | author = {Hao Dong and Ga{\"e}tan Frusque and Yue Zhao and Eleni Chatzi and Olga Fink},
82 | title = {{NNG-Mix: Improving Semi-supervised Anomaly Detection with Pseudo-anomaly Generation}},
83 | journal = {arXiv preprint arXiv:2311.11961},
84 | year = {2023},
85 | }
86 | ```
87 |
88 | ## Related Projects
89 |
90 | [MultiOOD](https://github.com/donghao51/MultiOOD): Scaling Out-of-Distribution Detection for Multiple Modalities
91 |
92 | ## Acknowledgement
93 |
94 | Many thanks to the excellent open-source projects [ADBench](https://github.com/Minqi824/ADBench).
95 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/dgm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from torch.nn import init
6 | from .vae import VariationalAutoencoder, Encoder, Decoder
7 |
8 |
9 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch
10 | class Classifier(nn.Module):
11 | """
12 | Classifier network, i.e. q(y|x), for two classes (0: normal, 1: outlier)
13 |
14 | :param net: neural network class to use (as parameter to use the same network over different shallow_ssad)
15 | """
16 |
17 | def __init__(self, net, dims=None):
18 | super(Classifier, self).__init__()
19 | self.dims = dims
20 | if dims is None:
21 | self.net = net()
22 | self.logits = nn.Linear(self.net.rep_dim, 2)
23 | else:
24 | [x_dim, h_dim, y_dim] = dims
25 | self.dense = nn.Linear(x_dim, h_dim)
26 | self.logits = nn.Linear(h_dim, y_dim)
27 |
28 | def forward(self, x):
29 | if self.dims is None:
30 | x = self.net(x)
31 | else:
32 | x = F.relu(self.dense(x))
33 | x = F.softmax(self.logits(x), dim=-1)
34 | return x
35 |
36 |
37 | class DeepGenerativeModel(VariationalAutoencoder):
38 | """
39 | M2 model from the paper 'Semi-Supervised Learning with Deep Generative Models' (Kingma et al., 2014).
40 |
41 | The 'Generative semi-supervised model' (M2) is a probabilistic model that incorporates label information in both
42 | inference and generation.
43 |
44 | :param dims: dimensions of the model given by [input_dim, label_dim, latent_dim, [hidden_dims]].
45 | :param classifier_net: classifier network class to use.
46 | """
47 |
48 | def __init__(self, dims, classifier_net=None):
49 | [x_dim, self.y_dim, z_dim, h_dim] = dims
50 | super(DeepGenerativeModel, self).__init__([x_dim, z_dim, h_dim])
51 |
52 | self.encoder = Encoder([x_dim + self.y_dim, h_dim, z_dim])
53 | self.decoder = Decoder([z_dim + self.y_dim, list(reversed(h_dim)), x_dim])
54 | if classifier_net is None:
55 | self.classifier = Classifier(net=None, dims=[x_dim, h_dim[0], self.y_dim])
56 | else:
57 | self.classifier = Classifier(classifier_net)
58 |
59 | # Init linear layers
60 | for m in self.modules():
61 | if isinstance(m, nn.Linear):
62 | init.xavier_normal_(m.weight.data)
63 | if m.bias is not None:
64 | m.bias.data.zero_()
65 |
66 | def forward(self, x, y):
67 | z, q_mu, q_log_var = self.encoder(torch.cat((x, y), dim=1))
68 | self.kl_divergence = self._kld(z, (q_mu, q_log_var))
69 | rec = self.decoder(torch.cat((z, y), dim=1))
70 |
71 | return rec
72 |
73 | def classify(self, x):
74 | logits = self.classifier(x)
75 | return logits
76 |
77 | def sample(self, z, y):
78 | """
79 | Samples from the Decoder to generate an x.
80 |
81 | :param z: latent normal variable
82 | :param y: label (one-hot encoded)
83 | :return: x
84 | """
85 | y = y.float()
86 | x = self.decoder(torch.cat((z, y), dim=1))
87 | return x
88 |
89 |
90 | class StackedDeepGenerativeModel(DeepGenerativeModel):
91 | def __init__(self, dims, features):
92 | """
93 | M1+M2 model as described in (Kingma et al., 2014).
94 |
95 | :param dims: dimensions of the model given by [input_dim, label_dim, latent_dim, [hidden_dims]].
96 | :param classifier_net: classifier network class to use.
97 | :param features: a pre-trained M1 model of class 'VariationalAutoencoder' trained on the same dataset.
98 | """
99 | [x_dim, y_dim, z_dim, h_dim] = dims
100 | super(StackedDeepGenerativeModel, self).__init__([features.z_dim, y_dim, z_dim, h_dim])
101 |
102 | # Be sure to reconstruct with the same dimensions
103 | in_features = self.decoder.reconstruction.in_features
104 | self.decoder.reconstruction = nn.Linear(in_features, x_dim)
105 |
106 | # Make vae feature model untrainable by freezing parameters
107 | self.features = features
108 | self.features.train(False)
109 |
110 | for param in self.features.parameters():
111 | param.requires_grad = False
112 |
113 | def forward(self, x, y):
114 | # Sample a new latent x from the M1 model
115 | x_sample, _, _ = self.features.encoder(x)
116 |
117 | # Use the sample as new input to M2
118 | return super(StackedDeepGenerativeModel, self).forward(x_sample, y)
119 |
120 | def classify(self, x):
121 | _, x, _ = self.features.encoder(x)
122 | logits = self.classifier(x)
123 | return logits
124 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/run.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import logging
3 | import random
4 | import numpy as np
5 | import pandas as pd
6 | import os
7 | from .utils.config import Config
8 | from .utils.visualization.plot_images_grid import plot_images_grid
9 | from .deepsad import deepsad
10 | from .datasets.main import load_dataset
11 | from myutils import Utils
12 |
13 | class DeepSAD():
14 | def __init__(self, seed, model_name='DeepSAD'):
15 | self.utils = Utils()
16 | self.device = self.utils.get_device() # get device
17 | self.seed = seed
18 |
19 | self.net_name = 'dense'
20 | self.xp_path = None
21 | self.load_config = None
22 | self.load_model = None
23 | self.eta = 1.0 # eta in the loss function
24 | self.optimizer_name = 'adam'
25 | self.lr = 0.001
26 | self.n_epochs = 50
27 | self.lr_milestone = [0]
28 | self.batch_size = 128
29 | self.weight_decay = 1e-6
30 | self.pretrain = True # whether to use auto-encoder for pretraining
31 | self.ae_optimizer_name = 'adam'
32 | self.ae_lr = 0.001
33 | self.ae_n_epochs = 100
34 | self.ae_lr_milestone = [0]
35 | self.ae_batch_size = 128
36 | self.ae_weight_decay = 1e-6
37 | self.num_threads = 0
38 | self.n_jobs_dataloader = 0
39 |
40 | def fit(self, X_train, y_train, ratio=None):
41 | """
42 | Deep SAD, a method for deep semi-supervised anomaly detection.
43 |
44 | :arg DATASET_NAME: Name of the dataset to load.
45 | :arg NET_NAME: Name of the neural network to use.
46 | :arg XP_PATH: Export path for logging the experiment.
47 | """
48 |
49 | # Set seed (using myutils)
50 | self.utils.set_seed(self.seed)
51 |
52 | # Set the number of threads used for parallelizing CPU operations
53 | if self.num_threads > 0:
54 | torch.set_num_threads(self.num_threads)
55 | logging.info('Computation device: %s' % self.device)
56 | logging.info('Number of threads: %d' % self.num_threads)
57 | logging.info('Number of dataloader workers: %d' % self.n_jobs_dataloader)
58 |
59 | # Load data
60 | data = {'X_train': X_train, 'y_train': y_train}
61 | dataset = load_dataset(data=data, train=True)
62 | input_size = dataset.train_set.data.size(1) #input size
63 |
64 | # Initialize DeepSAD model and set neural network phi
65 | self.deepSAD = deepsad(self.eta)
66 | self.deepSAD.set_network(self.net_name, input_size)
67 |
68 | # If specified, load Deep SAD model (center c, network weights, and possibly autoencoder weights)
69 | if self.load_model:
70 | self.deepSAD.load_model(model_path=self.load_model, load_ae=True, map_location=self.device)
71 | logging.info('Loading model from %s.' % self.load_model)
72 |
73 | logging.info('Pretraining: %s' % self.pretrain)
74 | if self.pretrain:
75 | # Pretrain model on dataset (via autoencoder)
76 | self.deepSAD.pretrain(dataset,
77 | input_size,
78 | optimizer_name=self.ae_optimizer_name,
79 | lr=self.ae_lr,
80 | n_epochs=self.ae_n_epochs,
81 | lr_milestones=self.ae_lr_milestone,
82 | batch_size=self.ae_batch_size,
83 | weight_decay=self.ae_weight_decay,
84 | device=self.device,
85 | n_jobs_dataloader=self.n_jobs_dataloader)
86 |
87 | # Train model on dataset
88 | self.deepSAD.train(dataset,
89 | optimizer_name=self.optimizer_name,
90 | lr=self.lr,
91 | n_epochs=self.n_epochs,
92 | lr_milestones=self.lr_milestone,
93 | batch_size=self.batch_size,
94 | weight_decay=self.weight_decay,
95 | device=self.device,
96 | n_jobs_dataloader=self.n_jobs_dataloader)
97 |
98 | # Save results, model, and configuration
99 | # deepSAD.save_results(export_json=xp_path + '/results.json')
100 | # deepSAD.save_model(export_model=xp_path + '/model.tar')
101 | # cfg.save_config(export_json=xp_path + '/config.json')
102 |
103 | # Plot most anomalous and most normal test samples
104 | # indices, labels, scores = zip(*deepSAD.results['test_scores'])
105 | # indices, labels, scores = np.array(indices), np.array(labels), np.array(scores)
106 | # idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score
107 | # idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score
108 |
109 | return self
110 |
111 | def predict_score(self, X):
112 | # input randomly generated y label for consistence
113 | dataset = load_dataset(data={'X_test': X, 'y_test': np.random.choice([0, 1], X.shape[0])}, train=False)
114 | score = self.deepSAD.test(dataset, device=self.device, n_jobs_dataloader=self.n_jobs_dataloader)
115 |
116 | return score
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/networks/vae.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 | from torch.nn import init
4 |
5 | from .layers.stochastic import GaussianSample
6 | from .inference.distributions import log_standard_gaussian, log_gaussian
7 |
8 |
9 | # Acknowledgements: https://github.com/wohlert/semi-supervised-pytorch
10 | class Encoder(nn.Module):
11 | """
12 | Encoder, i.e. the inference network.
13 |
14 | Attempts to infer the latent probability distribution p(z|x) from the data x by fitting a
15 | variational distribution q_φ(z|x). Returns the two parameters of the distribution (µ, log σ²).
16 |
17 | :param dims: dimensions of the network given by [input_dim, [hidden_dims], latent_dim].
18 | """
19 |
20 | def __init__(self, dims, sample_layer=GaussianSample):
21 | super(Encoder, self).__init__()
22 |
23 | [x_dim, h_dim, z_dim] = dims
24 | neurons = [x_dim, *h_dim]
25 | linear_layers = [nn.Linear(neurons[i-1], neurons[i]) for i in range(1, len(neurons))]
26 |
27 | self.hidden = nn.ModuleList(linear_layers)
28 | self.sample = sample_layer(h_dim[-1], z_dim)
29 |
30 | def forward(self, x):
31 | for layer in self.hidden:
32 | x = F.relu(layer(x))
33 | return self.sample(x)
34 |
35 |
36 | class Decoder(nn.Module):
37 | """
38 | Decoder, i.e. the generative network.
39 |
40 | Generates samples from an approximation p_θ(x|z) of the original distribution p(x)
41 | by transforming a latent representation z.
42 |
43 | :param dims: dimensions of the network given by [latent_dim, [hidden_dims], input_dim].
44 | """
45 |
46 | def __init__(self, dims):
47 | super(Decoder, self).__init__()
48 |
49 | [z_dim, h_dim, x_dim] = dims
50 | neurons = [z_dim, *h_dim]
51 | linear_layers = [nn.Linear(neurons[i-1], neurons[i]) for i in range(1, len(neurons))]
52 |
53 | self.hidden = nn.ModuleList(linear_layers)
54 | self.reconstruction = nn.Linear(h_dim[-1], x_dim)
55 | self.output_activation = nn.Sigmoid()
56 |
57 | def forward(self, x):
58 | for layer in self.hidden:
59 | x = F.relu(layer(x))
60 | return self.output_activation(self.reconstruction(x))
61 |
62 |
63 | class VariationalAutoencoder(nn.Module):
64 | """
65 | Variational Autoencoder (VAE) (Kingma and Welling, 2013) model consisting of an encoder-decoder pair for which
66 | a variational distribution is fitted to the encoder.
67 | Also known as the M1 model in (Kingma et al., 2014)
68 |
69 | :param dims: dimensions of the networks given by [input_dim, latent_dim, [hidden_dims]]. Encoder and decoder
70 | are build symmetrically.
71 | """
72 |
73 | def __init__(self, dims):
74 | super(VariationalAutoencoder, self).__init__()
75 |
76 | [x_dim, z_dim, h_dim] = dims
77 | self.z_dim = z_dim
78 | self.flow = None
79 |
80 | self.encoder = Encoder([x_dim, h_dim, z_dim])
81 | self.decoder = Decoder([z_dim, list(reversed(h_dim)), x_dim])
82 | self.kl_divergence = 0
83 |
84 | # Init linear layers
85 | for m in self.modules():
86 | if isinstance(m, nn.Linear):
87 | init.xavier_normal_(m.weight.data)
88 | if m.bias is not None:
89 | m.bias.data.zero_()
90 |
91 | def _kld(self, z, q_param, p_param=None):
92 | """
93 | Computes the KL-divergence of some latent variable z.
94 |
95 | KL(q||p) = - ∫ q(z) log [ p(z) / q(z) ] = - E_q[ log p(z) - log q(z) ]
96 |
97 | :param z: sample from q-distribuion
98 | :param q_param: (mu, log_var) of the q-distribution
99 | :param p_param: (mu, log_var) of the p-distribution
100 | :return: KL(q||p)
101 | """
102 | (mu, log_var) = q_param
103 |
104 | if self.flow is not None:
105 | f_z, log_det_z = self.flow(z)
106 | qz = log_gaussian(z, mu, log_var) - sum(log_det_z)
107 | z = f_z
108 | else:
109 | qz = log_gaussian(z, mu, log_var)
110 |
111 | if p_param is None:
112 | pz = log_standard_gaussian(z)
113 | else:
114 | (mu, log_var) = p_param
115 | pz = log_gaussian(z, mu, log_var)
116 |
117 | kl = qz - pz
118 |
119 | return kl
120 |
121 | def add_flow(self, flow):
122 | self.flow = flow
123 |
124 | def forward(self, x, y=None):
125 | """
126 | Runs a forward pass on a data point through the VAE model to provide its reconstruction and the parameters of
127 | the variational approximate distribution q.
128 |
129 | :param x: input data
130 | :return: reconstructed input
131 | """
132 | z, q_mu, q_log_var = self.encoder(x)
133 | self.kl_divergence = self._kld(z, (q_mu, q_log_var))
134 | rec = self.decoder(z)
135 |
136 | return rec
137 |
138 | def sample(self, z):
139 | """
140 | Given z ~ N(0, I) generates a sample from the learned distribution based on p_θ(x|z).
141 |
142 | :param z: (torch.autograd.Variable) latent normal variable
143 | :return: (torch.autograd.Variable) generated sample
144 | """
145 | return self.decoder(z)
146 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/optim/vae_trainer.py:
--------------------------------------------------------------------------------
1 | from baseline.DeepSAD.src.base.base_trainer import BaseTrainer
2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset
3 | from baseline.DeepSAD.src.base.base_net import BaseNet
4 | from baseline.DeepSAD.src.utils.misc import binary_cross_entropy
5 | from sklearn.metrics import roc_auc_score
6 |
7 | import logging
8 | import time
9 | import torch
10 | import torch.optim as optim
11 | import numpy as np
12 |
13 |
14 | class VAETrainer(BaseTrainer):
15 |
16 | def __init__(self, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150, lr_milestones: tuple = (),
17 | batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', n_jobs_dataloader: int = 0):
18 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device,
19 | n_jobs_dataloader)
20 |
21 | # Results
22 | self.train_time = None
23 | self.test_auc = None
24 | self.test_time = None
25 |
26 | def train(self, dataset: BaseADDataset, vae: BaseNet):
27 | logger = logging.getLogger()
28 |
29 | # Get train data loader
30 | train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
31 |
32 | # Set device
33 | vae = vae.to(self.device)
34 |
35 | # Set optimizer (Adam optimizer for now)
36 | optimizer = optim.Adam(vae.parameters(), lr=self.lr, weight_decay=self.weight_decay)
37 |
38 | # Set learning rate scheduler
39 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1)
40 |
41 | # Training
42 | logger.info('Starting pretraining...')
43 | start_time = time.time()
44 | vae.train()
45 | for epoch in range(self.n_epochs):
46 |
47 | scheduler.step()
48 | if epoch in self.lr_milestones:
49 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0]))
50 |
51 | epoch_loss = 0.0
52 | n_batches = 0
53 | epoch_start_time = time.time()
54 | for data in train_loader:
55 | inputs, _, _, _ = data
56 | inputs = inputs.to(self.device)
57 | inputs = inputs.view(inputs.size(0), -1)
58 |
59 | # Zero the network parameter gradients
60 | optimizer.zero_grad()
61 |
62 | # Update network parameters via backpropagation: forward + backward + optimize
63 | rec = vae(inputs)
64 |
65 | likelihood = -binary_cross_entropy(rec, inputs)
66 | elbo = likelihood - vae.kl_divergence
67 |
68 | # Overall loss
69 | loss = -torch.mean(elbo)
70 |
71 | loss.backward()
72 | optimizer.step()
73 |
74 | epoch_loss += loss.item()
75 | n_batches += 1
76 |
77 | # log epoch statistics
78 | epoch_train_time = time.time() - epoch_start_time
79 | logger.info(f'| Epoch: {epoch + 1:03}/{self.n_epochs:03} | Train Time: {epoch_train_time:.3f}s '
80 | f'| Train Loss: {epoch_loss / n_batches:.6f} |')
81 |
82 | self.train_time = time.time() - start_time
83 | logger.info('Pretraining Time: {:.3f}s'.format(self.train_time))
84 | logger.info('Finished pretraining.')
85 |
86 | return vae
87 |
88 | def test(self, dataset: BaseADDataset, vae: BaseNet):
89 | logger = logging.getLogger()
90 |
91 | # Get test data loader
92 | _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
93 |
94 | # Set device
95 | vae = vae.to(self.device)
96 |
97 | # Testing
98 | logger.info('Starting testing...')
99 | epoch_loss = 0.0
100 | n_batches = 0
101 | start_time = time.time()
102 | idx_label_score = []
103 | vae.eval()
104 | with torch.no_grad():
105 | for data in test_loader:
106 | inputs, labels, _, idx = data
107 | inputs, labels, idx = inputs.to(self.device), labels.to(self.device), idx.to(self.device)
108 |
109 | inputs = inputs.view(inputs.size(0), -1)
110 |
111 | rec = vae(inputs)
112 | likelihood = -binary_cross_entropy(rec, inputs)
113 | scores = -likelihood # negative likelihood as anomaly score
114 |
115 | # Save triple of (idx, label, score) in a list
116 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(),
117 | labels.cpu().data.numpy().tolist(),
118 | scores.cpu().data.numpy().tolist()))
119 |
120 | # Overall loss
121 | elbo = likelihood - vae.kl_divergence
122 | loss = -torch.mean(elbo)
123 |
124 | epoch_loss += loss.item()
125 | n_batches += 1
126 |
127 | self.test_time = time.time() - start_time
128 |
129 | # Compute AUC
130 | _, labels, scores = zip(*idx_label_score)
131 | labels = np.array(labels)
132 | scores = np.array(scores)
133 | self.test_auc = roc_auc_score(labels, scores)
134 |
135 | # Log results
136 | logger.info('Test Loss: {:.6f}'.format(epoch_loss / n_batches))
137 | logger.info('Test AUC: {:.2f}%'.format(100. * self.test_auc))
138 | logger.info('Test Time: {:.3f}s'.format(self.test_time))
139 | logger.info('Finished testing variational autoencoder.')
140 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/optim/ae_trainer.py:
--------------------------------------------------------------------------------
1 | from baseline.DeepSAD.src.base.base_trainer import BaseTrainer
2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset
3 | from baseline.DeepSAD.src.base.base_net import BaseNet
4 | from sklearn.metrics import roc_auc_score, average_precision_score
5 |
6 | import logging
7 | import time
8 | import torch
9 | import torch.nn as nn
10 | import torch.optim as optim
11 | import numpy as np
12 |
13 |
14 | class AETrainer(BaseTrainer):
15 |
16 | def __init__(self, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150, lr_milestones: tuple = (),
17 | batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', n_jobs_dataloader: int = 0):
18 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device,
19 | n_jobs_dataloader)
20 |
21 | # Results
22 | self.train_time = None
23 | self.test_aucroc = None; self.test_aucpr = None
24 | self.test_time = None
25 |
26 | def train(self, dataset: BaseADDataset, ae_net: BaseNet):
27 | logger = logging.getLogger()
28 |
29 | # Get train data loader
30 | train_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
31 |
32 | # Set loss
33 | criterion = nn.MSELoss(reduction='none')
34 |
35 | # Set device
36 | ae_net = ae_net.to(self.device)
37 | criterion = criterion.to(self.device)
38 |
39 | # Set optimizer (Adam optimizer for now)
40 | optimizer = optim.Adam(ae_net.parameters(), lr=self.lr, weight_decay=self.weight_decay)
41 |
42 | # Set learning rate scheduler
43 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1)
44 |
45 | # Training
46 | logger.info('Starting pretraining...')
47 | start_time = time.time()
48 | ae_net.train()
49 | for epoch in range(self.n_epochs):
50 |
51 | epoch_loss = 0.0
52 | n_batches = 0
53 | epoch_start_time = time.time()
54 | for data in train_loader:
55 | inputs, _, _, _ = data
56 | inputs = inputs.to(self.device)
57 |
58 | # Zero the network parameter gradients
59 | optimizer.zero_grad()
60 |
61 | # Update network parameters via backpropagation: forward + backward + optimize
62 | rec = ae_net(inputs)
63 | rec_loss = criterion(rec, inputs)
64 | loss = torch.mean(rec_loss)
65 | loss.backward()
66 | optimizer.step()
67 | scheduler.step()
68 | if epoch in self.lr_milestones:
69 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0]))
70 |
71 | epoch_loss += loss.item()
72 | n_batches += 1
73 |
74 | # log epoch statistics
75 | epoch_train_time = time.time() - epoch_start_time
76 | logger.info(f'| Epoch: {epoch + 1:03}/{self.n_epochs:03} | Train Time: {epoch_train_time:.3f}s '
77 | f'| Train Loss: {epoch_loss / n_batches:.6f} |')
78 |
79 | self.train_time = time.time() - start_time
80 | logger.info('Pretraining Time: {:.3f}s'.format(self.train_time))
81 | logger.info('Finished pretraining.')
82 |
83 | return ae_net
84 |
85 | def test(self, dataset: BaseADDataset, ae_net: BaseNet):
86 | logger = logging.getLogger()
87 |
88 | # Get test data loader
89 | test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
90 |
91 | # Set loss
92 | criterion = nn.MSELoss(reduction='none')
93 |
94 | # Set device for network
95 | ae_net = ae_net.to(self.device)
96 | criterion = criterion.to(self.device)
97 |
98 | # Testing
99 | logger.info('Testing autoencoder...')
100 | epoch_loss = 0.0
101 | n_batches = 0
102 | start_time = time.time()
103 | idx_label_score = []
104 | ae_net.eval()
105 | with torch.no_grad():
106 | for data in test_loader:
107 | inputs, labels, _, idx = data
108 | inputs, labels, idx = inputs.to(self.device), labels.to(self.device), idx.to(self.device)
109 |
110 | rec = ae_net(inputs)
111 | rec_loss = criterion(rec, inputs)
112 | scores = torch.mean(rec_loss, dim=tuple(range(1, rec.dim())))
113 |
114 | # Save triple of (idx, label, score) in a list
115 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(),
116 | labels.cpu().data.numpy().tolist(),
117 | scores.cpu().data.numpy().tolist()))
118 |
119 | loss = torch.mean(rec_loss)
120 | epoch_loss += loss.item()
121 | n_batches += 1
122 |
123 | self.test_time = time.time() - start_time
124 |
125 | # Compute AUC
126 | # _, labels, scores = zip(*idx_label_score)
127 | # labels = np.array(labels)
128 | # scores = np.array(scores)
129 | # self.test_aucroc = roc_auc_score(labels, scores)
130 | # self.test_aucpr = average_precision_score(labels, scores, pos_label=1)
131 |
132 | # Log results
133 | # logger.info('Test Loss: {:.6f}'.format(epoch_loss / n_batches))
134 | # logger.info('Test AUCROC: {:.2f}%'.format(100. * self.test_aucroc))
135 | # logger.info('Test AUCPR: {:.2f}%'.format(100. * self.test_aucpr))
136 | # logger.info('Test Time: {:.3f}s'.format(self.test_time))
137 | # logger.info('Finished testing autoencoder.')
138 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baselines/SemiDGM.py:
--------------------------------------------------------------------------------
1 | import json
2 | import torch
3 |
4 | from base.base_dataset import BaseADDataset
5 | from networks.main import build_network, build_autoencoder
6 | from optim import SemiDeepGenerativeTrainer, VAETrainer
7 |
8 |
9 | class SemiDeepGenerativeModel(object):
10 | """A class for the Semi-Supervised Deep Generative model (M1+M2 model).
11 |
12 | Paper: Kingma et al. (2014). Semi-supervised learning with deep generative models. In NIPS (pp. 3581-3589).
13 | Link: https://papers.nips.cc/paper/5352-semi-supervised-learning-with-deep-generative-models.pdf
14 |
15 | Attributes:
16 | net_name: A string indicating the name of the neural network to use.
17 | net: The neural network.
18 | trainer: SemiDeepGenerativeTrainer to train a Semi-Supervised Deep Generative model.
19 | optimizer_name: A string indicating the optimizer to use for training.
20 | results: A dictionary to save the results.
21 | """
22 |
23 | def __init__(self, alpha: float = 0.1):
24 | """Inits SemiDeepGenerativeModel."""
25 |
26 | self.alpha = alpha
27 |
28 | self.net_name = None
29 | self.net = None
30 |
31 | self.trainer = None
32 | self.optimizer_name = None
33 |
34 | self.vae_net = None # variational autoencoder network for pretraining
35 | self.vae_trainer = None
36 | self.vae_optimizer_name = None
37 |
38 | self.results = {
39 | 'train_time': None,
40 | 'test_auc': None,
41 | 'test_time': None,
42 | 'test_scores': None,
43 | }
44 |
45 | self.vae_results = {
46 | 'train_time': None,
47 | 'test_auc': None,
48 | 'test_time': None
49 | }
50 |
51 | def set_vae(self, net_name):
52 | """Builds the variational autoencoder network for pretraining."""
53 | self.net_name = net_name
54 | self.vae_net = build_autoencoder(self.net_name) # VAE for pretraining
55 |
56 | def set_network(self, net_name):
57 | """Builds the neural network."""
58 | self.net_name = net_name
59 | self.net = build_network(net_name, ae_net=self.vae_net) # full M1+M2 model
60 |
61 | def train(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 50,
62 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda',
63 | n_jobs_dataloader: int = 0):
64 | """Trains the Semi-Supervised Deep Generative model on the training data."""
65 |
66 | self.optimizer_name = optimizer_name
67 |
68 | self.trainer = SemiDeepGenerativeTrainer(alpha=self.alpha, optimizer_name=optimizer_name, lr=lr,
69 | n_epochs=n_epochs, lr_milestones=lr_milestones, batch_size=batch_size,
70 | weight_decay=weight_decay, device=device,
71 | n_jobs_dataloader=n_jobs_dataloader)
72 | self.net = self.trainer.train(dataset, self.net)
73 | self.results['train_time'] = self.trainer.train_time
74 |
75 | def test(self, dataset: BaseADDataset, device: str = 'cuda', n_jobs_dataloader: int = 0):
76 | """Tests the Semi-Supervised Deep Generative model on the test data."""
77 |
78 | if self.trainer is None:
79 | self.trainer = SemiDeepGenerativeTrainer(alpha=self.alpha, device=device,
80 | n_jobs_dataloader=n_jobs_dataloader)
81 |
82 | self.trainer.test(dataset, self.net)
83 | # Get results
84 | self.results['test_auc'] = self.trainer.test_auc
85 | self.results['test_time'] = self.trainer.test_time
86 | self.results['test_scores'] = self.trainer.test_scores
87 |
88 | def pretrain(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 100,
89 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda',
90 | n_jobs_dataloader: int = 0):
91 | """Pretrains a variational autoencoder (M1) for the Semi-Supervised Deep Generative model."""
92 |
93 | # Train
94 | self.vae_optimizer_name = optimizer_name
95 | self.vae_trainer = VAETrainer(optimizer_name=optimizer_name, lr=lr, n_epochs=n_epochs,
96 | lr_milestones=lr_milestones, batch_size=batch_size, weight_decay=weight_decay,
97 | device=device, n_jobs_dataloader=n_jobs_dataloader)
98 | self.vae_net = self.vae_trainer.train(dataset, self.vae_net)
99 | # Get train results
100 | self.vae_results['train_time'] = self.vae_trainer.train_time
101 |
102 | # Test
103 | self.vae_trainer.test(dataset, self.vae_net)
104 | # Get test results
105 | self.vae_results['test_auc'] = self.vae_trainer.test_auc
106 | self.vae_results['test_time'] = self.vae_trainer.test_time
107 |
108 | def save_model(self, export_model):
109 | """Save a Semi-Supervised Deep Generative model to export_model."""
110 |
111 | net_dict = self.net.state_dict()
112 | torch.save({'net_dict': net_dict}, export_model)
113 |
114 | def load_model(self, model_path):
115 | """Load a Semi-Supervised Deep Generative model from model_path."""
116 |
117 | model_dict = torch.load(model_path)
118 | self.net.load_state_dict(model_dict['net_dict'])
119 |
120 | def save_results(self, export_json):
121 | """Save results dict to a JSON-file."""
122 | with open(export_json, 'w') as fp:
123 | json.dump(self.results, fp)
124 |
125 | def save_vae_results(self, export_json):
126 | """Save variational autoencoder results dict to a JSON-file."""
127 | with open(export_json, 'w') as fp:
128 | json.dump(self.vae_results, fp)
129 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baselines/isoforest.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import time
4 | import torch
5 | import numpy as np
6 |
7 | from torch.utils.data import DataLoader
8 | from sklearn.ensemble import IsolationForest
9 | from sklearn.metrics import roc_auc_score
10 | from base.base_dataset import BaseADDataset
11 | from networks.main import build_autoencoder
12 |
13 |
14 | class IsoForest(object):
15 | """A class for Isolation Forest models."""
16 |
17 | def __init__(self, hybrid=False, n_estimators=100, max_samples='auto', contamination=0.1, n_jobs=-1, seed=None,
18 | **kwargs):
19 | """Init Isolation Forest instance."""
20 | self.n_estimators = n_estimators
21 | self.max_samples = max_samples
22 | self.contamination = contamination
23 | self.n_jobs = n_jobs
24 | self.seed = seed
25 |
26 | self.model = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination,
27 | n_jobs=n_jobs, random_state=seed, **kwargs)
28 |
29 | self.hybrid = hybrid
30 | self.ae_net = None # autoencoder network for the case of a hybrid model
31 |
32 | self.results = {
33 | 'train_time': None,
34 | 'test_time': None,
35 | 'test_auc': None,
36 | 'test_scores': None
37 | }
38 |
39 | def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0):
40 | """Trains the Isolation Forest model on the training data."""
41 | logger = logging.getLogger()
42 |
43 | # do not drop last batch for non-SGD optimization shallow_ssad
44 | train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True,
45 | num_workers=n_jobs_dataloader, drop_last=False)
46 |
47 | # Get data from loader
48 | X = ()
49 | for data in train_loader:
50 | inputs, _, _, _ = data
51 | inputs = inputs.to(device)
52 | if self.hybrid:
53 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
54 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
55 | X += (X_batch.cpu().data.numpy(),)
56 | X = np.concatenate(X)
57 |
58 | # Training
59 | logger.info('Starting training...')
60 | start_time = time.time()
61 | self.model.fit(X)
62 | train_time = time.time() - start_time
63 | self.results['train_time'] = train_time
64 |
65 | logger.info('Training Time: {:.3f}s'.format(self.results['train_time']))
66 | logger.info('Finished training.')
67 |
68 | def test(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0):
69 | """Tests the Isolation Forest model on the test data."""
70 | logger = logging.getLogger()
71 |
72 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader)
73 |
74 | # Get data from loader
75 | idx_label_score = []
76 | X = ()
77 | idxs = []
78 | labels = []
79 | for data in test_loader:
80 | inputs, label_batch, _, idx = data
81 | inputs, label_batch, idx = inputs.to(device), label_batch.to(device), idx.to(device)
82 | if self.hybrid:
83 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
84 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
85 | X += (X_batch.cpu().data.numpy(),)
86 | idxs += idx.cpu().data.numpy().astype(np.int64).tolist()
87 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist()
88 | X = np.concatenate(X)
89 |
90 | # Testing
91 | logger.info('Starting testing...')
92 | start_time = time.time()
93 | scores = (-1.0) * self.model.decision_function(X)
94 | self.results['test_time'] = time.time() - start_time
95 | scores = scores.flatten()
96 |
97 | # Save triples of (idx, label, score) in a list
98 | idx_label_score += list(zip(idxs, labels, scores.tolist()))
99 | self.results['test_scores'] = idx_label_score
100 |
101 | # Compute AUC
102 | _, labels, scores = zip(*idx_label_score)
103 | labels = np.array(labels)
104 | scores = np.array(scores)
105 | self.results['test_auc'] = roc_auc_score(labels, scores)
106 |
107 | # Log results
108 | logger.info('Test AUC: {:.2f}%'.format(100. * self.results['test_auc']))
109 | logger.info('Test Time: {:.3f}s'.format(self.results['test_time']))
110 | logger.info('Finished testing.')
111 |
112 | def load_ae(self, dataset_name, model_path):
113 | """Load pretrained autoencoder from model_path for feature extraction in a hybrid Isolation Forest model."""
114 |
115 | model_dict = torch.load(model_path, map_location='cpu')
116 | ae_net_dict = model_dict['ae_net_dict']
117 | if dataset_name in ['mnist', 'fmnist', 'cifar10']:
118 | net_name = dataset_name + '_LeNet'
119 | else:
120 | net_name = dataset_name + '_mlp'
121 |
122 | if self.ae_net is None:
123 | self.ae_net = build_autoencoder(net_name)
124 |
125 | # update keys (since there was a change in network definition)
126 | ae_keys = list(self.ae_net.state_dict().keys())
127 | for i in range(len(ae_net_dict)):
128 | k, v = ae_net_dict.popitem(False)
129 | new_key = ae_keys[i]
130 | ae_net_dict[new_key] = v
131 | i += 1
132 |
133 | self.ae_net.load_state_dict(ae_net_dict)
134 | self.ae_net.eval()
135 |
136 | def save_model(self, export_path):
137 | """Save Isolation Forest model to export_path."""
138 | pass
139 |
140 | def load_model(self, import_path, device: str = 'cpu'):
141 | """Load Isolation Forest model from import_path."""
142 | pass
143 |
144 | def save_results(self, export_json):
145 | """Save results dict to a JSON-file."""
146 | with open(export_json, 'w') as fp:
147 | json.dump(self.results, fp)
148 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baselines/kde.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import time
4 | import torch
5 | import numpy as np
6 |
7 | from torch.utils.data import DataLoader
8 | from sklearn.neighbors import KernelDensity
9 | from sklearn.metrics import roc_auc_score
10 | from sklearn.metrics.pairwise import pairwise_distances
11 | from sklearn.model_selection import GridSearchCV
12 | from base.base_dataset import BaseADDataset
13 | from networks.main import build_autoencoder
14 |
15 |
16 | class KDE(object):
17 | """A class for Kernel Density Estimation models."""
18 |
19 | def __init__(self, hybrid=False, kernel='gaussian', n_jobs=-1, seed=None, **kwargs):
20 | """Init Kernel Density Estimation instance."""
21 | self.kernel = kernel
22 | self.n_jobs = n_jobs
23 | self.seed = seed
24 |
25 | self.model = KernelDensity(kernel=kernel, **kwargs)
26 | self.bandwidth = self.model.bandwidth
27 |
28 | self.hybrid = hybrid
29 | self.ae_net = None # autoencoder network for the case of a hybrid model
30 |
31 | self.results = {
32 | 'train_time': None,
33 | 'test_time': None,
34 | 'test_auc': None,
35 | 'test_scores': None
36 | }
37 |
38 | def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0,
39 | bandwidth_GridSearchCV: bool = True):
40 | """Trains the Kernel Density Estimation model on the training data."""
41 | logger = logging.getLogger()
42 |
43 | # do not drop last batch for non-SGD optimization shallow_ssad
44 | train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True,
45 | num_workers=n_jobs_dataloader, drop_last=False)
46 |
47 | # Get data from loader
48 | X = ()
49 | for data in train_loader:
50 | inputs, _, _, _ = data
51 | inputs = inputs.to(device)
52 | if self.hybrid:
53 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
54 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
55 | X += (X_batch.cpu().data.numpy(),)
56 | X = np.concatenate(X)
57 |
58 | # Training
59 | logger.info('Starting training...')
60 | start_time = time.time()
61 |
62 | if bandwidth_GridSearchCV:
63 | # use grid search cross-validation to select bandwidth
64 | logger.info('Using GridSearchCV for bandwidth selection...')
65 | params = {'bandwidth': np.logspace(0.5, 5, num=10, base=2)}
66 | hyper_kde = GridSearchCV(KernelDensity(kernel=self.kernel), params, n_jobs=self.n_jobs, cv=5, verbose=0)
67 | hyper_kde.fit(X)
68 | self.bandwidth = hyper_kde.best_estimator_.bandwidth
69 | logger.info('Best bandwidth: {:.8f}'.format(self.bandwidth))
70 | self.model = hyper_kde.best_estimator_
71 | else:
72 | # if exponential kernel, re-initialize kde with bandwidth minimizing the numerical error
73 | if self.kernel == 'exponential':
74 | self.bandwidth = np.max(pairwise_distances(X)) ** 2
75 | self.model = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth)
76 |
77 | self.model.fit(X)
78 |
79 | train_time = time.time() - start_time
80 | self.results['train_time'] = train_time
81 |
82 | logger.info('Training Time: {:.3f}s'.format(self.results['train_time']))
83 | logger.info('Finished training.')
84 |
85 | def test(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0):
86 | """Tests the Kernel Density Estimation model on the test data."""
87 | logger = logging.getLogger()
88 |
89 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader)
90 |
91 | # Get data from loader
92 | idx_label_score = []
93 | X = ()
94 | idxs = []
95 | labels = []
96 | for data in test_loader:
97 | inputs, label_batch, _, idx = data
98 | inputs, label_batch, idx = inputs.to(device), label_batch.to(device), idx.to(device)
99 | if self.hybrid:
100 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
101 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
102 | X += (X_batch.cpu().data.numpy(),)
103 | idxs += idx.cpu().data.numpy().astype(np.int64).tolist()
104 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist()
105 | X = np.concatenate(X)
106 |
107 | # Testing
108 | logger.info('Starting testing...')
109 | start_time = time.time()
110 | scores = (-1.0) * self.model.score_samples(X)
111 | self.results['test_time'] = time.time() - start_time
112 | scores = scores.flatten()
113 |
114 | # Save triples of (idx, label, score) in a list
115 | idx_label_score += list(zip(idxs, labels, scores.tolist()))
116 | self.results['test_scores'] = idx_label_score
117 |
118 | # Compute AUC
119 | _, labels, scores = zip(*idx_label_score)
120 | labels = np.array(labels)
121 | scores = np.array(scores)
122 | self.results['test_auc'] = roc_auc_score(labels, scores)
123 |
124 | # Log results
125 | logger.info('Test AUC: {:.2f}%'.format(100. * self.results['test_auc']))
126 | logger.info('Test Time: {:.3f}s'.format(self.results['test_time']))
127 | logger.info('Finished testing.')
128 |
129 | def load_ae(self, dataset_name, model_path):
130 | """Load pretrained autoencoder from model_path for feature extraction in a hybrid KDE model."""
131 |
132 | model_dict = torch.load(model_path, map_location='cpu')
133 | ae_net_dict = model_dict['ae_net_dict']
134 | if dataset_name in ['mnist', 'fmnist', 'cifar10']:
135 | net_name = dataset_name + '_LeNet'
136 | else:
137 | net_name = dataset_name + '_mlp'
138 |
139 | if self.ae_net is None:
140 | self.ae_net = build_autoencoder(net_name)
141 |
142 | # update keys (since there was a change in network definition)
143 | ae_keys = list(self.ae_net.state_dict().keys())
144 | for i in range(len(ae_net_dict)):
145 | k, v = ae_net_dict.popitem(False)
146 | new_key = ae_keys[i]
147 | ae_net_dict[new_key] = v
148 | i += 1
149 |
150 | self.ae_net.load_state_dict(ae_net_dict)
151 | self.ae_net.eval()
152 |
153 | def save_model(self, export_path):
154 | """Save KDE model to export_path."""
155 | pass
156 |
157 | def load_model(self, import_path, device: str = 'cpu'):
158 | """Load KDE model from import_path."""
159 | pass
160 |
161 | def save_results(self, export_json):
162 | """Save results dict to a JSON-file."""
163 | with open(export_json, 'w') as fp:
164 | json.dump(self.results, fp)
165 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/README.md:
--------------------------------------------------------------------------------
1 | # Deep SAD: A Method for Deep Semi-Supervised Anomaly Detection
2 | This repository provides a [PyTorch](https://pytorch.org/) implementation of the *Deep SAD* method presented in our ICLR 2020 paper ”Deep Semi-Supervised Anomaly Detection”.
3 |
4 |
5 | ## Citation and Contact
6 | You find a PDF of the Deep Semi-Supervised Anomaly Detection ICLR 2020 paper on arXiv
7 | [https://arxiv.org/abs/1906.02694](https://arxiv.org/abs/1906.02694).
8 |
9 | If you find our work useful, please also cite the paper:
10 | ```
11 | @InProceedings{ruff2020deep,
12 | title = {Deep Semi-Supervised Anomaly Detection},
13 | author = {Ruff, Lukas and Vandermeulen, Robert A. and G{\"o}rnitz, Nico and Binder, Alexander and M{\"u}ller, Emmanuel and M{\"u}ller, Klaus-Robert and Kloft, Marius},
14 | booktitle = {International Conference on Learning Representations},
15 | year = {2020},
16 | url = {https://openreview.net/forum?id=HkgH0TEYwH}
17 | }
18 | ```
19 |
20 | If you would like get in touch, just drop us an email to [contact@lukasruff.com](mailto:contact@lukasruff.com).
21 |
22 |
23 | ## Abstract
24 | > > Deep approaches to anomaly detection have recently shown promising results over shallow methods on large and complex datasets. Typically anomaly detection is treated as an unsupervised learning problem. In practice however, one may have---in addition to a large set of unlabeled samples---access to a small pool of labeled samples, e.g. a subset verified by some domain expert as being normal or anomalous. Semi-supervised approaches to anomaly detection aim to utilize such labeled samples, but most proposed methods are limited to merely including labeled normal samples. Only a few methods take advantage of labeled anomalies, with existing deep approaches being domain-specific. In this work we present Deep SAD, an end-to-end deep methodology for general semi-supervised anomaly detection. We further introduce an information-theoretic framework for deep anomaly detection based on the idea that the entropy of the latent distribution for normal data should be lower than the entropy of the anomalous distribution, which can serve as a theoretical interpretation for our method. In extensive experiments on MNIST, Fashion-MNIST, and CIFAR-10, along with other anomaly detection benchmark datasets, we demonstrate that our method is on par or outperforms shallow, hybrid, and deep competitors, yielding appreciable performance improvements even when provided with only little labeled data.
25 |
26 | ## The need for semi-supervised anomaly detection
27 |
28 | 
29 |
30 |
31 | ## Installation
32 | This code is written in `Python 3.7` and requires the packages listed in `requirements.txt`.
33 |
34 | Clone the repository to your machine and directory of choice:
35 | ```
36 | git clone https://github.com/lukasruff/Deep-SAD-PyTorch.git
37 | ```
38 |
39 | To run the code, we recommend setting up a virtual environment, e.g. using `virtualenv` or `conda`:
40 |
41 | ### `virtualenv`
42 | ```
43 | # pip install virtualenv
44 | cd
45 | virtualenv myenv
46 | source myenv/bin/activate
47 | pip install -r requirements.txt
48 | ```
49 |
50 | ### `conda`
51 | ```
52 | cd
53 | conda create --name myenv
54 | source activate myenv
55 | while read requirement; do conda install -n myenv --yes $requirement; done < requirements.txt
56 | ```
57 |
58 |
59 | ## Running experiments
60 | We have implemented the [`MNIST`](http://yann.lecun.com/exdb/mnist/),
61 | [`Fashion-MNIST`](https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/), and
62 | [`CIFAR-10`](https://www.cs.toronto.edu/~kriz/cifar.html) datasets as well as the classic anomaly detection
63 | benchmark datasets `arrhythmia`, `cardio`, `satellite`, `satimage-2`, `shuttle`, and `thyroid` from the
64 | Outlier Detection DataSets (ODDS) repository ([http://odds.cs.stonybrook.edu/](http://odds.cs.stonybrook.edu/))
65 | as reported in the paper.
66 |
67 | The implemented network architectures are as reported in the appendix of the paper.
68 |
69 | ### Deep SAD
70 | You can run Deep SAD experiments using the `main.py` script.
71 |
72 | Here's an example on `MNIST` with `0` considered to be the normal class and having 1% labeled (known) training samples
73 | from anomaly class `1` with a pollution ratio of 10% of the unlabeled training data (with unknown anomalies from all
74 | anomaly classes `1`-`9`):
75 | ```
76 | cd
77 |
78 | # activate virtual environment
79 | source myenv/bin/activate # or 'source activate myenv' for conda
80 |
81 | # create folders for experimental output
82 | mkdir log/DeepSAD
83 | mkdir log/DeepSAD/mnist_test
84 |
85 | # change to source directory
86 | cd src
87 |
88 | # run experiment
89 | python main.py mnist mnist_LeNet ../log/DeepSAD/mnist_test ../data --ratio_known_outlier 0.01 --ratio_pollution 0.1 --lr 0.0001 --n_epochs 150 --lr_milestone 50 --batch_size 128 --weight_decay 0.5e-6 --pretrain True --ae_lr 0.0001 --ae_n_epochs 150 --ae_batch_size 128 --ae_weight_decay 0.5e-3 --normal_class 0 --known_outlier_class 1 --n_known_outlier_classes 1;
90 | ```
91 | Have a look into `main.py` for all possible arguments and options.
92 |
93 | ### Baselines
94 | We also provide an implementation of the following baselines via the respective `baseline_.py` scripts:
95 | OC-SVM (`ocsvm`), Isolation Forest (`isoforest`), Kernel Density Estimation (`kde`), kernel Semi-Supervised Anomaly
96 | Detection (`ssad`), and Semi-Supervised Deep Generative Model (`SemiDGM`).
97 |
98 | Here's how to run SSAD for example on the same experimental setup as above:
99 | ```
100 | cd
101 |
102 | # activate virtual environment
103 | source myenv/bin/activate # or 'source activate myenv' for conda
104 |
105 | # create folder for experimental output
106 | mkdir log/ssad
107 | mkdir log/ssad/mnist_test
108 |
109 | # change to source directory
110 | cd src
111 |
112 | # run experiment
113 | python baseline_ssad.py mnist ../log/ssad/mnist_test ../data --ratio_known_outlier 0.01 --ratio_pollution 0.1 --kernel rbf --kappa 1.0 --normal_class 0 --known_outlier_class 1 --n_known_outlier_classes 1;
114 | ```
115 |
116 | The autoencoder is provided through Deep SAD pre-training using `--pretrain True` with `main.py`.
117 | To then run a hybrid approach using one of the classic methods on top of autoencoder features, simply point to the saved
118 | autoencoder model using `--load_ae ../log/DeepSAD/mnist_test/model.tar` and set `--hybrid True`.
119 |
120 | To run hybrid SSAD for example on the same experimental setup as above:
121 | ```
122 | cd
123 |
124 | # activate virtual environment
125 | source myenv/bin/activate # or 'source activate myenv' for conda
126 |
127 | # create folder for experimental output
128 | mkdir log/hybrid_ssad
129 | mkdir log/hybrid_ssad/mnist_test
130 |
131 | # change to source directory
132 | cd src
133 |
134 | # run experiment
135 | python baseline_ssad.py mnist ../log/hybrid_ssad/mnist_test ../data --ratio_known_outlier 0.01 --ratio_pollution 0.1 --kernel rbf --kappa 1.0 --hybrid True --load_ae ../log/DeepSAD/mnist_test/model.tar --normal_class 0 --known_outlier_class 1 --n_known_outlier_classes 1;
136 | ```
137 |
138 | ## License
139 | MIT
140 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/deepsad.py:
--------------------------------------------------------------------------------
1 | import json
2 | import torch
3 |
4 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset
5 | from baseline.DeepSAD.src.networks.main import build_network, build_autoencoder
6 | from baseline.DeepSAD.src.optim.DeepSAD_trainer import DeepSADTrainer
7 | from baseline.DeepSAD.src.optim.ae_trainer import AETrainer
8 |
9 |
10 | class deepsad(object):
11 | """A class for the Deep SAD method.
12 |
13 | Attributes:
14 | eta: Deep SAD hyperparameter eta (must be 0 < eta).
15 | c: Hypersphere center c.
16 | net_name: A string indicating the name of the neural network to use.
17 | net: The neural network phi.
18 | trainer: DeepSADTrainer to train a Deep SAD model.
19 | optimizer_name: A string indicating the optimizer to use for training the Deep SAD network.
20 | ae_net: The autoencoder network corresponding to phi for network weights pretraining.
21 | ae_trainer: AETrainer to train an autoencoder in pretraining.
22 | ae_optimizer_name: A string indicating the optimizer to use for pretraining the autoencoder.
23 | results: A dictionary to save the results.
24 | ae_results: A dictionary to save the autoencoder results.
25 | """
26 |
27 | def __init__(self, eta: float = 1.0):
28 | """Inits DeepSAD with hyperparameter eta."""
29 |
30 | self.eta = eta
31 | self.c = None # hypersphere center c
32 |
33 | self.net_name = None
34 | self.net = None # neural network phi
35 |
36 | self.trainer = None
37 | self.optimizer_name = None
38 |
39 | self.ae_net = None # autoencoder network for pretraining
40 | self.ae_trainer = None
41 | self.ae_optimizer_name = None
42 |
43 | self.results = {
44 | 'train_time': None,
45 | 'test_aucroc': None,
46 | 'test_aucpr': None,
47 | 'test_time': None,
48 | 'test_scores': None,
49 | }
50 |
51 | self.ae_results = {
52 | 'train_time': None,
53 | 'test_aucroc': None,
54 | 'test_aucpr': None,
55 | 'test_time': None
56 | }
57 |
58 | def set_network(self, net_name, input_size):
59 | """Builds the neural network phi."""
60 | self.net_name = net_name
61 | self.net = build_network(net_name, input_size)
62 |
63 | def train(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 50,
64 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda',
65 | n_jobs_dataloader: int = 0):
66 | """Trains the Deep SAD model on the training data."""
67 |
68 | self.optimizer_name = optimizer_name
69 | self.trainer = DeepSADTrainer(self.c, self.eta, optimizer_name=optimizer_name, lr=lr, n_epochs=n_epochs,
70 | lr_milestones=lr_milestones, batch_size=batch_size, weight_decay=weight_decay,
71 | device=device, n_jobs_dataloader=n_jobs_dataloader)
72 | # Get the model
73 | self.net = self.trainer.train(dataset, self.net)
74 | self.results['train_time'] = self.trainer.train_time
75 | self.c = self.trainer.c.cpu().data.numpy().tolist() # get as list
76 |
77 | def test(self, dataset: BaseADDataset, device: str = 'cuda', n_jobs_dataloader: int = 0):
78 | """Tests the Deep SAD model on the test data."""
79 |
80 | if self.trainer is None:
81 | self.trainer = DeepSADTrainer(self.c, self.eta, device=device, n_jobs_dataloader=n_jobs_dataloader)
82 |
83 | score = self.trainer.test(dataset, self.net)
84 |
85 | # Get results
86 | # self.results['test_aucroc'] = self.trainer.test_aucroc
87 | # self.results['test_aucpr'] = self.trainer.test_aucpr
88 | self.results['test_time'] = self.trainer.test_time
89 | self.results['test_scores'] = self.trainer.test_scores
90 |
91 | return score
92 |
93 | def pretrain(self, dataset: BaseADDataset, input_size ,optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 100,
94 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda',
95 | n_jobs_dataloader: int = 0):
96 | """Pretrains the weights for the Deep SAD network phi via autoencoder."""
97 |
98 | # Set autoencoder network
99 | self.ae_net = build_autoencoder(self.net_name, input_size)
100 |
101 | # Train
102 | self.ae_optimizer_name = optimizer_name
103 | self.ae_trainer = AETrainer(optimizer_name, lr=lr, n_epochs=n_epochs, lr_milestones=lr_milestones,
104 | batch_size=batch_size, weight_decay=weight_decay, device=device,
105 | n_jobs_dataloader=n_jobs_dataloader)
106 | self.ae_net = self.ae_trainer.train(dataset, self.ae_net)
107 |
108 | # Get train results
109 | self.ae_results['train_time'] = self.ae_trainer.train_time
110 |
111 | # Test
112 | self.ae_trainer.test(dataset, self.ae_net)
113 |
114 | # Get test results
115 | self.ae_results['test_aucroc'] = self.ae_trainer.test_aucroc
116 | self.ae_results['test_aucpr'] = self.ae_trainer.test_aucpr
117 | self.ae_results['test_time'] = self.ae_trainer.test_time
118 |
119 | # Initialize Deep SAD network weights from pre-trained encoder
120 | self.init_network_weights_from_pretraining()
121 |
122 | def init_network_weights_from_pretraining(self):
123 | """Initialize the Deep SAD network weights from the encoder weights of the pretraining autoencoder."""
124 |
125 | net_dict = self.net.state_dict()
126 | ae_net_dict = self.ae_net.state_dict()
127 |
128 | # Filter out decoder network keys
129 | ae_net_dict = {k: v for k, v in ae_net_dict.items() if k in net_dict}
130 | # Overwrite values in the existing state_dict
131 | net_dict.update(ae_net_dict)
132 | # Load the new state_dict
133 | self.net.load_state_dict(net_dict)
134 |
135 | def save_model(self, export_model, save_ae=True):
136 | """Save Deep SAD model to export_model."""
137 |
138 | net_dict = self.net.state_dict()
139 | ae_net_dict = self.ae_net.state_dict() if save_ae else None
140 |
141 | torch.save({'c': self.c,
142 | 'net_dict': net_dict,
143 | 'ae_net_dict': ae_net_dict}, export_model)
144 |
145 | def load_model(self, model_path, load_ae=False, map_location='cpu'):
146 | """Load Deep SAD model from model_path."""
147 |
148 | model_dict = torch.load(model_path, map_location=map_location)
149 |
150 | self.c = model_dict['c']
151 | self.net.load_state_dict(model_dict['net_dict'])
152 |
153 | # load autoencoder parameters if specified
154 | if load_ae:
155 | if self.ae_net is None:
156 | self.ae_net = build_autoencoder(self.net_name)
157 | self.ae_net.load_state_dict(model_dict['ae_net_dict'])
158 |
159 | def save_results(self, export_json):
160 | """Save results dict to a JSON-file."""
161 | with open(export_json, 'w') as fp:
162 | json.dump(self.results, fp)
163 |
164 | def save_ae_results(self, export_json):
165 | """Save autoencoder results dict to a JSON-file."""
166 | with open(export_json, 'w') as fp:
167 | json.dump(self.ae_results, fp)
168 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/optim/DeepSAD_trainer.py:
--------------------------------------------------------------------------------
1 | from baseline.DeepSAD.src.base.base_trainer import BaseTrainer
2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset
3 | from baseline.DeepSAD.src.base.base_net import BaseNet
4 | from torch.utils.data.dataloader import DataLoader
5 | from sklearn.metrics import roc_auc_score, average_precision_score
6 |
7 | import logging
8 | import time
9 | import torch
10 | import torch.optim as optim
11 | import numpy as np
12 |
13 |
14 | class DeepSADTrainer(BaseTrainer):
15 |
16 | def __init__(self, c, eta: float, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150,
17 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda',
18 | n_jobs_dataloader: int = 0):
19 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device,
20 | n_jobs_dataloader)
21 |
22 | # Deep SAD parameters
23 | self.c = torch.tensor(c, device=self.device) if c is not None else None
24 | self.eta = eta
25 |
26 | # Optimization parameters
27 | self.eps = 1e-6
28 |
29 | # Results
30 | self.train_time = None
31 | self.test_aucroc = None; self.test_aucpr = None
32 | self.test_time = None
33 | self.test_scores = None
34 |
35 | def train(self, dataset: BaseADDataset, net: BaseNet):
36 | logger = logging.getLogger()
37 |
38 | # Get train data loader
39 | train_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
40 |
41 | # Set device for network
42 | net = net.to(self.device)
43 |
44 | # Set optimizer (Adam optimizer for now)
45 | optimizer = optim.Adam(net.parameters(), lr=self.lr, weight_decay=self.weight_decay)
46 |
47 | # Set learning rate scheduler
48 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1)
49 |
50 | # Initialize hypersphere center c (if c not loaded)
51 | if self.c is None:
52 | logger.info('Initializing center c...')
53 | self.c = self.init_center_c(train_loader, net)
54 | logger.info('Center c initialized.')
55 |
56 | # Training
57 | logger.info('Starting training...')
58 | start_time = time.time()
59 | net.train()
60 | for epoch in range(self.n_epochs):
61 |
62 | epoch_loss = 0.0
63 | n_batches = 0
64 | epoch_start_time = time.time()
65 | for data in train_loader:
66 | inputs, _, semi_targets, _ = data
67 | inputs, semi_targets = inputs.to(self.device), semi_targets.to(self.device)
68 |
69 | # transfer the label "1" to "-1" for the inverse loss
70 | semi_targets[semi_targets==1] = -1
71 |
72 | # Zero the network parameter gradients
73 | optimizer.zero_grad()
74 |
75 | # Update network parameters via backpropagation: forward + backward + optimize
76 | outputs = net(inputs)
77 | dist = torch.sum((outputs - self.c) ** 2, dim=1)
78 | losses = torch.where(semi_targets == 0, dist, self.eta * ((dist + self.eps) ** semi_targets.float()))
79 | loss = torch.mean(losses)
80 | loss.backward()
81 | optimizer.step()
82 | scheduler.step()
83 | if epoch in self.lr_milestones:
84 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0]))
85 |
86 | epoch_loss += loss.item()
87 | n_batches += 1
88 |
89 | # log epoch statistics
90 | epoch_train_time = time.time() - epoch_start_time
91 | logger.info(f'| Epoch: {epoch + 1:03}/{self.n_epochs:03} | Train Time: {epoch_train_time:.3f}s '
92 | f'| Train Loss: {epoch_loss / n_batches:.6f} |')
93 |
94 | self.train_time = time.time() - start_time
95 | logger.info('Training Time: {:.3f}s'.format(self.train_time))
96 | logger.info('Finished training.')
97 |
98 | return net
99 |
100 | def test(self, dataset: BaseADDataset, net: BaseNet):
101 | logger = logging.getLogger()
102 |
103 | # Get test data loader
104 | test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
105 |
106 | # Set device for network
107 | net = net.to(self.device)
108 |
109 | # Testing
110 | logger.info('Starting testing...')
111 | epoch_loss = 0.0
112 | n_batches = 0
113 | start_time = time.time()
114 | idx_label_score = []
115 | net.eval()
116 | with torch.no_grad():
117 | for data in test_loader:
118 | inputs, labels, semi_targets, idx = data
119 |
120 | inputs = inputs.to(self.device)
121 | labels = labels.to(self.device)
122 | semi_targets = semi_targets.to(self.device)
123 | idx = idx.to(self.device)
124 |
125 | outputs = net(inputs)
126 | dist = torch.sum((outputs - self.c) ** 2, dim=1)
127 | losses = torch.where(semi_targets == 0, dist, self.eta * ((dist + self.eps) ** semi_targets.float()))
128 | loss = torch.mean(losses)
129 | scores = dist
130 |
131 | # Save triples of (idx, label, score) in a list
132 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(),
133 | labels.cpu().data.numpy().tolist(),
134 | scores.cpu().data.numpy().tolist()))
135 |
136 | epoch_loss += loss.item()
137 | n_batches += 1
138 |
139 | self.test_time = time.time() - start_time
140 | self.test_scores = idx_label_score
141 |
142 | # Compute AUC
143 | _, labels, scores = zip(*idx_label_score)
144 | # labels = np.array(labels)
145 | scores = np.array(scores)
146 | # self.test_aucroc = roc_auc_score(labels, scores)
147 | # self.test_aucpr = average_precision_score(labels, scores, pos_label = 1)
148 |
149 | # Log results
150 | logger.info('Test Loss: {:.6f}'.format(epoch_loss / n_batches))
151 | # logger.info('Test AUCROC: {:.2f}%'.format(100. * self.test_aucroc))
152 | # logger.info('Test AUCPR: {:.2f}%'.format(100. * self.test_aucpr))
153 | logger.info('Test Time: {:.3f}s'.format(self.test_time))
154 | logger.info('Finished testing.')
155 |
156 | return scores
157 |
158 | def init_center_c(self, train_loader: DataLoader, net: BaseNet, eps=0.1):
159 | """Initialize hypersphere center c as the mean from an initial forward pass on the data."""
160 | n_samples = 0
161 | c = torch.zeros(net.rep_dim, device=self.device)
162 |
163 | net.eval()
164 | with torch.no_grad():
165 | for data in train_loader:
166 | # get the inputs of the batch
167 | inputs, _, _, _ = data
168 | inputs = inputs.to(self.device)
169 | outputs = net(inputs)
170 | n_samples += outputs.shape[0]
171 | c += torch.sum(outputs, dim=0)
172 |
173 | c /= n_samples
174 |
175 | # If c_i is too close to 0, set to +-eps. Reason: a zero unit can be trivially matched with zero weights.
176 | c[(abs(c) < eps) & (c < 0)] = -eps
177 | c[(abs(c) < eps) & (c > 0)] = eps
178 |
179 | return c
180 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/optim/SemiDGM_trainer.py:
--------------------------------------------------------------------------------
1 | from baseline.DeepSAD.src.base.base_trainer import BaseTrainer
2 | from baseline.DeepSAD.src.base.base_dataset import BaseADDataset
3 | from baseline.DeepSAD.src.base.base_net import BaseNet
4 | from baseline.DeepSAD.src.optim.variational import SVI, ImportanceWeightedSampler
5 | from baseline.DeepSAD.src.utils.misc import binary_cross_entropy
6 | from sklearn.metrics import roc_auc_score
7 |
8 | import logging
9 | import time
10 | import torch
11 | import torch.optim as optim
12 | import numpy as np
13 |
14 |
15 | class SemiDeepGenerativeTrainer(BaseTrainer):
16 |
17 | def __init__(self, alpha: float = 0.1, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150,
18 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda',
19 | n_jobs_dataloader: int = 0):
20 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device,
21 | n_jobs_dataloader)
22 |
23 | self.alpha = alpha
24 |
25 | # Results
26 | self.train_time = None
27 | self.test_auc = None
28 | self.test_time = None
29 | self.test_scores = None
30 |
31 | def train(self, dataset: BaseADDataset, net: BaseNet):
32 | logger = logging.getLogger()
33 |
34 | # Get train data loader
35 | train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
36 |
37 | # Set device
38 | net = net.to(self.device)
39 |
40 | # Use importance weighted sampler (Burda et al., 2015) to get a better estimate on the log-likelihood.
41 | sampler = ImportanceWeightedSampler(mc=1, iw=1)
42 | elbo = SVI(net, likelihood=binary_cross_entropy, sampler=sampler)
43 |
44 | # Set optimizer (Adam optimizer for now)
45 | optimizer = optim.Adam(net.parameters(), lr=self.lr, weight_decay=self.weight_decay)
46 |
47 | # Set learning rate scheduler
48 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1)
49 |
50 | # Training
51 | logger.info('Starting training...')
52 | start_time = time.time()
53 | net.train()
54 | for epoch in range(self.n_epochs):
55 |
56 | scheduler.step()
57 | if epoch in self.lr_milestones:
58 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0]))
59 |
60 | epoch_loss = 0.0
61 | n_batches = 0
62 | epoch_start_time = time.time()
63 | for data in train_loader:
64 | inputs, labels, semi_targets, _ = data
65 |
66 | inputs = inputs.to(self.device)
67 | labels = labels.to(self.device)
68 | semi_targets = semi_targets.to(self.device)
69 |
70 | # Get labeled and unlabeled data and make labels one-hot
71 | inputs = inputs.view(inputs.size(0), -1)
72 | x = inputs[semi_targets != 0]
73 | u = inputs[semi_targets == 0]
74 | y = labels[semi_targets != 0]
75 | if y.nelement() > 1:
76 | y_onehot = torch.Tensor(y.size(0), 2).to(self.device) # two labels: 0: normal, 1: outlier
77 | y_onehot.zero_()
78 | y_onehot.scatter_(1, y.view(-1, 1), 1)
79 |
80 | # Zero the network parameter gradients
81 | optimizer.zero_grad()
82 |
83 | # Update network parameters via backpropagation: forward + backward + optimize
84 | if y.nelement() < 2:
85 | L = torch.tensor(0.0).to(self.device)
86 | else:
87 | L = -elbo(x, y_onehot)
88 | U = -elbo(u)
89 |
90 | # Regular cross entropy
91 | if y.nelement() < 2:
92 | classication_loss = torch.tensor(0.0).to(self.device)
93 | else:
94 | # Add auxiliary classification loss q(y|x)
95 | logits = net.classify(x)
96 | eps = 1e-8
97 | classication_loss = torch.sum(y_onehot * torch.log(logits + eps), dim=1).mean()
98 |
99 | # Overall loss
100 | loss = L - self.alpha * classication_loss + U # J_alpha
101 |
102 | loss.backward()
103 | optimizer.step()
104 |
105 | epoch_loss += loss.item()
106 | n_batches += 1
107 |
108 | # log epoch statistics
109 | epoch_train_time = time.time() - epoch_start_time
110 | logger.info(f'| Epoch: {epoch + 1:03}/{self.n_epochs:03} | Train Time: {epoch_train_time:.3f}s '
111 | f'| Train Loss: {epoch_loss / n_batches:.6f} |')
112 |
113 | self.train_time = time.time() - start_time
114 | logger.info('Training Time: {:.3f}s'.format(self.train_time))
115 | logger.info('Finished training.')
116 |
117 | return net
118 |
119 | def test(self, dataset: BaseADDataset, net: BaseNet):
120 | logger = logging.getLogger()
121 |
122 | # Get test data loader
123 | _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
124 |
125 | # Set device
126 | net = net.to(self.device)
127 |
128 | # Use importance weighted sampler (Burda et al., 2015) to get a better estimate on the log-likelihood.
129 | sampler = ImportanceWeightedSampler(mc=1, iw=1)
130 | elbo = SVI(net, likelihood=binary_cross_entropy, sampler=sampler)
131 |
132 | # Testing
133 | logger.info('Starting testing...')
134 | epoch_loss = 0.0
135 | n_batches = 0
136 | start_time = time.time()
137 | idx_label_score = []
138 | net.eval()
139 | with torch.no_grad():
140 | for data in test_loader:
141 | inputs, labels, _, idx = data
142 | inputs = inputs.to(self.device)
143 | labels = labels.to(self.device)
144 | idx = idx.to(self.device)
145 |
146 | # All test data is considered unlabeled
147 | inputs = inputs.view(inputs.size(0), -1)
148 | u = inputs
149 | y = labels
150 | y_onehot = torch.Tensor(y.size(0), 2).to(self.device) # two labels: 0: normal, 1: outlier
151 | y_onehot.zero_()
152 | y_onehot.scatter_(1, y.view(-1, 1), 1)
153 |
154 | # Compute loss
155 | L = -elbo(u, y_onehot)
156 | U = -elbo(u)
157 |
158 | logits = net.classify(u)
159 | eps = 1e-8
160 | classication_loss = -torch.sum(y_onehot * torch.log(logits + eps), dim=1).mean()
161 |
162 | loss = L + self.alpha * classication_loss + U # J_alpha
163 |
164 | # Compute scores
165 | scores = logits[:, 1] # likelihood/confidence for anomalous class as anomaly score
166 |
167 | # Save triple of (idx, label, score) in a list
168 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(),
169 | labels.cpu().data.numpy().tolist(),
170 | scores.cpu().data.numpy().tolist()))
171 |
172 | epoch_loss += loss.item()
173 | n_batches += 1
174 |
175 | self.test_time = time.time() - start_time
176 | self.test_scores = idx_label_score
177 |
178 | # Compute AUC
179 | _, labels, scores = zip(*idx_label_score)
180 | labels = np.array(labels)
181 | scores = np.array(scores)
182 | self.test_auc = roc_auc_score(labels, scores)
183 |
184 | # Log results
185 | logger.info('Test Loss: {:.6f}'.format(epoch_loss / n_batches))
186 | logger.info('Test AUC: {:.2f}%'.format(100. * self.test_auc))
187 | logger.info('Test Time: {:.3f}s'.format(self.test_time))
188 | logger.info('Finished testing.')
189 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baselines/shallow_ssad/ssad_convex.py:
--------------------------------------------------------------------------------
1 | ########################################################################################################################
2 | # Acknowledgements: https://github.com/nicococo/tilitools
3 | ########################################################################################################################
4 | import numpy as np
5 |
6 | from cvxopt import matrix, spmatrix, sparse, spdiag
7 | from cvxopt.solvers import qp
8 |
9 |
10 | class ConvexSSAD:
11 | """ Convex semi-supervised anomaly detection with hinge-loss and L2 regularizer
12 | as described in Goernitz et al., Towards Supervised Anomaly Detection, JAIR, 2013
13 |
14 | minimize 0.5 ||w||^2_2 - rho - kappa*gamma + eta_u sum_i xi_i + eta_l sum_j xi_j
15 | {w,rho,gamma>=0,xi>=0}
16 | subject to >= rho - xi_i
17 | y_j >= y_j*rho + gamma - xi_j
18 |
19 | And the corresponding dual optimization problem:
20 |
21 | maximize -0.5 sum_(i,j) alpha_i alpha_j y_i y_j k(x_i,x_j)
22 | {0<=alpha_i<=eta_i}
23 | subject to kappa <= sum_j alpha_j (for all labeled examples)
24 | 1 = sum_j y_i alpha_j (for all examples)
25 |
26 | We introduce labels y_i = +1 for all unlabeled examples which enables us to combine sums.
27 |
28 | Note: Only dual solution is supported.
29 |
30 | Written by: Nico Goernitz, TU Berlin, 2013/14
31 | """
32 | PRECISION = 1e-9 # important: effects the threshold, support vectors and speed!
33 |
34 | def __init__(self, kernel, y, kappa=1.0, Cp=1.0, Cu=1.0, Cn=1.0):
35 | assert(len(y.shape) == 1)
36 | self.kernel = kernel
37 | self.y = y # (vector) corresponding labels (+1,-1 and 0 for unlabeled)
38 | self.kappa = kappa # (scalar) regularizer for importance of the margin
39 | self.Cp = Cp # (scalar) the regularization constant for positively labeled samples > 0
40 | self.Cu = Cu # (scalar) the regularization constant for unlabeled samples > 0
41 | self.Cn = Cn # (scalar) the regularization constant for outliers > 0
42 | self.samples = y.size
43 | self.labeled = np.sum(np.abs(y))
44 |
45 | # cy: (vector) converted label vector (+1 for pos and unlabeled, -1 for outliers)
46 | self.cy = y.copy().reshape((y.size, 1))
47 | self.cy[y == 0] = 1 # cy=+1.0 (unlabeled,pos) & cy=-1.0 (neg)
48 |
49 | # cl: (vector) converted label vector (+1 for labeled examples, 0.0 for unlabeled)
50 | self.cl = np.abs(y.copy()) # cl=+1.0 (labeled) cl=0.0 (unlabeled)
51 |
52 | # (vector) converted upper bound box constraint for each example
53 | self.cC = np.zeros(y.size) # cC=Cu (unlabeled) cC=Cp (pos) cC=Cn (neg)
54 | self.cC[y == 0] = Cu
55 | self.cC[y == 1] = Cp
56 | self.cC[y ==-1] = Cn
57 |
58 | self.alphas = None
59 | self.svs = None # (vector) list of support vector (contains indices)
60 | self.threshold = 0.0 # (scalar) the optimized threshold (rho)
61 |
62 | # if there are no labeled examples, then set kappa to 0.0 otherwise
63 | # the dual constraint kappa <= sum_{i \in labeled} alpha_i = 0.0 will
64 | # prohibit a solution
65 | if self.labeled == 0:
66 | print('There are no labeled examples hence, setting kappa=0.0')
67 | self.kappa = 0.0
68 | print('Convex semi-supervised anomaly detection with {0} samples ({1} labeled).'.format(self.samples, self.labeled))
69 |
70 | def set_train_kernel(self, kernel):
71 | dim1, dim2 = kernel.shape
72 | print([dim1, dim2])
73 | assert(dim1 == dim2 and dim1 == self.samples)
74 | self.kernel = kernel
75 |
76 | def fit(self, check_psd_eigs=False):
77 | # number of training examples
78 | N = self.samples
79 |
80 | # generate the label kernel
81 | Y = self.cy.dot(self.cy.T)
82 |
83 | # generate the final PDS kernel
84 | P = matrix(self.kernel*Y)
85 |
86 | # check for PSD
87 | if check_psd_eigs:
88 | eigs = np.linalg.eigvalsh(np.array(P))
89 | if eigs[0] < 0.0:
90 | print('Smallest eigenvalue is {0}'.format(eigs[0]))
91 | P += spdiag([-eigs[0] for i in range(N)])
92 |
93 | # there is no linear part of the objective
94 | q = matrix(0.0, (N, 1))
95 |
96 | # sum_i y_i alpha_i = A alpha = b = 1.0
97 | A = matrix(self.cy, (1, self.samples), 'd')
98 | b = matrix(1.0, (1, 1))
99 |
100 | # inequality constraints: G alpha <= h
101 | # 1) alpha_i <= C_i
102 | # 2) -alpha_i <= 0
103 | G12 = spmatrix(1.0, range(N), range(N))
104 | h1 = matrix(self.cC)
105 | h2 = matrix(0.0, (N, 1))
106 | G = sparse([G12, -G12])
107 | h = matrix([h1, h2])
108 | if self.labeled > 0:
109 | # 3) kappa <= \sum_i labeled_i alpha_i -> -cl' alpha <= -kappa
110 | print('Labeled data found.')
111 | G3 = -matrix(self.cl, (1, self.cl.size), 'd')
112 | h3 = -matrix(self.kappa, (1, 1))
113 | G = sparse([G12, -G12, G3])
114 | h = matrix([h1, h2, h3])
115 |
116 | # solve the quadratic programm
117 | sol = qp(P, -q, G, h, A, b)
118 |
119 | # store solution
120 | self.alphas = np.array(sol['x'])
121 |
122 | # 1. find all support vectors, i.e. 0 < alpha_i <= C
123 | # 2. store all support vector with alpha_i < C in 'margins'
124 | self.svs = np.where(self.alphas >= ConvexSSAD.PRECISION)[0]
125 |
126 | # these should sum to one
127 | print('Validate solution:')
128 | print('- found {0} support vectors'.format(len(self.svs)))
129 | print('0 <= alpha_i : {0} of {1}'.format(np.sum(0. <= self.alphas), N))
130 | print('- sum_(i) alpha_i cy_i = {0} = 1.0'.format(np.sum(self.alphas*self.cy)))
131 | print('- sum_(i in sv) alpha_i cy_i = {0} ~ 1.0 (approx error)'.format(np.sum(self.alphas[self.svs]*self.cy[self.svs])))
132 | print('- sum_(i in labeled) alpha_i = {0} >= {1} = kappa'.format(np.sum(self.alphas[self.cl == 1]), self.kappa))
133 | print('- sum_(i in unlabeled) alpha_i = {0}'.format(np.sum(self.alphas[self.y == 0])))
134 | print('- sum_(i in positives) alpha_i = {0}'.format(np.sum(self.alphas[self.y == 1])))
135 | print('- sum_(i in negatives) alpha_i = {0}'.format(np.sum(self.alphas[self.y ==-1])))
136 |
137 | # infer threshold (rho)
138 | psvs = np.where(self.y[self.svs] == 0)[0]
139 | # case 1: unlabeled support vectors available
140 | self.threshold = 0.
141 | unl_threshold = -1e12
142 | lbl_threshold = -1e12
143 | if psvs.size > 0:
144 | k = self.kernel[:, self.svs]
145 | k = k[self.svs[psvs], :]
146 | unl_threshold = np.max(self.apply(k))
147 |
148 | if np.sum(self.cl) > 1e-12:
149 | # case 2: only labeled examples available
150 | k = self.kernel[:, self.svs]
151 | k = k[self.svs, :]
152 | thres = self.apply(k)
153 | pinds = np.where(self.y[self.svs] == +1)[0]
154 | ninds = np.where(self.y[self.svs] == -1)[0]
155 | # only negatives is not possible
156 | if ninds.size > 0 and pinds.size == 0:
157 | print('ERROR: Check pre-defined PRECISION.')
158 | lbl_threshold = np.max(thres[ninds])
159 | elif ninds.size == 0:
160 | lbl_threshold = np.max(thres[pinds])
161 | else:
162 | # smallest negative + largest positive
163 | p = np.max(thres[pinds])
164 | n = np.min(thres[ninds])
165 | lbl_threshold = (n+p)/2.
166 | self.threshold = np.max((unl_threshold, lbl_threshold))
167 |
168 | def get_threshold(self):
169 | return self.threshold
170 |
171 | def get_support_dual(self):
172 | return self.svs
173 |
174 | def get_alphas(self):
175 | return self.alphas
176 |
177 | def apply(self, kernel):
178 | """ Application of dual trained ssad.
179 | kernel = get_kernel(Y, X[:, cssad.svs], kernel_type, kernel_param)
180 | """
181 | if kernel.shape[1] == self.samples:
182 | # if kernel is not restricted to support vectors
183 | ay = self.alphas * self.cy
184 | else:
185 | ay = self.alphas[self.svs] * self.cy[self.svs]
186 | return ay.T.dot(kernel.T).T - self.threshold
187 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baseline_ocsvm.py:
--------------------------------------------------------------------------------
1 | import click
2 | import torch
3 | import logging
4 | import random
5 | import numpy as np
6 |
7 | from utils.config import Config
8 | from utils.visualization.plot_images_grid import plot_images_grid
9 | from baselines.ocsvm import OCSVM
10 | from datasets.main import load_dataset
11 |
12 |
13 | ################################################################################
14 | # Settings
15 | ################################################################################
16 | @click.command()
17 | @click.argument('dataset_name', type=click.Choice(['mnist', 'fmnist', 'cifar10', 'arrhythmia', 'cardio', 'satellite',
18 | 'satimage-2', 'shuttle', 'thyroid']))
19 | @click.argument('xp_path', type=click.Path(exists=True))
20 | @click.argument('data_path', type=click.Path(exists=True))
21 | @click.option('--load_config', type=click.Path(exists=True), default=None,
22 | help='Config JSON-file path (default: None).')
23 | @click.option('--load_model', type=click.Path(exists=True), default=None,
24 | help='Model file path (default: None).')
25 | @click.option('--ratio_known_normal', type=float, default=0.0,
26 | help='Ratio of known (labeled) normal training examples.')
27 | @click.option('--ratio_known_outlier', type=float, default=0.0,
28 | help='Ratio of known (labeled) anomalous training examples.')
29 | @click.option('--ratio_pollution', type=float, default=0.0,
30 | help='Pollution ratio of unlabeled training data with unknown (unlabeled) anomalies.')
31 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.')
32 | @click.option('--kernel', type=click.Choice(['rbf', 'linear', 'poly']), default='rbf', help='Kernel for the OC-SVM')
33 | @click.option('--nu', type=float, default=0.1, help='OC-SVM hyperparameter nu (must be 0 < nu <= 1).')
34 | @click.option('--hybrid', type=bool, default=False,
35 | help='Train OC-SVM on features extracted from an autoencoder. If True, load_ae must be specified.')
36 | @click.option('--load_ae', type=click.Path(exists=True), default=None,
37 | help='Model file path to load autoencoder weights (default: None).')
38 | @click.option('--n_jobs_dataloader', type=int, default=0,
39 | help='Number of workers for data loading. 0 means that the data will be loaded in the main process.')
40 | @click.option('--normal_class', type=int, default=0,
41 | help='Specify the normal class of the dataset (all other classes are considered anomalous).')
42 | @click.option('--known_outlier_class', type=int, default=1,
43 | help='Specify the known outlier class of the dataset for semi-supervised anomaly detection.')
44 | @click.option('--n_known_outlier_classes', type=int, default=0,
45 | help='Number of known outlier classes.'
46 | 'If 0, no anomalies are known.'
47 | 'If 1, outlier class as specified in --known_outlier_class option.'
48 | 'If > 1, the specified number of outlier classes will be sampled at random.')
49 | def main(dataset_name, xp_path, data_path, load_config, load_model, ratio_known_normal, ratio_known_outlier,
50 | ratio_pollution, seed, kernel, nu, hybrid, load_ae, n_jobs_dataloader, normal_class, known_outlier_class,
51 | n_known_outlier_classes):
52 | """
53 | (Hybrid) One-Class SVM for anomaly detection.
54 |
55 | :arg DATASET_NAME: Name of the dataset to load.
56 | :arg XP_PATH: Export path for logging the experiment.
57 | :arg DATA_PATH: Root path of data.
58 | """
59 |
60 | # Get configuration
61 | cfg = Config(locals().copy())
62 |
63 | # Set up logging
64 | logging.basicConfig(level=logging.INFO)
65 | logger = logging.getLogger()
66 | logger.setLevel(logging.INFO)
67 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
68 | log_file = xp_path + '/log.txt'
69 | file_handler = logging.FileHandler(log_file)
70 | file_handler.setLevel(logging.INFO)
71 | file_handler.setFormatter(formatter)
72 | logger.addHandler(file_handler)
73 |
74 | # Print paths
75 | logger.info('Log file is %s.' % log_file)
76 | logger.info('Data path is %s.' % data_path)
77 | logger.info('Export path is %s.' % xp_path)
78 |
79 | # Print experimental setup
80 | logger.info('Dataset: %s' % dataset_name)
81 | logger.info('Normal class: %d' % normal_class)
82 | logger.info('Ratio of labeled normal train samples: %.2f' % ratio_known_normal)
83 | logger.info('Ratio of labeled anomalous samples: %.2f' % ratio_known_outlier)
84 | logger.info('Pollution ratio of unlabeled train data: %.2f' % ratio_pollution)
85 | if n_known_outlier_classes == 1:
86 | logger.info('Known anomaly class: %d' % known_outlier_class)
87 | else:
88 | logger.info('Number of known anomaly classes: %d' % n_known_outlier_classes)
89 |
90 | # If specified, load experiment config from JSON-file
91 | if load_config:
92 | cfg.load_config(import_json=load_config)
93 | logger.info('Loaded configuration from %s.' % load_config)
94 |
95 | # Print OC-SVM configuration
96 | logger.info('OC-SVM kernel: %s' % cfg.settings['kernel'])
97 | logger.info('Nu-paramerter: %.2f' % cfg.settings['nu'])
98 | logger.info('Hybrid model: %s' % cfg.settings['hybrid'])
99 |
100 | # Set seed
101 | if cfg.settings['seed'] != -1:
102 | random.seed(cfg.settings['seed'])
103 | np.random.seed(cfg.settings['seed'])
104 | torch.manual_seed(cfg.settings['seed'])
105 | torch.cuda.manual_seed(cfg.settings['seed'])
106 | torch.backends.cudnn.deterministic = True
107 | logger.info('Set seed to %d.' % cfg.settings['seed'])
108 |
109 | # Use 'cpu' as device for OC-SVM
110 | device = 'cpu'
111 | torch.multiprocessing.set_sharing_strategy('file_system') # fix multiprocessing issue for ubuntu
112 | logger.info('Computation device: %s' % device)
113 | logger.info('Number of dataloader workers: %d' % n_jobs_dataloader)
114 |
115 | # Load data
116 | dataset = load_dataset(dataset_name, data_path, normal_class, known_outlier_class, n_known_outlier_classes,
117 | ratio_known_normal, ratio_known_outlier, ratio_pollution,
118 | random_state=np.random.RandomState(cfg.settings['seed']))
119 | # Log random sample of known anomaly classes if more than 1 class
120 | if n_known_outlier_classes > 1:
121 | logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes,))
122 |
123 | # Initialize OC-SVM model
124 | ocsvm = OCSVM(cfg.settings['kernel'], cfg.settings['nu'], cfg.settings['hybrid'])
125 |
126 | # If specified, load model parameters from already trained model
127 | if load_model:
128 | ocsvm.load_model(import_path=load_model, device=device)
129 | logger.info('Loading model from %s.' % load_model)
130 |
131 | # If specified, load model autoencoder weights for a hybrid approach
132 | if hybrid and load_ae is not None:
133 | ocsvm.load_ae(dataset_name, model_path=load_ae)
134 | logger.info('Loaded pretrained autoencoder for features from %s.' % load_ae)
135 |
136 | # Train model on dataset
137 | ocsvm.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
138 |
139 | # Test model
140 | ocsvm.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
141 |
142 | # Save results and configuration
143 | ocsvm.save_results(export_json=xp_path + '/results.json')
144 | cfg.save_config(export_json=xp_path + '/config.json')
145 |
146 | # Plot most anomalous and most normal test samples
147 | indices, labels, scores = zip(*ocsvm.results['test_scores'])
148 | indices, labels, scores = np.array(indices), np.array(labels), np.array(scores)
149 | idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score
150 | idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score
151 |
152 | if dataset_name in ('mnist', 'fmnist', 'cifar10'):
153 |
154 | if dataset_name in ('mnist', 'fmnist'):
155 | X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1)
156 | X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(1)
157 | X_normal_low = dataset.test_set.data[idx_normal_sorted[:32], ...].unsqueeze(1)
158 | X_normal_high = dataset.test_set.data[idx_normal_sorted[-32:], ...].unsqueeze(1)
159 |
160 | if dataset_name == 'cifar10':
161 | X_all_low = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[:32], ...], (0, 3, 1, 2)))
162 | X_all_high = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[-32:], ...], (0, 3, 1, 2)))
163 | X_normal_low = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[:32], ...], (0, 3, 1, 2)))
164 | X_normal_high = torch.tensor(
165 | np.transpose(dataset.test_set.data[idx_normal_sorted[-32:], ...], (0, 3, 1, 2)))
166 |
167 | plot_images_grid(X_all_low, export_img=xp_path + '/all_low', padding=2)
168 | plot_images_grid(X_all_high, export_img=xp_path + '/all_high', padding=2)
169 | plot_images_grid(X_normal_low, export_img=xp_path + '/normals_low', padding=2)
170 | plot_images_grid(X_normal_high, export_img=xp_path + '/normals_high', padding=2)
171 |
172 |
173 | if __name__ == '__main__':
174 | main()
175 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baseline_ssad.py:
--------------------------------------------------------------------------------
1 | import click
2 | import torch
3 | import logging
4 | import random
5 | import numpy as np
6 | import cvxopt as co
7 |
8 | from utils.config import Config
9 | from utils.visualization.plot_images_grid import plot_images_grid
10 | from baselines.ssad import SSAD
11 | from datasets.main import load_dataset
12 |
13 |
14 | ################################################################################
15 | # Settings
16 | ################################################################################
17 | @click.command()
18 | @click.argument('dataset_name', type=click.Choice(['mnist', 'fmnist', 'cifar10', 'arrhythmia', 'cardio', 'satellite',
19 | 'satimage-2', 'shuttle', 'thyroid']))
20 | @click.argument('xp_path', type=click.Path(exists=True))
21 | @click.argument('data_path', type=click.Path(exists=True))
22 | @click.option('--load_config', type=click.Path(exists=True), default=None,
23 | help='Config JSON-file path (default: None).')
24 | @click.option('--load_model', type=click.Path(exists=True), default=None,
25 | help='Model file path (default: None).')
26 | @click.option('--ratio_known_normal', type=float, default=0.0,
27 | help='Ratio of known (labeled) normal training examples.')
28 | @click.option('--ratio_known_outlier', type=float, default=0.0,
29 | help='Ratio of known (labeled) anomalous training examples.')
30 | @click.option('--ratio_pollution', type=float, default=0.0,
31 | help='Pollution ratio of unlabeled training data with unknown (unlabeled) anomalies.')
32 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.')
33 | @click.option('--kernel', type=click.Choice(['rbf']), default='rbf', help='Kernel for SSAD')
34 | @click.option('--kappa', type=float, default=1.0, help='SSAD hyperparameter kappa.')
35 | @click.option('--hybrid', type=bool, default=False,
36 | help='Train SSAD on features extracted from an autoencoder. If True, load_ae must be specified')
37 | @click.option('--load_ae', type=click.Path(exists=True), default=None,
38 | help='Model file path to load autoencoder weights (default: None).')
39 | @click.option('--n_jobs_dataloader', type=int, default=0,
40 | help='Number of workers for data loading. 0 means that the data will be loaded in the main process.')
41 | @click.option('--normal_class', type=int, default=0,
42 | help='Specify the normal class of the dataset (all other classes are considered anomalous).')
43 | @click.option('--known_outlier_class', type=int, default=1,
44 | help='Specify the known outlier class of the dataset for semi-supervised anomaly detection.')
45 | @click.option('--n_known_outlier_classes', type=int, default=0,
46 | help='Number of known outlier classes.'
47 | 'If 0, no anomalies are known.'
48 | 'If 1, outlier class as specified in --known_outlier_class option.'
49 | 'If > 1, the specified number of outlier classes will be sampled at random.')
50 | def main(dataset_name, xp_path, data_path, load_config, load_model, ratio_known_normal, ratio_known_outlier,
51 | ratio_pollution, seed, kernel, kappa, hybrid, load_ae, n_jobs_dataloader, normal_class, known_outlier_class,
52 | n_known_outlier_classes):
53 | """
54 | (Hybrid) SSAD for anomaly detection as in Goernitz et al., Towards Supervised Anomaly Detection, JAIR, 2013.
55 |
56 | :arg DATASET_NAME: Name of the dataset to load.
57 | :arg XP_PATH: Export path for logging the experiment.
58 | :arg DATA_PATH: Root path of data.
59 | """
60 |
61 | # Get configuration
62 | cfg = Config(locals().copy())
63 |
64 | # Set up logging
65 | logging.basicConfig(level=logging.INFO)
66 | logger = logging.getLogger()
67 | logger.setLevel(logging.INFO)
68 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
69 | log_file = xp_path + '/log.txt'
70 | file_handler = logging.FileHandler(log_file)
71 | file_handler.setLevel(logging.INFO)
72 | file_handler.setFormatter(formatter)
73 | logger.addHandler(file_handler)
74 |
75 | # Print paths
76 | logger.info('Log file is %s.' % log_file)
77 | logger.info('Data path is %s.' % data_path)
78 | logger.info('Export path is %s.' % xp_path)
79 |
80 | # Print experimental setup
81 | logger.info('Dataset: %s' % dataset_name)
82 | logger.info('Normal class: %d' % normal_class)
83 | logger.info('Ratio of labeled normal train samples: %.2f' % ratio_known_normal)
84 | logger.info('Ratio of labeled anomalous samples: %.2f' % ratio_known_outlier)
85 | logger.info('Pollution ratio of unlabeled train data: %.2f' % ratio_pollution)
86 | if n_known_outlier_classes == 1:
87 | logger.info('Known anomaly class: %d' % known_outlier_class)
88 | else:
89 | logger.info('Number of known anomaly classes: %d' % n_known_outlier_classes)
90 |
91 | # If specified, load experiment config from JSON-file
92 | if load_config:
93 | cfg.load_config(import_json=load_config)
94 | logger.info('Loaded configuration from %s.' % load_config)
95 |
96 | # Print SSAD configuration
97 | logger.info('SSAD kernel: %s' % cfg.settings['kernel'])
98 | logger.info('Kappa-paramerter: %.2f' % cfg.settings['kappa'])
99 | logger.info('Hybrid model: %s' % cfg.settings['hybrid'])
100 |
101 | # Set seed
102 | if cfg.settings['seed'] != -1:
103 | random.seed(cfg.settings['seed'])
104 | np.random.seed(cfg.settings['seed'])
105 | co.setseed(cfg.settings['seed'])
106 | torch.manual_seed(cfg.settings['seed'])
107 | torch.cuda.manual_seed(cfg.settings['seed'])
108 | torch.backends.cudnn.deterministic = True
109 | logger.info('Set seed to %d.' % cfg.settings['seed'])
110 |
111 | # Use 'cpu' as device for SSAD
112 | device = 'cpu'
113 | torch.multiprocessing.set_sharing_strategy('file_system') # fix multiprocessing issue for ubuntu
114 | logger.info('Computation device: %s' % device)
115 | logger.info('Number of dataloader workers: %d' % n_jobs_dataloader)
116 |
117 | # Load data
118 | dataset = load_dataset(dataset_name, data_path, normal_class, known_outlier_class, n_known_outlier_classes,
119 | ratio_known_normal, ratio_known_outlier, ratio_pollution,
120 | random_state=np.random.RandomState(cfg.settings['seed']))
121 | # Log random sample of known anomaly classes if more than 1 class
122 | if n_known_outlier_classes > 1:
123 | logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes,))
124 |
125 | # Initialize SSAD model
126 | ssad = SSAD(kernel=cfg.settings['kernel'], kappa=cfg.settings['kappa'], hybrid=cfg.settings['hybrid'])
127 |
128 | # If specified, load model parameters from already trained model
129 | if load_model:
130 | ssad.load_model(import_path=load_model, device=device)
131 | logger.info('Loading model from %s.' % load_model)
132 |
133 | # If specified, load model autoencoder weights for a hybrid approach
134 | if hybrid and load_ae is not None:
135 | ssad.load_ae(dataset_name, model_path=load_ae)
136 | logger.info('Loaded pretrained autoencoder for features from %s.' % load_ae)
137 |
138 | # Train model on dataset
139 | ssad.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
140 |
141 | # Test model
142 | ssad.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
143 |
144 | # Save results and configuration
145 | ssad.save_results(export_json=xp_path + '/results.json')
146 | cfg.save_config(export_json=xp_path + '/config.json')
147 |
148 | # Plot most anomalous and most normal test samples
149 | indices, labels, scores = zip(*ssad.results['test_scores'])
150 | indices, labels, scores = np.array(indices), np.array(labels), np.array(scores)
151 | idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score
152 | idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score
153 |
154 | if dataset_name in ('mnist', 'fmnist', 'cifar10'):
155 |
156 | if dataset_name in ('mnist', 'fmnist'):
157 | X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1)
158 | X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(1)
159 | X_normal_low = dataset.test_set.data[idx_normal_sorted[:32], ...].unsqueeze(1)
160 | X_normal_high = dataset.test_set.data[idx_normal_sorted[-32:], ...].unsqueeze(1)
161 |
162 | if dataset_name == 'cifar10':
163 | X_all_low = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[:32], ...], (0, 3, 1, 2)))
164 | X_all_high = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[-32:], ...], (0, 3, 1, 2)))
165 | X_normal_low = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[:32], ...], (0, 3, 1, 2)))
166 | X_normal_high = torch.tensor(
167 | np.transpose(dataset.test_set.data[idx_normal_sorted[-32:], ...], (0, 3, 1, 2)))
168 |
169 | plot_images_grid(X_all_low, export_img=xp_path + '/all_low', padding=2)
170 | plot_images_grid(X_all_high, export_img=xp_path + '/all_high', padding=2)
171 | plot_images_grid(X_normal_low, export_img=xp_path + '/normals_low', padding=2)
172 | plot_images_grid(X_normal_high, export_img=xp_path + '/normals_high', padding=2)
173 |
174 |
175 | if __name__ == '__main__':
176 | main()
177 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baselines/ocsvm.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import time
4 | import torch
5 | import numpy as np
6 |
7 | from torch.utils.data import DataLoader
8 | from sklearn.svm import OneClassSVM
9 | from sklearn.metrics import roc_auc_score
10 | from base.base_dataset import BaseADDataset
11 | from networks.main import build_autoencoder
12 |
13 |
14 | class OCSVM(object):
15 | """A class for One-Class SVM models."""
16 |
17 | def __init__(self, kernel='rbf', nu=0.1, hybrid=False):
18 | """Init OCSVM instance."""
19 | self.kernel = kernel
20 | self.nu = nu
21 | self.rho = None
22 | self.gamma = None
23 |
24 | self.model = OneClassSVM(kernel=kernel, nu=nu)
25 |
26 | self.hybrid = hybrid
27 | self.ae_net = None # autoencoder network for the case of a hybrid model
28 | self.linear_model = None # also init a model with linear kernel if hybrid approach
29 |
30 | self.results = {
31 | 'train_time': None,
32 | 'test_time': None,
33 | 'test_auc': None,
34 | 'test_scores': None,
35 | 'train_time_linear': None,
36 | 'test_time_linear': None,
37 | 'test_auc_linear': None
38 | }
39 |
40 | def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0):
41 | """Trains the OC-SVM model on the training data."""
42 | logger = logging.getLogger()
43 |
44 | # do not drop last batch for non-SGD optimization shallow_ssad
45 | train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True,
46 | num_workers=n_jobs_dataloader, drop_last=False)
47 |
48 | # Get data from loader
49 | X = ()
50 | for data in train_loader:
51 | inputs, _, _, _ = data
52 | inputs = inputs.to(device)
53 | if self.hybrid:
54 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
55 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
56 | X += (X_batch.cpu().data.numpy(),)
57 | X = np.concatenate(X)
58 |
59 | # Training
60 | logger.info('Starting training...')
61 |
62 | # Select model via hold-out test set of 1000 samples
63 | gammas = np.logspace(-7, 2, num=10, base=2)
64 | best_auc = 0.0
65 |
66 | # Sample hold-out set from test set
67 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader)
68 |
69 | X_test = ()
70 | labels = []
71 | for data in test_loader:
72 | inputs, label_batch, _, _ = data
73 | inputs, label_batch = inputs.to(device), label_batch.to(device)
74 | if self.hybrid:
75 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
76 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
77 | X_test += (X_batch.cpu().data.numpy(),)
78 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist()
79 | X_test, labels = np.concatenate(X_test), np.array(labels)
80 | n_test, n_normal, n_outlier = len(X_test), np.sum(labels == 0), np.sum(labels == 1)
81 | n_val = int(0.1 * n_test)
82 | n_val_normal, n_val_outlier = int(n_val * (n_normal/n_test)), int(n_val * (n_outlier/n_test))
83 | perm = np.random.permutation(n_test)
84 | X_val = np.concatenate((X_test[perm][labels[perm] == 0][:n_val_normal],
85 | X_test[perm][labels[perm] == 1][:n_val_outlier]))
86 | labels = np.array([0] * n_val_normal + [1] * n_val_outlier)
87 |
88 | i = 1
89 | for gamma in gammas:
90 |
91 | # Model candidate
92 | model = OneClassSVM(kernel=self.kernel, nu=self.nu, gamma=gamma)
93 |
94 | # Train
95 | start_time = time.time()
96 | model.fit(X)
97 | train_time = time.time() - start_time
98 |
99 | # Test on small hold-out set from test set
100 | scores = (-1.0) * model.decision_function(X_val)
101 | scores = scores.flatten()
102 |
103 | # Compute AUC
104 | auc = roc_auc_score(labels, scores)
105 |
106 | logger.info(f' | Model {i:02}/{len(gammas):02} | Gamma: {gamma:.8f} | Train Time: {train_time:.3f}s '
107 | f'| Val AUC: {100. * auc:.2f} |')
108 |
109 | if auc > best_auc:
110 | best_auc = auc
111 | self.model = model
112 | self.gamma = gamma
113 | self.results['train_time'] = train_time
114 |
115 | i += 1
116 |
117 | # If hybrid, also train a model with linear kernel
118 | if self.hybrid:
119 | self.linear_model = OneClassSVM(kernel='linear', nu=self.nu)
120 | start_time = time.time()
121 | self.linear_model.fit(X)
122 | train_time = time.time() - start_time
123 | self.results['train_time_linear'] = train_time
124 |
125 | logger.info(f'Best Model: | Gamma: {self.gamma:.8f} | AUC: {100. * best_auc:.2f}')
126 | logger.info('Training Time: {:.3f}s'.format(self.results['train_time']))
127 | logger.info('Finished training.')
128 |
129 | def test(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0):
130 | """Tests the OC-SVM model on the test data."""
131 | logger = logging.getLogger()
132 |
133 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader)
134 |
135 | # Get data from loader
136 | idx_label_score = []
137 | X = ()
138 | idxs = []
139 | labels = []
140 | for data in test_loader:
141 | inputs, label_batch, _, idx = data
142 | inputs, label_batch, idx = inputs.to(device), label_batch.to(device), idx.to(device)
143 | if self.hybrid:
144 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
145 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
146 | X += (X_batch.cpu().data.numpy(),)
147 | idxs += idx.cpu().data.numpy().astype(np.int64).tolist()
148 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist()
149 | X = np.concatenate(X)
150 |
151 | # Testing
152 | logger.info('Starting testing...')
153 | start_time = time.time()
154 |
155 | scores = (-1.0) * self.model.decision_function(X)
156 |
157 | self.results['test_time'] = time.time() - start_time
158 | scores = scores.flatten()
159 | self.rho = -self.model.intercept_[0]
160 |
161 | # Save triples of (idx, label, score) in a list
162 | idx_label_score += list(zip(idxs, labels, scores.tolist()))
163 | self.results['test_scores'] = idx_label_score
164 |
165 | # Compute AUC
166 | _, labels, scores = zip(*idx_label_score)
167 | labels = np.array(labels)
168 | scores = np.array(scores)
169 | self.results['test_auc'] = roc_auc_score(labels, scores)
170 |
171 | # If hybrid, also test model with linear kernel
172 | if self.hybrid:
173 | start_time = time.time()
174 | scores_linear = (-1.0) * self.linear_model.decision_function(X)
175 | self.results['test_time_linear'] = time.time() - start_time
176 | scores_linear = scores_linear.flatten()
177 | self.results['test_auc_linear'] = roc_auc_score(labels, scores_linear)
178 | logger.info('Test AUC linear model: {:.2f}%'.format(100. * self.results['test_auc_linear']))
179 | logger.info('Test Time linear model: {:.3f}s'.format(self.results['test_time_linear']))
180 |
181 | # Log results
182 | logger.info('Test AUC: {:.2f}%'.format(100. * self.results['test_auc']))
183 | logger.info('Test Time: {:.3f}s'.format(self.results['test_time']))
184 | logger.info('Finished testing.')
185 |
186 | def load_ae(self, dataset_name, model_path):
187 | """Load pretrained autoencoder from model_path for feature extraction in a hybrid OC-SVM model."""
188 |
189 | model_dict = torch.load(model_path, map_location='cpu')
190 | ae_net_dict = model_dict['ae_net_dict']
191 | if dataset_name in ['mnist', 'fmnist', 'cifar10']:
192 | net_name = dataset_name + '_LeNet'
193 | else:
194 | net_name = dataset_name + '_mlp'
195 |
196 | if self.ae_net is None:
197 | self.ae_net = build_autoencoder(net_name)
198 |
199 | # update keys (since there was a change in network definition)
200 | ae_keys = list(self.ae_net.state_dict().keys())
201 | for i in range(len(ae_net_dict)):
202 | k, v = ae_net_dict.popitem(False)
203 | new_key = ae_keys[i]
204 | ae_net_dict[new_key] = v
205 | i += 1
206 |
207 | self.ae_net.load_state_dict(ae_net_dict)
208 | self.ae_net.eval()
209 |
210 | def save_model(self, export_path):
211 | """Save OC-SVM model to export_path."""
212 | pass
213 |
214 | def load_model(self, import_path, device: str = 'cpu'):
215 | """Load OC-SVM model from import_path."""
216 | pass
217 |
218 | def save_results(self, export_json):
219 | """Save results dict to a JSON-file."""
220 | with open(export_json, 'w') as fp:
221 | json.dump(self.results, fp)
222 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baseline_kde.py:
--------------------------------------------------------------------------------
1 | import click
2 | import torch
3 | import logging
4 | import random
5 | import numpy as np
6 |
7 | from utils.config import Config
8 | from utils.visualization.plot_images_grid import plot_images_grid
9 | from baselines.kde import KDE
10 | from datasets.main import load_dataset
11 |
12 |
13 | ################################################################################
14 | # Settings
15 | ################################################################################
16 | @click.command()
17 | @click.argument('dataset_name', type=click.Choice(['mnist', 'fmnist', 'cifar10', 'arrhythmia', 'cardio', 'satellite',
18 | 'satimage-2', 'shuttle', 'thyroid']))
19 | @click.argument('xp_path', type=click.Path(exists=True))
20 | @click.argument('data_path', type=click.Path(exists=True))
21 | @click.option('--load_config', type=click.Path(exists=True), default=None,
22 | help='Config JSON-file path (default: None).')
23 | @click.option('--load_model', type=click.Path(exists=True), default=None,
24 | help='Model file path (default: None).')
25 | @click.option('--ratio_known_normal', type=float, default=0.0,
26 | help='Ratio of known (labeled) normal training examples.')
27 | @click.option('--ratio_known_outlier', type=float, default=0.0,
28 | help='Ratio of known (labeled) anomalous training examples.')
29 | @click.option('--ratio_pollution', type=float, default=0.0,
30 | help='Pollution ratio of unlabeled training data with unknown (unlabeled) anomalies.')
31 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.')
32 | @click.option('--kernel', type=click.Choice(['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']),
33 | default='gaussian', help='Kernel for the KDE')
34 | @click.option('--grid_search_cv', type=bool, default=True,
35 | help='Use sklearn GridSearchCV to determine optimal bandwidth')
36 | @click.option('--n_jobs_model', type=int, default=-1, help='Number of jobs for model training.')
37 | @click.option('--hybrid', type=bool, default=False,
38 | help='Train KDE on features extracted from an autoencoder. If True, load_ae must be specified.')
39 | @click.option('--load_ae', type=click.Path(exists=True), default=None,
40 | help='Model file path to load autoencoder weights (default: None).')
41 | @click.option('--n_jobs_dataloader', type=int, default=0,
42 | help='Number of workers for data loading. 0 means that the data will be loaded in the main process.')
43 | @click.option('--normal_class', type=int, default=0,
44 | help='Specify the normal class of the dataset (all other classes are considered anomalous).')
45 | @click.option('--known_outlier_class', type=int, default=1,
46 | help='Specify the known outlier class of the dataset for semi-supervised anomaly detection.')
47 | @click.option('--n_known_outlier_classes', type=int, default=0,
48 | help='Number of known outlier classes.'
49 | 'If 0, no anomalies are known.'
50 | 'If 1, outlier class as specified in --known_outlier_class option.'
51 | 'If > 1, the specified number of outlier classes will be sampled at random.')
52 | def main(dataset_name, xp_path, data_path, load_config, load_model, ratio_known_normal, ratio_known_outlier,
53 | ratio_pollution, seed, kernel, grid_search_cv, n_jobs_model, hybrid, load_ae, n_jobs_dataloader, normal_class,
54 | known_outlier_class, n_known_outlier_classes):
55 | """
56 | (Hybrid) KDE for anomaly detection.
57 |
58 | :arg DATASET_NAME: Name of the dataset to load.
59 | :arg XP_PATH: Export path for logging the experiment.
60 | :arg DATA_PATH: Root path of data.
61 | """
62 |
63 | # Get configuration
64 | cfg = Config(locals().copy())
65 |
66 | # Set up logging
67 | logging.basicConfig(level=logging.INFO)
68 | logger = logging.getLogger()
69 | logger.setLevel(logging.INFO)
70 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
71 | log_file = xp_path + '/log.txt'
72 | file_handler = logging.FileHandler(log_file)
73 | file_handler.setLevel(logging.INFO)
74 | file_handler.setFormatter(formatter)
75 | logger.addHandler(file_handler)
76 |
77 | # Print paths
78 | logger.info('Log file is %s.' % log_file)
79 | logger.info('Data path is %s.' % data_path)
80 | logger.info('Export path is %s.' % xp_path)
81 |
82 | # Print experimental setup
83 | logger.info('Dataset: %s' % dataset_name)
84 | logger.info('Normal class: %d' % normal_class)
85 | logger.info('Ratio of labeled normal train samples: %.2f' % ratio_known_normal)
86 | logger.info('Ratio of labeled anomalous samples: %.2f' % ratio_known_outlier)
87 | logger.info('Pollution ratio of unlabeled train data: %.2f' % ratio_pollution)
88 | if n_known_outlier_classes == 1:
89 | logger.info('Known anomaly class: %d' % known_outlier_class)
90 | else:
91 | logger.info('Number of known anomaly classes: %d' % n_known_outlier_classes)
92 |
93 | # If specified, load experiment config from JSON-file
94 | if load_config:
95 | cfg.load_config(import_json=load_config)
96 | logger.info('Loaded configuration from %s.' % load_config)
97 |
98 | # Print KDE configuration
99 | logger.info('KDE kernel: %s' % cfg.settings['kernel'])
100 | logger.info('Use GridSearchCV for bandwidth selection: %s' % cfg.settings['grid_search_cv'])
101 | logger.info('Number of jobs for model training: %d' % n_jobs_model)
102 | logger.info('Hybrid model: %s' % cfg.settings['hybrid'])
103 |
104 | # Set seed
105 | if cfg.settings['seed'] != -1:
106 | random.seed(cfg.settings['seed'])
107 | np.random.seed(cfg.settings['seed'])
108 | torch.manual_seed(cfg.settings['seed'])
109 | torch.cuda.manual_seed(cfg.settings['seed'])
110 | torch.backends.cudnn.deterministic = True
111 | logger.info('Set seed to %d.' % cfg.settings['seed'])
112 |
113 | # Use 'cpu' as device for KDE
114 | device = 'cpu'
115 | torch.multiprocessing.set_sharing_strategy('file_system') # fix multiprocessing issue for ubuntu
116 | logger.info('Computation device: %s' % device)
117 | logger.info('Number of dataloader workers: %d' % n_jobs_dataloader)
118 |
119 | # Load data
120 | dataset = load_dataset(dataset_name, data_path, normal_class, known_outlier_class, n_known_outlier_classes,
121 | ratio_known_normal, ratio_known_outlier, ratio_pollution,
122 | random_state=np.random.RandomState(cfg.settings['seed']))
123 | # Log random sample of known anomaly classes if more than 1 class
124 | if n_known_outlier_classes > 1:
125 | logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes,))
126 |
127 | # Initialize KDE model
128 | kde = KDE(hybrid=cfg.settings['hybrid'], kernel=cfg.settings['kernel'], n_jobs=n_jobs_model,
129 | seed=cfg.settings['seed'])
130 |
131 | # If specified, load model parameters from already trained model
132 | if load_model:
133 | kde.load_model(import_path=load_model, device=device)
134 | logger.info('Loading model from %s.' % load_model)
135 |
136 | # If specified, load model autoencoder weights for a hybrid approach
137 | if hybrid and load_ae is not None:
138 | kde.load_ae(dataset_name, model_path=load_ae)
139 | logger.info('Loaded pretrained autoencoder for features from %s.' % load_ae)
140 |
141 | # Train model on dataset
142 | kde.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader,
143 | bandwidth_GridSearchCV=cfg.settings['grid_search_cv'])
144 |
145 | # Test model
146 | kde.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
147 |
148 | # Save results and configuration
149 | kde.save_results(export_json=xp_path + '/results.json')
150 | cfg.save_config(export_json=xp_path + '/config.json')
151 |
152 | # Plot most anomalous and most normal test samples
153 | indices, labels, scores = zip(*kde.results['test_scores'])
154 | indices, labels, scores = np.array(indices), np.array(labels), np.array(scores)
155 | idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score
156 | idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score
157 |
158 | if dataset_name in ('mnist', 'fmnist', 'cifar10'):
159 |
160 | if dataset_name in ('mnist', 'fmnist'):
161 | X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1)
162 | X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(1)
163 | X_normal_low = dataset.test_set.data[idx_normal_sorted[:32], ...].unsqueeze(1)
164 | X_normal_high = dataset.test_set.data[idx_normal_sorted[-32:], ...].unsqueeze(1)
165 |
166 | if dataset_name == 'cifar10':
167 | X_all_low = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[:32], ...], (0, 3, 1, 2)))
168 | X_all_high = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[-32:], ...], (0, 3, 1, 2)))
169 | X_normal_low = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[:32], ...], (0, 3, 1, 2)))
170 | X_normal_high = torch.tensor(
171 | np.transpose(dataset.test_set.data[idx_normal_sorted[-32:], ...], (0, 3, 1, 2)))
172 |
173 | plot_images_grid(X_all_low, export_img=xp_path + '/all_low', padding=2)
174 | plot_images_grid(X_all_high, export_img=xp_path + '/all_high', padding=2)
175 | plot_images_grid(X_normal_low, export_img=xp_path + '/normals_low', padding=2)
176 | plot_images_grid(X_normal_high, export_img=xp_path + '/normals_high', padding=2)
177 |
178 |
179 | if __name__ == '__main__':
180 | main()
181 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baseline_isoforest.py:
--------------------------------------------------------------------------------
1 | import click
2 | import torch
3 | import logging
4 | import random
5 | import numpy as np
6 |
7 | from utils.config import Config
8 | from utils.visualization.plot_images_grid import plot_images_grid
9 | from baselines.isoforest import IsoForest
10 | from datasets.main import load_dataset
11 |
12 |
13 | ################################################################################
14 | # Settings
15 | ################################################################################
16 | @click.command()
17 | @click.argument('dataset_name', type=click.Choice(['mnist', 'fmnist', 'cifar10', 'arrhythmia', 'cardio', 'satellite',
18 | 'satimage-2', 'shuttle', 'thyroid']))
19 | @click.argument('xp_path', type=click.Path(exists=True))
20 | @click.argument('data_path', type=click.Path(exists=True))
21 | @click.option('--load_config', type=click.Path(exists=True), default=None,
22 | help='Config JSON-file path (default: None).')
23 | @click.option('--load_model', type=click.Path(exists=True), default=None,
24 | help='Model file path (default: None).')
25 | @click.option('--ratio_known_normal', type=float, default=0.0,
26 | help='Ratio of known (labeled) normal training examples.')
27 | @click.option('--ratio_known_outlier', type=float, default=0.0,
28 | help='Ratio of known (labeled) anomalous training examples.')
29 | @click.option('--ratio_pollution', type=float, default=0.0,
30 | help='Pollution ratio of unlabeled training data with unknown (unlabeled) anomalies.')
31 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.')
32 | @click.option('--n_estimators', type=int, default=100,
33 | help='Set the number of base estimators in the ensemble (default: 100).')
34 | @click.option('--max_samples', type=int, default=256,
35 | help='Set the number of samples drawn to train each base estimator (default: 256).')
36 | @click.option('--contamination', type=float, default=0.1,
37 | help='Expected fraction of anomalies in the training set. (default: 0.1).')
38 | @click.option('--n_jobs_model', type=int, default=-1, help='Number of jobs for model training.')
39 | @click.option('--hybrid', type=bool, default=False,
40 | help='Train model on features extracted from an autoencoder. If True, load_ae must be specified.')
41 | @click.option('--load_ae', type=click.Path(exists=True), default=None,
42 | help='Model file path to load autoencoder weights (default: None).')
43 | @click.option('--n_jobs_dataloader', type=int, default=0,
44 | help='Number of workers for data loading. 0 means that the data will be loaded in the main process.')
45 | @click.option('--normal_class', type=int, default=0,
46 | help='Specify the normal class of the dataset (all other classes are considered anomalous).')
47 | @click.option('--known_outlier_class', type=int, default=1,
48 | help='Specify the known outlier class of the dataset for semi-supervised anomaly detection.')
49 | @click.option('--n_known_outlier_classes', type=int, default=0,
50 | help='Number of known outlier classes.'
51 | 'If 0, no anomalies are known.'
52 | 'If 1, outlier class as specified in --known_outlier_class option.'
53 | 'If > 1, the specified number of outlier classes will be sampled at random.')
54 | def main(dataset_name, xp_path, data_path, load_config, load_model, ratio_known_normal, ratio_known_outlier,
55 | ratio_pollution, seed, n_estimators, max_samples, contamination, n_jobs_model, hybrid, load_ae,
56 | n_jobs_dataloader, normal_class, known_outlier_class, n_known_outlier_classes):
57 | """
58 | (Hybrid) Isolation Forest model for anomaly detection.
59 |
60 | :arg DATASET_NAME: Name of the dataset to load.
61 | :arg XP_PATH: Export path for logging the experiment.
62 | :arg DATA_PATH: Root path of data.
63 | """
64 |
65 | # Get configuration
66 | cfg = Config(locals().copy())
67 |
68 | # Set up logging
69 | logging.basicConfig(level=logging.INFO)
70 | logger = logging.getLogger()
71 | logger.setLevel(logging.INFO)
72 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
73 | log_file = xp_path + '/log.txt'
74 | file_handler = logging.FileHandler(log_file)
75 | file_handler.setLevel(logging.INFO)
76 | file_handler.setFormatter(formatter)
77 | logger.addHandler(file_handler)
78 |
79 | # Print paths
80 | logger.info('Log file is %s.' % log_file)
81 | logger.info('Data path is %s.' % data_path)
82 | logger.info('Export path is %s.' % xp_path)
83 |
84 | # Print experimental setup
85 | logger.info('Dataset: %s' % dataset_name)
86 | logger.info('Normal class: %d' % normal_class)
87 | logger.info('Ratio of labeled normal train samples: %.2f' % ratio_known_normal)
88 | logger.info('Ratio of labeled anomalous samples: %.2f' % ratio_known_outlier)
89 | logger.info('Pollution ratio of unlabeled train data: %.2f' % ratio_pollution)
90 | if n_known_outlier_classes == 1:
91 | logger.info('Known anomaly class: %d' % known_outlier_class)
92 | else:
93 | logger.info('Number of known anomaly classes: %d' % n_known_outlier_classes)
94 |
95 | # If specified, load experiment config from JSON-file
96 | if load_config:
97 | cfg.load_config(import_json=load_config)
98 | logger.info('Loaded configuration from %s.' % load_config)
99 |
100 | # Print Isolation Forest configuration
101 | logger.info('Number of base estimators in the ensemble: %d' % cfg.settings['n_estimators'])
102 | logger.info('Number of samples for training each base estimator: %d' % cfg.settings['max_samples'])
103 | logger.info('Contamination parameter: %.2f' % cfg.settings['contamination'])
104 | logger.info('Number of jobs for model training: %d' % n_jobs_model)
105 | logger.info('Hybrid model: %s' % cfg.settings['hybrid'])
106 |
107 | # Set seed
108 | if cfg.settings['seed'] != -1:
109 | random.seed(cfg.settings['seed'])
110 | np.random.seed(cfg.settings['seed'])
111 | torch.manual_seed(cfg.settings['seed'])
112 | torch.cuda.manual_seed(cfg.settings['seed'])
113 | torch.backends.cudnn.deterministic = True
114 | logger.info('Set seed to %d.' % cfg.settings['seed'])
115 |
116 | # Use 'cpu' as device for Isolation Forest
117 | device = 'cpu'
118 | torch.multiprocessing.set_sharing_strategy('file_system') # fix multiprocessing issue for ubuntu
119 | logger.info('Computation device: %s' % device)
120 | logger.info('Number of dataloader workers: %d' % n_jobs_dataloader)
121 |
122 | # Load data
123 | dataset = load_dataset(dataset_name, data_path, normal_class, known_outlier_class, n_known_outlier_classes,
124 | ratio_known_normal, ratio_known_outlier, ratio_pollution,
125 | random_state=np.random.RandomState(cfg.settings['seed']))
126 | # Log random sample of known anomaly classes if more than 1 class
127 | if n_known_outlier_classes > 1:
128 | logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes,))
129 |
130 | # Initialize Isolation Forest model
131 | Isoforest = IsoForest(hybrid=cfg.settings['hybrid'], n_estimators=cfg.settings['n_estimators'],
132 | max_samples=cfg.settings['max_samples'], contamination=cfg.settings['contamination'],
133 | n_jobs=n_jobs_model, seed=cfg.settings['seed'])
134 |
135 | # If specified, load model parameters from already trained model
136 | if load_model:
137 | Isoforest.load_model(import_path=load_model, device=device)
138 | logger.info('Loading model from %s.' % load_model)
139 |
140 | # If specified, load model autoencoder weights for a hybrid approach
141 | if hybrid and load_ae is not None:
142 | Isoforest.load_ae(dataset_name, model_path=load_ae)
143 | logger.info('Loaded pretrained autoencoder for features from %s.' % load_ae)
144 |
145 | # Train model on dataset
146 | Isoforest.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
147 |
148 | # Test model
149 | Isoforest.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
150 |
151 | # Save results and configuration
152 | Isoforest.save_results(export_json=xp_path + '/results.json')
153 | cfg.save_config(export_json=xp_path + '/config.json')
154 |
155 | # Plot most anomalous and most normal test samples
156 | indices, labels, scores = zip(*Isoforest.results['test_scores'])
157 | indices, labels, scores = np.array(indices), np.array(labels), np.array(scores)
158 | idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score
159 | idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score
160 |
161 | if dataset_name in ('mnist', 'fmnist', 'cifar10'):
162 |
163 | if dataset_name in ('mnist', 'fmnist'):
164 | X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1)
165 | X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(1)
166 | X_normal_low = dataset.test_set.data[idx_normal_sorted[:32], ...].unsqueeze(1)
167 | X_normal_high = dataset.test_set.data[idx_normal_sorted[-32:], ...].unsqueeze(1)
168 |
169 | if dataset_name == 'cifar10':
170 | X_all_low = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[:32], ...], (0, 3, 1, 2)))
171 | X_all_high = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[-32:], ...], (0, 3, 1, 2)))
172 | X_normal_low = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[:32], ...], (0, 3, 1, 2)))
173 | X_normal_high = torch.tensor(
174 | np.transpose(dataset.test_set.data[idx_normal_sorted[-32:], ...], (0, 3, 1, 2)))
175 |
176 | plot_images_grid(X_all_low, export_img=xp_path + '/all_low', padding=2)
177 | plot_images_grid(X_all_high, export_img=xp_path + '/all_high', padding=2)
178 | plot_images_grid(X_normal_low, export_img=xp_path + '/normals_low', padding=2)
179 | plot_images_grid(X_normal_high, export_img=xp_path + '/normals_high', padding=2)
180 |
181 |
182 | if __name__ == '__main__':
183 | main()
184 |
--------------------------------------------------------------------------------
/baseline/DeepSAD/src/baselines/ssad.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import time
4 | import torch
5 | import numpy as np
6 |
7 | from torch.utils.data import DataLoader
8 | from .shallow_ssad.ssad_convex import ConvexSSAD
9 | from sklearn.metrics import roc_auc_score
10 | from sklearn.metrics.pairwise import pairwise_kernels
11 | from base.base_dataset import BaseADDataset
12 | from networks.main import build_autoencoder
13 |
14 |
15 | class SSAD(object):
16 | """
17 | A class for kernel SSAD models as described in Goernitz et al., Towards Supervised Anomaly Detection, JAIR, 2013.
18 | """
19 |
20 | def __init__(self, kernel='rbf', kappa=1.0, Cp=1.0, Cu=1.0, Cn=1.0, hybrid=False):
21 | """Init SSAD instance."""
22 | self.kernel = kernel
23 | self.kappa = kappa
24 | self.Cp = Cp
25 | self.Cu = Cu
26 | self.Cn = Cn
27 | self.rho = None
28 | self.gamma = None
29 |
30 | self.model = None
31 | self.X_svs = None
32 |
33 | self.hybrid = hybrid
34 | self.ae_net = None # autoencoder network for the case of a hybrid model
35 | self.linear_model = None # also init a model with linear kernel if hybrid approach
36 | self.linear_X_svs = None
37 |
38 | self.results = {
39 | 'train_time': None,
40 | 'test_time': None,
41 | 'test_auc': None,
42 | 'test_scores': None,
43 | 'train_time_linear': None,
44 | 'test_time_linear': None,
45 | 'test_auc_linear': None
46 | }
47 |
48 | def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0):
49 | """Trains the SSAD model on the training data."""
50 | logger = logging.getLogger()
51 |
52 | # do not drop last batch for non-SGD optimization shallow_ssad
53 | train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True,
54 | num_workers=n_jobs_dataloader, drop_last=False)
55 |
56 | # Get data from loader
57 | X = ()
58 | semi_targets = []
59 | for data in train_loader:
60 | inputs, _, semi_targets_batch, _ = data
61 | inputs, semi_targets_batch = inputs.to(device), semi_targets_batch.to(device)
62 | if self.hybrid:
63 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
64 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
65 | X += (X_batch.cpu().data.numpy(),)
66 | semi_targets += semi_targets_batch.cpu().data.numpy().astype(np.int).tolist()
67 | X, semi_targets = np.concatenate(X), np.array(semi_targets)
68 |
69 | # Training
70 | logger.info('Starting training...')
71 |
72 | # Select model via hold-out test set of 1000 samples
73 | gammas = np.logspace(-7, 2, num=10, base=2)
74 | best_auc = 0.0
75 |
76 | # Sample hold-out set from test set
77 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader)
78 |
79 | X_test = ()
80 | labels = []
81 | for data in test_loader:
82 | inputs, label_batch, _, _ = data
83 | inputs, label_batch = inputs.to(device), label_batch.to(device)
84 | if self.hybrid:
85 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
86 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
87 | X_test += (X_batch.cpu().data.numpy(),)
88 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist()
89 | X_test, labels = np.concatenate(X_test), np.array(labels)
90 | n_test, n_normal, n_outlier = len(X_test), np.sum(labels == 0), np.sum(labels == 1)
91 | n_val = int(0.1 * n_test)
92 | n_val_normal, n_val_outlier = int(n_val * (n_normal/n_test)), int(n_val * (n_outlier/n_test))
93 | perm = np.random.permutation(n_test)
94 | X_val = np.concatenate((X_test[perm][labels[perm] == 0][:n_val_normal],
95 | X_test[perm][labels[perm] == 1][:n_val_outlier]))
96 | labels = np.array([0] * n_val_normal + [1] * n_val_outlier)
97 |
98 | i = 1
99 | for gamma in gammas:
100 |
101 | # Build the training kernel
102 | kernel = pairwise_kernels(X, X, metric=self.kernel, gamma=gamma)
103 |
104 | # Model candidate
105 | model = ConvexSSAD(kernel, semi_targets, Cp=self.Cp, Cu=self.Cu, Cn=self.Cn)
106 |
107 | # Train
108 | start_time = time.time()
109 | model.fit()
110 | train_time = time.time() - start_time
111 |
112 | # Test on small hold-out set from test set
113 | kernel_val = pairwise_kernels(X_val, X[model.svs, :], metric=self.kernel, gamma=gamma)
114 | scores = (-1.0) * model.apply(kernel_val)
115 | scores = scores.flatten()
116 |
117 | # Compute AUC
118 | auc = roc_auc_score(labels, scores)
119 |
120 | logger.info(f' | Model {i:02}/{len(gammas):02} | Gamma: {gamma:.8f} | Train Time: {train_time:.3f}s '
121 | f'| Val AUC: {100. * auc:.2f} |')
122 |
123 | if auc > best_auc:
124 | best_auc = auc
125 | self.model = model
126 | self.gamma = gamma
127 | self.results['train_time'] = train_time
128 |
129 | i += 1
130 |
131 | # Get support vectors for testing
132 | self.X_svs = X[self.model.svs, :]
133 |
134 | # If hybrid, also train a model with linear kernel
135 | if self.hybrid:
136 | linear_kernel = pairwise_kernels(X, X, metric='linear')
137 | self.linear_model = ConvexSSAD(linear_kernel, semi_targets, Cp=self.Cp, Cu=self.Cu, Cn=self.Cn)
138 | start_time = time.time()
139 | self.linear_model.fit()
140 | train_time = time.time() - start_time
141 | self.results['train_time_linear'] = train_time
142 | self.linear_X_svs = X[self.linear_model.svs, :]
143 |
144 | logger.info(f'Best Model: | Gamma: {self.gamma:.8f} | AUC: {100. * best_auc:.2f}')
145 | logger.info('Training Time: {:.3f}s'.format(self.results['train_time']))
146 | logger.info('Finished training.')
147 |
148 | def test(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0):
149 | """Tests the SSAD model on the test data."""
150 | logger = logging.getLogger()
151 |
152 | _, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader)
153 |
154 | # Get data from loader
155 | idx_label_score = []
156 | X = ()
157 | idxs = []
158 | labels = []
159 | for data in test_loader:
160 | inputs, label_batch, _, idx = data
161 | inputs, label_batch, idx = inputs.to(device), label_batch.to(device), idx.to(device)
162 | if self.hybrid:
163 | inputs = self.ae_net.encoder(inputs) # in hybrid approach, take code representation of AE as features
164 | X_batch = inputs.view(inputs.size(0), -1) # X_batch.shape = (batch_size, n_channels * height * width)
165 | X += (X_batch.cpu().data.numpy(),)
166 | idxs += idx.cpu().data.numpy().astype(np.int64).tolist()
167 | labels += label_batch.cpu().data.numpy().astype(np.int64).tolist()
168 | X = np.concatenate(X)
169 |
170 | # Testing
171 | logger.info('Starting testing...')
172 | start_time = time.time()
173 |
174 | # Build kernel
175 | kernel = pairwise_kernels(X, self.X_svs, metric=self.kernel, gamma=self.gamma)
176 |
177 | scores = (-1.0) * self.model.apply(kernel)
178 |
179 | self.results['test_time'] = time.time() - start_time
180 | scores = scores.flatten()
181 | self.rho = -self.model.threshold
182 |
183 | # Save triples of (idx, label, score) in a list
184 | idx_label_score += list(zip(idxs, labels, scores.tolist()))
185 | self.results['test_scores'] = idx_label_score
186 |
187 | # Compute AUC
188 | _, labels, scores = zip(*idx_label_score)
189 | labels = np.array(labels)
190 | scores = np.array(scores)
191 | self.results['test_auc'] = roc_auc_score(labels, scores)
192 |
193 | # If hybrid, also test model with linear kernel
194 | if self.hybrid:
195 | start_time = time.time()
196 | linear_kernel = pairwise_kernels(X, self.linear_X_svs, metric='linear')
197 | scores_linear = (-1.0) * self.linear_model.apply(linear_kernel)
198 | self.results['test_time_linear'] = time.time() - start_time
199 | scores_linear = scores_linear.flatten()
200 | self.results['test_auc_linear'] = roc_auc_score(labels, scores_linear)
201 | logger.info('Test AUC linear model: {:.2f}%'.format(100. * self.results['test_auc_linear']))
202 | logger.info('Test Time linear model: {:.3f}s'.format(self.results['test_time_linear']))
203 |
204 | # Log results
205 | logger.info('Test AUC: {:.2f}%'.format(100. * self.results['test_auc']))
206 | logger.info('Test Time: {:.3f}s'.format(self.results['test_time']))
207 | logger.info('Finished testing.')
208 |
209 | def load_ae(self, dataset_name, model_path):
210 | """Load pretrained autoencoder from model_path for feature extraction in a hybrid SSAD model."""
211 |
212 | model_dict = torch.load(model_path, map_location='cpu')
213 | ae_net_dict = model_dict['ae_net_dict']
214 | if dataset_name in ['mnist', 'fmnist', 'cifar10']:
215 | net_name = dataset_name + '_LeNet'
216 | else:
217 | net_name = dataset_name + '_mlp'
218 |
219 | if self.ae_net is None:
220 | self.ae_net = build_autoencoder(net_name)
221 |
222 | # update keys (since there was a change in network definition)
223 | ae_keys = list(self.ae_net.state_dict().keys())
224 | for i in range(len(ae_net_dict)):
225 | k, v = ae_net_dict.popitem(False)
226 | new_key = ae_keys[i]
227 | ae_net_dict[new_key] = v
228 | i += 1
229 |
230 | self.ae_net.load_state_dict(ae_net_dict)
231 | self.ae_net.eval()
232 |
233 | def save_model(self, export_path):
234 | """Save SSAD model to export_path."""
235 | pass
236 |
237 | def load_model(self, import_path, device: str = 'cpu'):
238 | """Load SSAD model from import_path."""
239 | pass
240 |
241 | def save_results(self, export_json):
242 | """Save results dict to a JSON-file."""
243 | with open(export_json, 'w') as fp:
244 | json.dump(self.results, fp)
245 |
--------------------------------------------------------------------------------
/myutils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 | import random
5 | import torch
6 |
7 | # metric
8 | from sklearn.metrics import roc_auc_score, average_precision_score
9 |
10 | # plot
11 | import matplotlib.pyplot as plt
12 |
13 | # statistical analysis
14 | from scipy.stats import wilcoxon
15 |
16 | class Utils():
17 | def __init__(self):
18 | pass
19 |
20 | # remove randomness
21 | def set_seed(self, seed):
22 | # os.environ['PYTHONHASHSEED'] = str(seed)
23 | # os.environ['TF_CUDNN_DETERMINISTIC'] = 'true'
24 | # os.environ['TF_DETERMINISTIC_OPS'] = 'true'
25 |
26 | # basic seed
27 | np.random.seed(seed)
28 | random.seed(seed)
29 |
30 | # pytorch seed
31 | torch.manual_seed(seed)
32 | torch.backends.cudnn.deterministic = True
33 | torch.backends.cudnn.benchmark = False
34 |
35 | def get_device(self, gpu_specific=False):
36 | if gpu_specific:
37 | if torch.cuda.is_available():
38 | n_gpu = torch.cuda.device_count()
39 | print(f'number of gpu: {n_gpu}')
40 | print(f'cuda name: {torch.cuda.get_device_name(0)}')
41 | print('GPU is on')
42 | else:
43 | print('GPU is off')
44 |
45 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
46 | else:
47 | device = torch.device("cpu")
48 | return device
49 |
50 | # generate unique value
51 | def unique(self, a, b):
52 | u = 0.5 * (a + b) * (a + b + 1) + b
53 | return int(u)
54 |
55 | def data_description(self, X, y):
56 | des_dict = {}
57 | des_dict['Samples'] = X.shape[0]
58 | des_dict['Features'] = X.shape[1]
59 | des_dict['Anomalies'] = sum(y)
60 | des_dict['Anomalies Ratio(%)'] = round((sum(y) / len(y)) * 100, 2)
61 |
62 | print(des_dict)
63 |
64 | # metric
65 | def metric(self, y_true, y_score, pos_label=1):
66 | aucroc = roc_auc_score(y_true=y_true, y_score=y_score)
67 | aucpr = average_precision_score(y_true=y_true, y_score=y_score, pos_label=1)
68 |
69 | return {'aucroc':aucroc, 'aucpr':aucpr}
70 |
71 | # resampling function
72 | def sampler(self, X_train, y_train, batch_size):
73 | index_u = np.where(y_train == 0)[0]
74 | index_a = np.where(y_train == 1)[0]
75 |
76 | n = 0
77 | while len(index_u) >= batch_size:
78 | self.set_seed(n)
79 | index_u_batch = np.random.choice(index_u, batch_size // 2, replace=False)
80 | index_u = np.setdiff1d(index_u, index_u_batch)
81 |
82 | index_a_batch = np.random.choice(index_a, batch_size // 2, replace=True)
83 |
84 | # batch index
85 | index_batch = np.append(index_u_batch, index_a_batch)
86 | # shuffle
87 | np.random.shuffle(index_batch)
88 |
89 | if n == 0:
90 | X_train_new = X_train[index_batch]
91 | y_train_new = y_train[index_batch]
92 | else:
93 | X_train_new = np.append(X_train_new, X_train[index_batch], axis=0)
94 | y_train_new = np.append(y_train_new, y_train[index_batch])
95 | n += 1
96 |
97 | return X_train_new, y_train_new
98 |
99 | def sampler_2(self, X_train, y_train, step, batch_size=512):
100 | index_u = np.where(y_train == 0)[0]
101 | index_a = np.where(y_train == 1)[0]
102 |
103 | for i in range(step):
104 | index_u_batch = np.random.choice(index_u, batch_size // 2, replace=True)
105 | index_a_batch = np.random.choice(index_a, batch_size // 2, replace=True)
106 |
107 | # batch index
108 | index_batch = np.append(index_u_batch, index_a_batch)
109 | # shuffle
110 | np.random.shuffle(index_batch)
111 |
112 | if i == 0:
113 | X_train_new = X_train[index_batch]
114 | y_train_new = y_train[index_batch]
115 | else:
116 | X_train_new = np.append(X_train_new, X_train[index_batch], axis=0)
117 | y_train_new = np.append(y_train_new, y_train[index_batch])
118 |
119 | return X_train_new, y_train_new
120 |
121 | # for PReNet
122 | def sampler_pairs(self, X_train_tensor, y_train, epoch, batch_num, batch_size, s_a_a, s_a_u, s_u_u):
123 | '''
124 | X_train_tensor: the input X in the torch.tensor form
125 | y_train: label in the numpy.array form
126 |
127 | batch_num: generate how many batches in one epoch
128 | batch_size: the batch size
129 | '''
130 | data_loader_X = []
131 | data_loader_y = []
132 |
133 | index_a = np.where(y_train == 1)[0]
134 | index_u = np.where(y_train == 0)[0]
135 |
136 | for i in range(batch_num): # i.e., drop_last = True
137 | index = []
138 |
139 | # pairs of (a,a); (a,u); (u,u)
140 | for j in range(6):
141 | # generate unique seed and set seed
142 | # seed = self.unique(epoch, i)
143 | # seed = self.unique(seed, j)
144 | # self.set_seed(seed)
145 |
146 | if j < 3:
147 | index_sub = np.random.choice(index_a, batch_size // 4, replace=True)
148 | index.append(list(index_sub))
149 |
150 | if j == 3:
151 | index_sub = np.random.choice(index_u, batch_size // 4, replace=True)
152 | index.append(list(index_sub))
153 |
154 | if j > 3:
155 | index_sub = np.random.choice(index_u, batch_size // 2, replace=True)
156 | index.append(list(index_sub))
157 |
158 | # index[0] + index[1] = (a,a), batch / 4
159 | # index[2] + index[2] = (a,u), batch / 4
160 | # index[4] + index[5] = (u,u), batch / 2
161 | index_left = index[0] + index[2] + index[4]
162 | index_right = index[1] + index[3] + index[5]
163 |
164 | X_train_tensor_left = X_train_tensor[index_left]
165 | X_train_tensor_right = X_train_tensor[index_right]
166 |
167 | # generate label
168 | y_train_new = np.append(np.repeat(s_a_a, batch_size // 4), np.repeat(s_a_u, batch_size // 4))
169 | y_train_new = np.append(y_train_new, np.repeat(s_u_u, batch_size // 2))
170 | y_train_new = torch.from_numpy(y_train_new).float()
171 |
172 | # shuffle
173 | index_shuffle = np.arange(len(y_train_new))
174 | index_shuffle = np.random.choice(index_shuffle, len(index_shuffle), replace=False)
175 |
176 | X_train_tensor_left = X_train_tensor_left[index_shuffle]
177 | X_train_tensor_right = X_train_tensor_right[index_shuffle]
178 | y_train_new = y_train_new[index_shuffle]
179 |
180 | # save
181 | data_loader_X.append([X_train_tensor_left, X_train_tensor_right])
182 | data_loader_y.append(y_train_new)
183 |
184 | return data_loader_X, data_loader_y
185 |
186 | # gradient norm
187 | def grad_norm(self, grad_tuple):
188 |
189 | grad = torch.tensor([0.0])
190 | for i in range(len(grad_tuple)):
191 | grad += torch.norm(grad_tuple[i])
192 |
193 | return grad
194 |
195 | # visualize the gradient flow in network
196 | def plot_grad_flow(self, named_parameters):
197 | ave_grads = []
198 | layers = []
199 | for n, p in named_parameters:
200 | if (p.requires_grad) and ("bias" not in n):
201 | layers.append(n)
202 | ave_grads.append(p.grad.abs().mean())
203 | plt.plot(ave_grads, alpha=0.3, color="b")
204 | plt.hlines(0, 0, len(ave_grads) + 1, linewidth=1, color="k")
205 | plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical")
206 | plt.xlim(xmin=0, xmax=len(ave_grads))
207 | plt.xlabel("Layers")
208 | plt.ylabel("average gradient")
209 | plt.title("Gradient flow")
210 | plt.grid(True)
211 |
212 | # def torch_wasserstein_loss(tensor_a, tensor_b):
213 | # # Compute the first Wasserstein distance between two 1D distributions.
214 | # return (torch_cdf_loss(tensor_a, tensor_b, p=1))
215 |
216 | # Calculate the First Wasserstein Distance
217 | def torch_cdf_loss(self, tensor_a, tensor_b, p=1):
218 | # last-dimension is weight distribution
219 | # p is the norm of the distance, p=1 --> First Wasserstein Distance
220 | # to get a positive weight with our normalized distribution
221 | # we recommend combining this loss with other difference-based losses like L1
222 |
223 | # normalize distribution, add 1e-14 to divisor to avoid 0/0
224 | tensor_a = tensor_a / (torch.sum(tensor_a, dim=-1, keepdim=True) + 1e-14)
225 | tensor_b = tensor_b / (torch.sum(tensor_b, dim=-1, keepdim=True) + 1e-14)
226 | # make cdf with cumsum
227 | cdf_tensor_a = torch.cumsum(tensor_a, dim=-1)
228 | cdf_tensor_b = torch.cumsum(tensor_b, dim=-1)
229 |
230 | # choose different formulas for different norm situations
231 | if p == 1:
232 | cdf_distance = torch.sum(torch.abs((cdf_tensor_a - cdf_tensor_b)), dim=-1)
233 | elif p == 2:
234 | cdf_distance = torch.sqrt(torch.sum(torch.pow((cdf_tensor_a - cdf_tensor_b), 2), dim=-1))
235 | else:
236 | cdf_distance = torch.pow(torch.sum(torch.pow(torch.abs(cdf_tensor_a - cdf_tensor_b), p), dim=-1), 1 / p)
237 |
238 | cdf_loss = cdf_distance.mean()
239 | return cdf_loss
240 |
241 | # Calculate the loss like devnet in PyTorch
242 | def cal_loss(self, y, y_pred, mode='devnet'):
243 | if mode == 'devnet':
244 | y_pred.squeeze_()
245 |
246 | ref = torch.randn(5000) # sampling from the normal distribution
247 | dev = (y_pred - torch.mean(ref)) / torch.std(ref)
248 | # print(f'mean:{torch.mean(ref)}, std:{torch.std(ref)}')
249 | inlier_loss = torch.abs(dev)
250 | outlier_loss = torch.max(5.0 - dev, torch.zeros_like(5.0 - dev))
251 |
252 | loss = torch.mean((1 - y) * inlier_loss + y * outlier_loss)
253 | else:
254 | raise NotImplementedError
255 |
256 | return loss
257 |
258 | def result_process(self, result_show, name, std=False):
259 | # average performance
260 | ave_metric = np.mean(result_show, axis=0).values
261 | std_metric = np.std(result_show, axis=0).values
262 |
263 | # statistical test
264 | wilcoxon_df = pd.DataFrame(data=None, index=result_show.columns, columns=result_show.columns)
265 |
266 | for i in range(wilcoxon_df.shape[0]):
267 | for j in range(wilcoxon_df.shape[1]):
268 | if i != j:
269 | wilcoxon_df.iloc[i, j] = \
270 | wilcoxon(result_show.iloc[:, i] - result_show.iloc[:, j], alternative='greater')[1]
271 |
272 | # average rank
273 | result_show.loc['Ave.rank'] = np.mean(result_show.rank(ascending=False, method='dense', axis=1), axis=0)
274 |
275 | # average metric
276 | if std:
277 | result_show.loc['Ave.metric'] = [str(format(round(a,3), '.3f')) + '±' + str(format(round(s,3), '.3f'))
278 | for a,s in zip(ave_metric, std_metric)]
279 | else:
280 | result_show.loc['Ave.metric'] = [str(format(round(a, 3), '.3f')) for a, s in zip(ave_metric, std_metric)]
281 |
282 |
283 | # the p-value of wilcoxon statistical test
284 | result_show.loc['p-value'] = wilcoxon_df.loc[name].values
285 |
286 |
287 | for _ in result_show.index:
288 | if _ in ['Ave.rank', 'p-value']:
289 | result_show.loc[_, :] = [format(round(_, 2), '.2f') for _ in result_show.loc[_, :].values]
290 |
291 | # result_show = result_show.astype('float')
292 | # result_show = result_show.round(2)
293 |
294 | return result_show
--------------------------------------------------------------------------------