├── .gitignore
├── PCA_plot_for_dataset.py
├── README.md
├── additional_methods
    ├── Deep-SVDD
    │   ├── LICENSE
    │   ├── README.md
    │   ├── data
    │   │   ├── .gitignore
    │   │   ├── .gitkeep
    │   │   └── cardio.npz
    │   ├── imgs
    │   │   ├── .gitkeep
    │   │   ├── cifar10.png
    │   │   └── mnist.png
    │   ├── log
    │   │   └── .gitkeep
    │   ├── requirements.txt
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── base
    │   │   │   ├── __init__.py
    │   │   │   ├── base_dataset.py
    │   │   │   ├── base_net.py
    │   │   │   ├── base_trainer.py
    │   │   │   └── torchvision_dataset.py
    │   │   ├── datasets
    │   │   │   ├── OD_dataset.py
    │   │   │   ├── __init__.py
    │   │   │   ├── cifar10.py
    │   │   │   ├── main.py
    │   │   │   ├── mnist.py
    │   │   │   └── preprocessing.py
    │   │   ├── deepSVDD.py
    │   │   ├── main.py
    │   │   ├── networks
    │   │   │   ├── __init__.py
    │   │   │   ├── cifar10_LeNet.py
    │   │   │   ├── cifar10_LeNet_elu.py
    │   │   │   ├── mnist_LeNet.py
    │   │   │   └── networks.py
    │   │   ├── optim
    │   │   │   ├── __init__.py
    │   │   │   ├── ae_trainer.py
    │   │   │   └── deepSVDD_trainer.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── collect_results.py
    │   │   │   ├── config.py
    │   │   │   └── visualization
    │   │   │       └── plot_images_grid.py
    │   └── test_instruction.txt
    ├── HBOS
    │   ├── hbos.py
    │   └── hbos_LICENSE.txt
    ├── ODIN.py
    ├── SVDD
    │   ├── .gitattributes
    │   ├── .spyproject
    │   │   └── config
    │   │   │   ├── backups
    │   │   │       ├── codestyle.ini.bak
    │   │   │       ├── encoding.ini.bak
    │   │   │       ├── vcs.ini.bak
    │   │   │       └── workspace.ini.bak
    │   │   │   ├── codestyle.ini
    │   │   │   ├── defaults
    │   │   │       ├── defaults-codestyle-0.2.0.ini
    │   │   │       ├── defaults-encoding-0.2.0.ini
    │   │   │       ├── defaults-vcs-0.2.0.ini
    │   │   │       └── defaults-workspace-0.2.0.ini
    │   │   │   ├── encoding.ini
    │   │   │   ├── vcs.ini
    │   │   │   └── workspace.ini
    │   ├── LICENSE
    │   ├── README.md
    │   ├── SECURITY.md
    │   ├── examples
    │   │   ├── svdd_example_KPCA.py
    │   │   ├── svdd_example_PSO.py
    │   │   ├── svdd_example_confusion_matrix.py
    │   │   ├── svdd_example_cross_validation.py
    │   │   ├── svdd_example_grid_search.py
    │   │   ├── svdd_example_hybrid_data.py
    │   │   ├── svdd_example_kernel.py
    │   │   └── svdd_example_unlabeled_data.py
    │   ├── requirements.txt
    │   └── src
    │   │   └── BaseSVDD.py
    ├── abod.py
    ├── cof.py
    ├── ensemble.py
    ├── gen2out
    │   ├── bagging.py
    │   ├── gen2out.py
    │   ├── iforest.py
    │   ├── main.py
    │   └── utils.py
    ├── lmdd.py
    ├── sod.py
    └── wrappers
    │   ├── AE.py
    │   ├── ALAD.py
    │   ├── AnoGAN.py
    │   ├── ExtendedIForest.py
    │   ├── HBOS.py
    │   ├── VAE.py
    │   └── rrcf.py
├── environment.yml
├── evaluation_metrics.py
├── figures
    └── .gitignore
├── formatted_data
    ├── aloi.npz
    ├── annthyroid.npz
    ├── arrhythmia.npz
    ├── breastw.npz
    ├── campaign.npz
    ├── cardio.npz
    ├── cover.npz
    ├── donors.npz
    ├── fault.npz
    ├── glass.npz
    ├── hepatitis.npz
    ├── hrss_anomalous_optimized.npz
    ├── hrss_anomalous_standard.npz
    ├── http.npz
    ├── internetads.npz
    ├── ionosphere.npz
    ├── landsat.npz
    ├── letter.npz
    ├── magic.gamma.npz
    ├── mammography.npz
    ├── mi-f.npz
    ├── mi-v.npz
    ├── mnist.npz
    ├── musk.npz
    ├── nasa.npz
    ├── optdigits.npz
    ├── pageblocks.npz
    ├── parkinson.npz
    ├── pen-global.npz
    ├── pen-local.npz
    ├── pendigits.npz
    ├── pima.npz
    ├── satellite.npz
    ├── satimage-2.npz
    ├── seismic-bumps.npz
    ├── shuttle.npz
    ├── skin.npz
    ├── smtp.npz
    ├── spambase.npz
    ├── speech.npz
    ├── stamps.npz
    ├── thyroid.npz
    ├── vertebral.npz
    ├── vowels.npz
    ├── waveform.npz
    ├── wbc.npz
    ├── wbc2.npz
    ├── wilt.npz
    ├── wine.npz
    ├── wpbc.npz
    ├── yeast.npz
    └── yeast6.npz
├── generate_and_plot_types_of_anomalies.py
├── invert_labels_calculate_metrics.py
├── method_example.py
├── minimal_environment.yml
├── preprocess_detect_outliers.py
├── produce_figures.py
├── raw_data
    ├── .gitkeep
    ├── ADBench_data_raw
    │   └── .gitkeep
    ├── ELKI_data_raw
    │   ├── .gitkeep
    │   ├── Annthyroid
    │   │   └── .gitkeep
    │   ├── Arrhythmia
    │   │   └── .gitkeep
    │   ├── Cardiotocography
    │   │   └── .gitkeep
    │   ├── HeartDisease
    │   │   └── .gitkeep
    │   ├── Hepatitis
    │   │   └── .gitkeep
    │   ├── InternetAds
    │   │   └── .gitkeep
    │   ├── PageBlocks
    │   │   └── .gitkeep
    │   ├── Parkinson
    │   │   └── .gitkeep
    │   ├── Pima
    │   │   └── .gitkeep
    │   ├── SpamBase
    │   │   └── .gitkeep
    │   ├── Stamps
    │   │   └── .gitkeep
    │   └── Wilt
    │   │   └── .gitkeep
    ├── GAAL_data_raw
    │   └── .gitkeep
    ├── Goldstein_data_raw
    │   └── .gitkeep
    ├── ODDS_data_raw
    │   ├── .gitkeep
    │   ├── categorical_variables_per_dataset.json
    │   ├── matfile_data
    │   │   └── .gitkeep
    │   └── other_data
    │   │   └── .gitkeep
    └── extended_AE_data_raw
    │   ├── .gitkeep
    │   └── CNC-kaggle
    │       └── .gitkeep
├── read_raw_write_in_format.py
├── run_all_methods.py
├── tables
    └── .gitignore
└── testnewmethods.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | results/
 3 | results_temp/
 4 | logs/
 5 | formatted_data_old/
 6 | 
 7 | raw_data/ADBench_data_raw/*.npz
 8 | 
 9 | 
10 | *.pyc
11 | *.log
12 | *.tex
13 | *.pickle
14 | *.arff
15 | *.mat
16 | *.csv
17 | 
18 | additional_methods/Deep-SVDD/log/mnist_test/
19 | 
20 | additional_methods/Deep-SVDD/log/musk/
21 | /raw_data/GAAL_data_raw/Annthyroid
22 | /raw_data/GAAL_data_raw/SpamBase
23 | /raw_data/GAAL_data_raw/WDBC
24 | /raw_data/GAAL_data_raw/Waveform
25 | /raw_data/ODDS_data_raw/other_data/yeast.data
26 | /raw_data/ODDS_data_raw/other_data/yeast.names
27 | /raw_data/extended_AE_data_raw/CNC-kaggle/README.txt
28 | /raw_data/extended_AE_data_raw/CNC-kaggle/test_artifact.jpg
29 | /results.zip
30 | 


--------------------------------------------------------------------------------
/PCA_plot_for_dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Sep 23 08:56:39 2022
 5 | 
 6 | @author: rbouman
 7 | """
 8 | 
 9 | import os
10 | from numpy.linalg import svd
11 | from sklearn.preprocessing import StandardScaler
12 | import pickle
13 | import numpy as np
14 | import matplotlib.pyplot as plt
15 | import pandas as pd
16 | 
17 | pickle_dir = "formatted_data"
18 | base_result_dir = "results"
19 | result_dir = "result_dir"
20 | csvresult_dir = "csvresult_dir"
21 | score_dir = "score_dir"
22 | log_dir = "logs"
23 | 
24 | method_name = "EIF"
25 | dataset_name = "musk"
26 | 
27 | 
28 | 
29 | picklefile_name = dataset_name + ".pickle"
30 | 
31 | 
32 | full_path_filename = os.path.join(pickle_dir, picklefile_name)
33 | 
34 | data = pickle.load(open(full_path_filename, 'rb'))
35 | X, y = data["X"], np.squeeze(data["y"])
36 | 
37 | 
38 | 
39 | score_folder_path = os.path.join(base_result_dir, score_dir, dataset_name, method_name)
40 | 
41 | hyperparameter_scores = os.listdir(score_folder_path)
42 | 
43 | n_scores = len(hyperparameter_scores)
44 | 
45 | score_sums = np.zeros(y.shape)
46 | 
47 | for hyperparameter_score in hyperparameter_scores:
48 |     full_path_filename = os.path.join(score_folder_path, hyperparameter_score)
49 |     
50 |     score_sums += pd.read_csv(full_path_filename, names=["scores"])["scores"]
51 | 
52 | scores = score_sums/n_scores
53 | 
54 | 
55 | scaler = StandardScaler()
56 | 
57 | X_scaled = scaler.fit_transform(X)
58 | 
59 | _, S, Vt = svd(X_scaled, full_matrices=False)
60 | V = Vt.T
61 | 
62 | var_explained = S**2 / np.sum(S**2)
63 | 
64 | X_PCA = X.dot(V)
65 | #%% make plots
66 | plt.figure()
67 | plt.title("class colored plot: ")
68 | 
69 | plt.scatter(X_PCA[y==0,0], X_PCA[y==0,1], label="normal")
70 | plt.scatter(X_PCA[y==1,0], X_PCA[y==1,1], label="outlier")
71 | 
72 | plt.xlabel("PC1 " + str(var_explained[0]*100) + "% var explained")
73 | plt.ylabel("PC2 " + str(var_explained[1]*100) + "% var explained")
74 | 
75 | plt.legend()
76 | 
77 | plt.figure()
78 | plt.title
79 | plt.show()
80 | 
81 | plt.figure()
82 | plt.title("score colored plot")
83 | 
84 | plt.scatter(X_PCA[:,0], X_PCA[:,1], c=scores)
85 | 
86 | plt.xlabel("PC1 " + str(var_explained[0]*100) + "% var explained")
87 | plt.ylabel("PC2 " + str(var_explained[1]*100) + "% var explained")
88 | 
89 | plt.colorbar()
90 | 
91 | plt.figure()
92 | plt.title
93 | plt.show()
94 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Unsupervised anomaly detection algorithms on real-world data: how many do we need?
  2 | This is the repository supplementing our [JMLR paper](https://jmlr.org/papers/v25/23-0570.html).
  3 | Currently, this is the largest benchmark of unsupervised anomaly detection algorithms, with 33 algorithms applied on 52 datasets. 
  4 | 
  5 | You can cite our paper as follows:
  6 | 
  7 | ```
  8 | @article{Bouman2024UnsupervisedADComparison,
  9 |   author  = {Roel Bouman and Zaharah Bukhsh and Tom Heskes},
 10 |   title   = {Unsupervised Anomaly Detection Algorithms on Real-world Data: How Many Do We Need?},
 11 |   journal = {Journal of Machine Learning Research},
 12 |   year    = {2024},
 13 |   volume  = {25},
 14 |   number  = {105},
 15 |   pages   = {1--34},
 16 |   url     = {http://jmlr.org/papers/v25/23-0570.html}
 17 | }
 18 | ```
 19 | 
 20 | ## Running the full benchmark
 21 | In order to run the full benchmark, you will need to install all dependencies. The easiest way to do this is through the supplied .yml file through an Anaconda environment.
 22 | ```
 23 | conda env create -f environment.yml
 24 | ```
 25 | 
 26 | Then, activate the environment:
 27 | ```
 28 | conda activate OD_benchmark
 29 | ```
 30 | 
 31 | Due to permission (read/write) errors, it might be that the pip packages in the environment.yml file do not install correctly. It is then recommended to activate the OD_benchmark environment, and install these packages manually using `pip install` from within the environment.
 32 | 
 33 | If you want to run the DeepSVDD benchmark, or use the method in any other way, you also need to install a separate environment for DeepSVDD:
 34 | 
 35 | ```
 36 | cd additional_methods/Deep-SVDD
 37 | conda create --name myenv
 38 | source activate myenv
 39 | while read requirement; do conda install -n myenv --yes $requirement; done < requirements.txt
 40 | cd ../..
 41 | ```
 42 | 
 43 | You can replace the Conda environment name `myenv` with any of your choice, but you will have to change the name accordingly in the `run_all_methods.py` script.
 44 | 
 45 | The current installation instructions do not include GPU acceleration for the Tensorflow/PyTorch libraries. Should you wish to use it nonetheless, please follow the installation instructions for your specific system. Make sure to install these in the correct OD_benchmark conda environment.
 46 | 
 47 | When all dependencies are succesfully installed, you can either re-run the preprocessing, or make use of the existing preprocessed `.npz` files.
 48 | 
 49 | If you want to download all raw data, you can download them from the following sources:
 50 | 
 51 | | **Name**    | **Source URL**                                                                                                                                                                                                                               | **Datasets**                                                                                                                                                                                                                                                                                                      |
 52 | |-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 53 | | ADBench     | https://github.com/Minqi824/ADBench/tree/main/datasets                                                                                                                                                                                       | 11_donors.npz, 12_fault.npz, 19_landsat.npz, 22_magic.gamma.npz, 33_skin.npz, 42_WBC.npz, 46_WPBC.npz, 47_yeast.npz, 4_breastw.npz, 5_campaign.npz                                                                                                                                                                |
 54 | | ELKI        | https://www.dbs.ifi.lmu.de/research/outlier-evaluation/DAMI/                                                                                                                                                                                 | Hepatitis_withoutdupl_norm_16.arff, InternetAds_withoutdupl_norm_19.arff, PageBlocks_withoutdupl_norm_09.arff, Parkinson_withoutdupl_norm_75.arff, Stamps_withoutdupl_norm_09.arff, Wilt_withoutdupl_norm_05.arff                                                                                                 |
 55 | | extended AE | https://www.kaggle.com/datasets/shasun/tool-wear-detection-in-cnc-mill, https://www.kaggle.com/datasets/inIT-OWL/high-storage-system-data-for-energy-optimization, https://www.kaggle.com/datasets/shrutimehta/nasa-asteroids-classification | HRSS_anomalous_optimized.csv, HRSS_anomalous_standard.csv, nasa.csv, and the entire folder: "CNC-kaggle"                                                                                                                                                                                                          |
 56 | | GAAL        | https://github.com/leibinghe/GAAL-based-outlier-detection/blob/master/Data/                                                                                                                                                                  | Spambase, Waveform                                                                                                                                                                                                                                                                                                |
 57 | | Goldstein   | http://dx.doi.org/10.7910/DVN/OPQMVF                                                                                                                                                                                                         | aloi-unsupervised-ad.csv, pen-global-unsupervised-ad.csv, pen-local-unsupervised-ad.csv                                                                                                                                                                                                                           |
 58 | | ODDS        | http://odds.cs.stonybrook.edu/                                                                                                                                                                                                               | annthyroid.mat, arrhythmia.mat, cardio.mat, cover.mat, glass.mat, http.mat, ionosphere.mat, letter.mat, mammography.mat, mnist.mat, musk.mat, optdigits.mat, pendigits.mat, pima.mat, satellite.mat, satimage-2.mat, shuttle.mat, smtp.mat, speech.mat, thyroid.mat, vertebral.mat, vowels.mat, wbc.mat, wine.mat, and non ".mat" data: seismic-bumps.arff, yeast.data, yeast.names |
 59 | 
 60 | Ensure each of the datasets is put into the correct folder in the `raw_data` folder.
 61 | 
 62 | The raw data can then be processed using the `read_raw_write_in_format.py` script.
 63 | 
 64 | ```
 65 | python3 read_raw_write_in_format.py
 66 | ```
 67 | 
 68 | All methods can then be run using a nice CLI:
 69 | 
 70 | ```
 71 | python3 run_all_methods.py
 72 | ```
 73 | 
 74 | Or alternatively, you can add additional arguments to run only subsets. For example, you only want to run the kNN method on the wine dataset:
 75 | 
 76 | ```
 77 | python3 run_all_methods.py --method kNN --dataset wine
 78 | ```
 79 | 
 80 | As noted in the paper, we've inverted the labels for the `skin` and `vertebral` datasets post-hoc. This can be reproduced by executing the following script:
 81 | 
 82 | ```
 83 | python3 invert_labels_calculate_metrics.py
 84 | ```
 85 | 
 86 | Finally, reproducing the figures and analysis from the paper is then easily done using the following command:
 87 | 
 88 | 
 89 | ```
 90 | python3 produce_figures.py
 91 | ```
 92 | 
 93 | ## Extending the benchmark
 94 | Extending the benchmark is easy!
 95 | You'll not need to install all dependencies for this, but a minimal set will do:
 96 | ```
 97 | conda env create -f minimal_environment.yml
 98 | ```
 99 | 
100 | Then, activate the environment:
101 | ```
102 | conda activate OD_benchmark_minimal
103 | ```
104 | 
105 | ### Adding datasets
106 | Datasets can be added by ensuring processed datafiles are added to `processed_data` folder in either `.npz` or `.pickle` format. You can look at the `read_raw_write_in_format.py` script for inspiration. Most importantly, data can't contain duplicates, and must include the following attributes: `X` with samples as rows and features as columns, `y`, 1-dimensional with a label `0` for each normal sample, and `1` for each anomaly.
107 | 
108 | ### Adding methods:
109 | Methods are even easier to add, they only need to produce outlier scores according to the PyOD standard. If your implementation follows the Sklearn API, you can simply modify the following example: (also found in `method_example.py`)
110 | ```
111 | from preprocess_detect_outliers import preprocess_detect_outliers
112 | 
113 | from pyod.models.knn import KNN 
114 | 
115 | #uninstantiated method class
116 | methods = {
117 |         "kNN":KNN
118 |         }
119 | 
120 | #dict of methods and parameters
121 | method_parameters = {
122 |         "kNN":{"n_neighbors":range(5,31), "method":["mean"]}
123 |         }
124 | 
125 | preprocess_detect_outliers(methods, method_parameters)
126 | ```
127 | If you want to add your method, the class needs to at least possess a `.fit()` function (like the Scikit-learn API), and it must after fitting have an `.decision_scores_` attribute which gives an outlier score for each sample in `X`. According to the PyOD standard, a high outlier score indicates a higher likelihood for a sample to be an outlier.
128 | 
129 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Lukas Ruff
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/README.md:
--------------------------------------------------------------------------------
  1 | # PyTorch Implementation of Deep SVDD
  2 | This repository provides a [PyTorch](https://pytorch.org/) implementation of the *Deep SVDD* method presented in our
  3 | ICML 2018 paper ”Deep One-Class Classification”.
  4 | 
  5 | 
  6 | ## Citation and Contact
  7 | You find a PDF of the Deep One-Class Classification ICML 2018 paper at 
  8 | [http://proceedings.mlr.press/v80/ruff18a.html](http://proceedings.mlr.press/v80/ruff18a.html).
  9 | 
 10 | If you use our work, please also cite the paper:
 11 | ```
 12 | @InProceedings{pmlr-v80-ruff18a,
 13 |   title     = {Deep One-Class Classification},
 14 |   author    = {Ruff, Lukas and Vandermeulen, Robert A. and G{\"o}rnitz, Nico and Deecke, Lucas and Siddiqui, Shoaib A. and Binder, Alexander and M{\"u}ller, Emmanuel and Kloft, Marius},
 15 |   booktitle = {Proceedings of the 35th International Conference on Machine Learning},
 16 |   pages     = {4393--4402},
 17 |   year      = {2018},
 18 |   volume    = {80},
 19 | }
 20 | ```
 21 | 
 22 | If you would like to get in touch, please contact [contact@lukasruff.com](mailto:contact@lukasruff.com).
 23 | 
 24 | 
 25 | ## Abstract
 26 | > > Despite the great advances made by deep learning in many machine learning problems, there is a relative dearth of 
 27 | > > deep learning approaches for anomaly detection. Those approaches which do exist involve networks trained to perform 
 28 | > > a task other than anomaly detection, namely generative models or compression, which are in turn adapted for use in 
 29 | > > anomaly detection; they are not trained on an anomaly detection based objective. In this paper we introduce a new 
 30 | > > anomaly detection method—Deep Support Vector Data Description—, which is trained on an anomaly detection based
 31 | > > objective. The adaptation to the deep regime necessitates that our neural network and training procedure satisfy 
 32 | > > certain properties, which we demonstrate theoretically. We show the effectiveness of our method on MNIST and
 33 | > > CIFAR-10 image benchmark datasets as well as on the detection of adversarial examples of GTSRB stop signs.
 34 | 
 35 | 
 36 | ## Installation
 37 | This code is written in `Python 3.7` and requires the packages listed in `requirements.txt`.
 38 | 
 39 | Clone the repository to your local machine and directory of choice:
 40 | ```
 41 | git clone https://github.com/lukasruff/Deep-SVDD-PyTorch.git
 42 | ```
 43 | 
 44 | To run the code, we recommend setting up a virtual environment, e.g. using `virtualenv` or `conda`:
 45 | 
 46 | ### `virtualenv`
 47 | ```
 48 | # pip install virtualenv
 49 | cd <path-to-Deep-SVDD-PyTorch-directory>
 50 | virtualenv myenv
 51 | source myenv/bin/activate
 52 | pip install -r requirements.txt
 53 | ```
 54 | 
 55 | ### `conda`
 56 | ```
 57 | cd <path-to-Deep-SVDD-PyTorch-directory>
 58 | conda create --name myenv
 59 | source activate myenv
 60 | while read requirement; do conda install -n myenv --yes $requirement; done < requirements.txt
 61 | ```
 62 | 
 63 | 
 64 | ## Running experiments
 65 | 
 66 | We currently have implemented the MNIST ([http://yann.lecun.com/exdb/mnist/](http://yann.lecun.com/exdb/mnist/)) and 
 67 | CIFAR-10 ([https://www.cs.toronto.edu/~kriz/cifar.html](https://www.cs.toronto.edu/~kriz/cifar.html)) datasets and 
 68 | simple LeNet-type networks.
 69 | 
 70 | Have a look into `main.py` for all possible arguments and options.
 71 | 
 72 | ### MNIST example
 73 | ```
 74 | cd <path-to-Deep-SVDD-PyTorch-directory>
 75 | 
 76 | # activate virtual environment
 77 | source myenv/bin/activate  # or 'source activate myenv' for conda
 78 | 
 79 | # create folder for experimental output
 80 | mkdir log/mnist_test
 81 | 
 82 | # change to source directory
 83 | cd src
 84 | 
 85 | # run experiment
 86 | python main.py mnist mnist_LeNet ../log/mnist_test ../data --objective one-class --lr 0.0001 --n_epochs 150 --lr_milestone 50 --batch_size 200 --weight_decay 0.5e-6 --pretrain True --ae_lr 0.0001 --ae_n_epochs 150 --ae_lr_milestone 50 --ae_batch_size 200 --ae_weight_decay 0.5e-3 --normal_class 3;
 87 | ```
 88 | This example trains a One-Class Deep SVDD model where digit 3 (`--normal_class 3`) is considered to be the normal class. Autoencoder
 89 | pretraining is used for parameter initialization.
 90 | 
 91 | ### CIFAR-10 example
 92 | ```
 93 | cd <path-to-Deep-SVDD-PyTorch-directory>
 94 | 
 95 | # activate virtual environment
 96 | source myenv/bin/activate  # or 'source activate myenv' for conda
 97 | 
 98 | # create folder for experimental output
 99 | mkdir log/cifar10_test
100 | 
101 | # change to source directory
102 | cd src
103 | 
104 | # run experiment
105 | python main.py cifar10 cifar10_LeNet ../log/cifar10_test ../data --objective one-class --lr 0.0001 --n_epochs 150 --lr_milestone 50 --batch_size 200 --weight_decay 0.5e-6 --pretrain True --ae_lr 0.0001 --ae_n_epochs 350 --ae_lr_milestone 250 --ae_batch_size 200 --ae_weight_decay 0.5e-6 --normal_class 3;
106 | ```
107 | This example trains a One-Class Deep SVDD model where cats (`--normal_class 3`) is considered to be the normal class. 
108 | Autoencoder pretraining is used for parameter initialization.
109 | 
110 | 
111 | ## Examples
112 | 
113 | ### MNIST
114 | Example of the 32 most normal (left) and 32 most anomalous (right) test set examples per class on MNIST according to 
115 | Deep SVDD anomaly scores.
116 | 
117 | ![MNIST](imgs/mnist.png?raw=true "MNIST")
118 | 
119 | ### CIFAR-10
120 | Example of the 32 most normal (left) and 32 most anomalous (right) test set examples per class on CIFAR-10 according to 
121 | Deep SVDD anomaly scores.
122 | 
123 | ![CIFAR-10](imgs/cifar10.png?raw=true "CIFAR-10")
124 | 
125 | 
126 | ## License
127 | MIT


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.pickle
2 | *.npz
3 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/data/.gitkeep


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/data/cardio.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/data/cardio.npz


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/imgs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/imgs/.gitkeep


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/imgs/cifar10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/imgs/cifar10.png


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/imgs/mnist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/imgs/mnist.png


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/log/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/log/.gitkeep


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests==2.20.1
 2 | numpy==1.15.2
 3 | scipy==1.1.0
 4 | scikit-learn==0.20.0
 5 | certifi==2018.10.15
 6 | chardet==3.0.4
 7 | Click==7.0
 8 | cloudpickle==0.5.6
 9 | cycler==0.10.0
10 | idna==2.7
11 | kiwisolver==1.0.1
12 | matplotlib==3.0.1
13 | pandas==0.23.4
14 | Pillow==5.3.0
15 | pyparsing==2.3.0
16 | python-dateutil==2.7.5
17 | pytz==2018.7
18 | six==1.11.0
19 | torch==0.4.1
20 | torchvision==0.2.1
21 | tqdm==4.28.1
22 | ujson==1.35
23 | urllib3==1.24.1
24 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/src/__init__.py


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_dataset import *
2 | from .torchvision_dataset import *
3 | from .base_net import *
4 | from .base_trainer import *
5 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/base/base_dataset.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from torch.utils.data import DataLoader
 3 | 
 4 | 
 5 | class BaseADDataset(ABC):
 6 |     """Anomaly detection dataset base class."""
 7 | 
 8 |     def __init__(self, root: str):
 9 |         super().__init__()
10 |         self.root = root  # root path to data
11 | 
12 |         self.n_classes = 2  # 0: normal, 1: outlier
13 |         self.normal_classes = None  # tuple with original class labels that define the normal class
14 |         self.outlier_classes = None  # tuple with original class labels that define the outlier class
15 | 
16 |         self.train_set = None  # must be of type torch.utils.data.Dataset
17 |         self.test_set = None  # must be of type torch.utils.data.Dataset
18 | 
19 |     @abstractmethod
20 |     def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> (
21 |             DataLoader, DataLoader):
22 |         """Implement data loaders of type torch.utils.data.DataLoader for train_set and test_set."""
23 |         pass
24 | 
25 |     def __repr__(self):
26 |         return self.__class__.__name__
27 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/base/base_net.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | 
 6 | class BaseNet(nn.Module):
 7 |     """Base class for all neural networks."""
 8 | 
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.logger = logging.getLogger(self.__class__.__name__)
12 |         self.rep_dim = None  # representation dimensionality, i.e. dim of the last layer
13 | 
14 |     def forward(self, *input):
15 |         """
16 |         Forward pass logic
17 |         :return: Network output
18 |         """
19 |         raise NotImplementedError
20 | 
21 |     def summary(self):
22 |         """Network summary."""
23 |         net_parameters = filter(lambda p: p.requires_grad, self.parameters())
24 |         params = sum([np.prod(p.size()) for p in net_parameters])
25 |         self.logger.info('Trainable parameters: {}'.format(params))
26 |         self.logger.info(self)
27 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/base/base_trainer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from .base_dataset import BaseADDataset
 3 | from .base_net import BaseNet
 4 | 
 5 | 
 6 | class BaseTrainer(ABC):
 7 |     """Trainer base class."""
 8 | 
 9 |     def __init__(self, optimizer_name: str, lr: float, n_epochs: int, lr_milestones: tuple, batch_size: int,
10 |                  weight_decay: float, device: str, n_jobs_dataloader: int):
11 |         super().__init__()
12 |         self.optimizer_name = optimizer_name
13 |         self.lr = lr
14 |         self.n_epochs = n_epochs
15 |         self.lr_milestones = lr_milestones
16 |         self.batch_size = batch_size
17 |         self.weight_decay = weight_decay
18 |         self.device = device
19 |         self.n_jobs_dataloader = n_jobs_dataloader
20 | 
21 |     @abstractmethod
22 |     def train(self, dataset: BaseADDataset, net: BaseNet) -> BaseNet:
23 |         """
24 |         Implement train method that trains the given network using the train_set of dataset.
25 |         :return: Trained net
26 |         """
27 |         pass
28 | 
29 |     @abstractmethod
30 |     def test(self, dataset: BaseADDataset, net: BaseNet):
31 |         """
32 |         Implement test method that evaluates the test_set of dataset on the given network.
33 |         """
34 |         pass
35 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/base/torchvision_dataset.py:
--------------------------------------------------------------------------------
 1 | from .base_dataset import BaseADDataset
 2 | from torch.utils.data import DataLoader
 3 | 
 4 | 
 5 | class TorchvisionDataset(BaseADDataset):
 6 |     """TorchvisionDataset class for datasets already implemented in torchvision.datasets."""
 7 | 
 8 |     def __init__(self, root: str):
 9 |         super().__init__(root)
10 | 
11 |     def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> (
12 |             DataLoader, DataLoader):
13 |         train_loader = DataLoader(dataset=self.train_set, batch_size=batch_size, shuffle=shuffle_train,
14 |                                   num_workers=num_workers)
15 |         test_loader = DataLoader(dataset=self.test_set, batch_size=batch_size, shuffle=shuffle_test,
16 |                                  num_workers=num_workers)
17 |         return train_loader, test_loader
18 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/datasets/OD_dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import pickle
 3 | import numpy as np
 4 | from sklearn.preprocessing import RobustScaler
 5 | 
 6 | from base.torchvision_dataset import TorchvisionDataset
 7 | 
 8 | class OD_Dataset(TorchvisionDataset):
 9 | 
10 |     def __init__(self, root: str, normal_class=0):
11 |         super().__init__(root)
12 | 
13 | 
14 |         self.root = root
15 |     
16 |         self.n_classes = 2  # 0: normal, 1: outlier
17 |         self.normal_classes = tuple([normal_class])
18 |         self.outlier_classes = [0,1]
19 |         self.outlier_classes.remove(normal_class)
20 | 
21 |         # Subset train set to normal class
22 |         self.train_set = OD_Base_Dataset(dataset_name=root)
23 | 
24 |         self.test_set = OD_Base_Dataset(dataset_name=root)
25 | 
26 | class OD_Base_Dataset(Dataset):
27 |     def __init__(self, dataset_name):
28 |         
29 |         data = pickle.load(open(dataset_name, 'rb'))
30 |         self.X, self.y = data["X"].astype(np.float32), np.squeeze(data["y"]).astype(np.float32)
31 |         
32 |         scaler = RobustScaler()
33 |         
34 |         self.X_scaled = scaler.fit_transform(self.X)
35 | 
36 |     def __len__(self):
37 |         return self.X.shape[0]
38 | 
39 |     def __getitem__(self, idx):
40 | 
41 |             
42 |         return self.X_scaled[idx,:], self.y[idx], idx
43 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import load_dataset
2 | from .mnist import MNIST_Dataset
3 | from .cifar10 import CIFAR10_Dataset
4 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/datasets/cifar10.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Subset
 2 | from PIL import Image
 3 | from torchvision.datasets import CIFAR10
 4 | from base.torchvision_dataset import TorchvisionDataset
 5 | from .preprocessing import get_target_label_idx, global_contrast_normalization
 6 | 
 7 | import torchvision.transforms as transforms
 8 | 
 9 | 
10 | class CIFAR10_Dataset(TorchvisionDataset):
11 | 
12 |     def __init__(self, root: str, normal_class=5):
13 |         super().__init__(root)
14 | 
15 |         self.n_classes = 2  # 0: normal, 1: outlier
16 |         self.normal_classes = tuple([normal_class])
17 |         self.outlier_classes = list(range(0, 10))
18 |         self.outlier_classes.remove(normal_class)
19 | 
20 |         # Pre-computed min and max values (after applying GCN) from train data per class
21 |         min_max = [(-28.94083453598571, 13.802961825439636),
22 |                    (-6.681770233365245, 9.158067708230273),
23 |                    (-34.924463588638204, 14.419298165027628),
24 |                    (-10.599172931391799, 11.093187820377565),
25 |                    (-11.945022995801637, 10.628045447867583),
26 |                    (-9.691969487694928, 8.948326776180823),
27 |                    (-9.174940012342555, 13.847014686472365),
28 |                    (-6.876682005899029, 12.282371383343161),
29 |                    (-15.603507135507172, 15.2464923804279),
30 |                    (-6.132882973622672, 8.046098172351265)]
31 | 
32 |         # CIFAR-10 preprocessing: GCN (with L1 norm) and min-max feature scaling to [0,1]
33 |         transform = transforms.Compose([transforms.ToTensor(),
34 |                                         transforms.Lambda(lambda x: global_contrast_normalization(x, scale='l1')),
35 |                                         transforms.Normalize([min_max[normal_class][0]] * 3,
36 |                                                              [min_max[normal_class][1] - min_max[normal_class][0]] * 3)])
37 | 
38 |         target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes))
39 | 
40 |         train_set = MyCIFAR10(root=self.root, train=True, download=True,
41 |                               transform=transform, target_transform=target_transform)
42 |         # Subset train set to normal class
43 |         train_idx_normal = get_target_label_idx(train_set.train_labels, self.normal_classes)
44 |         self.train_set = Subset(train_set, train_idx_normal)
45 | 
46 |         self.test_set = MyCIFAR10(root=self.root, train=False, download=True,
47 |                                   transform=transform, target_transform=target_transform)
48 | 
49 | 
50 | class MyCIFAR10(CIFAR10):
51 |     """Torchvision CIFAR10 class with patch of __getitem__ method to also return the index of a data sample."""
52 | 
53 |     def __init__(self, *args, **kwargs):
54 |         super(MyCIFAR10, self).__init__(*args, **kwargs)
55 | 
56 |     def __getitem__(self, index):
57 |         """Override the original method of the CIFAR10 class.
58 |         Args:
59 |             index (int): Index
60 |         Returns:
61 |             triple: (image, target, index) where target is index of the target class.
62 |         """
63 |         if self.train:
64 |             img, target = self.train_data[index], self.train_labels[index]
65 |         else:
66 |             img, target = self.test_data[index], self.test_labels[index]
67 | 
68 |         # doing this so that it is consistent with all other datasets
69 |         # to return a PIL Image
70 |         img = Image.fromarray(img)
71 | 
72 |         if self.transform is not None:
73 |             img = self.transform(img)
74 | 
75 |         if self.target_transform is not None:
76 |             target = self.target_transform(target)
77 | 
78 |         return img, target, index  # only line changed
79 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/datasets/main.py:
--------------------------------------------------------------------------------
 1 | from .OD_dataset import OD_Dataset
 2 | import os
 3 | 
 4 | def load_dataset(dataset_name, data_path, normal_class):
 5 |     """Loads the dataset."""
 6 | 
 7 |     dataset_path = os.path.join(data_path, dataset_name)
 8 |     dataset = OD_Dataset(root=dataset_path, normal_class=normal_class)
 9 | 
10 |     return dataset
11 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/datasets/mnist.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Subset
 2 | from PIL import Image
 3 | from torchvision.datasets import MNIST
 4 | from base.torchvision_dataset import TorchvisionDataset
 5 | from .preprocessing import get_target_label_idx, global_contrast_normalization
 6 | 
 7 | import torchvision.transforms as transforms
 8 | 
 9 | 
10 | class MNIST_Dataset(TorchvisionDataset):
11 | 
12 |     def __init__(self, root: str, normal_class=0):
13 |         super().__init__(root)
14 | 
15 |         self.n_classes = 2  # 0: normal, 1: outlier
16 |         self.normal_classes = tuple([normal_class])
17 |         self.outlier_classes = list(range(0, 10))
18 |         self.outlier_classes.remove(normal_class)
19 | 
20 |         # Pre-computed min and max values (after applying GCN) from train data per class
21 |         min_max = [(-0.8826567065619495, 9.001545489292527),
22 |                    (-0.6661464580883915, 20.108062262467364),
23 |                    (-0.7820454743183202, 11.665100841080346),
24 |                    (-0.7645772083211267, 12.895051191467457),
25 |                    (-0.7253923114302238, 12.683235701611533),
26 |                    (-0.7698501867861425, 13.103278415430502),
27 |                    (-0.778418217980696, 10.457837397569108),
28 |                    (-0.7129780970522351, 12.057777597673047),
29 |                    (-0.8280402650205075, 10.581538445782988),
30 |                    (-0.7369959242164307, 10.697039838804978)]
31 | 
32 |         # MNIST preprocessing: GCN (with L1 norm) and min-max feature scaling to [0,1]
33 |         transform = transforms.Compose([transforms.ToTensor(),
34 |                                         transforms.Lambda(lambda x: global_contrast_normalization(x, scale='l1')),
35 |                                         transforms.Normalize([min_max[normal_class][0]],
36 |                                                              [min_max[normal_class][1] - min_max[normal_class][0]])])
37 | 
38 |         target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes))
39 | 
40 |         train_set = MyMNIST(root=self.root, train=True, download=True,
41 |                             transform=transform, target_transform=target_transform)
42 |         # Subset train_set to normal class
43 |         train_idx_normal = get_target_label_idx(train_set.train_labels.clone().data.cpu().numpy(), self.normal_classes)
44 |         self.train_set = Subset(train_set, train_idx_normal)
45 | 
46 |         self.test_set = MyMNIST(root=self.root, train=False, download=True,
47 |                                 transform=transform, target_transform=target_transform)
48 | 
49 | 
50 | class MyMNIST(MNIST):
51 |     """Torchvision MNIST class with patch of __getitem__ method to also return the index of a data sample."""
52 | 
53 |     def __init__(self, *args, **kwargs):
54 |         super(MyMNIST, self).__init__(*args, **kwargs)
55 | 
56 |     def __getitem__(self, index):
57 |         """Override the original method of the MNIST class.
58 |         Args:
59 |             index (int): Index
60 |         Returns:
61 |             triple: (image, target, index) where target is index of the target class.
62 |         """
63 |         if self.train:
64 |             img, target = self.train_data[index], self.train_labels[index]
65 |         else:
66 |             img, target = self.test_data[index], self.test_labels[index]
67 | 
68 |         # doing this so that it is consistent with all other datasets
69 |         # to return a PIL Image
70 |         img = Image.fromarray(img.numpy(), mode='L')
71 | 
72 |         if self.transform is not None:
73 |             img = self.transform(img)
74 | 
75 |         if self.target_transform is not None:
76 |             target = self.target_transform(target)
77 | 
78 |         return img, target, index  # only line changed
79 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/datasets/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | def get_target_label_idx(labels, targets):
 6 |     """
 7 |     Get the indices of labels that are included in targets.
 8 |     :param labels: array of labels
 9 |     :param targets: list/tuple of target labels
10 |     :return: list with indices of target labels
11 |     """
12 |     return np.argwhere(np.isin(labels, targets)).flatten().tolist()
13 | 
14 | 
15 | def global_contrast_normalization(x: torch.tensor, scale='l2'):
16 |     """
17 |     Apply global contrast normalization to tensor, i.e. subtract mean across features (pixels) and normalize by scale,
18 |     which is either the standard deviation, L1- or L2-norm across features (pixels).
19 |     Note this is a *per sample* normalization globally across features (and not across the dataset).
20 |     """
21 | 
22 |     assert scale in ('l1', 'l2')
23 | 
24 |     n_features = int(np.prod(x.shape))
25 | 
26 |     mean = torch.mean(x)  # mean over all features (pixels) per sample
27 |     x -= mean
28 | 
29 |     if scale == 'l1':
30 |         x_scale = torch.mean(torch.abs(x))
31 | 
32 |     if scale == 'l2':
33 |         x_scale = torch.sqrt(torch.sum(x ** 2)) / n_features
34 | 
35 |     x /= x_scale
36 | 
37 |     return x
38 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/deepSVDD.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import torch
  3 | 
  4 | from base.base_dataset import BaseADDataset
  5 | from networks.networks import network, auto_encoder
  6 | from optim.deepSVDD_trainer import DeepSVDDTrainer
  7 | from optim.ae_trainer import AETrainer
  8 | 
  9 | 
 10 | class DeepSVDD(object):
 11 |     """A class for the Deep SVDD method.
 12 | 
 13 |     Attributes:
 14 |         objective: A string specifying the Deep SVDD objective (either 'one-class' or 'soft-boundary').
 15 |         nu: Deep SVDD hyperparameter nu (must be 0 < nu <= 1).
 16 |         R: Hypersphere radius R.
 17 |         c: Hypersphere center c.
 18 |         net_name: A string indicating the name of the neural network to use.
 19 |         net: The neural network \phi.
 20 |         ae_net: The autoencoder network corresponding to \phi for network weights pretraining.
 21 |         trainer: DeepSVDDTrainer to train a Deep SVDD model.
 22 |         optimizer_name: A string indicating the optimizer to use for training the Deep SVDD network.
 23 |         ae_trainer: AETrainer to train an autoencoder in pretraining.
 24 |         ae_optimizer_name: A string indicating the optimizer to use for pretraining the autoencoder.
 25 |         results: A dictionary to save the results.
 26 |     """
 27 | 
 28 |     def __init__(self, n_vars, objective: str = 'one-class', nu: float = 0.1):
 29 |         """Inits DeepSVDD with one of the two objectives and hyperparameter nu."""
 30 |         
 31 |         
 32 |         self.n_vars = n_vars
 33 |         
 34 |         assert objective in ('one-class', 'soft-boundary'), "Objective must be either 'one-class' or 'soft-boundary'."
 35 |         self.objective = objective
 36 |         assert (0 < nu) & (nu <= 1), "For hyperparameter nu, it must hold: 0 < nu <= 1."
 37 |         self.nu = nu
 38 |         self.R = 0.0  # hypersphere radius R
 39 |         self.c = None  # hypersphere center c
 40 | 
 41 |         self.net_name = None
 42 |         self.net = None  # neural network \phi
 43 | 
 44 |         self.trainer = None
 45 |         self.optimizer_name = None
 46 | 
 47 |         self.ae_net = None  # autoencoder network for pretraining
 48 |         self.ae_trainer = None
 49 |         self.ae_optimizer_name = None
 50 | 
 51 |         self.results = {
 52 |             'train_time': None,
 53 |             'test_auc': None,
 54 |             'test_time': None,
 55 |             'test_scores': None,
 56 |         }
 57 | 
 58 |     def set_networks(self, pretrain, n_layers, shrinkage_factor):
 59 |         """Builds the neural network \phi."""
 60 |         self.net, self.ae_net = self.build_networks(pretrain, n_layers, shrinkage_factor)
 61 |         
 62 |     def build_networks(self, pretrain, n_layers, shrinkage_factor):
 63 |         """Builds the neural networks."""
 64 |         
 65 |         net = network(self.n_vars, n_layers, shrinkage_factor)
 66 | 
 67 |         if pretrain:
 68 |             ae_net = auto_encoder(self.n_vars, n_layers, shrinkage_factor)
 69 |             
 70 |         else:
 71 |             ae_net = None
 72 | 
 73 |         return net, ae_net
 74 | 
 75 |         
 76 | 
 77 |     def train(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 50,
 78 |               lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda',
 79 |               n_jobs_dataloader: int = 0):
 80 |         """Trains the Deep SVDD model on the training data."""
 81 | 
 82 |         self.optimizer_name = optimizer_name
 83 |         self.trainer = DeepSVDDTrainer(self.objective, self.R, self.c, self.nu, optimizer_name, lr=lr,
 84 |                                        n_epochs=n_epochs, lr_milestones=lr_milestones, batch_size=batch_size,
 85 |                                        weight_decay=weight_decay, device=device, n_jobs_dataloader=n_jobs_dataloader)
 86 |         # Get the model
 87 |         self.net = self.trainer.train(dataset, self.net)
 88 |         self.R = float(self.trainer.R.cpu().data.numpy())  # get float
 89 |         self.c = self.trainer.c.cpu().data.numpy().tolist()  # get list
 90 |         self.results['train_time'] = self.trainer.train_time
 91 | 
 92 |     def test(self, dataset: BaseADDataset, device: str = 'cuda', n_jobs_dataloader: int = 0):
 93 |         """Tests the Deep SVDD model on the test data."""
 94 | 
 95 |         if self.trainer is None:
 96 |             self.trainer = DeepSVDDTrainer(self.objective, self.R, self.c, self.nu,
 97 |                                            device=device, n_jobs_dataloader=n_jobs_dataloader)
 98 | 
 99 |         self.trainer.test(dataset, self.net)
100 |         # Get results
101 |         self.results['test_auc'] = self.trainer.test_auc
102 |         self.results['test_time'] = self.trainer.test_time
103 |         self.results['test_scores'] = self.trainer.test_scores
104 | 
105 |     def pretrain(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 100,
106 |                  lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda',
107 |                  n_jobs_dataloader: int = 0):
108 |         """Pretrains the weights for the Deep SVDD network \phi via autoencoder."""
109 | 
110 |         self.ae_optimizer_name = optimizer_name
111 |         self.ae_trainer = AETrainer(optimizer_name, lr=lr, n_epochs=n_epochs, lr_milestones=lr_milestones,
112 |                                     batch_size=batch_size, weight_decay=weight_decay, device=device,
113 |                                     n_jobs_dataloader=n_jobs_dataloader)
114 |         self.ae_net = self.ae_trainer.train(dataset, self.ae_net)
115 |         self.ae_trainer.test(dataset, self.ae_net)
116 |         self.init_network_weights_from_pretraining()
117 | 
118 |     def init_network_weights_from_pretraining(self):
119 |         """Initialize the Deep SVDD network weights from the encoder weights of the pretraining autoencoder."""
120 | 
121 |         net_dict = self.net.state_dict()
122 |         ae_net_dict = self.ae_net.state_dict()
123 | 
124 |         # Filter out decoder network keys
125 |         ae_net_dict = {k: v for k, v in ae_net_dict.items() if k in net_dict}
126 |         # Overwrite values in the existing state_dict
127 |         net_dict.update(ae_net_dict)
128 |         # Load the new state_dict
129 |         self.net.load_state_dict(net_dict)
130 | 
131 |     def save_model(self, export_model, save_ae=True):
132 |         """Save Deep SVDD model to export_model."""
133 | 
134 |         net_dict = self.net.state_dict()
135 |         ae_net_dict = self.ae_net.state_dict() if save_ae and self.ae_net is not None else None
136 | 
137 |         torch.save({'R': self.R,
138 |                     'c': self.c,
139 |                     'net_dict': net_dict,
140 |                     'ae_net_dict': ae_net_dict}, export_model)
141 | 
142 |     def load_model(self, model_path, load_ae=False):
143 |         """Load Deep SVDD model from model_path."""
144 | 
145 |         model_dict = torch.load(model_path)
146 | 
147 |         self.R = model_dict['R']
148 |         self.c = model_dict['c']
149 |         self.net.load_state_dict(model_dict['net_dict'])
150 |         if load_ae:
151 |             if self.ae_net is None:
152 |                 self.ae_net = build_autoencoder(self.net_name)
153 |             self.ae_net.load_state_dict(model_dict['ae_net_dict'])
154 | 
155 |     def save_results(self, export_json):
156 |         """Save results dict to a JSON-file."""
157 |         with open(export_json, 'w') as fp:
158 |             json.dump(self.results, fp)
159 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/main.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import torch
  3 | import logging
  4 | import random
  5 | import numpy as np
  6 | import os
  7 | 
  8 | from utils.config import Config
  9 | from deepSVDD import DeepSVDD
 10 | from datasets.main import load_dataset
 11 | 
 12 | 
 13 | ################################################################################
 14 | # Settings
 15 | ################################################################################
 16 | @click.command()
 17 | @click.argument('dataset_name', type=str)
 18 | #@click.argument('net_name', type=click.Choice(['mnist_LeNet', 'cifar10_LeNet', 'cifar10_LeNet_ELU']))
 19 | @click.argument('n_layers', type=int)
 20 | @click.argument('shrinkage_factor', type=float)
 21 | @click.argument('xp_path', type=click.Path(exists=False))
 22 | @click.argument('data_path', type=click.Path(exists=False))
 23 | @click.argument("target_scorefile_path", type=click.Path(exists=False))
 24 | @click.option('--load_config', type=click.Path(exists=True), default=None,
 25 |               help='Config JSON-file path (default: None).')
 26 | @click.option('--load_model', type=click.Path(exists=True), default=None,
 27 |               help='Model file path (default: None).')
 28 | @click.option('--objective', type=click.Choice(['one-class', 'soft-boundary']), default='one-class',
 29 |               help='Specify Deep SVDD objective ("one-class" or "soft-boundary").')
 30 | @click.option('--nu', type=float, default=0.1, help='Deep SVDD hyperparameter nu (must be 0 < nu <= 1).')
 31 | @click.option('--device', type=str, default='cuda', help='Computation device to use ("cpu", "cuda", "cuda:2", etc.).')
 32 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.')
 33 | @click.option('--optimizer_name', type=click.Choice(['adam', 'amsgrad']), default='adam',
 34 |               help='Name of the optimizer to use for Deep SVDD network training.')
 35 | @click.option('--lr', type=float, default=0.001,
 36 |               help='Initial learning rate for Deep SVDD network training. Default=0.001')
 37 | @click.option('--n_epochs', type=int, default=50, help='Number of epochs to train.')
 38 | @click.option('--lr_milestone', type=int, default=0, multiple=True,
 39 |               help='Lr scheduler milestones at which lr is multiplied by 0.1. Can be multiple and must be increasing.')
 40 | @click.option('--batch_size', type=int, default=128, help='Batch size for mini-batch training.')
 41 | @click.option('--weight_decay', type=float, default=1e-6,
 42 |               help='Weight decay (L2 penalty) hyperparameter for Deep SVDD objective.')
 43 | @click.option('--pretrain', type=bool, default=True,
 44 |               help='Pretrain neural network parameters via autoencoder.')
 45 | @click.option('--ae_optimizer_name', type=click.Choice(['adam', 'amsgrad']), default='adam',
 46 |               help='Name of the optimizer to use for autoencoder pretraining.')
 47 | @click.option('--ae_lr', type=float, default=0.001,
 48 |               help='Initial learning rate for autoencoder pretraining. Default=0.001')
 49 | @click.option('--ae_n_epochs', type=int, default=100, help='Number of epochs to train autoencoder.')
 50 | @click.option('--ae_lr_milestone', type=int, default=0, multiple=True,
 51 |               help='Lr scheduler milestones at which lr is multiplied by 0.1. Can be multiple and must be increasing.')
 52 | @click.option('--ae_batch_size', type=int, default=128, help='Batch size for mini-batch autoencoder training.')
 53 | @click.option('--ae_weight_decay', type=float, default=1e-6,
 54 |               help='Weight decay (L2 penalty) hyperparameter for autoencoder objective.')
 55 | @click.option('--n_jobs_dataloader', type=int, default=0,
 56 |               help='Number of workers for data loading. 0 means that the data will be loaded in the main process.')
 57 | @click.option('--normal_class', type=int, default=0,
 58 |               help='Specify the normal class of the dataset (all other classes are considered anomalous).')
 59 | def main(dataset_name, n_layers, shrinkage_factor, xp_path, data_path, target_scorefile_path, load_config, load_model, objective, nu, device, seed,
 60 |          optimizer_name, lr, n_epochs, lr_milestone, batch_size, weight_decay, pretrain, ae_optimizer_name, ae_lr,
 61 |          ae_n_epochs, ae_lr_milestone, ae_batch_size, ae_weight_decay, n_jobs_dataloader, normal_class):
 62 |     """
 63 |     Deep SVDD, a fully deep method for anomaly detection.
 64 | 
 65 |     :arg DATASET_NAME: Name of the dataset to load.
 66 |     :arg N_LAYERS: Number of hidden layers used for network. If auto-encoder pretraining is used, the auto-encoder will have n_layers*2-1 hidden layers.
 67 |     :arg SHRINKAGE_FACTOR: Factor by which the neurons per layer will decay between each size. Must be between 0 and 1. Shrinkage is reversed for the auto-encoder after the bottleneck layer
 68 |     :arg XP_PATH: Export path for logging the experiment.
 69 |     :arg DATA_PATH: Root path of data.
 70 |     """
 71 | 
 72 |     
 73 |     # Get configuration
 74 |     cfg = Config(locals().copy())
 75 | 
 76 |     # Set up logging
 77 |     logging.basicConfig(level=logging.INFO)
 78 |     logger = logging.getLogger()
 79 |     logger.setLevel(logging.INFO)
 80 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 81 |     if not os.path.exists(xp_path):
 82 |         os.makedirs(xp_path)
 83 |         
 84 |     log_file = xp_path + '/log.txt'
 85 |     file_handler = logging.FileHandler(log_file)
 86 |     file_handler.setLevel(logging.INFO)
 87 |     file_handler.setFormatter(formatter)
 88 |     logger.addHandler(file_handler)
 89 | 
 90 |     # Print arguments
 91 |     logger.info('Log file is %s.' % log_file)
 92 |     logger.info('Data path is %s.' % data_path)
 93 |     logger.info('Export path is %s.' % xp_path)
 94 | 
 95 |     logger.info('Dataset: %s' % dataset_name)
 96 |     logger.info('Normal class: %d' % normal_class)
 97 |     logger.info('Network (n_layers, shrinkage_factor): %s, %s' % (n_layers, shrinkage_factor))
 98 | 
 99 |     # If specified, load experiment config from JSON-file
100 |     if load_config:
101 |         cfg.load_config(import_json=load_config)
102 |         logger.info('Loaded configuration from %s.' % load_config)
103 | 
104 |     # Print configuration
105 |     logger.info('Deep SVDD objective: %s' % cfg.settings['objective'])
106 |     logger.info('Nu-parameter: %.2f' % cfg.settings['nu'])
107 | 
108 |     # Set seed
109 |     if cfg.settings['seed'] != -1:
110 |         random.seed(cfg.settings['seed'])
111 |         np.random.seed(cfg.settings['seed'])
112 |         torch.manual_seed(cfg.settings['seed'])
113 |         logger.info('Set seed to %d.' % cfg.settings['seed'])
114 | 
115 |     # Default device to 'cpu' if cuda is not available
116 |     if not torch.cuda.is_available():
117 |         device = 'cpu'
118 |     logger.info('Computation device: %s' % device)
119 |     logger.info('Number of dataloader workers: %d' % n_jobs_dataloader)
120 | 
121 |     # Load data
122 |     dataset = load_dataset(dataset_name, data_path, normal_class)
123 |     n_vars = dataset.train_set.X.shape[1]
124 |     
125 |     # Initialize DeepSVDD model and set neural network \phi
126 |     deep_SVDD = DeepSVDD(n_vars, cfg.settings['objective'], cfg.settings['nu'])
127 |     deep_SVDD.set_networks(cfg.settings['pretrain'], n_layers, shrinkage_factor)
128 |     # If specified, load Deep SVDD model (radius R, center c, network weights, and possibly autoencoder weights)
129 |     if load_model:
130 |         deep_SVDD.load_model(model_path=load_model, load_ae=True)
131 |         logger.info('Loading model from %s.' % load_model)
132 | 
133 |     logger.info('Pretraining: %s' % pretrain)
134 |     if pretrain:
135 |         # Log pretraining details
136 |         logger.info('Pretraining optimizer: %s' % cfg.settings['ae_optimizer_name'])
137 |         logger.info('Pretraining learning rate: %g' % cfg.settings['ae_lr'])
138 |         logger.info('Pretraining epochs: %d' % cfg.settings['ae_n_epochs'])
139 |         logger.info('Pretraining learning rate scheduler milestones: %s' % (cfg.settings['ae_lr_milestone'],))
140 |         logger.info('Pretraining batch size: %d' % cfg.settings['ae_batch_size'])
141 |         logger.info('Pretraining weight decay: %g' % cfg.settings['ae_weight_decay'])
142 | 
143 |         # Pretrain model on dataset (via autoencoder)
144 |         deep_SVDD.pretrain(dataset,
145 |                            optimizer_name=cfg.settings['ae_optimizer_name'],
146 |                            lr=cfg.settings['ae_lr'],
147 |                            n_epochs=cfg.settings['ae_n_epochs'],
148 |                            lr_milestones=cfg.settings['ae_lr_milestone'],
149 |                            batch_size=cfg.settings['ae_batch_size'],
150 |                            weight_decay=cfg.settings['ae_weight_decay'],
151 |                            device=device,
152 |                            n_jobs_dataloader=n_jobs_dataloader)
153 | 
154 |     # Log training details
155 |     logger.info('Training optimizer: %s' % cfg.settings['optimizer_name'])
156 |     logger.info('Training learning rate: %g' % cfg.settings['lr'])
157 |     logger.info('Training epochs: %d' % cfg.settings['n_epochs'])
158 |     logger.info('Training learning rate scheduler milestones: %s' % (cfg.settings['lr_milestone'],))
159 |     logger.info('Training batch size: %d' % cfg.settings['batch_size'])
160 |     logger.info('Training weight decay: %g' % cfg.settings['weight_decay'])
161 | 
162 |     # Train model on dataset
163 |     deep_SVDD.train(dataset,
164 |                     optimizer_name=cfg.settings['optimizer_name'],
165 |                     lr=cfg.settings['lr'],
166 |                     n_epochs=cfg.settings['n_epochs'],
167 |                     lr_milestones=cfg.settings['lr_milestone'],
168 |                     batch_size=cfg.settings['batch_size'],
169 |                     weight_decay=cfg.settings['weight_decay'],
170 |                     device=device,
171 |                     n_jobs_dataloader=n_jobs_dataloader)
172 | 
173 |     # Test model
174 |     deep_SVDD.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
175 | 
176 |     # Plot most anomalous and most normal (within-class) test samples
177 |     indices, labels, scores = zip(*deep_SVDD.results['test_scores'])
178 |     indices, labels, scores = np.array(indices), np.array(labels), np.array(scores)
179 |     idx_sorted = indices[labels == 0][np.argsort(scores[labels == 0])]  # sorted from lowest to highest anomaly score
180 | 
181 |     # Save results, model, and configuration
182 |     deep_SVDD.save_results(export_json=xp_path + '/results.json')
183 |     deep_SVDD.save_model(export_model=xp_path + '/model.tar')
184 |     cfg.save_config(export_json=xp_path + '/config.json')
185 |     
186 |     test_scores_index_ordered = scores[indices]
187 |     
188 |     np.savetxt(target_scorefile_path, test_scores_index_ordered)
189 |     
190 | 
191 | 
192 | if __name__ == '__main__':
193 |     main()
194 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/networks/__init__.py:
--------------------------------------------------------------------------------
1 | from .networks import network, auto_encoder
2 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/networks/cifar10_LeNet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from base.base_net import BaseNet
 6 | 
 7 | 
 8 | class CIFAR10_LeNet(BaseNet):
 9 | 
10 |     def __init__(self):
11 |         super().__init__()
12 | 
13 |         self.rep_dim = 128
14 |         self.pool = nn.MaxPool2d(2, 2)
15 | 
16 |         self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2)
17 |         self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
18 |         self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2)
19 |         self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False)
20 |         self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2)
21 |         self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False)
22 |         self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False)
23 | 
24 |     def forward(self, x):
25 |         x = self.conv1(x)
26 |         x = self.pool(F.leaky_relu(self.bn2d1(x)))
27 |         x = self.conv2(x)
28 |         x = self.pool(F.leaky_relu(self.bn2d2(x)))
29 |         x = self.conv3(x)
30 |         x = self.pool(F.leaky_relu(self.bn2d3(x)))
31 |         x = x.view(x.size(0), -1)
32 |         x = self.fc1(x)
33 |         return x
34 | 
35 | 
36 | class CIFAR10_LeNet_Autoencoder(BaseNet):
37 | 
38 |     def __init__(self):
39 |         super().__init__()
40 | 
41 |         self.rep_dim = 128
42 |         self.pool = nn.MaxPool2d(2, 2)
43 | 
44 |         # Encoder (must match the Deep SVDD network above)
45 |         self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2)
46 |         nn.init.xavier_uniform_(self.conv1.weight, gain=nn.init.calculate_gain('leaky_relu'))
47 |         self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
48 |         self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2)
49 |         nn.init.xavier_uniform_(self.conv2.weight, gain=nn.init.calculate_gain('leaky_relu'))
50 |         self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False)
51 |         self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2)
52 |         nn.init.xavier_uniform_(self.conv3.weight, gain=nn.init.calculate_gain('leaky_relu'))
53 |         self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False)
54 |         self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False)
55 |         self.bn1d = nn.BatchNorm1d(self.rep_dim, eps=1e-04, affine=False)
56 | 
57 |         # Decoder
58 |         self.deconv1 = nn.ConvTranspose2d(int(self.rep_dim / (4 * 4)), 128, 5, bias=False, padding=2)
59 |         nn.init.xavier_uniform_(self.deconv1.weight, gain=nn.init.calculate_gain('leaky_relu'))
60 |         self.bn2d4 = nn.BatchNorm2d(128, eps=1e-04, affine=False)
61 |         self.deconv2 = nn.ConvTranspose2d(128, 64, 5, bias=False, padding=2)
62 |         nn.init.xavier_uniform_(self.deconv2.weight, gain=nn.init.calculate_gain('leaky_relu'))
63 |         self.bn2d5 = nn.BatchNorm2d(64, eps=1e-04, affine=False)
64 |         self.deconv3 = nn.ConvTranspose2d(64, 32, 5, bias=False, padding=2)
65 |         nn.init.xavier_uniform_(self.deconv3.weight, gain=nn.init.calculate_gain('leaky_relu'))
66 |         self.bn2d6 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
67 |         self.deconv4 = nn.ConvTranspose2d(32, 3, 5, bias=False, padding=2)
68 |         nn.init.xavier_uniform_(self.deconv4.weight, gain=nn.init.calculate_gain('leaky_relu'))
69 | 
70 |     def forward(self, x):
71 |         x = self.conv1(x)
72 |         x = self.pool(F.leaky_relu(self.bn2d1(x)))
73 |         x = self.conv2(x)
74 |         x = self.pool(F.leaky_relu(self.bn2d2(x)))
75 |         x = self.conv3(x)
76 |         x = self.pool(F.leaky_relu(self.bn2d3(x)))
77 |         x = x.view(x.size(0), -1)
78 |         x = self.bn1d(self.fc1(x))
79 |         x = x.view(x.size(0), int(self.rep_dim / (4 * 4)), 4, 4)
80 |         x = F.leaky_relu(x)
81 |         x = self.deconv1(x)
82 |         x = F.interpolate(F.leaky_relu(self.bn2d4(x)), scale_factor=2)
83 |         x = self.deconv2(x)
84 |         x = F.interpolate(F.leaky_relu(self.bn2d5(x)), scale_factor=2)
85 |         x = self.deconv3(x)
86 |         x = F.interpolate(F.leaky_relu(self.bn2d6(x)), scale_factor=2)
87 |         x = self.deconv4(x)
88 |         x = torch.sigmoid(x)
89 |         return x
90 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/networks/cifar10_LeNet_elu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from base.base_net import BaseNet
 6 | 
 7 | 
 8 | class CIFAR10_LeNet_ELU(BaseNet):
 9 | 
10 |     def __init__(self):
11 |         super().__init__()
12 | 
13 |         self.rep_dim = 128
14 |         self.pool = nn.MaxPool2d(2, 2)
15 | 
16 |         self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2)
17 |         self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
18 |         self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2)
19 |         self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False)
20 |         self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2)
21 |         self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False)
22 |         self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False)
23 | 
24 |     def forward(self, x):
25 |         x = self.conv1(x)
26 |         x = self.pool(F.elu(self.bn2d1(x)))
27 |         x = self.conv2(x)
28 |         x = self.pool(F.elu(self.bn2d2(x)))
29 |         x = self.conv3(x)
30 |         x = self.pool(F.elu(self.bn2d3(x)))
31 |         x = x.view(x.size(0), -1)
32 |         x = self.fc1(x)
33 |         return x
34 | 
35 | 
36 | class CIFAR10_LeNet_ELU_Autoencoder(BaseNet):
37 | 
38 |     def __init__(self):
39 |         super().__init__()
40 | 
41 |         self.rep_dim = 128
42 |         self.pool = nn.MaxPool2d(2, 2)
43 | 
44 |         # Encoder (must match the Deep SVDD network above)
45 |         self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2)
46 |         nn.init.xavier_uniform_(self.conv1.weight)
47 |         self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
48 |         self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2)
49 |         nn.init.xavier_uniform_(self.conv2.weight)
50 |         self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False)
51 |         self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2)
52 |         nn.init.xavier_uniform_(self.conv3.weight)
53 |         self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False)
54 |         self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False)
55 |         self.bn1d = nn.BatchNorm1d(self.rep_dim, eps=1e-04, affine=False)
56 | 
57 |         # Decoder
58 |         self.deconv1 = nn.ConvTranspose2d(int(self.rep_dim / (4 * 4)), 128, 5, bias=False, padding=2)
59 |         nn.init.xavier_uniform_(self.deconv1.weight)
60 |         self.bn2d4 = nn.BatchNorm2d(128, eps=1e-04, affine=False)
61 |         self.deconv2 = nn.ConvTranspose2d(128, 64, 5, bias=False, padding=2)
62 |         nn.init.xavier_uniform_(self.deconv2.weight)
63 |         self.bn2d5 = nn.BatchNorm2d(64, eps=1e-04, affine=False)
64 |         self.deconv3 = nn.ConvTranspose2d(64, 32, 5, bias=False, padding=2)
65 |         nn.init.xavier_uniform_(self.deconv3.weight)
66 |         self.bn2d6 = nn.BatchNorm2d(32, eps=1e-04, affine=False)
67 |         self.deconv4 = nn.ConvTranspose2d(32, 3, 5, bias=False, padding=2)
68 |         nn.init.xavier_uniform_(self.deconv4.weight)
69 | 
70 |     def forward(self, x):
71 |         x = self.conv1(x)
72 |         x = self.pool(F.elu(self.bn2d1(x)))
73 |         x = self.conv2(x)
74 |         x = self.pool(F.elu(self.bn2d2(x)))
75 |         x = self.conv3(x)
76 |         x = self.pool(F.elu(self.bn2d3(x)))
77 |         x = x.view(x.size(0), -1)
78 |         x = self.bn1d(self.fc1(x))
79 |         x = x.view(x.size(0), int(self.rep_dim / (4 * 4)), 4, 4)
80 |         x = F.elu(x)
81 |         x = self.deconv1(x)
82 |         x = F.interpolate(F.elu(self.bn2d4(x)), scale_factor=2)
83 |         x = self.deconv2(x)
84 |         x = F.interpolate(F.elu(self.bn2d5(x)), scale_factor=2)
85 |         x = self.deconv3(x)
86 |         x = F.interpolate(F.elu(self.bn2d6(x)), scale_factor=2)
87 |         x = self.deconv4(x)
88 |         x = torch.sigmoid(x)
89 |         return x
90 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/networks/mnist_LeNet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from base.base_net import BaseNet
 6 | 
 7 | 
 8 | class MNIST_LeNet(BaseNet):
 9 | 
10 |     def __init__(self):
11 |         super().__init__()
12 | 
13 |         self.rep_dim = 32
14 |         self.pool = nn.MaxPool2d(2, 2)
15 | 
16 |         self.conv1 = nn.Conv2d(1, 8, 5, bias=False, padding=2)
17 |         self.bn1 = nn.BatchNorm2d(8, eps=1e-04, affine=False)
18 |         self.conv2 = nn.Conv2d(8, 4, 5, bias=False, padding=2)
19 |         self.bn2 = nn.BatchNorm2d(4, eps=1e-04, affine=False)
20 |         self.fc1 = nn.Linear(4 * 7 * 7, self.rep_dim, bias=False)
21 | 
22 |     def forward(self, x):
23 |         x = self.conv1(x)
24 |         x = self.pool(F.leaky_relu(self.bn1(x)))
25 |         x = self.conv2(x)
26 |         x = self.pool(F.leaky_relu(self.bn2(x)))
27 |         x = x.view(x.size(0), -1)
28 |         x = self.fc1(x)
29 |         return x
30 | 
31 | 
32 | class MNIST_LeNet_Autoencoder(BaseNet):
33 | 
34 |     def __init__(self):
35 |         super().__init__()
36 | 
37 |         self.rep_dim = 32
38 |         self.pool = nn.MaxPool2d(2, 2)
39 | 
40 |         # Encoder (must match the Deep SVDD network above)
41 |         self.conv1 = nn.Conv2d(1, 8, 5, bias=False, padding=2)
42 |         self.bn1 = nn.BatchNorm2d(8, eps=1e-04, affine=False)
43 |         self.conv2 = nn.Conv2d(8, 4, 5, bias=False, padding=2)
44 |         self.bn2 = nn.BatchNorm2d(4, eps=1e-04, affine=False)
45 |         self.fc1 = nn.Linear(4 * 7 * 7, self.rep_dim, bias=False)
46 | 
47 |         # Decoder
48 |         self.deconv1 = nn.ConvTranspose2d(2, 4, 5, bias=False, padding=2)
49 |         self.bn3 = nn.BatchNorm2d(4, eps=1e-04, affine=False)
50 |         self.deconv2 = nn.ConvTranspose2d(4, 8, 5, bias=False, padding=3)
51 |         self.bn4 = nn.BatchNorm2d(8, eps=1e-04, affine=False)
52 |         self.deconv3 = nn.ConvTranspose2d(8, 1, 5, bias=False, padding=2)
53 | 
54 |     def forward(self, x):
55 |         x = self.conv1(x)
56 |         x = self.pool(F.leaky_relu(self.bn1(x)))
57 |         x = self.conv2(x)
58 |         x = self.pool(F.leaky_relu(self.bn2(x)))
59 |         x = x.view(x.size(0), -1)
60 |         x = self.fc1(x)
61 |         x = x.view(x.size(0), int(self.rep_dim / 16), 4, 4)
62 |         x = F.interpolate(F.leaky_relu(x), scale_factor=2)
63 |         x = self.deconv1(x)
64 |         x = F.interpolate(F.leaky_relu(self.bn3(x)), scale_factor=2)
65 |         x = self.deconv2(x)
66 |         x = F.interpolate(F.leaky_relu(self.bn4(x)), scale_factor=2)
67 |         x = self.deconv3(x)
68 |         x = torch.sigmoid(x)
69 | 
70 |         return x
71 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/networks/networks.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | from base.base_net import BaseNet
 8 | 
 9 | 
10 | class network(BaseNet):
11 | 
12 |     def __init__(self, n_vars, n_layers, shrinkage_factor):
13 |         super().__init__()
14 |         
15 |         layer_sizes = [math.ceil(n_vars * (1-shrinkage_factor)**(i)) for i in range(n_layers+1)]
16 |         self.rep_dim = math.ceil(layer_sizes[-1] * (1-shrinkage_factor))
17 |         
18 |         self.layers = []
19 |         
20 |         for i in range(n_layers):
21 |             self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1], bias=False))
22 |             if i is not n_layers-1:
23 |                 self.layers.append(nn.BatchNorm1d(layer_sizes[i+1], eps=1e-04, affine=False))
24 |                 self.layers.append(nn.LeakyReLU())
25 |         
26 |         #bottleneck layer
27 |         self.layers.append(nn.Linear(layer_sizes[-1], self.rep_dim, bias=False))
28 |         
29 |         self.encoder = nn.Sequential(*self.layers)
30 | 
31 |     def forward(self, x):
32 |         x = self.encoder(x)
33 |         return x
34 | 
35 | 
36 | class auto_encoder(BaseNet):
37 | 
38 |     def __init__(self, n_vars, n_layers, shrinkage_factor):
39 |         super().__init__()
40 |         
41 |         layer_sizes = [math.ceil(n_vars * (1-shrinkage_factor)**(i)) for i in range(n_layers+1)]
42 |         self.rep_dim = math.ceil(layer_sizes[-1] * (1-shrinkage_factor))
43 |         
44 |         #encoder
45 |         self.layers = []
46 |         
47 |         for i in range(n_layers):
48 |             self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1], bias=False))
49 |             
50 |             if i is not n_layers-1:
51 |                 nn.init.xavier_uniform_(self.layers[-1].weight, gain=nn.init.calculate_gain('leaky_relu'))
52 |                 self.layers.append(nn.BatchNorm1d(layer_sizes[i+1], eps=1e-04, affine=False))
53 |                 self.layers.append(nn.LeakyReLU())
54 |         
55 |         #bottleneck layer
56 |         self.layers.append(nn.Linear(layer_sizes[-1], self.rep_dim, bias=False))
57 |         
58 |         self.encoder = nn.Sequential(*self.layers)
59 |         
60 |         #decoder
61 |         
62 |         reverse_layer_sizes = [self.rep_dim] + list(reversed(layer_sizes))
63 |         self.layers = []
64 |         
65 |         for i in range(n_layers+1):
66 |             self.layers.append(nn.Linear(reverse_layer_sizes[i], reverse_layer_sizes[i+1], bias=False))
67 |             if i < n_layers:
68 |                 nn.init.xavier_uniform_(self.layers[-1].weight, gain=nn.init.calculate_gain('leaky_relu'))
69 |                 self.layers.append(nn.BatchNorm1d(reverse_layer_sizes[i+1], eps=1e-04, affine=False))
70 |                 self.layers.append(nn.LeakyReLU())
71 |         
72 | 
73 |         
74 |         #self.layers.append(nn.Linear(reverse_layer_sizes[-1], n_vars, bias=False))
75 |         
76 |         self.decoder = nn.Sequential(*self.layers)
77 | 
78 |     def forward(self, x):
79 |         x = self.encoder(x)
80 |         x = self.decoder(x)
81 |         return x
82 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/optim/__init__.py:
--------------------------------------------------------------------------------
1 | from .deepSVDD_trainer import DeepSVDDTrainer
2 | from .ae_trainer import AETrainer
3 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/optim/ae_trainer.py:
--------------------------------------------------------------------------------
  1 | from base.base_trainer import BaseTrainer
  2 | from base.base_dataset import BaseADDataset
  3 | from base.base_net import BaseNet
  4 | from sklearn.metrics import roc_auc_score
  5 | 
  6 | import logging
  7 | import time
  8 | import torch
  9 | import torch.optim as optim
 10 | import numpy as np
 11 | 
 12 | 
 13 | class AETrainer(BaseTrainer):
 14 | 
 15 |     def __init__(self, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150, lr_milestones: tuple = (),
 16 |                  batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', n_jobs_dataloader: int = 0):
 17 |         super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device,
 18 |                          n_jobs_dataloader)
 19 | 
 20 |     def train(self, dataset: BaseADDataset, ae_net: BaseNet):
 21 |         logger = logging.getLogger()
 22 | 
 23 |         # Set device for network
 24 |         ae_net = ae_net.to(self.device)
 25 | 
 26 |         # Get train data loader
 27 |         train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
 28 | 
 29 |         # Set optimizer (Adam optimizer for now)
 30 |         optimizer = optim.Adam(ae_net.parameters(), lr=self.lr, weight_decay=self.weight_decay,
 31 |                                amsgrad=self.optimizer_name == 'amsgrad')
 32 | 
 33 |         # Set learning rate scheduler
 34 |         scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1)
 35 | 
 36 |         # Training
 37 |         logger.info('Starting pretraining...')
 38 |         start_time = time.time()
 39 |         ae_net.train()
 40 |         for epoch in range(self.n_epochs):
 41 | 
 42 |             scheduler.step()
 43 |             if epoch in self.lr_milestones:
 44 |                 logger.info('  LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0]))
 45 | 
 46 |             loss_epoch = 0.0
 47 |             n_batches = 0
 48 |             epoch_start_time = time.time()
 49 |             for data in train_loader:
 50 |                 inputs, _, _ = data
 51 |                 inputs = inputs.to(self.device)
 52 | 
 53 |                 # Zero the network parameter gradients
 54 |                 optimizer.zero_grad()
 55 | 
 56 |                 # Update network parameters via backpropagation: forward + backward + optimize
 57 |                 outputs = ae_net(inputs)
 58 |                 scores = torch.sum((outputs - inputs) ** 2, dim=tuple(range(1, outputs.dim())))
 59 |                 loss = torch.mean(scores)
 60 |                 loss.backward()
 61 |                 optimizer.step()
 62 | 
 63 |                 loss_epoch += loss.item()
 64 |                 n_batches += 1
 65 | 
 66 |             # log epoch statistics
 67 |             epoch_train_time = time.time() - epoch_start_time
 68 |             logger.info('  Epoch {}/{}\t Time: {:.3f}\t Loss: {:.8f}'
 69 |                         .format(epoch + 1, self.n_epochs, epoch_train_time, loss_epoch / n_batches))
 70 | 
 71 |         pretrain_time = time.time() - start_time
 72 |         logger.info('Pretraining time: %.3f' % pretrain_time)
 73 |         logger.info('Finished pretraining.')
 74 | 
 75 |         return ae_net
 76 | 
 77 |     def test(self, dataset: BaseADDataset, ae_net: BaseNet):
 78 |         logger = logging.getLogger()
 79 | 
 80 |         # Set device for network
 81 |         ae_net = ae_net.to(self.device)
 82 | 
 83 |         # Get test data loader
 84 |         _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
 85 | 
 86 |         # Testing
 87 |         logger.info('Testing autoencoder...')
 88 |         loss_epoch = 0.0
 89 |         n_batches = 0
 90 |         start_time = time.time()
 91 |         idx_label_score = []
 92 |         ae_net.eval()
 93 |         with torch.no_grad():
 94 |             for data in test_loader:
 95 |                 inputs, labels, idx = data
 96 |                 inputs = inputs.to(self.device)
 97 |                 outputs = ae_net(inputs)
 98 |                 scores = torch.sum((outputs - inputs) ** 2, dim=tuple(range(1, outputs.dim())))
 99 |                 loss = torch.mean(scores)
100 | 
101 |                 # Save triple of (idx, label, score) in a list
102 |                 idx_label_score += list(zip(idx.cpu().data.numpy().tolist(),
103 |                                             labels.cpu().data.numpy().tolist(),
104 |                                             scores.cpu().data.numpy().tolist()))
105 | 
106 |                 loss_epoch += loss.item()
107 |                 n_batches += 1
108 | 
109 |         logger.info('Test set Loss: {:.8f}'.format(loss_epoch / n_batches))
110 | 
111 |         _, labels, scores = zip(*idx_label_score)
112 |         labels = np.array(labels)
113 |         scores = np.array(scores)
114 | 
115 |         auc = roc_auc_score(labels, scores)
116 |         logger.info('Test set AUC: {:.2f}%'.format(100. * auc))
117 | 
118 |         test_time = time.time() - start_time
119 |         logger.info('Autoencoder testing time: %.3f' % test_time)
120 |         logger.info('Finished testing autoencoder.')
121 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/optim/deepSVDD_trainer.py:
--------------------------------------------------------------------------------
  1 | from base.base_trainer import BaseTrainer
  2 | from base.base_dataset import BaseADDataset
  3 | from base.base_net import BaseNet
  4 | from torch.utils.data.dataloader import DataLoader
  5 | from sklearn.metrics import roc_auc_score
  6 | 
  7 | import logging
  8 | import time
  9 | import torch
 10 | import torch.optim as optim
 11 | import numpy as np
 12 | 
 13 | 
 14 | class DeepSVDDTrainer(BaseTrainer):
 15 | 
 16 |     def __init__(self, objective, R, c, nu: float, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150,
 17 |                  lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda',
 18 |                  n_jobs_dataloader: int = 0):
 19 |         super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device,
 20 |                          n_jobs_dataloader)
 21 | 
 22 |         assert objective in ('one-class', 'soft-boundary'), "Objective must be either 'one-class' or 'soft-boundary'."
 23 |         self.objective = objective
 24 | 
 25 |         # Deep SVDD parameters
 26 |         self.R = torch.tensor(R, device=self.device)  # radius R initialized with 0 by default.
 27 |         self.c = torch.tensor(c, device=self.device) if c is not None else None
 28 |         self.nu = nu
 29 | 
 30 |         # Optimization parameters
 31 |         self.warm_up_n_epochs = 10  # number of training epochs for soft-boundary Deep SVDD before radius R gets updated
 32 | 
 33 |         # Results
 34 |         self.train_time = None
 35 |         self.test_auc = None
 36 |         self.test_time = None
 37 |         self.test_scores = None
 38 | 
 39 |     def train(self, dataset: BaseADDataset, net: BaseNet):
 40 |         logger = logging.getLogger()
 41 | 
 42 |         # Set device for network
 43 |         net = net.to(self.device)
 44 | 
 45 |         # Get train data loader
 46 |         train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
 47 | 
 48 |         # Set optimizer (Adam optimizer for now)
 49 |         optimizer = optim.Adam(net.parameters(), lr=self.lr, weight_decay=self.weight_decay,
 50 |                                amsgrad=self.optimizer_name == 'amsgrad')
 51 | 
 52 |         # Set learning rate scheduler
 53 |         scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1)
 54 | 
 55 |         # Initialize hypersphere center c (if c not loaded)
 56 |         if self.c is None:
 57 |             logger.info('Initializing center c...')
 58 |             self.c = self.init_center_c(train_loader, net)
 59 |             logger.info('Center c initialized at %s', self.c)
 60 | 
 61 |         # Training
 62 |         logger.info('Starting training...')
 63 |         start_time = time.time()
 64 |         net.train()
 65 |         for epoch in range(self.n_epochs):
 66 | 
 67 |             scheduler.step()
 68 |             if epoch in self.lr_milestones:
 69 |                 logger.info('  LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0]))
 70 | 
 71 |             loss_epoch = 0.0
 72 |             n_batches = 0
 73 |             epoch_start_time = time.time()
 74 |             for data in train_loader:
 75 |                 inputs, _, _ = data
 76 |                 inputs = inputs.to(self.device)
 77 | 
 78 |                 # Zero the network parameter gradients
 79 |                 optimizer.zero_grad()
 80 | 
 81 |                 # Update network parameters via backpropagation: forward + backward + optimize
 82 |                 outputs = net(inputs)
 83 |                 dist = torch.sum((outputs - self.c) ** 2, dim=1)
 84 |                 if self.objective == 'soft-boundary':
 85 |                     scores = dist - self.R ** 2
 86 |                     loss = self.R ** 2 + (1 / self.nu) * torch.mean(torch.max(torch.zeros_like(scores), scores))
 87 |                 else:
 88 |                     loss = torch.mean(dist)
 89 |                 loss.backward()
 90 |                 optimizer.step()
 91 | 
 92 |                 # Update hypersphere radius R on mini-batch distances
 93 |                 if (self.objective == 'soft-boundary') and (epoch >= self.warm_up_n_epochs):
 94 |                     self.R.data = torch.tensor(get_radius(dist, self.nu), device=self.device)
 95 | 
 96 |                 loss_epoch += loss.item()
 97 |                 n_batches += 1
 98 | 
 99 |             # log epoch statistics
100 |             epoch_train_time = time.time() - epoch_start_time
101 |             logger.info('  Epoch {}/{}\t Time: {:.3f}\t Loss: {:.8f}'
102 |                         .format(epoch + 1, self.n_epochs, epoch_train_time, loss_epoch / n_batches))
103 | 
104 |         self.train_time = time.time() - start_time
105 |         logger.info('Training time: %.3f' % self.train_time)
106 | 
107 |         logger.info('Finished training.')
108 | 
109 |         return net
110 | 
111 |     def test(self, dataset: BaseADDataset, net: BaseNet):
112 |         logger = logging.getLogger()
113 | 
114 |         # Set device for network
115 |         net = net.to(self.device)
116 | 
117 |         # Get test data loader
118 |         _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader)
119 | 
120 |         # Testing
121 |         logger.info('Starting testing...')
122 |         start_time = time.time()
123 |         idx_label_score = []
124 |         net.eval()
125 |         with torch.no_grad():
126 |             for data in test_loader:
127 |                 inputs, labels, idx = data
128 |                 inputs = inputs.to(self.device)
129 |                 outputs = net(inputs)
130 |                 dist = torch.sum((outputs - self.c) ** 2, dim=1)
131 |                 if self.objective == 'soft-boundary':
132 |                     scores = dist - self.R ** 2
133 |                 else:
134 |                     scores = dist
135 | 
136 |                 # Save triples of (idx, label, score) in a list
137 |                 idx_label_score += list(zip(idx.cpu().data.numpy().tolist(),
138 |                                             labels.cpu().data.numpy().tolist(),
139 |                                             scores.cpu().data.numpy().tolist()))
140 | 
141 |         self.test_time = time.time() - start_time
142 |         logger.info('Testing time: %.3f' % self.test_time)
143 | 
144 |         self.test_scores = idx_label_score
145 | 
146 |         # Compute AUC
147 |         _, labels, scores = zip(*idx_label_score)
148 |         labels = np.array(labels)
149 |         scores = np.array(scores)
150 | 
151 |         self.test_auc = roc_auc_score(labels, scores)
152 |         logger.info('Test set AUC: {:.2f}%'.format(100. * self.test_auc))
153 | 
154 |         logger.info('Finished testing.')
155 | 
156 |     def init_center_c(self, train_loader: DataLoader, net: BaseNet, eps=0.1):
157 |         """Initialize hypersphere center c as the mean from an initial forward pass on the data."""
158 |         n_samples = 0
159 |         c = torch.zeros(net.rep_dim, device=self.device)
160 | 
161 |         net.eval()
162 |         with torch.no_grad():
163 |             for data in train_loader:
164 |                 # get the inputs of the batch
165 |                 inputs, _, _ = data
166 |                 inputs = inputs.to(self.device)
167 |                 outputs = net(inputs)
168 |                 n_samples += outputs.shape[0]
169 |                 c += torch.sum(outputs, dim=0)
170 | 
171 |         c /= n_samples
172 | 
173 |         # If c_i is too close to 0, set to +-eps. Reason: a zero unit can be trivially matched with zero weights.
174 |         c[(abs(c) < eps) & (c < 0)] = -eps
175 |         c[(abs(c) < eps) & (c > 0)] = eps
176 | 
177 |         return c
178 | 
179 | 
180 | def get_radius(dist: torch.Tensor, nu: float):
181 |     """Optimally solve for radius R via the (1-nu)-quantile of distances."""
182 |     return np.quantile(np.sqrt(dist.clone().data.cpu().numpy()), 1 - nu)
183 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import Config
2 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/utils/collect_results.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | 
 4 | 
 5 | base_path = '/Users/lukasruff/Repos/Deep-SVDD-PyTorch/log/mnist/test/mnist/soft_deepSVDD'
 6 | n_exps = 3
 7 | n_seeds = 3
 8 | 
 9 | exps = range(n_exps)
10 | seeds = range(1, n_seeds)
11 | 
12 | for exp in exps:
13 | 
14 |     exp_folder = str(exp) + 'vsall'
15 |     aucs = np.zeros(n_seeds, dtype=np.float32)
16 | 
17 |     for seed in seeds:
18 | 
19 |         seed_folder = 'seed_' + str(seed)
20 |         file_name = 'results.json'
21 |         file_path = base_path + '/' + exp_folder + '/' + seed_folder + '/' + file_name
22 | 
23 |         with open(file_path, 'r') as fp:
24 |             results = json.load(fp)
25 | 
26 |         aucs[seed - 1] = results['test_auc']
27 | 
28 |     mean = np.mean(aucs[aucs > 0])
29 |     std = np.std(aucs[aucs > 0])
30 | 
31 |     # Write results
32 |     log_file = '{}/result.txt'.format(base_path)
33 |     log = open(log_file, 'a')
34 |     log.write('Experiment: {}\n'.format(exp_folder))
35 |     log.write('Test Set AUC [mean]: {} %\n'.format(round(float(mean * 100), 4)))
36 |     log.write('Test Set AUC [std]: {} %\n'.format(round(float(std * 100), 4)))
37 |     log.write('\n')
38 | 
39 | log.write('\n')
40 | log.close()
41 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/utils/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class Config(object):
 5 |     """Base class for experimental setting/configuration."""
 6 | 
 7 |     def __init__(self, settings):
 8 |         self.settings = settings
 9 | 
10 |     def load_config(self, import_json):
11 |         """Load settings dict from import_json (path/filename.json) JSON-file."""
12 | 
13 |         with open(import_json, 'r') as fp:
14 |             settings = json.load(fp)
15 | 
16 |         for key, value in settings.items():
17 |             self.settings[key] = value
18 | 
19 |     def save_config(self, export_json):
20 |         """Save settings dict to export_json (path/filename.json) JSON-file."""
21 | 
22 |         with open(export_json, 'w') as fp:
23 |             json.dump(self.settings, fp)
24 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/src/utils/visualization/plot_images_grid.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | # import matplotlib
 3 | # matplotlib.use('Agg')  # or 'PS', 'PDF', 'SVG'
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | from torchvision.utils import make_grid
 8 | 
 9 | 
10 | def plot_images_grid(x: torch.tensor, export_img, title: str = '', nrow=8, padding=2, normalize=False, pad_value=0):
11 |     """Plot 4D Tensor of images of shape (B x C x H x W) as a grid."""
12 | 
13 |     grid = make_grid(x, nrow=nrow, padding=padding, normalize=normalize, pad_value=pad_value)
14 |     npgrid = grid.cpu().numpy()
15 | 
16 |     plt.imshow(np.transpose(npgrid, (1, 2, 0)), interpolation='nearest')
17 | 
18 |     ax = plt.gca()
19 |     ax.xaxis.set_visible(False)
20 |     ax.yaxis.set_visible(False)
21 | 
22 |     if not (title == ''):
23 |         plt.title(title)
24 | 
25 |     plt.savefig(export_img, bbox_inches='tight', pad_inches=0.1)
26 |     plt.clf()
27 | 


--------------------------------------------------------------------------------
/additional_methods/Deep-SVDD/test_instruction.txt:
--------------------------------------------------------------------------------
1 | python main.py "wine.pickle" 3 0.2 ../log/mnist_test ../../../formatted_data test.csv --objective one-class --lr 0.00001 --n_epochs 1500 --lr_milestone 500 --batch_size 200 --weight_decay 0.5e-6 --pretrain True --ae_lr 0.00001 --ae_n_epochs 1500 --ae_lr_milestone 500 --ae_batch_size 200 --ae_weight_decay 0.5e-3 --normal_class 0;
2 | 


--------------------------------------------------------------------------------
/additional_methods/ODIN.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Outlier Detection using Indegree Number (ODIN) Algorithm
 3 | """
 4 | # Author: Roel Bouman <roel.bouman@ru.nl>
 5 | # License: BSD 2 clause
 6 | 
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | from sklearn.neighbors import kneighbors_graph
11 | from sklearn.utils.validation import check_is_fitted
12 | from sklearn.utils.validation import check_array
13 | 
14 | from pyod.models.base import BaseDetector
15 | from pyod.utils.utility import invert_order
16 | 
17 | import numpy as np
18 | 
19 | #Note, PREDICT is not implemented properly yet. It looks only to 1 matrix of input data at a time.
20 | class ODIN(BaseDetector):
21 |     """
22 |     """
23 |     def __init__(self, n_neighbors=20,
24 |                  metric='minkowski', p=2, metric_params=None,
25 |                  contamination=0.1, n_jobs=None):
26 |         super(ODIN, self).__init__(contamination=contamination)
27 |         self.n_neighbors = n_neighbors
28 |         self.metric = metric
29 |         self.p = p
30 |         self.metric_params = metric_params
31 |         self.n_jobs = n_jobs
32 | 
33 | 
34 |     def fit(self, X, y=None):
35 |         """Fit detector. y is ignored in unsupervised methods.
36 |         Parameters
37 |         ----------
38 |         X : numpy array of shape (n_samples, n_features)
39 |             The input samples.
40 |         y : Ignored
41 |             Not used, present for API consistency by convention.
42 |         Returns
43 |         -------
44 |         self : object
45 |             Fitted estimator.
46 |         """
47 |         # validate inputs X and y (optional)
48 |         X = check_array(X)
49 | 
50 |         self.knn_graph_  = kneighbors_graph(X, n_neighbors=self.n_neighbors,
51 |                                             metric=self.metric,
52 |                                             p=self.p,
53 |                                             metric_params=self.metric_params,
54 |                                             n_jobs=self.n_jobs,
55 |                                             include_self=False)
56 |         
57 |         
58 |         
59 |         
60 | 
61 |         # Invert decision_scores_. Outliers comes with higher outlier scores
62 |         self.decision_scores_ = invert_order(np.asarray(np.sum(self.knn_graph_, axis=0)).flatten())
63 |         self._process_decision_scores()
64 |         return self
65 | 
66 |     def decision_function(self, X):
67 |         """TEMP
68 |         """
69 |         X = check_array(X)
70 | 
71 |         self.knn_graph_  = kneighbors_graph(X, n_neighbors=self.n_neighbors,
72 |                                             metric=self.metric,
73 |                                             p=self.p,
74 |                                             metric_params=self.metric_params,
75 |                                             n_jobs=self.n_jobs,
76 |                                             include_self=False)   
77 |         
78 |         
79 |         
80 | 
81 |         # Invert decision_scores_. Outliers comes with higher outlier scores
82 |         self.decision_scores_ = invert_order(np.asarray(np.sum(self.knn_graph_, axis=0)).flatten())
83 |         self._process_decision_scores()
84 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/backups/codestyle.ini.bak:
--------------------------------------------------------------------------------
1 | [codestyle]
2 | indentation = True
3 | edge_line = True
4 | edge_line_columns = 79
5 | 
6 | [main]
7 | version = 0.2.0
8 | 
9 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/backups/encoding.ini.bak:
--------------------------------------------------------------------------------
1 | [encoding]
2 | text_encoding = utf-8
3 | 
4 | [main]
5 | version = 0.2.0
6 | 
7 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/backups/vcs.ini.bak:
--------------------------------------------------------------------------------
1 | [vcs]
2 | use_version_control = False
3 | version_control_system = 
4 | 
5 | [main]
6 | version = 0.2.0
7 | 
8 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/backups/workspace.ini.bak:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | restore_data_on_startup = True
 3 | save_data_on_exit = True
 4 | save_history = True
 5 | save_non_project_files = False
 6 | 
 7 | [main]
 8 | version = 0.2.0
 9 | recent_files = ['src\\BaseSVDD.py']
10 | 
11 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/codestyle.ini:
--------------------------------------------------------------------------------
1 | [codestyle]
2 | indentation = True
3 | edge_line = True
4 | edge_line_columns = 79
5 | 
6 | [main]
7 | version = 0.2.0
8 | 
9 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/defaults/defaults-codestyle-0.2.0.ini:
--------------------------------------------------------------------------------
1 | [codestyle]
2 | indentation = True
3 | edge_line = True
4 | edge_line_columns = 79
5 | 
6 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/defaults/defaults-encoding-0.2.0.ini:
--------------------------------------------------------------------------------
1 | [encoding]
2 | text_encoding = utf-8
3 | 
4 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/defaults/defaults-vcs-0.2.0.ini:
--------------------------------------------------------------------------------
1 | [vcs]
2 | use_version_control = False
3 | version_control_system = 
4 | 
5 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/defaults/defaults-workspace-0.2.0.ini:
--------------------------------------------------------------------------------
1 | [workspace]
2 | restore_data_on_startup = True
3 | save_data_on_exit = True
4 | save_history = True
5 | save_non_project_files = False
6 | 
7 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/encoding.ini:
--------------------------------------------------------------------------------
1 | [encoding]
2 | text_encoding = utf-8
3 | 
4 | [main]
5 | version = 0.2.0
6 | 
7 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/vcs.ini:
--------------------------------------------------------------------------------
1 | [vcs]
2 | use_version_control = False
3 | version_control_system = 
4 | 
5 | [main]
6 | version = 0.2.0
7 | 
8 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/.spyproject/config/workspace.ini:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | restore_data_on_startup = True
 3 | save_data_on_exit = True
 4 | save_history = True
 5 | save_non_project_files = False
 6 | 
 7 | [main]
 8 | version = 0.2.0
 9 | recent_files = []
10 | 
11 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Kepeng Qiu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img width="80%" height="80%" src="https://z3.ax1x.com/2021/11/11/IwtDFH.gif">
  3 | </p>
  4 | 
  5 | <h3 align="center">Support Vector Data Description (SVDD)</h3>
  6 | 
  7 | <p align="center">Python code for abnormal detection or fault detection using Support Vector Data Description (SVDD)</p>
  8 | <p align="center">Version 1.1, 11-NOV-2021</p>
  9 | <p align="center">Email: iqiukp@outlook.com</p>
 10 | 
 11 | <div align=center>
 12 | 
 13 | <img src="https://img.shields.io/github/v/release/iqiukp/SVDD-Python?label=version" />
 14 | <img src="https://img.shields.io/github/repo-size/iqiukp/SVDD-Python" />
 15 | <img src="https://img.shields.io/github/languages/code-size/iqiukp/SVDD-Python" />
 16 | <img src="https://img.shields.io/github/languages/top/iqiukp/SVDD-Python" />
 17 | <img src="https://img.shields.io/github/stars/iqiukp/SVDD-Python" />
 18 | <img src="https://img.shields.io/github/forks/iqiukp/SVDD-Python" />
 19 | </div>
 20 | 
 21 | <hr />
 22 | 
 23 | ## Main features
 24 | 
 25 | - SVDD BaseEstimator based on sklearn.base for one-class or binary classification
 26 | - Multiple kinds of kernel functions (linear, gaussian, polynomial, sigmoid)
 27 | - Visualization of decision boundaries for 2D data
 28 | 
 29 | ## Requirements
 30 | 
 31 | - cvxopt
 32 | - matplotlib
 33 | - numpy
 34 | - scikit_learn
 35 | - scikit-opt (optional, only used for parameter optimization)
 36 | 
 37 | ## Notices
 38 | 
 39 | - The label must be 1 for positive sample or -1 for negative sample. 
 40 | - Detailed applications please see the examples.
 41 | - This code is for reference only.
 42 | 
 43 | ## Examples
 44 | 
 45 | ### 01. svdd_example_unlabeled_data.py
 46 | 
 47 | An example for SVDD model fitting using unlabeled data.
 48 | 
 49 | <p align="center">
 50 |   <img width="80%" height="80%" src="https://z3.ax1x.com/2021/11/11/Iw44fA.png">
 51 |   <img width="60%" height="60%" src="https://z3.ax1x.com/2021/11/11/Iw5Ghd.png">
 52 | </p>
 53 | 
 54 | ### 02. svdd_example_hybrid_data.py
 55 | 
 56 | An example for SVDD model fitting with negataive samples.
 57 | 
 58 | ```Python
 59 | import sys
 60 | sys.path.append("..")
 61 | from sklearn.datasets import load_wine
 62 | from src.BaseSVDD import BaseSVDD, BananaDataset
 63 | 
 64 | # Banana-shaped dataset generation and partitioning
 65 | X, y = BananaDataset.generate(number=100, display='on')
 66 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
 67 | 
 68 | # 
 69 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on')
 70 | 
 71 | # 
 72 | svdd.fit(X_train,  y_train)
 73 | 
 74 | # 
 75 | svdd.plot_boundary(X_train,  y_train)
 76 | 
 77 | #
 78 | y_test_predict = svdd.predict(X_test, y_test)
 79 | 
 80 | #
 81 | radius = svdd.radius
 82 | distance = svdd.get_distance(X_test)
 83 | svdd.plot_distance(radius, distance)
 84 | ```
 85 | 
 86 | <p align="center">
 87 |   <img width="80%" height="80%" src="https://z3.ax1x.com/2021/11/11/Iw5WuV.png">
 88 |   <img width="60%" height="60%" src="https://z3.ax1x.com/2021/11/11/Iw55EF.png">
 89 | </p>
 90 | 
 91 | ### 03. svdd_example_kernel.py
 92 | 
 93 | An example for SVDD model fitting using different kernels.
 94 | 
 95 | ```Python
 96 | import sys
 97 | sys.path.append("..")
 98 | from src.BaseSVDD import BaseSVDD, BananaDataset
 99 | 
100 | # Banana-shaped dataset generation and partitioning
101 | X, y = BananaDataset.generate(number=100, display='on')
102 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
103 | 
104 | # kernel list
105 | kernelList = {"1": BaseSVDD(C=0.9, kernel='rbf', gamma=0.3, display='on'),
106 |               "2": BaseSVDD(C=0.9, kernel='poly',degree=2, display='on'),
107 |               "3": BaseSVDD(C=0.9, kernel='linear', display='on')
108 |               }
109 | 
110 | # 
111 | for i in range(len(kernelList)):
112 |     svdd = kernelList.get(str(i+1))
113 |     svdd.fit(X_train,  y_train)
114 |     svdd.plot_boundary(X_train,  y_train)
115 | ```
116 | 
117 | <p align="center">
118 |   <img width="80%" height="80%" src="https://z3.ax1x.com/2021/11/11/IwoFJJ.png">
119 |   <img width="80%" height="80%" src="https://z3.ax1x.com/2021/11/11/IwoKoD.png">
120 |   <img width="80%" height="80%" src="https://z3.ax1x.com/2021/11/11/Iwo8SA.png">
121 | </p>
122 | 
123 | 
124 | ### 04. svdd_example_KPCA.py
125 | 
126 | An example for SVDD model fitting using nonlinear principal component.
127 | 
128 | The KPCA algorithm is used to reduce the dimension of the original data.
129 | 
130 | ```Python
131 | import sys
132 | sys.path.append("..")
133 | import numpy as np
134 | from src.BaseSVDD import BaseSVDD
135 | from sklearn.decomposition import KernelPCA
136 | 
137 | 
138 | # create 100 points with 5 dimensions
139 | X = np.r_[np.random.randn(50, 5) + 1, np.random.randn(50, 5)]
140 | y = np.append(np.ones((50, 1), dtype=np.int64), 
141 |               -np.ones((50, 1), dtype=np.int64),
142 |               axis=0)
143 | 
144 | # number of the dimensionality
145 | kpca = KernelPCA(n_components=2, kernel="rbf", gamma=0.1, fit_inverse_transform=True)
146 | X_kpca = kpca.fit_transform(X)
147 | 
148 | # fit the SVDD model
149 | svdd = BaseSVDD(C=0.9, gamma=10, kernel='rbf', display='on')
150 | 
151 | # fit and predict
152 | svdd.fit(X_kpca,  y)
153 | y_test_predict = svdd.predict(X_kpca, y)
154 | 
155 | # plot the distance curve
156 | radius = svdd.radius
157 | distance = svdd.get_distance(X_kpca)
158 | svdd.plot_distance(radius, distance)
159 | 
160 | # plot the boundary
161 | svdd.plot_boundary(X_kpca,  y)
162 | ```
163 | 
164 | <p align="center">
165 |   <img width="80%" height="80%" src="https://z3.ax1x.com/2021/11/11/IwH20P.png">
166 |   <img width="60%" height="60%" src="https://z3.ax1x.com/2021/11/11/IwHhtS.png">
167 | </p>
168 | 
169 | ### 05. svdd_example_PSO.py
170 | 
171 | An example for parameter optimization using PSO.
172 | 
173 | "scikit-opt" is required in this example.
174 | 
175 | https://github.com/guofei9987/scikit-opt
176 | 
177 | 
178 | ```Python
179 | import sys
180 | sys.path.append("..")
181 | from src.BaseSVDD import BaseSVDD, BananaDataset
182 | from sko.PSO import PSO
183 | import matplotlib.pyplot as plt
184 | 
185 | 
186 | # Banana-shaped dataset generation and partitioning
187 | X, y = BananaDataset.generate(number=100, display='off')
188 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
189 | 
190 | # objective function
191 | def objective_func(x):
192 |     x1, x2 = x
193 |     svdd = BaseSVDD(C=x1, gamma=x2, kernel='rbf', display='off')
194 |     y = 1-svdd.fit(X_train, y_train).accuracy
195 |     return y
196 | 
197 | # Do PSO
198 | pso = PSO(func=objective_func, n_dim=2, pop=10, max_iter=20, 
199 |           lb=[0.01, 0.01], ub=[1, 3], w=0.8, c1=0.5, c2=0.5)
200 | pso.run()
201 | 
202 | print('best_x is', pso.gbest_x)
203 | print('best_y is', pso.gbest_y)
204 | 
205 | # plot the result
206 | fig = plt.figure(figsize=(6, 4))
207 | ax = fig.add_subplot(1, 1, 1)
208 | ax.plot(pso.gbest_y_hist)
209 | ax.yaxis.grid()
210 | plt.show()
211 | ```
212 | 
213 | <p align="center">
214 |   <img width="60%" height="60%"src="https://z3.ax1x.com/2021/11/11/IwbG4S.png">
215 | </p>
216 | 
217 | ### 06. svdd_example_confusion_matrix.py
218 | 
219 | An example for drawing the confusion matrix and ROC curve.
220 | 
221 | <p align="center">
222 |   <img width="40%" height="40%" src="https://z3.ax1x.com/2021/11/11/Iw7S5F.png">
223 |   <img width="40%" height="40%" src="https://z3.ax1x.com/2021/11/11/Iw7ADx.png">
224 | </p>
225 | 
226 | ### 07. svdd_example_cross_validation.py
227 | 
228 | An example for cross validation.
229 | 
230 | ```Python
231 | import sys
232 | sys.path.append("..")
233 | from src.BaseSVDD import BaseSVDD, BananaDataset
234 | from sklearn.model_selection import cross_val_score
235 | 
236 | 
237 | # Banana-shaped dataset generation and partitioning
238 | X, y = BananaDataset.generate(number=100, display='on')
239 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
240 | 
241 | # 
242 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on')
243 | 
244 | 
245 | # cross validation (k-fold)
246 | k = 5
247 | scores = cross_val_score(svdd, X_train, y_train, cv=k, scoring='accuracy')
248 | 
249 | #
250 | print("Cross validation scores:")
251 | for scores_ in scores:
252 |     print(scores_)
253 |  
254 | print("Mean cross validation score: {:4f}".format(scores.mean()))
255 | ```
256 | Results
257 | ```
258 | Cross validation scores:
259 | 0.5714285714285714
260 | 0.75
261 | 0.9642857142857143
262 | 1.0
263 | 1.0
264 | Mean cross validation score: 0.857143
265 | ```
266 | 
267 | ### 08. svdd_example_grid_search.py
268 | 
269 | An example for parameter selection using grid search.
270 | 
271 | ```Python
272 | import sys
273 | sys.path.append("..")
274 | from sklearn.datasets import load_wine
275 | from src.BaseSVDD import BaseSVDD, BananaDataset
276 | from sklearn.model_selection import KFold, LeaveOneOut, ShuffleSplit
277 | from sklearn.model_selection import learning_curve, GridSearchCV
278 | 
279 | # Banana-shaped dataset generation and partitioning
280 | X, y = BananaDataset.generate(number=100, display='off')
281 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
282 | 
283 | param_grid = [
284 |     {"kernel": ["rbf"], "gamma": [0.1, 0.2, 0.5], "C": [0.1, 0.5, 1]},
285 |     {"kernel": ["linear"], "C": [0.1, 0.5, 1]},
286 |     {"kernel": ["poly"], "C": [0.1, 0.5, 1], "degree": [2, 3, 4, 5]},
287 | ]
288 | 
289 | svdd = GridSearchCV(BaseSVDD(display='off'), param_grid, cv=5, scoring="accuracy")
290 | svdd.fit(X_train, y_train)
291 | print("best parameters:")
292 | print(svdd.best_params_)
293 | print("\n")
294 | 
295 | # 
296 | best_model = svdd.best_estimator_
297 | means = svdd.cv_results_["mean_test_score"]
298 | stds = svdd.cv_results_["std_test_score"]
299 | 
300 | for mean, std, params in zip(means, stds, svdd.cv_results_["params"]):
301 |     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
302 | print()
303 | 
304 | ```
305 | Results
306 | ```Python
307 | best parameters:
308 | {'C': 0.5, 'gamma': 0.1, 'kernel': 'rbf'}
309 | 
310 | 
311 | 0.921 (+/-0.159) for {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
312 | 0.893 (+/-0.192) for {'C': 0.1, 'gamma': 0.2, 'kernel': 'rbf'}
313 | 0.857 (+/-0.296) for {'C': 0.1, 'gamma': 0.5, 'kernel': 'rbf'}
314 | 0.950 (+/-0.086) for {'C': 0.5, 'gamma': 0.1, 'kernel': 'rbf'}
315 | 0.921 (+/-0.131) for {'C': 0.5, 'gamma': 0.2, 'kernel': 'rbf'}
316 | 0.864 (+/-0.273) for {'C': 0.5, 'gamma': 0.5, 'kernel': 'rbf'}
317 | 0.950 (+/-0.086) for {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
318 | 0.921 (+/-0.131) for {'C': 1, 'gamma': 0.2, 'kernel': 'rbf'}
319 | 0.864 (+/-0.273) for {'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}
320 | 0.807 (+/-0.246) for {'C': 0.1, 'kernel': 'linear'}
321 | 0.821 (+/-0.278) for {'C': 0.5, 'kernel': 'linear'}
322 | 0.793 (+/-0.273) for {'C': 1, 'kernel': 'linear'}
323 | 0.879 (+/-0.184) for {'C': 0.1, 'degree': 2, 'kernel': 'poly'}
324 | 0.836 (+/-0.305) for {'C': 0.1, 'degree': 3, 'kernel': 'poly'}
325 | 0.771 (+/-0.416) for {'C': 0.1, 'degree': 4, 'kernel': 'poly'}
326 | 0.757 (+/-0.448) for {'C': 0.1, 'degree': 5, 'kernel': 'poly'}
327 | 0.871 (+/-0.224) for {'C': 0.5, 'degree': 2, 'kernel': 'poly'}
328 | 0.814 (+/-0.311) for {'C': 0.5, 'degree': 3, 'kernel': 'poly'}
329 | 0.800 (+/-0.390) for {'C': 0.5, 'degree': 4, 'kernel': 'poly'}
330 | 0.764 (+/-0.432) for {'C': 0.5, 'degree': 5, 'kernel': 'poly'}
331 | 0.871 (+/-0.224) for {'C': 1, 'degree': 2, 'kernel': 'poly'}
332 | 0.850 (+/-0.294) for {'C': 1, 'degree': 3, 'kernel': 'poly'}
333 | 0.800 (+/-0.390) for {'C': 1, 'degree': 4, 'kernel': 'poly'}
334 | 0.771 (+/-0.416) for {'C': 1, 'degree': 5, 'kernel': 'poly'}
335 | ```
336 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | Use this section to tell people about which versions of your project are
 6 | currently being supported with security updates.
 7 | 
 8 | | Version | Supported          |
 9 | | ------- | ------------------ |
10 | | 5.1.x   | :white_check_mark: |
11 | | 5.0.x   | :x:                |
12 | | 4.0.x   | :white_check_mark: |
13 | | < 4.0   | :x:                |
14 | 
15 | ## Reporting a Vulnerability
16 | 
17 | Use this section to tell people how to report a vulnerability.
18 | 
19 | Tell them where to go, how often they can expect to get an update on a
20 | reported vulnerability, what to expect if the vulnerability is accepted or
21 | declined, etc.
22 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/examples/svdd_example_KPCA.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | An example for SVDD model fitting using nonlinear principal component.
 5 | 
 6 | The KPCA algorithm is used to reduce the dimension of the original data.
 7 | 
 8 | """
 9 | 
10 | import sys
11 | sys.path.append("..")
12 | import numpy as np
13 | from src.BaseSVDD import BaseSVDD
14 | from sklearn.decomposition import KernelPCA
15 | 
16 | 
17 | # create 100 points with 5 dimensions
18 | X = np.r_[np.random.randn(50, 5) + 1, np.random.randn(50, 5)]
19 | y = np.append(np.ones((50, 1), dtype=np.int64), 
20 |               -np.ones((50, 1), dtype=np.int64),
21 |               axis=0)
22 | 
23 | # number of the dimensionality
24 | kpca = KernelPCA(n_components=2, kernel="rbf", gamma=0.1, fit_inverse_transform=True)
25 | X_kpca = kpca.fit_transform(X)
26 | 
27 | # fit the SVDD model
28 | svdd = BaseSVDD(C=0.9, gamma=10, kernel='rbf', display='on')
29 | 
30 | # fit and predict
31 | svdd.fit(X_kpca,  y)
32 | y_test_predict = svdd.predict(X_kpca, y)
33 | 
34 | # plot the distance curve
35 | radius = svdd.radius
36 | distance = svdd.get_distance(X_kpca)
37 | svdd.plot_distance(radius, distance)
38 | 
39 | # plot the boundary
40 | svdd.plot_boundary(X_kpca,  y)
41 | 
42 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/examples/svdd_example_PSO.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | An example for parameter optimization using PSO.
 5 | 
 6 | "scikit-opt" is required in this examples.
 7 | 
 8 | https://github.com/guofei9987/scikit-opt
 9 | 
10 | """
11 | 
12 | import sys
13 | sys.path.append("..")
14 | from src.BaseSVDD import BaseSVDD, BananaDataset
15 | from sko.PSO import PSO
16 | import matplotlib.pyplot as plt
17 | 
18 | 
19 | # Banana-shaped dataset generation and partitioning
20 | X, y = BananaDataset.generate(number=100, display='off')
21 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
22 | 
23 | # objective function
24 | def objective_func(x):
25 |     x1, x2 = x
26 |     svdd = BaseSVDD(C=x1, gamma=x2, kernel='rbf', display='off')
27 |     y = 1-svdd.fit(X_train, y_train).accuracy
28 |     return y
29 | 
30 | # Do PSO
31 | pso = PSO(func=objective_func, n_dim=2, pop=10, max_iter=20, 
32 |           lb=[0.01, 0.01], ub=[1, 3], w=0.8, c1=0.5, c2=0.5)
33 | pso.run()
34 | 
35 | print('best_x is', pso.gbest_x)
36 | print('best_y is', pso.gbest_y)
37 | 
38 | # plot the result
39 | fig = plt.figure(figsize=(6, 4))
40 | ax = fig.add_subplot(1, 1, 1)
41 | ax.plot(pso.gbest_y_hist)
42 | ax.yaxis.grid()
43 | plt.show()
44 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/examples/svdd_example_confusion_matrix.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | An example for drawing the confusion matrix and ROC curve
 4 | 
 5 | """
 6 | import sys
 7 | sys.path.append("..")
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | from src.BaseSVDD import BaseSVDD
11 | from sklearn.metrics import confusion_matrix
12 | from sklearn.metrics import ConfusionMatrixDisplay
13 | from sklearn.metrics import roc_curve, auc
14 | from sklearn.model_selection import train_test_split
15 | 
16 | # generate data
17 | n = 100
18 | dim = 5
19 | X = np.r_[np.random.randn(n, dim) + 1, np.random.randn(n, dim)]
20 | y = np.append(np.ones((n, 1), dtype=np.int64), 
21 |               -np.ones((n, 1), dtype=np.int64),
22 |               axis=0)
23 | 
24 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
25 | 
26 | # SVDD model
27 | svdd = BaseSVDD(C=0.9, gamma=0.1, kernel='rbf', display='on')
28 | svdd.fit(X_train,  y_train)
29 | y_test_predict = svdd.predict(X_test, y_test)
30 | 
31 | # plot the distance curve
32 | radius = svdd.radius
33 | distance = svdd.get_distance(X_test)
34 | svdd.plot_distance(radius, distance)
35 | 
36 | # confusion matrix and ROC curve
37 | cm = confusion_matrix(y_test, y_test_predict)
38 | cm_display = ConfusionMatrixDisplay(cm).plot()
39 | y_score = svdd.decision_function(X_test)
40 | 
41 | fpr, tpr, _ = roc_curve(y_test, y_score)
42 | roc_auc = auc(fpr, tpr)
43 | 
44 | plt.figure()
45 | plt.plot(fpr, tpr, color="darkorange", lw=3, label="ROC curve (area = %0.2f)" % roc_auc)
46 | plt.plot([0, 1], [0, 1], color="navy", lw=3, linestyle="--")
47 | plt.xlim([0.0, 1.0])
48 | plt.ylim([0.0, 1.05])
49 | plt.xlabel("False Positive Rate")
50 | plt.ylabel("True Positive Rate")
51 | plt.title("Receiver operating characteristic")
52 | plt.legend(loc="lower right")
53 | plt.grid()
54 | plt.show()
55 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/examples/svdd_example_cross_validation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | An example for cross validation
 4 | 
 5 | """
 6 | import sys
 7 | sys.path.append("..")
 8 | from src.BaseSVDD import BaseSVDD, BananaDataset
 9 | from sklearn.model_selection import cross_val_score
10 | 
11 | 
12 | # Banana-shaped dataset generation and partitioning
13 | X, y = BananaDataset.generate(number=100, display='on')
14 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
15 | 
16 | # 
17 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on')
18 | 
19 | 
20 | # cross validation (k-fold)
21 | k = 5
22 | scores = cross_val_score(svdd, X_train, y_train, cv=k, scoring='accuracy')
23 | 
24 | #
25 | print("Cross validation scores:")
26 | for scores_ in scores:
27 |     print(scores_)
28 |  
29 | print("Mean cross validation score: {:4f}".format(scores.mean()))
30 | 
31 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/examples/svdd_example_grid_search.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | An example for parameter selection using grid search
 5 | 
 6 | """
 7 | import sys
 8 | sys.path.append("..")
 9 | from sklearn.datasets import load_wine
10 | from src.BaseSVDD import BaseSVDD, BananaDataset
11 | from sklearn.model_selection import KFold, LeaveOneOut, ShuffleSplit
12 | from sklearn.model_selection import learning_curve, GridSearchCV
13 | 
14 | # Banana-shaped dataset generation and partitioning
15 | X, y = BananaDataset.generate(number=100, display='off')
16 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
17 | 
18 | param_grid = [
19 |     {"kernel": ["rbf"], "gamma": [0.1, 0.2, 0.5], "C": [0.1, 0.5, 1]},
20 |     {"kernel": ["linear"], "C": [0.1, 0.5, 1]},
21 |     {"kernel": ["poly"], "C": [0.1, 0.5, 1], "degree": [2, 3, 4, 5]},
22 | ]
23 | 
24 | svdd = GridSearchCV(BaseSVDD(display='off'), param_grid, cv=5, scoring="accuracy")
25 | svdd.fit(X_train, y_train)
26 | print("best parameters:")
27 | print(svdd.best_params_)
28 | print("\n")
29 | 
30 | # 
31 | best_model = svdd.best_estimator_
32 | means = svdd.cv_results_["mean_test_score"]
33 | stds = svdd.cv_results_["std_test_score"]
34 | 
35 | for mean, std, params in zip(means, stds, svdd.cv_results_["params"]):
36 |     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
37 | print()
38 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/examples/svdd_example_hybrid_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | An example for SVDD model fitting with negataive samples
 5 | 
 6 | """
 7 | import sys
 8 | sys.path.append("..")
 9 | from sklearn.datasets import load_wine
10 | from src.BaseSVDD import BaseSVDD, BananaDataset
11 | 
12 | # Banana-shaped dataset generation and partitioning
13 | X, y = BananaDataset.generate(number=100, display='on')
14 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
15 | 
16 | # 
17 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on')
18 | 
19 | # 
20 | svdd.fit(X_train,  y_train)
21 | 
22 | # 
23 | svdd.plot_boundary(X_train,  y_train)
24 | 
25 | #
26 | y_test_predict = svdd.predict(X_test, y_test)
27 | 
28 | #
29 | radius = svdd.radius
30 | distance = svdd.get_distance(X_test)
31 | svdd.plot_distance(radius, distance)


--------------------------------------------------------------------------------
/additional_methods/SVDD/examples/svdd_example_kernel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | An example for SVDD model fitting using different kernels
 5 | 
 6 | """
 7 | import sys
 8 | sys.path.append("..")
 9 | from src.BaseSVDD import BaseSVDD, BananaDataset
10 | 
11 | # Banana-shaped dataset generation and partitioning
12 | X, y = BananaDataset.generate(number=100, display='on')
13 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3)
14 | 
15 | # kernel list
16 | kernelList = {"1": BaseSVDD(C=0.9, kernel='rbf', gamma=0.3, display='on'),
17 |               "2": BaseSVDD(C=0.9, kernel='poly',degree=2, display='on'),
18 |               "3": BaseSVDD(C=0.9, kernel='linear', display='on')
19 |               }
20 | 
21 | # 
22 | for i in range(len(kernelList)):
23 |     svdd = kernelList.get(str(i+1))
24 |     svdd.fit(X_train,  y_train)
25 |     svdd.plot_boundary(X_train,  y_train)
26 | 
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/additional_methods/SVDD/examples/svdd_example_unlabeled_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | An example for SVDD model fitting with negataive samples
 5 | 
 6 | """
 7 | import sys
 8 | sys.path.append("..")
 9 | import numpy as np
10 | from src.BaseSVDD import BaseSVDD
11 | 
12 | # create 100 points with 2 dimensions
13 | n = 100
14 | dim = 2
15 | X = np.r_[np.random.randn(n, dim)]
16 | 
17 | # svdd object using rbf kernel
18 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on')
19 | 
20 | # fit the SVDD model
21 | svdd.fit(X)
22 | 
23 | # predict the label
24 | y_predict = svdd.predict(X)
25 | 
26 | # plot the boundary
27 | svdd.plot_boundary(X)
28 | 
29 | # plot the distance
30 | radius = svdd.radius
31 | distance = svdd.get_distance(X)
32 | svdd.plot_distance(radius, distance)


--------------------------------------------------------------------------------
/additional_methods/SVDD/requirements.txt:
--------------------------------------------------------------------------------
1 | cvxopt==1.2.7
2 | matplotlib==3.4.2
3 | numpy==1.22.0
4 | scikit_learn==1.0.1
5 | 


--------------------------------------------------------------------------------
/additional_methods/abod.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Angle-based Outlier Detector (ABOD)
  3 | """
  4 | # Author: Yue Zhao <zhaoy@cmu.edu>
  5 | # License: BSD 2 clause
  6 | 
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import warnings
 11 | from itertools import combinations
 12 | 
 13 | import numpy as np
 14 | from numba import njit
 15 | from sklearn.neighbors import KDTree
 16 | from sklearn.neighbors import NearestNeighbors
 17 | from sklearn.utils import check_array
 18 | from sklearn.utils.validation import check_is_fitted
 19 | 
 20 | from pyod.models.base import BaseDetector
 21 | from pyod.utils.utility import check_parameter
 22 | 
 23 | 
 24 | @njit
 25 | def _wcos(curr_pt, a, b):  # pragma: no cover
 26 |     """Internal function to calculate weighted cosine using optimized
 27 |     numba code.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     curr_pt : numpy array of shape (n_samples, n_features)
 32 |         Current sample to be calculated.
 33 | 
 34 |     a : numpy array of shape (n_samples, n_features)
 35 |         Training sample a.
 36 | 
 37 |     b : numpy array of shape (n_samples, n_features)
 38 |         Training sample b.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     wcos : float in range [-1, 1]
 43 |         Cosine similarity between a-curr_pt and b-curr_pt.
 44 | 
 45 |     """
 46 | 
 47 |     a_curr = a - curr_pt
 48 |     b_curr = b - curr_pt
 49 | 
 50 |     # wcos = (<a_curr, b_curr>/((|a_curr|*|b_curr|)^2)
 51 |     wcos = np.dot(a_curr, b_curr) / (
 52 |             np.linalg.norm(a_curr, 2) ** 2) / (
 53 |                    np.linalg.norm(b_curr, 2) ** 2)
 54 |     return wcos
 55 | 
 56 | 
 57 | def _calculate_wocs(curr_pt, X, X_ind):
 58 |     """Calculated the variance of weighted cosine of a point.
 59 |     wcos = (<a_curr, b_curr>/((|a_curr|*|b_curr|)^2)
 60 | 
 61 |     Parameters
 62 |     ----------
 63 |     curr_pt : numpy array, shape (1, n_features)
 64 |         The sample to be calculated.
 65 | 
 66 |     X : numpy array of shape (n_samples, n_features)
 67 |         The training dataset.
 68 | 
 69 |     X_ind : list
 70 |         The valid index of the training data.
 71 | 
 72 |     Returns
 73 |     -------
 74 |     cos_angle_var : float
 75 |         The variance of cosine angle
 76 | 
 77 |     """
 78 |     wcos_list = []
 79 |     curr_pair_inds = list(combinations(X_ind, 2))
 80 |     for j, (a_ind, b_ind) in enumerate(curr_pair_inds):
 81 |         a = X[a_ind, :]
 82 |         b = X[b_ind, :]
 83 | 
 84 |         # skip if no angle can be formed
 85 |         if np.array_equal(a, curr_pt) or np.array_equal(b, curr_pt):
 86 |             wcos_list.append(0)
 87 |         else:
 88 |             # add the weighted cosine to the list
 89 |             wcos_list.append(_wcos(curr_pt, a, b))
 90 |     return np.var(wcos_list)
 91 | 
 92 | 
 93 | # noinspection PyPep8Naming
 94 | class ABOD(BaseDetector):
 95 |     """ABOD class for Angle-base Outlier Detection.
 96 |     For an observation, the variance of its weighted cosine scores to all
 97 |     neighbors could be viewed as the outlying score.
 98 |     See :cite:`kriegel2008angle` for details.
 99 | 
100 |     Two version of ABOD are supported:
101 | 
102 |     - Fast ABOD: use k nearest neighbors to approximate.
103 |     - Original ABOD: consider all training points with high time complexity at
104 |       O(n^3).
105 | 
106 |     Parameters
107 |     ----------
108 |     contamination : float in (0., 0.5), optional (default=0.1)
109 |         The amount of contamination of the data set, i.e.
110 |         the proportion of outliers in the data set. Used when fitting to
111 |         define the threshold on the decision function.
112 | 
113 |     n_neighbors : int, optional (default=10)
114 |         Number of neighbors to use by default for k neighbors queries.
115 | 
116 |     method: str, optional (default='fast')
117 |         Valid values for metric are:
118 | 
119 |         - 'fast': fast ABOD. Only consider n_neighbors of training points
120 |         - 'default': original ABOD with all training points, which could be
121 |           slow
122 | 
123 |     Attributes
124 |     ----------
125 |     decision_scores_ : numpy array of shape (n_samples,)
126 |         The outlier scores of the training data.
127 |         The higher, the more abnormal. Outliers tend to have higher
128 |         scores. This value is available once the detector is
129 |         fitted.
130 | 
131 |     threshold_ : float
132 |         The threshold is based on ``contamination``. It is the
133 |         ``n_samples * contamination`` most abnormal samples in
134 |         ``decision_scores_``. The threshold is calculated for generating
135 |         binary outlier labels.
136 | 
137 |     labels_ : int, either 0 or 1
138 |         The binary labels of the training data. 0 stands for inliers
139 |         and 1 for outliers/anomalies. It is generated by applying
140 |         ``threshold_`` on ``decision_scores_``.
141 |     """
142 | 
143 |     def __init__(self, contamination=0.1, n_neighbors=5, method='fast'):
144 |         super(ABOD, self).__init__(contamination=contamination)
145 |         self.method = method
146 |         self.n_neighbors = n_neighbors
147 | 
148 |     def fit(self, X, y=None):
149 |         """Fit detector. y is ignored in unsupervised methods.
150 | 
151 |         Parameters
152 |         ----------
153 |         X : numpy array of shape (n_samples, n_features)
154 |             The input samples.
155 | 
156 |         y : Ignored
157 |             Not used, present for API consistency by convention.
158 | 
159 |         Returns
160 |         -------
161 |         self : object
162 |             Fitted estimator.
163 |         """
164 |         # validate inputs X and y (optional)
165 |         X = check_array(X)
166 |         self._set_n_classes(y)
167 | 
168 |         self.X_train_ = X
169 |         self.n_train_ = X.shape[0]
170 |         self.decision_scores_ = np.zeros([self.n_train_, 1])
171 | 
172 |         if self.method == 'fast':
173 |             self._fit_fast()
174 |         elif self.method == 'default':
175 |             self._fit_default()
176 |         else:
177 |             raise ValueError(self.method, "is not a valid method")
178 | 
179 |         # flip the scores
180 |         self.decision_scores_ = self.decision_scores_.ravel() * -1
181 |         self._process_decision_scores()
182 |         return self
183 | 
184 |     def _fit_default(self):
185 |         """Default ABOD method. Use all training points with high complexity
186 |         O(n^3). For internal use only.
187 |         """
188 |         for i in range(self.n_train_):
189 |             curr_pt = self.X_train_[i, :]
190 | 
191 |             # get the index pairs of the neighbors, remove itself from index
192 |             X_ind = list(range(0, self.n_train_))
193 |             X_ind.remove(i)
194 | 
195 |             self.decision_scores_[i, 0] = _calculate_wocs(curr_pt,
196 |                                                           self.X_train_,
197 |                                                           X_ind)
198 |         return self
199 | 
200 |     def _fit_fast(self):
201 |         """Fast ABOD method. Only use n_neighbors for angle calculation.
202 |         Internal use only
203 |         """
204 | 
205 |         # make sure the n_neighbors is in the range
206 |         if self.n_neighbors >= self.n_train_:
207 |             self.n_neighbors = self.n_train_ - 1
208 |             warnings.warn("n_neighbors is set to the number of "
209 |                           "training points minus 1: {0}".format(self.n_train_))
210 | 
211 |             check_parameter(self.n_neighbors, 1, self.n_train_,
212 |                             include_left=True, include_right=True)
213 | 
214 |         self.tree_ = KDTree(self.X_train_)
215 | 
216 |         neigh = NearestNeighbors(n_neighbors=self.n_neighbors)
217 |         neigh.fit(self.X_train_)
218 |         ind_arr = neigh.kneighbors(n_neighbors=self.n_neighbors,
219 |                                    return_distance=False)
220 | 
221 |         for i in range(self.n_train_):
222 |             curr_pt = self.X_train_[i, :]
223 |             X_ind = ind_arr[i, :]
224 |             self.decision_scores_[i, 0] = _calculate_wocs(curr_pt,
225 |                                                           self.X_train_,
226 |                                                           X_ind)
227 |         return self
228 | 
229 |     # noinspection PyPep8Naming
230 |     def decision_function(self, X):
231 |         """Predict raw anomaly score of X using the fitted detector.
232 | 
233 |         The anomaly score of an input sample is computed based on different
234 |         detector algorithms. For consistency, outliers are assigned with
235 |         larger anomaly scores.
236 | 
237 |         Parameters
238 |         ----------
239 |         X : numpy array of shape (n_samples, n_features)
240 |             The training input samples. Sparse matrices are accepted only
241 |             if they are supported by the base estimator.
242 | 
243 |         Returns
244 |         -------
245 |         anomaly_scores : numpy array of shape (n_samples,)
246 |             The anomaly score of the input samples.
247 |         """
248 | 
249 |         check_is_fitted(self, ['X_train_', 'n_train_', 'decision_scores_',
250 |                                'threshold_', 'labels_'])
251 |         X = check_array(X)
252 | 
253 |         if self.method == 'fast':  # fast ABOD
254 |             # outliers have higher outlier scores
255 |             return self._decision_function_fast(X) * -1
256 |         else:  # default ABOD
257 |             return self._decision_function_default(X) * -1
258 | 
259 |     def _decision_function_default(self, X):
260 |         """Internal method for predicting outlier scores using default ABOD.
261 | 
262 |         Parameters
263 |         ----------
264 |         X : numpy array of shape (n_samples, n_features)
265 |             The training input samples.
266 | 
267 |         Returns
268 |         -------
269 |         pred_score : array, shape (n_samples,)
270 |             The anomaly score of the input samples.
271 | 
272 |         """
273 |         # initialize the output score
274 |         pred_score = np.zeros([X.shape[0], 1])
275 | 
276 |         for i in range(X.shape[0]):
277 |             curr_pt = X[i, :]
278 |             # get the index pairs of the neighbors
279 |             X_ind = list(range(0, self.n_train_))
280 |             pred_score[i, :] = _calculate_wocs(curr_pt, self.X_train_, X_ind)
281 | 
282 |         return pred_score.ravel()
283 | 
284 |     def _decision_function_fast(self, X):
285 |         """Internal method for predicting outlier scores using Fast ABOD.
286 | 
287 |         Parameters
288 |         ----------
289 |         X : numpy array of shape (n_samples, n_features)
290 |             The training input samples.
291 | 
292 |         Returns
293 |         -------
294 |         pred_score : array, shape (n_samples,)
295 |             The anomaly score of the input samples.
296 | 
297 |         """
298 | 
299 |         check_is_fitted(self, ['tree_'])
300 |         # initialize the output score
301 |         pred_score = np.zeros([X.shape[0], 1])
302 | 
303 |         # get the indexes of the X's k nearest training points
304 |         _, ind_arr = self.tree_.query(X, k=self.n_neighbors)
305 | 
306 |         for i in range(X.shape[0]):
307 |             curr_pt = X[i, :]
308 |             X_ind = ind_arr[i, :]
309 |             pred_score[i, :] = _calculate_wocs(curr_pt, self.X_train_, X_ind)
310 | 
311 |         return pred_score.ravel()


--------------------------------------------------------------------------------
/additional_methods/ensemble.py:
--------------------------------------------------------------------------------
 1 | from sklearn.utils import check_array 
 2 |  
 3 | from pyod.models.base import BaseDetector 
 4 | from pyod.models.combination import average 
 5 | from pyod.models.lof import LOF 
 6 | 
 7 | import numpy as np
 8 | 
 9 | class Ensemble(BaseDetector): 
10 |      
11 |     def __init__(self, estimators=[LOF()], combination_function=average, contamination=0.1, **kwargs): 
12 |         super(Ensemble, self).__init__(contamination=contamination) 
13 |         self.estimators = estimators 
14 |         self.n_estimators_ = len(estimators) 
15 |         self.combination_function = combination_function 
16 |         self.kwargs = kwargs 
17 |          
18 |     def fit(self, X, y=None): 
19 |         X = check_array(X) 
20 |         n_samples = X.shape[0] 
21 |          
22 |         all_scores = np.zeros((n_samples,self.n_estimators_)) 
23 |          
24 |         for i, estimator in enumerate(self.estimators): 
25 |             estimator.fit(X) 
26 |             all_scores[:,i] = estimator.decision_scores_ 
27 |              
28 |         self.decision_scores_ = self.combination_function(all_scores, **self.kwargs) 
29 |          
30 |         return self 
31 |          
32 |     def decision_function(self, X): 
33 |         n_samples = X.shape[0] 
34 |          
35 |         all_scores = np.zeros((n_samples,self.n_estimators_)) 
36 |          
37 |         for i, estimator in enumerate(self.estimators): 
38 |             all_scores[:,i] = estimator.decision_function(X) 
39 |          
40 |         return self.combination_function(all_scores, **self.kwargs)


--------------------------------------------------------------------------------
/additional_methods/gen2out/gen2out.py:
--------------------------------------------------------------------------------
  1 | ####################################
  2 | # Author: Jeremy (Meng-Chieh) Lee  #
  3 | # Email	: mengchil@cs.cmu.edu      #
  4 | ####################################
  5 | 
  6 | 
  7 | import numpy as np
  8 | from scipy.spatial.distance import cityblock
  9 | 
 10 | from sklearn.cluster import DBSCAN
 11 | from sklearn.linear_model import LinearRegression
 12 | 
 13 | import time
 14 | from joblib import Parallel, delayed
 15 | from tqdm import tqdm
 16 | 
 17 | from .iforest import IsolationForest
 18 | 
 19 | from pyod.utils.utility import invert_order
 20 | 
 21 | 
 22 | class gen2Out:
 23 | 	def __init__(self, lower_bound=9, upper_bound=12, max_depth=7,
 24 | 				 rotate=True, contamination='auto', random_state=None):
 25 | 		self.lower_bound = lower_bound
 26 | 		self.upper_bound = upper_bound
 27 | 		self.max_depth = max_depth
 28 | 		self.rotate = rotate
 29 | 		self.contamination = contamination if contamination == 'auto' else float(contamination)
 30 | 		self.random_state = random_state
 31 | 
 32 | 	def func(self, Xs, i):
 33 | 		### Fit forest with full-grown trees
 34 | 		clf = IsolationForest(random_state=self.random_state,
 35 | 							  max_samples=len(Xs),
 36 | 							  contamination=self.contamination,
 37 | 							  rotate=self.rotate).fit(Xs, max_depth=100000000)
 38 | 		depths = np.mean(clf._compute_actual_depth(Xs), axis=0)
 39 | 		bins = np.arange(int(depths.min()), int(depths.max() + 2))
 40 | 		y, x = np.histogram(depths, bins=bins)
 41 | 		return i, x[np.argmax(y)]
 42 | 
 43 | 	def fit(self, X, y=None):
 44 | 		if self.random_state:
 45 | 			np.random.seed(self.random_state)
 46 | 		self.n_sample = X.shape[0]
 47 | 
 48 | 		params_arr = Parallel(n_jobs=self.upper_bound-self.lower_bound)(
 49 | 								[delayed(self.func)(X[np.random.choice(self.n_sample, 2 ** i, replace=True)], i)
 50 | 								for i in np.arange(self.lower_bound, self.upper_bound)])
 51 | 		x_arr, y_arr = np.array(params_arr).T
 52 | 
 53 | 		self.reg = LinearRegression(fit_intercept=False).fit(x_arr.reshape(-1, 1), y_arr)
 54 | 		self.clf = IsolationForest(random_state=self.random_state,
 55 | 								   max_samples=len(X),
 56 | 								   contamination=self.contamination,
 57 | 								   rotate=self.rotate).fit(X, max_depth=self.max_depth)
 58 | 
 59 | 		return self
 60 | 
 61 | 	def average_path_length(self, n):
 62 | 		n = np.array(n)
 63 | 		apl = self.reg.predict(np.log2([n]).T)
 64 | 		apl[apl < 0] = 0
 65 | 		return apl
 66 | 
 67 | 	def decision_function(self, X):
 68 | 		depths, leaves = self.clf._compute_actual_depth_leaf(X)
 69 | 
 70 | 		new_depths = np.zeros(X.shape[0])
 71 | 		for d, l in zip(depths, leaves):
 72 | 			new_depths += d + self.average_path_length(l)
 73 | 
 74 | 		scores = 2 ** (-new_depths
 75 | 					   / (len(self.clf.estimators_)
 76 | 						  * self.average_path_length([self.n_sample])))
 77 | 		
 78 | 		return invert_order(scores)
 79 | 
 80 | 	def point_anomaly_scores(self, X):
 81 | 		self = self.fit(X)
 82 | 		return self.decision_function(X)
 83 | 	
 84 | 	def group_anomaly_scores(self, X, trials=10):
 85 | 		### Fit a sequence of gen2Out0
 86 | 		self.min_rate = int(np.log2(len(X)) - 8) + 1
 87 | 		self.scores = np.zeros((self.min_rate, trials, len(X)))
 88 | 
 89 | 		print('Fitting gen2Out0...')
 90 | 		for i in tqdm(range(self.min_rate)):
 91 | 			for j in range(trials):
 92 | 				X_sampled = X[np.random.choice(len(X), int(len(X) * (1 / (2 ** i))))]
 93 | 				clf = self.fit(X_sampled)
 94 | 				self.scores[i][j] = clf.decision_function(X)
 95 | 
 96 | 		### Create X-ray plot
 97 | 		smax = np.max(np.mean(self.scores, axis=1), axis=0)
 98 | 		self.threshold = np.mean(smax) + 3 * np.std(smax)
 99 | 
100 | 		sr_list = []
101 | 		xrays = np.max(np.mean(self.scores, axis=1), axis=0)
102 | 		for idx, xray in enumerate(xrays):
103 | 			if xray >= self.threshold:
104 | 				sr_list.append(idx)
105 | 		sr_list = np.array(sr_list)
106 | 
107 | 		### Outlier grouping
108 | 		groups = DBSCAN().fit_predict(X[sr_list])
109 | 
110 | 		self.labels = -np.ones(len(X)).astype(int)
111 | 		for idx, g in zip(sr_list, groups):
112 | 			if g != -1:
113 | 				self.labels[idx] = g + 1
114 | 				
115 | 		### Compute iso-curves
116 | 		xline = 1 / (2 ** np.arange(0, self.min_rate))
117 | 		self.s_arr = [[] for l in np.unique(self.labels) if l != -1]
118 | 		xrays_max = np.argmax(np.mean(self.scores, axis=1), axis=0)
119 | 		for idx in sr_list:
120 | 			if self.labels[idx] != -1:
121 | 				dis = cityblock([np.log2(xrays_max[idx]) / 10 + 1, xrays[idx]], [1, 1])
122 | 				self.s_arr[self.labels[idx]-1].append((2 - dis) / 2)
123 | 		
124 | 		ga_scores = np.array([np.median(s) for s in self.s_arr])
125 | 		ga_indices = [np.where(self.labels == l)[0] for l in np.unique(self.labels) if l != -1]
126 | 		
127 | 		return ga_scores, ga_indices
128 | 


--------------------------------------------------------------------------------
/additional_methods/gen2out/main.py:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Author: Jeremy (Meng-Chieh) Lee  #
 3 | # Email	: mengchil@cs.cmu.edu      #
 4 | ####################################
 5 | 
 6 | 
 7 | import numpy as np
 8 | import time
 9 | import argparse
10 | 
11 | from .gen2out import gen2Out
12 | from .utils import sythetic_group_anomaly, plot_results
13 | 
14 | 
15 | if __name__ == '__main__':	
16 | 
17 |     parser = argparse.ArgumentParser(description='Parameters for gen2Out')
18 |     parser.add_argument('--lower_bound', default=9, type=int, help='Lower bound of sampling (2^i)')
19 |     parser.add_argument('--upper_bound', default=12, type=int, help='Upper bound of sampling (2^i)')
20 |     parser.add_argument('--max_depth', default=7, type=int, help='Maximum depth of each tree')
21 |     parser.add_argument('--rotate', default=True, type=bool, help='Whether to use the rotated IF or not')
22 |     parser.add_argument('--contamination', default='auto', type=str, help='Contamination rate of the dataset')
23 |     parser.add_argument('--random_state', default=0, type=int, help='Control the randomness')
24 |     args = parser.parse_args()
25 | 
26 |     model = gen2Out(lower_bound=args.lower_bound,
27 |                     upper_bound=args.upper_bound,
28 |                     max_depth=args.max_depth,
29 |                     rotate=args.rotate,
30 |                     contamination=args.contamination,
31 |                     random_state=args.random_state)
32 | 
33 |     X = sythetic_group_anomaly()
34 | 
35 |     print('Start point anomaly detection:')
36 |     t1 = time.time()
37 |     pscores = model.point_anomaly_scores(X)
38 |     t2 = time.time()
39 |     print('Finish in %.1f seconds!\n' % (t2 - t1))
40 | 
41 |     print('Start group anomaly detection:')
42 |     t1 = time.time()
43 |     gscores = model.group_anomaly_scores(X)
44 |     t2 = time.time()
45 |     print('Finish in %.1f seconds!\n' % (t2 - t1))
46 | 
47 |     print('Generating plots...')
48 |     plot_results(X, model)
49 |     print('Finish!')
50 | 
51 |     
52 | 


--------------------------------------------------------------------------------
/additional_methods/gen2out/utils.py:
--------------------------------------------------------------------------------
  1 | ####################################
  2 | # Author: Jeremy (Meng-Chieh) Lee  #
  3 | # Email	: mengchil@cs.cmu.edu      #
  4 | ####################################
  5 | 
  6 | 
  7 | import numpy as np
  8 | from scipy.spatial.distance import cityblock, euclidean
  9 | 
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | def uni_disk(n, low=0, high=1):
 14 |     r = np.random.uniform(low=low, high=high, size=n)  # radius
 15 |     theta = np.random.uniform(low=0, high=2*np.pi, size=n)  # angle
 16 |     x = np.sqrt(r) * np.cos(theta)
 17 |     y = np.sqrt(r) * np.sin(theta)
 18 |     return x, y
 19 | 
 20 | def sythetic_group_anomaly(seed=0):
 21 |     np.random.seed(seed)
 22 | 
 23 |     x1, y1 = uni_disk(100000)
 24 |     x1 *= 5
 25 |     y1 *= 5
 26 | 
 27 |     x2, y2 = uni_disk(1000)
 28 |     x2 = x2 * 1.5 + 10
 29 |     y2 = y2 * 1.5 + 5
 30 | 
 31 |     x3, y3 = uni_disk(2000)
 32 |     x3 = x3 * 6 + 3
 33 |     y3 = y3 - 10
 34 | 
 35 |     x4 = [11, -2, 13, 14]
 36 |     y4 = [0, 9, -10, 10]
 37 | 
 38 |     x = np.concatenate([x1, x2, x3, x4])
 39 |     y = np.concatenate([y1, y2, y3, y4])
 40 |     X_norm = np.array([x, y]).T
 41 | 
 42 |     return X_norm
 43 | 
 44 | def plot_xray(X, model, idx_arr, line=False):
 45 |     plt.scatter(1, 1, s=100, c='k', marker='*')
 46 |     xline = 1 / (2 ** np.arange(0, model.min_rate))
 47 | 
 48 |     for idx in idx_arr:
 49 |         s = model.scores.T[idx].T
 50 |         std, mean = np.std(s, axis=1), np.mean(s, axis=1)
 51 |         if line:
 52 |             plt.plot(xline, mean, c='k', alpha=0.7)
 53 |             plt.fill_between(xline, mean-std, mean+std, color='grey', alpha=0.2)
 54 | 
 55 |         max_idx = np.argmax(mean)
 56 |         plt.scatter(xline[max_idx], mean[max_idx], s=20, c='k')
 57 |     
 58 |     plt.plot([2 ** (-(model.min_rate - 0.7)), 1.2], [model.threshold, model.threshold], '--', label='Mean + 3 * Std', alpha=0.8, c='r')
 59 |     plt.ylim(-0.05, 1.05)
 60 |     plt.xlim(2 ** (-(model.min_rate - 0.7)), 1.2)
 61 |     plt.xscale('log', base=2)
 62 |     plt.xlabel('Qualification Rate', fontsize=20)
 63 |     plt.ylabel('Anomaly Score', fontsize=20)
 64 |     plt.legend(fontsize=12)
 65 | 
 66 | def plot_results(X, model):
 67 |     ### Randomly sample when plotting
 68 |     idx_arr = np.concatenate([np.arange(300), 
 69 |                               np.arange(100000, 100300), 
 70 |                               np.arange(101000, 101300), 
 71 |                               np.arange(103000, 103004)])
 72 | 
 73 |     ### Plot heatmap
 74 |     plt.figure(figsize=(4.8, 4))
 75 |     plt.hexbin(X[:, 0], X[:, 1], cmap='cool', gridsize=30, bins='log', mincnt=1)
 76 |     plt.colorbar()
 77 |     plt.tight_layout()
 78 |     plt.savefig('results/step0_heatmap.png')
 79 | 
 80 |     ### Step 1: X-ray plot
 81 |     plt.figure(figsize=(4, 4))
 82 |     plot_xray(X, model, idx_arr, line=True)
 83 |     plt.tight_layout()
 84 |     plt.savefig('results/step1_xray_plot.png')
 85 | 
 86 |     ### Step 2: Apex extraction
 87 |     plt.figure(figsize=(4, 4))
 88 |     plot_xray(X, model, idx_arr, line=False)
 89 |     plt.tight_layout()
 90 |     plt.savefig('results/step2_apex_extraction.png')
 91 | 
 92 |     ### Step 3: Outlier grouping 
 93 |     c_arr = ['', 'b', 'r', 'y', 'm', 'g', 'c']
 94 |     plt.figure(figsize=(4, 4))
 95 |     plt.scatter(X[:, 0], X[:, 1], c='lightgrey', alpha=0.5)
 96 | 
 97 |     for l in np.unique(model.labels):
 98 |         if l != -1:
 99 |             idx = np.where(model.labels == l)[0]
100 |             plt.scatter(X[idx, 0], X[idx, 1], c=c_arr[l], label='GA ' + str(l))
101 | 
102 |     plt.legend(fontsize=12)
103 |     plt.tight_layout()
104 |     plt.savefig('results/step3_outlier_grouping.png')
105 | 
106 |     ### Step 4: Anomaly iso-curves
107 |     man_x, man_y, man_dis = [], [], []
108 |     for i in np.arange(0, model.min_rate, 0.01):
109 |         for j in np.arange(0, 1.01, 0.01):
110 |             ix = 1 / (2 ** i)
111 |             man_x.append(ix)
112 |             man_y.append(j)
113 |             man_dis.append(cityblock([np.log2(ix) / 10, j], [1, 1]))
114 |     man_x, man_y, man_dis = np.array(man_x), np.array(man_y), np.array(man_dis)
115 | 
116 |     plt.figure(figsize=(4.8, 4))
117 |     plt.scatter(man_x, man_y, c=man_dis, cmap='gist_rainbow', alpha=0.1)
118 |     plt.colorbar()
119 |     plt.scatter(1, 1, s=100, c='k', marker='*')
120 | 
121 |     xline = 1 / (2 ** np.arange(0, model.min_rate))
122 |     for idx in idx_arr:
123 |         if model.labels[idx] != -1:
124 |             c = c_arr[model.labels[idx]]
125 |             s = model.scores.T[idx].T
126 |             std, mean = np.std(s, axis=1), np.mean(s, axis=1)
127 |             plt.plot(xline, mean, c=c, alpha=0.05)
128 |             max_idx = np.argmax(mean)
129 |             plt.scatter(xline[max_idx], mean[max_idx], s=20, c=c)
130 |     for l in np.unique(model.labels):
131 |         if l != -1:
132 |             plt.plot([], [], '-o', c=c_arr[l], label='GA ' + str(l))
133 | 
134 |     plt.xscale('log', base=2)
135 |     plt.ylim(-0.05, 1.05)
136 |     plt.xlim(2 ** (-(model.min_rate - 0.7)), 1.2)
137 |     plt.xlabel('Qualification Rate', fontsize=20)
138 |     plt.ylabel('Anomaly Score', fontsize=20)
139 |     plt.legend(fontsize=12, loc=4)
140 |     plt.tight_layout()
141 |     plt.savefig('results/step4_anomaly_isocurves.png')
142 | 
143 |     ### Step 5: Scoring
144 |     plt.figure(figsize=(4.4, 4))
145 | 
146 |     for idx, s in enumerate(model.s_arr):
147 |         ymin, ymax = np.min(s), np.max(s)
148 |         Q1, Q3 = np.percentile(s, 25), np.percentile(s, 75)
149 |         m = np.median(s)
150 |         plt.scatter([idx, idx], [ymin, ymax], facecolors='none', edgecolors='lightgrey')
151 |         plt.plot([idx, idx], [Q1, Q3], c='grey', linewidth=0.9)
152 |         plt.plot([idx-0.12, idx+0.12], [Q1, Q1], c='grey', linewidth=0.9)
153 |         plt.plot([idx-0.12, idx+0.12], [Q3, Q3], c='grey', linewidth=0.9)
154 |         plt.plot([idx-0.24, idx+0.24], [m, m], c='r', linewidth=3)
155 | 
156 |     plt.xticks(np.arange(len(model.s_arr)), ['GA '+str(i+1) for i in range(len(model.s_arr))], fontsize=12)
157 |     plt.xlabel('Generalized Anomaly ID', fontsize=20)
158 |     plt.ylabel('Distribution of\nAnomaly Score', fontsize=20)
159 |     plt.tight_layout()
160 |     plt.savefig('results/step5_scoring.png')


--------------------------------------------------------------------------------
/additional_methods/lmdd.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Linear Model Deviation-base outlier detection (LMDD).
  3 | """
  4 | # Author: Yahya Almardeny <almardeny@gmail.com>
  5 | # License: BSD 2 clause
  6 | 
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import numpy as np
 11 | from numba import njit, prange
 12 | from scipy import stats
 13 | from sklearn.utils import check_array, check_random_state
 14 | 
 15 | from pyod.utils import check_parameter
 16 | from pyod.models.base import BaseDetector
 17 | 
 18 | 
 19 | @njit
 20 | def _aad(X):
 21 |     """Internal Function to Calculate Average Absolute Deviation
 22 |     (a.k.a Mean Absolute Deviation)
 23 |     """
 24 |     return np.mean(np.absolute(X - np.mean(X)))
 25 | 
 26 | @njit(parallel=True)
 27 | def _dis(X, dis_measure_=_aad):
 28 |     """
 29 |     Internal function to calculate for
 30 |     dissimilarity in a sequence of sets.
 31 |     """
 32 |     n = X.shape[0]
 33 |     res_ = np.zeros((n,))
 34 |     _var = np.zeros((n,))
 35 |     var_max, j = -np.inf, 0
 36 |     # this can be vectorized but just for comforting memory
 37 |     for i in prange(1, n):
 38 |         _var[i] = dis_measure_(X[:i + 1]) - dis_measure_(X[:i])
 39 | 
 40 |     j = np.argmax(_var)
 41 |     var_max = _var[j]
 42 |     
 43 |     if var_max > res_[j]:
 44 |         res_[j] = var_max
 45 | 
 46 |         for k in prange(j + 1, n):
 47 |             dk_diff = dis_measure_(np.vstack((X[:j], np.expand_dims(X[k], axis=0)))) - dis_measure_(np.vstack((X[:j + 1], np.expand_dims(X[k], axis=0))))
 48 | 
 49 |             if dk_diff >= 0:
 50 |                 res_[k] = dk_diff + var_max
 51 | 
 52 |     return res_
 53 | 
 54 | def _check_params(n_iter, dis_measure, random_state):
 55 |     """Internal function to check for and validate class parameters.
 56 |     Also, to return random state instance and the appropriate dissimilarity
 57 |     measure if valid.
 58 |     """
 59 |     if isinstance(n_iter, int):
 60 |         check_parameter(n_iter, low=1, param_name='n_iter')
 61 |     else:
 62 |         raise TypeError("n_iter should be int, got %s" % n_iter)
 63 | 
 64 |     if isinstance(dis_measure, str):
 65 |         if dis_measure not in ('aad', 'var', 'iqr'):
 66 |             raise ValueError("Unknown dissimilarity measure type, "
 67 |                              "dis_measure should be in "
 68 |                              "(\'aad\', \'var\', \'iqr\'), "
 69 |                              "got %s" % dis_measure)
 70 |         # TO-DO: 'mad': Median Absolute Deviation to be added
 71 |         # once Scipy stats version 1.3.0 is released
 72 |     else:
 73 |         raise TypeError("dis_measure should be str, got %s" % dis_measure)
 74 | 
 75 |     return check_random_state(random_state), _aad if dis_measure == 'aad' \
 76 |         else (np.var if dis_measure == 'var'
 77 |               else (stats.iqr if dis_measure == 'iqr' else None))
 78 | 
 79 | 
 80 | class LMDD(BaseDetector):
 81 |     """Linear Method for Deviation-based Outlier Detection.
 82 | 
 83 |     LMDD employs the concept of the smoothing factor which
 84 |     indicates how much the dissimilarity can be reduced by
 85 |     removing a subset of elements from the data-set.
 86 |     Read more in the :cite:`arning1996linear`.
 87 | 
 88 |     Note: this implementation has minor modification to make it output scores
 89 |     instead of labels.
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     contamination : float in (0., 0.5), optional (default=0.1)
 94 |         The amount of contamination of the data set, i.e.
 95 |         the proportion of outliers in the data set. Used when fitting to
 96 |         define the threshold on the decision function.
 97 | 
 98 |     n_iter : int, optional (default=50)
 99 |         Number of iterations where in each iteration,
100 |         the process is repeated after randomizing the order of the input.
101 |         Note that n_iter is a very important factor that affects the accuracy.
102 |         The higher the better the accuracy and the longer the execution.
103 | 
104 |     dis_measure: str, optional (default='aad')
105 |         Dissimilarity measure to be used in calculating the smoothing factor
106 |         for points, options available:
107 | 
108 |         - 'aad': Average Absolute Deviation
109 |         - 'var': Variance
110 |         - 'iqr': Interquartile Range
111 | 
112 |     random_state : int, RandomState instance or None, optional (default=None)
113 |         If int, random_state is the seed used by the random number generator;
114 |         If RandomState instance, random_state is the random number generator;
115 |         If None, the random number generator is the RandomState instance used
116 |         by `np.random`.
117 | 
118 |     Attributes
119 |     ----------
120 |     decision_scores_ : numpy array of shape (n_samples,)
121 |         The outlier scores of the training data.
122 |         The higher, the more abnormal. Outliers tend to have higher
123 |         scores. This value is available once the detector is fitted.
124 | 
125 |     threshold_ : float
126 |         The threshold is based on ``contamination``. It is the
127 |         ``n_samples * contamination`` most abnormal samples in
128 |         ``decision_scores_``. The threshold is calculated for generating
129 |         binary outlier labels.
130 | 
131 |     labels_ : int, either 0 or 1
132 |         The binary labels of the training data. 0 stands for inliers
133 |         and 1 for outliers/anomalies. It is generated by applying
134 |         ``threshold_`` on ``decision_scores_``.
135 |     """
136 | 
137 |     def __init__(self, contamination=0.1, n_iter=50, dis_measure='aad',
138 |                  random_state=None):
139 |         super(LMDD, self).__init__(contamination=contamination)
140 |         self.n_iter, self.n_iter_ = n_iter, n_iter
141 |         self.dis_measure, self.dis_measure_ = dis_measure, dis_measure
142 | 
143 |         # add this assignment to prevent clone error; not being used.
144 |         self.random_state = random_state
145 |         self.random_state_, self.dis_measure_ = _check_params(n_iter,
146 |                                                               dis_measure,
147 |                                                               random_state)
148 | 
149 |     def fit(self, X, y=None):
150 |         """Fit detector. y is ignored in unsupervised methods.
151 | 
152 |         Parameters
153 |         ----------
154 |         X : numpy array of shape (n_samples, n_features)
155 |             The input samples.
156 | 
157 |         y : Ignored
158 |             Not used, present for API consistency by convention.
159 | 
160 |         Returns
161 |         -------
162 |         self : object
163 |             Fitted estimator.
164 |         """
165 |         X = check_array(X)
166 |         self._set_n_classes(y)
167 |         self.decision_scores_ = self.decision_function(X)
168 |         self._process_decision_scores()
169 |         return self
170 | 
171 |     def decision_function(self, X):
172 |         """Predict raw anomaly score of X using the fitted detector.
173 | 
174 |         The anomaly score of an input sample is computed based on different
175 |         detector algorithms. For consistency, outliers are assigned with
176 |         larger anomaly scores.
177 | 
178 |         Parameters
179 |         ----------
180 |         X : numpy array of shape (n_samples, n_features)
181 |             The training input samples. Sparse matrices are accepted only
182 |             if they are supported by the base estimator.
183 | 
184 |         Returns
185 |         -------
186 |         anomaly_scores : numpy array of shape (n_samples,)
187 |             The anomaly score of the input samples.
188 |         """
189 |         return self.__sf(X)
190 |     
191 |     # def __dis(self, X):
192 |     #     """
193 |     #     Internal function to calculate for
194 |     #     dissimilarity in a sequence of sets.
195 |     #     """
196 |     #     res_ = np.zeros(shape=(X.shape[0],))
197 |     #     var_max, j = -np.inf, 0
198 |     #     # this can be vectorized but just for comforting memory
199 |     #     test = []
200 |     #     for i in range(1, X.shape[0]):
201 |     #         _var = self.dis_measure_(X[:i + 1]) - self.dis_measure_(X[:i])
202 |     #         test.append(_var)
203 |     #         if _var > var_max:
204 |     #             var_max = _var
205 |     #             j = i
206 |                 
207 |         
208 | 
209 |     #     if var_max > res_[j]:
210 |     #         res_[j] = var_max
211 | 
212 |     #         for k in range(j + 1, X.shape[0]):
213 |     #             dk_diff = self.dis_measure_(np.vstack((X[:j], X[k])))\
214 |     #                     - self.dis_measure_(np.vstack((X[:j + 1], X[k]))) 
215 |     #             if dk_diff >= 0:
216 |     #                 res_[k] = dk_diff + var_max
217 | 
218 |     #     return res_
219 |     
220 | 
221 |     
222 |     
223 |     def __sf(self, X):
224 |         """Internal function to calculate for Smoothing Factors of data points
225 |         Repeated n_iter_ of times in randomized mode.
226 |         """
227 |         dis_ = np.zeros(shape=(X.shape[0],))
228 |         card_ = np.zeros(shape=(X.shape[0],))
229 |         # perform one process with the original input order
230 |         itr_res = _dis(X)
231 |         np.put(card_, X.shape[0] - sum([i > 0. for i in itr_res]),
232 |                np.where(itr_res > 0.))
233 | 
234 |         # create a copy of random state to preserve original state for
235 |         # future fits (if any)
236 |         random_state = np.random.RandomState(
237 |             seed=self.random_state_.get_state()[1][0])
238 |         indices = np.arange(X.shape[0])
239 |         for _ in range(self.n_iter_):
240 |             ind_ = indices
241 |             random_state.shuffle(ind_)
242 |             _x = X[indices]
243 |             # get dissimilarity of this iteration and restore original order
244 |             itr_res = _dis(_x)[np.argsort(ind_)]
245 |             current_card = X.shape[0] - sum([i > 0. for i in itr_res])
246 |             # compare with previous iteration to get the maximal dissimilarity
247 |             for i, j in enumerate(itr_res):
248 |                 if j > dis_[i]:
249 |                     dis_[i] = j
250 |                     card_[i] = current_card
251 |             # Increase random state seed by one to reorder input next iteration
252 |             random_state.seed(random_state.get_state()[1][0] + 1)
253 | 
254 |         return np.multiply(dis_, card_)
255 | 


--------------------------------------------------------------------------------
/additional_methods/sod.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Subspace Outlier Detection (SOD)
  3 | """
  4 | # Author: Yahya Almardeny <almardeny@gmail.com>
  5 | # License: BSD 2 clause
  6 | 
  7 | import numba as nb
  8 | import numpy as np
  9 | from sklearn.neighbors import NearestNeighbors
 10 | from sklearn.utils import check_array
 11 | 
 12 | from pyod.models.base import BaseDetector
 13 | from pyod.utils.utility import check_parameter
 14 | 
 15 | 
 16 | @nb.jit(nopython=True, parallel=True)
 17 | def _snn_imp(ind, ref_set_):
 18 |     n = ind.shape[0]
 19 |     _count = np.zeros((n, ref_set_), dtype=np.uint32)
 20 |     
 21 |     for i in nb.prange(n):
 22 |         temp = np.zeros(n, dtype=np.uint32)
 23 |         test_element_set = ind[i]
 24 |         
 25 |         for j in range(n):
 26 |             count = 0
 27 |             for idx in ind[j]:
 28 |                 if idx in test_element_set:
 29 |                     count += 1
 30 |             temp[j] = count
 31 |         
 32 |         temp[i] = np.iinfo(np.uint32).max
 33 |         _count[i] = np.argsort(temp)[::-1][1:ref_set_ + 1]
 34 | 
 35 |     return _count
 36 | 
 37 | class SOD(BaseDetector):
 38 |     """Subspace outlier detection (SOD) schema aims to detect outlier in
 39 |     varying subspaces of a high dimensional feature space. For each data
 40 |     object, SOD explores the axis-parallel subspace spanned by the data
 41 |     object's neighbors and determines how much the object deviates from the
 42 |     neighbors in this subspace.
 43 | 
 44 |     See :cite:`kriegel2009outlier` for details.
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     n_neighbors : int, optional (default=20)
 49 |         Number of neighbors to use by default for k neighbors queries.
 50 | 
 51 |     ref_set: int, optional (default=10)
 52 |         specifies the number of shared nearest neighbors to create the
 53 |         reference set. Note that ref_set must be smaller than n_neighbors.
 54 | 
 55 |     alpha: float in (0., 1.), optional (default=0.8)
 56 |            specifies the lower limit for selecting subspace.
 57 |            0.8 is set as default as suggested in the original paper.
 58 | 
 59 |     contamination : float in (0., 0.5), optional (default=0.1)
 60 |         The amount of contamination of the data set, i.e.
 61 |         the proportion of outliers in the data set. Used when fitting to
 62 |         define the threshold on the decision function.
 63 | 
 64 |     Attributes
 65 |     ----------
 66 |     decision_scores_ : numpy array of shape (n_samples,)
 67 |         The outlier scores of the training data.
 68 |         The higher, the more abnormal. Outliers tend to have higher
 69 |         scores. This value is available once the detector is
 70 |         fitted.
 71 | 
 72 |     threshold_ : float
 73 |         The threshold is based on ``contamination``. It is the
 74 |         ``n_samples * contamination`` most abnormal samples in
 75 |         ``decision_scores_``. The threshold is calculated for generating
 76 |         binary outlier labels.
 77 | 
 78 |     labels_ : int, either 0 or 1
 79 |         The binary labels of the training data. 0 stands for inliers
 80 |         and 1 for outliers/anomalies. It is generated by applying
 81 |         ``threshold_`` on ``decision_scores_``.
 82 |     """
 83 | 
 84 |     def __init__(self, contamination=0.1, n_neighbors=20, ref_set=10,
 85 |                  alpha=0.8):
 86 |         super(SOD, self).__init__(contamination=contamination)
 87 |         if isinstance(n_neighbors, int):
 88 |             check_parameter(n_neighbors, low=1, param_name='n_neighbors')
 89 |         else:
 90 |             raise ValueError(
 91 |                 "n_neighbors should be int. Got %s" % type(n_neighbors))
 92 | 
 93 |         if isinstance(ref_set, int):
 94 |             check_parameter(ref_set, low=1, high=n_neighbors,
 95 |                             param_name='ref_set')
 96 |         else:
 97 |             raise ValueError("ref_set should be int. Got %s" % type(ref_set))
 98 | 
 99 |         if isinstance(alpha, float):
100 |             check_parameter(alpha, low=0.0, high=1.0, param_name='alpha')
101 |         else:
102 |             raise ValueError("alpha should be float. Got %s" % type(alpha))
103 | 
104 |         self.n_neighbors = n_neighbors
105 |         self.ref_set = ref_set
106 |         self.alpha = alpha
107 | 
108 |     def fit(self, X, y=None):
109 |         """Fit detector. y is ignored in unsupervised methods.
110 | 
111 |         Parameters
112 |         ----------
113 |         X : numpy array of shape (n_samples, n_features)
114 |             The input samples.
115 | 
116 |         y : Ignored
117 |             Not used, present for API consistency by convention.
118 | 
119 |         Returns
120 |         -------
121 |         self : object
122 |             Fitted estimator.
123 |         """
124 | 
125 |         # validate inputs X and y (optional)
126 |         X = check_array(X)
127 |         self._set_n_classes(y)
128 |         self.decision_scores_ = self.decision_function(X)
129 |         self._process_decision_scores()
130 | 
131 |         return self
132 | 
133 |     def decision_function(self, X):
134 |         """Predict raw anomaly score of X using the fitted detector.
135 |         The anomaly score of an input sample is computed based on different
136 |         detector algorithms. For consistency, outliers are assigned with
137 |         larger anomaly scores.
138 | 
139 |         Parameters
140 |         ----------
141 |         X : numpy array of shape (n_samples, n_features)
142 |             The training input samples. Sparse matrices are accepted only
143 |             if they are supported by the base estimator.
144 | 
145 |         Returns
146 |         -------
147 |         anomaly_scores : numpy array of shape (n_samples,)
148 |             The anomaly score of the input samples.
149 |         """
150 |         return self._sod(X)
151 | 
152 |     def _snn(self, X):
153 |         """This function is called internally to calculate the shared nearest
154 |         neighbors (SNN). SNN is reported to be more robust than k nearest
155 |         neighbors.
156 | 
157 |         Returns
158 |         -------
159 |         snn_indices : numpy array of shape (n_shared_nearest_neighbors,)
160 |             The indices of top k shared nearest neighbors for each observation.
161 |         """
162 |         knn = NearestNeighbors(n_neighbors=self.n_neighbors)
163 |         knn.fit(X)
164 |         # Get the knn index
165 |         ind = knn.kneighbors(return_distance=False)
166 |         return _snn_imp(ind, self.ref_set)
167 | 
168 |     def _sod(self, X):
169 |         """This function is called internally to perform subspace outlier 
170 |         detection algorithm.
171 |         
172 |         Returns
173 |         -------
174 |         anomaly_scores : numpy array of shape (n_samples,)
175 |             The anomaly score of the input samples.
176 |         """
177 |         ref_inds = self._snn(X)
178 |         anomaly_scores = np.zeros(shape=(X.shape[0],))
179 |         for i in range(X.shape[0]):
180 |             obs = X[i]
181 |             ref = X[ref_inds[i,],]
182 |             means = np.mean(ref, axis=0)  # mean of each column
183 |             # average squared distance of the reference to the mean
184 |             var_total = np.sum(np.sum(np.square(ref - means))) / self.ref_set
185 |             var_expect = self.alpha * var_total / X.shape[1]
186 |             var_actual = np.var(ref, axis=0)  # variance of each attribute
187 |             var_inds = [1 if (j < var_expect) else 0 for j in var_actual]
188 |             rel_dim = np.sum(var_inds)
189 |             if rel_dim != 0:
190 |                 anomaly_scores[i] = np.sqrt(
191 |                     np.dot(var_inds, np.square(obs - means)) / rel_dim)
192 | 
193 |         return anomaly_scores
194 | 


--------------------------------------------------------------------------------
/additional_methods/wrappers/AE.py:
--------------------------------------------------------------------------------
 1 | from pyod.models.auto_encoder import AutoEncoder
 2 | import math
 3 | 
 4 | class AE_wrapper(AutoEncoder):
 5 |     def __init__(self, n_layers=1, shrinkage_factor=0.3, **args):
 6 |         
 7 |         self.n_layers = n_layers
 8 |         self.shrinkage_factor = shrinkage_factor
 9 | 
10 |         try: 
11 |             del args["encoder_neurons"]
12 |         except KeyError:
13 |             pass
14 |         
15 |         try: 
16 |             del args["decoder_neurons"]
17 |         except KeyError:
18 |             pass
19 |         
20 |         self.args = args
21 |     
22 |     def fit(self, X, y=None):
23 |         
24 |         n_features = X.shape[1]
25 |         
26 |         self.encoder_neurons = [math.ceil(n_features * (1-self.shrinkage_factor)**(i+1)) for i in range(self.n_layers)]
27 |         
28 |         self.decoder_neurons = list(reversed(self.encoder_neurons))
29 |         
30 |         self.hidden_neurons = self.encoder_neurons + self.decoder_neurons
31 |         
32 |         super().__init__(hidden_neurons=self.hidden_neurons,  **self.args)
33 |         
34 |         super().fit(X, y)


--------------------------------------------------------------------------------
/additional_methods/wrappers/ALAD.py:
--------------------------------------------------------------------------------
 1 | from pyod.models.alad import ALAD
 2 | import math
 3 | 
 4 | class ALAD_wrapper(ALAD):
 5 |     def __init__(self, n_layers=1, shrinkage_factor=0.3, **args):
 6 |         
 7 |         self.n_layers = n_layers
 8 |         self.shrinkage_factor = shrinkage_factor
 9 | 
10 |         try: 
11 |             del args["encoder_neurons"]
12 |         except KeyError:
13 |             pass
14 |         
15 |         try: 
16 |             del args["decoder_neurons"]
17 |         except KeyError:
18 |             pass
19 |         
20 |         self.args = args
21 |     
22 |     def fit(self, X, y=None):
23 |         
24 |         n_features = X.shape[1]
25 |         
26 |         self.encoder_neurons = [math.ceil(n_features * (1-self.shrinkage_factor)**(i+1)) for i in range(self.n_layers)]
27 |         
28 |         self.decoder_neurons = list(reversed(self.encoder_neurons))
29 |         
30 |         
31 |         super().__init__(dec_layers=self.decoder_neurons, enc_layers=self.encoder_neurons, disc_xx_layers=self.encoder_neurons, disc_zz_layers=self.encoder_neurons, disc_xz_layers=self.encoder_neurons, **self.args)
32 |         
33 |         super().fit(X, y)


--------------------------------------------------------------------------------
/additional_methods/wrappers/AnoGAN.py:
--------------------------------------------------------------------------------
 1 | from pyod.models.anogan import AnoGAN
 2 | import math
 3 | 
 4 | class AnoGAN_wrapper(AnoGAN):
 5 |     def __init__(self, D_n_layers=1, G_n_layers=1, G_shrinkage_factor=0.3, D_shrinkage_factor=0.3, **args):
 6 |         
 7 |         self.D_n_layers = D_n_layers
 8 |         self.G_n_layers = G_n_layers
 9 |         self.D_shrinkage_factor = D_shrinkage_factor
10 |         self.G_shrinkage_factor = G_shrinkage_factor
11 |         
12 |         try: 
13 |             del args["G_layers"]
14 |         except KeyError:
15 |             pass
16 |         
17 |         try: 
18 |             del args["D_layers"]
19 |         except KeyError:
20 |             pass
21 |         
22 |         self.args = args
23 |     
24 |     def fit(self, X, y=None):
25 |         
26 | 
27 |         n_features = X.shape[1]
28 |         
29 |         self.G_encoder_neurons = [math.ceil(n_features * (1-self.G_shrinkage_factor)**(i+1)) for i in range(self.G_n_layers)]
30 |         
31 |         self.G_decoder_neurons = list(reversed(self.G_encoder_neurons))
32 |         
33 |         self.G_layers = self.G_encoder_neurons + self.G_decoder_neurons
34 |         
35 |         self.D_layers = [math.ceil(n_features * (1-self.D_shrinkage_factor)**(i+1)) for i in range(self.D_n_layers)]
36 |         
37 |         super().__init__(G_layers=self.G_layers, D_layers=self.D_layers, **self.args)
38 |         
39 |         super().fit(X, y)


--------------------------------------------------------------------------------
/additional_methods/wrappers/ExtendedIForest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 31 23:59:31 2022
 4 | 
 5 | @author: Roel
 6 | """
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import eif as iso
11 | 
12 | from pyod.models.base import BaseDetector
13 | 
14 | 
15 | from sklearn.utils import check_array
16 | 
17 | 
18 | class ExtendedIForest(BaseDetector):
19 |     """
20 | 
21 |     """
22 | 
23 |     def __init__(self, n_estimators=100,
24 |                  max_samples=256,
25 |                  contamination=0.1,
26 |                  extension_level=1,
27 |                  verbose=0):
28 |         super(ExtendedIForest, self).__init__(contamination=contamination)
29 |         
30 |         self.n_estimators = n_estimators
31 |         self.max_samples = max_samples
32 |         self.extension_level = extension_level
33 | 
34 | 
35 |     def fit(self, X, y=None):
36 |         """Fit detector. y is ignored in unsupervised methods.
37 |         Parameters
38 |         ----------
39 |         X : numpy array of shape (n_samples, n_features)
40 |             The input samples.
41 |         y : Ignored
42 |             Not used, present for API consistency by convention.
43 |         Returns
44 |         -------
45 |         self : object
46 |             Fitted estimator.
47 |         """
48 |         # validate inputs X and y (optional)
49 |         X = check_array(X)
50 |         self._set_n_classes(y)
51 | 
52 |         max_samples = min(X.shape[0], self.max_samples)
53 |         self.detector_ = iso.iForest(X, ntrees=self.n_estimators, sample_size=max_samples, ExtensionLevel=self.extension_level)
54 | 
55 | 
56 |         self.decision_scores_ = self.decision_function(X)
57 | 
58 |         return self
59 | 
60 |     def decision_function(self, X):
61 |         """Predict raw anomaly score of X using the fitted detector.
62 |         The anomaly score of an input sample is computed based on different
63 |         detector algorithms. For consistency, outliers are assigned with
64 |         larger anomaly scores.
65 |         Parameters
66 |         ----------
67 |         X : numpy array of shape (n_samples, n_features)
68 |             The training input samples. Sparse matrices are accepted only
69 |             if they are supported by the base estimator.
70 |         Returns
71 |         -------
72 |         anomaly_scores : numpy array of shape (n_samples,)
73 |             The anomaly score of the input samples.
74 |         """
75 |         
76 | 
77 |         return self.detector_.compute_paths(X_in=X)
78 | 
79 | 
80 |     @property
81 |     def max_samples_(self):
82 |         """The actual number of samples.
83 |         Decorator for scikit-learn Isolation Forest attributes.
84 |         """
85 |         return self.detector_.max_samples_


--------------------------------------------------------------------------------
/additional_methods/wrappers/HBOS.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Dec 12 17:53:21 2023
 5 | 
 6 | @author: rbouman
 7 | """
 8 | 
 9 | from __future__ import division
10 | from __future__ import print_function
11 | 
12 | from  ..HBOS.hbos import HBOS
13 | from sklearn.utils import check_array
14 | 
15 | import pandas as pd
16 | 
17 | 
18 | from pyod.models.base import BaseDetector
19 | 
20 | 
21 | class DynamicHBOS(BaseDetector):
22 |     """
23 | 
24 |     """
25 | 
26 |     def __init__(self, contamination=0.1):
27 |         super(DynamicHBOS, self).__init__(contamination=contamination)
28 |         
29 | 
30 | 
31 |     def fit(self, X, y=None):
32 |         """Fit detector. y is ignored in unsupervised methods.
33 |         Parameters
34 |         ----------
35 |         X : numpy array of shape (n_samples, n_features)
36 |             The input samples.
37 |         y : Ignored
38 |             Not used, present for API consistency by convention.
39 |         Returns
40 |         -------
41 |         self : object
42 |             Fitted estimator.
43 |         """
44 |         # validate inputs X and y (optional)
45 |         X = check_array(X)
46 |         self._set_n_classes(y)
47 | 
48 |         self.detector_ = HBOS()
49 |         
50 |         self.detector_.fit(pd.DataFrame(X))
51 |         self.decision_scores_ = self.decision_function(X)
52 | 
53 |         return self
54 | 
55 |     def decision_function(self, X):
56 |         """Predict raw anomaly score of X using the fitted detector.
57 |         The anomaly score of an input sample is computed based on different
58 |         detector algorithms. For consistency, outliers are assigned with
59 |         larger anomaly scores.
60 |         Parameters
61 |         ----------
62 |         X : numpy array of shape (n_samples, n_features)
63 |             The training input samples. Sparse matrices are accepted only
64 |             if they are supported by the base estimator.
65 |         Returns
66 |         -------
67 |         anomaly_scores : numpy array of shape (n_samples,)
68 |             The anomaly score of the input samples.
69 |         """
70 |         
71 | 
72 |         return self.detector_.predict(pd.DataFrame(X))
73 | 
74 | 
75 |     @property
76 |     def max_samples_(self):
77 |         """The actual number of samples.
78 |         Decorator for scikit-learn Isolation Forest attributes.
79 |         """
80 |         return self.detector_.max_samples_


--------------------------------------------------------------------------------
/additional_methods/wrappers/VAE.py:
--------------------------------------------------------------------------------
 1 | from pyod.models.vae import VAE
 2 | import math
 3 | 
 4 | class VAE_wrapper(VAE):
 5 |     def __init__(self, n_layers=1, shrinkage_factor=0.3, **args):
 6 |         
 7 |         self.n_layers = n_layers
 8 |         self.shrinkage_factor = shrinkage_factor
 9 |         
10 |         try: 
11 |             del args["encoder_neurons"]
12 |         except KeyError:
13 |             pass
14 |         
15 |         try: 
16 |             del args["decoder_neurons"]
17 |         except KeyError:
18 |             pass
19 |         
20 |         self.args = args
21 |     
22 |     def fit(self, X, y=None):
23 |         
24 |         n_features = X.shape[1]
25 |         
26 |         self.encoder_neurons = [math.ceil(n_features * (1-self.shrinkage_factor)**(i+1)) for i in range(self.n_layers)]
27 |         
28 |         self.decoder_neurons = list(reversed(self.encoder_neurons))
29 |         
30 |         super().__init__(encoder_neurons=self.encoder_neurons, decoder_neurons=self.decoder_neurons, **self.args)
31 |         
32 |         super().fit(X, y)


--------------------------------------------------------------------------------
/additional_methods/wrappers/rrcf.py:
--------------------------------------------------------------------------------
 1 | from pyod.models.base import BaseDetector
 2 | import rrcf
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | class rrcf_wrapper():
 7 |     def __init__(self, n_trees, tree_size):
 8 |         
 9 |         self.n_trees = n_trees
10 |         self.tree_size = tree_size
11 |         
12 |     # based on example batch code from: https://github.com/kLabUM/rrcf
13 |     def fit(self, X, y=None):
14 |          
15 |         n = X.shape[0]
16 |         
17 |         tree_size = min(self.tree_size, n)
18 |         
19 |         forest = []
20 |         
21 |         if self.n_trees * tree_size < n:
22 |             self.n_trees = np.ceil(n / tree_size) #increase n_trees if not all samples are covered.
23 |         while len(forest) < self.n_trees:
24 |             # Select random subsets of points uniformly from point set
25 |             ixs = np.random.choice(n, size=(n//tree_size, tree_size), 
26 |                                    replace=False)
27 |             # Add sampled trees to forest
28 |             trees = [rrcf.RCTree(X[ix], index_labels=ix) for ix in ixs]
29 |             forest.extend(trees)
30 |         
31 |         
32 |         # Compute average CoDisp
33 |         avg_codisp = pd.Series(0.0, index=np.arange(n))
34 |         index = np.zeros(n)
35 |         for tree in forest:
36 |             codisp = pd.Series({leaf : tree.codisp(leaf) for leaf in tree.leaves})
37 |             avg_codisp[codisp.index] += codisp
38 |             np.add.at(index, codisp.index.values, 1)
39 |         avg_codisp /= index
40 |         
41 |         self.decision_scores_ = avg_codisp


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: OD_benchmark
 2 | dependencies:
 3 |   - python=3.8
 4 |   - numpy
 5 |   - pandas
 6 |   - scikit-learn=1.0.2
 7 |   - matplotlib
 8 |   - seaborn
 9 |   - scipy
10 |   - cvxopt
11 |   - pytorch
12 |   - pip
13 |   - pip:
14 |     - cython
15 |     - tensorflow
16 |     - pyod
17 |     - eif
18 |     - combo
19 |     - tqdm
20 |     - rrcf
21 | 


--------------------------------------------------------------------------------
/evaluation_metrics.py:
--------------------------------------------------------------------------------
 1 | from pyod.utils.utility import get_label_n #precision_n_scores with n=None is equal to the R-precision measure
 2 | from sklearn.utils import column_or_1d
 3 | from sklearn.metrics import precision_score, average_precision_score
 4 | import numpy as np
 5 | 
 6 | #copied from pyod, but changed default behaviour of precision_score warnings when y_pred is all zeroes
 7 | def precision_n_scores(y, y_pred, n=None):
 8 |     """Utility function to calculate precision @ rank n.
 9 | 
10 |     Parameters
11 |     ----------
12 |     y : list or numpy array of shape (n_samples,)
13 |         The ground truth. Binary (0: inliers, 1: outliers).
14 | 
15 |     y_pred : list or numpy array of shape (n_samples,)
16 |         The raw outlier scores as returned by a fitted model.
17 | 
18 |     n : int, optional (default=None)
19 |         The number of outliers. if not defined, infer using ground truth.
20 | 
21 |     Returns
22 |     -------
23 |     precision_at_rank_n : float
24 |         Precision at rank n score.
25 | 
26 |     """
27 | 
28 |     # turn raw prediction decision scores into binary labels
29 |     y_pred = get_label_n(y, y_pred, n)
30 | 
31 |     # enforce formats of y and labels_
32 |     y = column_or_1d(y)
33 |     y_pred = column_or_1d(y_pred)
34 | 
35 |     return precision_score(y, y_pred, zero_division=0)
36 | 
37 | def adjusted_precision_n_scores(y_true, y_pred, n=None):
38 | 
39 |     p_at_n = precision_n_scores(y_true, y_pred, n=n)
40 |     
41 |     # calculate the percentage of outliers
42 |     if n is not None:
43 |         outliers_fraction = n /len(y_true)
44 |     else:
45 |         outliers_fraction = np.count_nonzero(y_true) / len(y_true)
46 |     
47 |     adjusted_p_at_n = (p_at_n - outliers_fraction)/(1 - outliers_fraction)
48 |     
49 |     return(adjusted_p_at_n)
50 |     
51 | def adjusted_average_precision(y_true, y_pred):
52 |     
53 |     ap = average_precision_score(y_true, y_pred)
54 |     
55 |     # calculate the percentage of outliers
56 |     outliers_fraction = np.count_nonzero(y_true) / len(y_true)
57 |     
58 |     adjusted_average_precision = (ap - outliers_fraction)/(1 - outliers_fraction)
59 |     
60 |     return(adjusted_average_precision)
61 | 
62 |     
63 |     
64 | 
65 | 


--------------------------------------------------------------------------------
/figures/.gitignore:
--------------------------------------------------------------------------------
1 | *.eps
2 | *.png
3 | *.pdf
4 | 


--------------------------------------------------------------------------------
/formatted_data/aloi.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/aloi.npz


--------------------------------------------------------------------------------
/formatted_data/annthyroid.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/annthyroid.npz


--------------------------------------------------------------------------------
/formatted_data/arrhythmia.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/arrhythmia.npz


--------------------------------------------------------------------------------
/formatted_data/breastw.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/breastw.npz


--------------------------------------------------------------------------------
/formatted_data/campaign.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/campaign.npz


--------------------------------------------------------------------------------
/formatted_data/cardio.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/cardio.npz


--------------------------------------------------------------------------------
/formatted_data/cover.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/cover.npz


--------------------------------------------------------------------------------
/formatted_data/donors.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/donors.npz


--------------------------------------------------------------------------------
/formatted_data/fault.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/fault.npz


--------------------------------------------------------------------------------
/formatted_data/glass.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/glass.npz


--------------------------------------------------------------------------------
/formatted_data/hepatitis.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/hepatitis.npz


--------------------------------------------------------------------------------
/formatted_data/hrss_anomalous_optimized.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/hrss_anomalous_optimized.npz


--------------------------------------------------------------------------------
/formatted_data/hrss_anomalous_standard.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/hrss_anomalous_standard.npz


--------------------------------------------------------------------------------
/formatted_data/http.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/http.npz


--------------------------------------------------------------------------------
/formatted_data/internetads.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/internetads.npz


--------------------------------------------------------------------------------
/formatted_data/ionosphere.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/ionosphere.npz


--------------------------------------------------------------------------------
/formatted_data/landsat.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/landsat.npz


--------------------------------------------------------------------------------
/formatted_data/letter.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/letter.npz


--------------------------------------------------------------------------------
/formatted_data/magic.gamma.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/magic.gamma.npz


--------------------------------------------------------------------------------
/formatted_data/mammography.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/mammography.npz


--------------------------------------------------------------------------------
/formatted_data/mi-f.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/mi-f.npz


--------------------------------------------------------------------------------
/formatted_data/mi-v.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/mi-v.npz


--------------------------------------------------------------------------------
/formatted_data/mnist.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/mnist.npz


--------------------------------------------------------------------------------
/formatted_data/musk.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/musk.npz


--------------------------------------------------------------------------------
/formatted_data/nasa.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/nasa.npz


--------------------------------------------------------------------------------
/formatted_data/optdigits.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/optdigits.npz


--------------------------------------------------------------------------------
/formatted_data/pageblocks.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pageblocks.npz


--------------------------------------------------------------------------------
/formatted_data/parkinson.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/parkinson.npz


--------------------------------------------------------------------------------
/formatted_data/pen-global.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pen-global.npz


--------------------------------------------------------------------------------
/formatted_data/pen-local.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pen-local.npz


--------------------------------------------------------------------------------
/formatted_data/pendigits.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pendigits.npz


--------------------------------------------------------------------------------
/formatted_data/pima.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pima.npz


--------------------------------------------------------------------------------
/formatted_data/satellite.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/satellite.npz


--------------------------------------------------------------------------------
/formatted_data/satimage-2.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/satimage-2.npz


--------------------------------------------------------------------------------
/formatted_data/seismic-bumps.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/seismic-bumps.npz


--------------------------------------------------------------------------------
/formatted_data/shuttle.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/shuttle.npz


--------------------------------------------------------------------------------
/formatted_data/skin.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/skin.npz


--------------------------------------------------------------------------------
/formatted_data/smtp.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/smtp.npz


--------------------------------------------------------------------------------
/formatted_data/spambase.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/spambase.npz


--------------------------------------------------------------------------------
/formatted_data/speech.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/speech.npz


--------------------------------------------------------------------------------
/formatted_data/stamps.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/stamps.npz


--------------------------------------------------------------------------------
/formatted_data/thyroid.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/thyroid.npz


--------------------------------------------------------------------------------
/formatted_data/vertebral.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/vertebral.npz


--------------------------------------------------------------------------------
/formatted_data/vowels.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/vowels.npz


--------------------------------------------------------------------------------
/formatted_data/waveform.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/waveform.npz


--------------------------------------------------------------------------------
/formatted_data/wbc.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wbc.npz


--------------------------------------------------------------------------------
/formatted_data/wbc2.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wbc2.npz


--------------------------------------------------------------------------------
/formatted_data/wilt.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wilt.npz


--------------------------------------------------------------------------------
/formatted_data/wine.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wine.npz


--------------------------------------------------------------------------------
/formatted_data/wpbc.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wpbc.npz


--------------------------------------------------------------------------------
/formatted_data/yeast.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/yeast.npz


--------------------------------------------------------------------------------
/formatted_data/yeast6.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/yeast6.npz


--------------------------------------------------------------------------------
/generate_and_plot_types_of_anomalies.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | import numpy as np
  3 | import seaborn as sns
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | sns.set_style("white")
  7 | 
  8 | 
  9 | #%% peripheral point
 10 | 
 11 | # Peripheral point plot
 12 | distribution_1 = np.random.randn(1000, 2) + np.array([1, 1])
 13 | anomalies_1 = np.array([[-3, 5], [1, -3.5], [-4, -3]], dtype=np.float32)
 14 | 
 15 | fig, axes = plt.subplots(1, 2, figsize=(18, 6))
 16 | 
 17 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[0])
 18 | sns.scatterplot(x=anomalies_1[:, 0], y=anomalies_1[:, 1], marker="X", color="red", s=100, ax=axes[0])
 19 | axes[0].set_xlim(-6, 10)
 20 | axes[0].set_ylim(-6, 10)
 21 | axes[0].set_xlabel("X$_1$")
 22 | axes[0].set_ylabel("X$_2$")
 23 | axes[0].set_title("Peripheral Anomalies", fontsize=20)
 24 | 
 25 | # Enclosed point plot
 26 | distribution_1 = np.random.randn(1000, 2) + np.array([-4, -4])
 27 | distribution_2 = np.random.randn(1000, 2) + np.array([-4, 4])
 28 | distribution_3 = np.random.randn(1000, 2) + np.array([4, -4])
 29 | distribution_4 = np.random.randn(1000, 2) + np.array([4, 4])
 30 | 
 31 | anomalies_1 = np.array([[0, 0], [-0.5, 0.3], [0.4, -0.7]], dtype=np.float32)
 32 | 
 33 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[1])
 34 | sns.scatterplot(x=distribution_2[:, 0], y=distribution_2[:, 1], color="blue", alpha=0.2, ax=axes[1])
 35 | sns.scatterplot(x=distribution_3[:, 0], y=distribution_3[:, 1], color="blue", alpha=0.2, ax=axes[1])
 36 | sns.scatterplot(x=distribution_4[:, 0], y=distribution_4[:, 1], color="blue", alpha=0.2, ax=axes[1])
 37 | sns.scatterplot(x=anomalies_1[:, 0], y=anomalies_1[:, 1], marker="X", color="red", s=100, ax=axes[1])
 38 | axes[1].set_xlim(-8, 8)
 39 | axes[1].set_ylim(-8, 8)
 40 | axes[1].set_xlabel("X$_1$")
 41 | axes[1].set_ylabel("X$_2$")
 42 | axes[1].set_title("Enclosed Anomalies", fontsize=20)
 43 | 
 44 | # Adjust layout to prevent clipping of titles
 45 | plt.tight_layout()
 46 | 
 47 | # Show the plot
 48 | 
 49 | 
 50 | fig.savefig("figures/enclosed-peripheral_point_example.pdf", format="pdf")
 51 | 
 52 | plt.show()
 53 | #%% local_outlier_plot
 54 | 
 55 | # Local outlier plot
 56 | local_outlier_plot = plt.figure()
 57 | 
 58 | distribution_1 = np.random.randn(1000, 2) + np.array([1, 1])
 59 | distribution_2 = np.random.randn(1000, 2) / 5 + np.array([7, 7])
 60 | 
 61 | anomalies = np.array([[6.2, 6.5], [7.2, 8], [7.9, 6.3]])
 62 | 
 63 | fig, axes = plt.subplots(1, 2, figsize=(18, 6))
 64 | 
 65 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[0])
 66 | sns.scatterplot(x=distribution_2[:, 0], y=distribution_2[:, 1], color="blue", alpha=0.2, ax=axes[0])
 67 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[0])
 68 | axes[0].set_xlim(-6, 10)
 69 | axes[0].set_ylim(-6, 10)
 70 | axes[0].set_xlabel("X$_1$")
 71 | axes[0].set_ylabel("X$_2$")
 72 | axes[0].set_title("Local Density Anomalies", fontsize=20)
 73 | 
 74 | # Global outlier plot
 75 | anomalies = np.array([[8, 0], [7.5, 1]])
 76 | 
 77 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[1])
 78 | sns.scatterplot(x=distribution_2[:, 0], y=distribution_2[:, 1], color="blue", alpha=0.2, ax=axes[1])
 79 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[1])
 80 | axes[1].set_xlim(-6, 10)
 81 | axes[1].set_ylim(-6, 10)
 82 | axes[1].set_xlabel("X$_1$")
 83 | axes[1].set_ylabel("X$_2$")
 84 | axes[1].set_title("Global Density Anomalies", fontsize=20)
 85 | 
 86 | # Adjust layout to prevent clipping of titles
 87 | plt.tight_layout()
 88 | 
 89 | # Show the plot
 90 | 
 91 | 
 92 | fig.savefig("figures/global-local_outlier_example.pdf", format="pdf")
 93 | 
 94 | plt.show()
 95 | #%% clustered outliers
 96 | 
 97 | # Clustered outliers plot
 98 | local_outlier_plot = plt.figure()
 99 | 
100 | distribution_1 = np.random.randn(1000, 2) + np.array([1, 1])
101 | anomalies_2 = np.random.randn(10, 2) / 5 + np.array([7, 7])
102 | 
103 | fig, axes = plt.subplots(1, 2, figsize=(18, 6))
104 | 
105 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[0])
106 | sns.scatterplot(x=anomalies_2[:, 0], y=anomalies_2[:, 1], marker="X", color="red", s=100, ax=axes[0])
107 | axes[0].set_xlim(-6, 10)
108 | axes[0].set_ylim(-6, 10)
109 | axes[0].set_xlabel("X$_1$")
110 | axes[0].set_ylabel("X$_2$")
111 | axes[0].set_title("Clustered Anomalies", fontsize=20)
112 | 
113 | # Isolated outliers plot
114 | anomalies = np.array([[7, 7], [-4, -4], [-4, 6]])
115 | 
116 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[1])
117 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[1])
118 | axes[1].set_xlim(-6, 10)
119 | axes[1].set_ylim(-6, 10)
120 | axes[1].set_xlabel("X$_1$")
121 | axes[1].set_ylabel("X$_2$")
122 | axes[1].set_title("Isolated Anomalies", fontsize=20)
123 | 
124 | # Adjust layout to prevent clipping of titles
125 | plt.tight_layout()
126 | 
127 | # Show the plot
128 | 
129 | 
130 | fig.savefig("figures/isolated-clustered_outlier_example.pdf", format="pdf")
131 | plt.show()
132 | #%% univariate outliers
133 | 
134 | # Univariate outliers plot
135 | distribution_1 = np.random.multivariate_normal([0, 0], [[3, 0], [0, 1]], 1000)
136 | anomalies = np.array([[0, 6], [8, 0]])
137 | 
138 | fig, axes = plt.subplots(1, 2, figsize=(18, 6))
139 | 
140 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[0])
141 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[0])
142 | axes[0].set_xlim(-9, 9)
143 | axes[0].set_ylim(-4, 7)
144 | axes[0].set_xlabel("X$_1$")
145 | axes[0].set_ylabel("X$_2$")
146 | axes[0].set_title("Univariate Anomalies", fontsize=20)
147 | 
148 | # Multivariate outliers plot
149 | distribution_1 = np.random.multivariate_normal([0, 0], [[0.5, 0], [4, 4]], 1000)
150 | anomalies = np.array([[-3, 3], [4, -2]])
151 | 
152 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[1])
153 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[1])
154 | axes[1].set_xlim(-7, 7)
155 | axes[1].set_ylim(-7, 7)
156 | axes[1].set_xlabel("X$_1$")
157 | axes[1].set_ylabel("X$_2$")
158 | axes[1].set_title("Multivariate Anomalies", fontsize=20)
159 | 
160 | # Adjust layout to prevent clipping of titles
161 | plt.tight_layout()
162 | 
163 | # Show the plot
164 | 
165 | 
166 | fig.savefig("figures/multivariate-univariate_outlier_example.pdf", format="pdf")
167 | plt.show()
168 | 


--------------------------------------------------------------------------------
/invert_labels_calculate_metrics.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import pickle
 4 | import numpy as np
 5 | 
 6 | from sklearn.metrics import roc_auc_score, average_precision_score
 7 | from pyod.utils.utility import precision_n_scores
 8 | from evaluation_metrics import adjusted_precision_n_scores, adjusted_average_precision
 9 | 
10 | 
11 | #define score function:
12 | score_functions = {"ROC/AUC": roc_auc_score, 
13 |                    "R_precision": precision_n_scores, 
14 |                    "adjusted_R_precision": adjusted_precision_n_scores, 
15 |                    "average_precision": average_precision_score, 
16 |                    "adjusted_average_precision": adjusted_average_precision}
17 | 
18 | # In case anomaly detection problems are wrongly defined, labels can be switched in order to recalculate metrics
19 | 
20 | # inverted datasets:
21 | # Skin originally has 1 being the skin pixel class, and 0 being the noise class. The skin class is however more homogeneous, so labels should be flipped.
22 | # Vertebral consist out of 3 classes, the normal class, and disk hernia/spondilolysthesis. The latter classes are combined and originally defined as 0 in ODDS, but they are conceptually the anomalies.ArithmeticError
23 | # yeast is poorly documented. We've replaced it with yeast6 from EOAD
24 | inverted_datasets = ["yeast", "skin", "vertebral"]
25 | 
26 | pickle_dir = "formatted_data"
27 | score_dir = "results/score_dir"
28 | csv_result_dir = "results/csvresult_dir"
29 | result_dir = "results/result_dir"
30 | figure_dir = "figures"
31 | table_dir = "tables"
32 | 
33 | #uncomment if new metrics need to be calculated:
34 | #all_datasets = set(os.listdir(result_dir)) 
35 | 
36 | #set inverted_datasets to all_datasets if new metrics need to be calculated
37 | for dataset in inverted_datasets:
38 |     print(dataset)
39 |     
40 |     full_path_filename = os.path.join(pickle_dir, dataset+".npz")
41 |     
42 |     data = np.load(open(full_path_filename, 'rb'))
43 |     X, y = data["X"], np.squeeze(data["y"])
44 |     
45 |     #invert y:
46 |     if dataset in inverted_datasets:
47 |         y_inverted = np.zeros(y.shape)
48 |         y_inverted[y==0] = 1
49 |         y = y_inverted
50 |     
51 |     for method_name in os.listdir(os.path.join(score_dir, dataset)):
52 |         print(method_name)
53 |         score_folder_path = os.path.join(score_dir, dataset, method_name)
54 |         
55 |         hyperparameter_csvs = os.listdir(score_folder_path)
56 |         hyperparameter_settings = [filename.replace(".csv", "") for filename in hyperparameter_csvs]
57 |         
58 |         results_per_setting = {}
59 |         for hyperparameter_csv, hyperparameter_setting in zip(hyperparameter_csvs, hyperparameter_settings):
60 |             print(hyperparameter_csv)
61 |             full_path_filename = os.path.join(score_folder_path, hyperparameter_csv)
62 |             
63 |             outlier_scores = pd.read_csv(full_path_filename, header=None)
64 |             
65 |             method_performance = {method_name:{score_name: score_function(y,outlier_scores) for (score_name, score_function) in score_functions.items()}}
66 |             method_performance_df = pd.DataFrame(method_performance).transpose()
67 |             
68 |             metric_pickle_file = os.path.join(result_dir, dataset, method_name, hyperparameter_csv.replace(".csv", ".pickle"))
69 |             with open(metric_pickle_file, 'wb') as handle:
70 |                 pickle.dump(method_performance_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
71 |             
72 |             metric_csv_file = os.path.join(csv_result_dir, dataset, method_name, hyperparameter_csv)
73 | 
74 |             #also write csv files for easy manual inspection
75 |             method_performance_df.to_csv(metric_csv_file)
76 | 


--------------------------------------------------------------------------------
/method_example.py:
--------------------------------------------------------------------------------
 1 | from preprocess_detect_outliers import preprocess_detect_outliers
 2 | #%% Define parameter settings and methods
 3 | 
 4 | 
 5 | from pyod.models.knn import KNN 
 6 | 
 7 | #dict of methods and functions
 8 | methods = {
 9 |         "kNN":KNN
10 |         }
11 | 
12 | #dict of methods and parameters
13 | method_parameters = {
14 |         "kNN":{"n_neighbors":range(5,31), "method":["mean"]}
15 |         }
16 | 
17 | 
18 | #%% run method over all datasets
19 | 
20 | preprocess_detect_outliers(methods, method_parameters)


--------------------------------------------------------------------------------
/minimal_environment.yml:
--------------------------------------------------------------------------------
 1 | name: OD_benchmark_minimal
 2 | dependencies:
 3 |   - python=3.8
 4 |   - numpy
 5 |   - pandas
 6 |   - scikit-learn=1.0.2
 7 |   - matplotlib
 8 |   - seaborn
 9 |   - scipy
10 |   - pip
11 |   - pip:
12 |     - pyod
13 |     - combo
14 | 


--------------------------------------------------------------------------------
/preprocess_detect_outliers.py:
--------------------------------------------------------------------------------
  1 | #%% setup
  2 | import pickle
  3 | import os
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.metrics import roc_auc_score, average_precision_score
  7 | from pyod.utils.utility import precision_n_scores
  8 | from sklearn.pipeline import make_pipeline
  9 | from sklearn.preprocessing import RobustScaler
 10 | from sklearn.model_selection import ParameterGrid
 11 | from evaluation_metrics import adjusted_precision_n_scores, adjusted_average_precision
 12 | 
 13 | 
 14 | formatted_data_dir = "formatted_data"
 15 | base_result_dir = "results"
 16 | result_dir = "result_dir"
 17 | csvresult_dir = "csvresult_dir"
 18 | score_dir = "score_dir"
 19 | 
 20 | #define score function:
 21 | score_functions = {"ROC/AUC": roc_auc_score, 
 22 |                    "R_precision": precision_n_scores, 
 23 |                    "adjusted_R_precision": adjusted_precision_n_scores, 
 24 |                    "average_precision": average_precision_score, 
 25 |                    "adjusted_average_precision": adjusted_average_precision}
 26 | 
 27 | 
 28 | verbose = True
 29 | input_type = "npz"
 30 | 
 31 | #%%
 32 | 
 33 | def preprocess_detect_outliers(methods, method_parameters, verbose=True, input_type="npz"):
 34 |     
 35 |     #sort dataset_names based on size: https://stackoverflow.com/questions/20252669/get-files-from-directory-argument-sorting-by-size
 36 |     # make a generator for all file paths within dirpath
 37 |     all_files = ( os.path.join(basedir, filename) for basedir, dirs, files in os.walk(formatted_data_dir) for filename in files   )
 38 |     sorted_files = sorted(all_files, key = os.path.getsize)
 39 |     dataset_names = [filename.replace(formatted_data_dir+os.path.sep,"") for filename in sorted_files]
 40 |     dataset_names = [dataset_name for dataset_name in dataset_names if dataset_name.endswith(input_type)]
 41 |     
 42 |     all_methods_to_run = methods
 43 |     
 44 |     #%% loop over all data, but do not reproduce existing results
 45 |     
 46 |     target_dir = os.path.join(base_result_dir, result_dir)
 47 |     target_csvdir = os.path.join(base_result_dir, csvresult_dir)
 48 |     score_csvdir = os.path.join(base_result_dir, score_dir)
 49 |     
 50 |     if not os.path.exists(score_csvdir):
 51 |         os.makedirs(score_csvdir)
 52 |         
 53 |     for dataset_name in dataset_names:
 54 |         
 55 |         #print name for reporting purpose
 56 |         print("______"+dataset_name+"______")
 57 |         
 58 |         full_path_filename = os.path.join(formatted_data_dir, dataset_name)
 59 |         
 60 |         if input_type == "pickle":
 61 |             data = pickle.load(open(full_path_filename, 'rb'))
 62 |         elif input_type == "npz":
 63 |             data = data = np.load(open(full_path_filename, 'rb'))
 64 |                         
 65 |         X, y = data["X"], np.squeeze(data["y"])
 66 |         
 67 |         #loop over all methods:
 68 |     
 69 |         for method_name, OD_class in all_methods_to_run.items():
 70 |             print("-" + method_name)
 71 |             hyperparameter_grid = method_parameters[method_name]
 72 |             hyperparameter_list = list(ParameterGrid(hyperparameter_grid))
 73 |             
 74 |             #loop over hyperparameter settings
 75 |             for hyperparameter_setting in hyperparameter_list:
 76 |                 
 77 |                 hyperparameter_string = str(hyperparameter_setting)
 78 |                 
 79 |                 if verbose:
 80 |                     print(hyperparameter_string)
 81 |                 
 82 |                 #check whether results have  been calculated
 83 |                 full_target_dir = os.path.join(target_dir, dataset_name.replace("."+input_type, ""), method_name)
 84 |                 target_file_name = os.path.join(target_dir, dataset_name.replace("."+input_type, ""), method_name, hyperparameter_string+".pickle")
 85 |                 if os.path.exists(target_file_name) and os.path.getsize(target_file_name) > 0:
 86 |                     if verbose:
 87 |                         print(" results already calculated, skipping recalculation")
 88 |                 else:
 89 |                         
 90 |                     OD_method = OD_class(**hyperparameter_setting)
 91 |                     
 92 |                     pipeline = make_pipeline(RobustScaler(), OD_method)
 93 |                     
 94 |                     pipeline.fit(X)
 95 |     
 96 |                     outlier_scores = pipeline[1].decision_scores_
 97 |                     
 98 |                     method_performance = {method_name:{score_name: score_function(y,outlier_scores) for (score_name, score_function) in score_functions.items()}}
 99 |                     method_performance_df = pd.DataFrame(method_performance).transpose()
100 |                         
101 |                     os.makedirs(full_target_dir, exist_ok=True)
102 |                     with open(target_file_name, 'wb') as handle:
103 |                         pickle.dump(method_performance_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
104 |                     
105 |                     #also write csv files for easy manual inspection
106 |                     full_target_csvdir = os.path.join(target_csvdir, dataset_name.replace("."+input_type, ""), method_name)
107 |                     os.makedirs(full_target_csvdir, exist_ok=True)
108 |                     target_csvfile_name = os.path.join(full_target_csvdir, hyperparameter_string+".csv")
109 |                     method_performance_df.to_csv(target_csvfile_name)
110 |                     
111 |                     full_target_scoredir = os.path.join(score_csvdir, dataset_name.replace("."+input_type, ""), method_name)
112 |                     os.makedirs(full_target_scoredir, exist_ok=True)
113 |                     target_scorefile_name = os.path.join(full_target_scoredir, hyperparameter_string+".csv")
114 |                     np.savetxt(target_scorefile_name, outlier_scores)
115 |                     
116 | 
117 | 


--------------------------------------------------------------------------------
/raw_data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ADBench_data_raw/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ADBench_data_raw/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/Annthyroid/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Annthyroid/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/Arrhythmia/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Arrhythmia/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/Cardiotocography/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Cardiotocography/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/HeartDisease/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/HeartDisease/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/Hepatitis/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Hepatitis/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/InternetAds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/InternetAds/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/PageBlocks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/PageBlocks/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/Parkinson/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Parkinson/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/Pima/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Pima/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/SpamBase/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/SpamBase/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/Stamps/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Stamps/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ELKI_data_raw/Wilt/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Wilt/.gitkeep


--------------------------------------------------------------------------------
/raw_data/GAAL_data_raw/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/GAAL_data_raw/.gitkeep


--------------------------------------------------------------------------------
/raw_data/Goldstein_data_raw/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/Goldstein_data_raw/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ODDS_data_raw/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ODDS_data_raw/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ODDS_data_raw/categorical_variables_per_dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "lympho": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]
3 | }


--------------------------------------------------------------------------------
/raw_data/ODDS_data_raw/matfile_data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ODDS_data_raw/matfile_data/.gitkeep


--------------------------------------------------------------------------------
/raw_data/ODDS_data_raw/other_data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ODDS_data_raw/other_data/.gitkeep


--------------------------------------------------------------------------------
/raw_data/extended_AE_data_raw/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/extended_AE_data_raw/.gitkeep


--------------------------------------------------------------------------------
/raw_data/extended_AE_data_raw/CNC-kaggle/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/extended_AE_data_raw/CNC-kaggle/.gitkeep


--------------------------------------------------------------------------------
/tables/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | 


--------------------------------------------------------------------------------
/testnewmethods.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Thu Dec 14 17:23:39 2023
 5 | 
 6 | @author: rbouman
 7 | """
 8 | 
 9 | from sklearn.datasets import load_breast_cancer
10 | 
11 | from pyod.models.lmdd import LMDD
12 | from additional_methods.lmdd import LMDD as LMDD2
13 | 
14 | import numpy as np
15 | import matplotlib.pyplot as plt
16 | 
17 | import os
18 | 
19 | from sklearn.metrics import roc_auc_score
20 | 
21 | 
22 | formatted_data_dir = "formatted_data"
23 | dataset_name = "wbc.npz"
24 | 
25 | 
26 | full_path_filename = os.path.join(formatted_data_dir, dataset_name)
27 | 
28 | data  = np.load(open(full_path_filename, 'rb'))
29 |             
30 | X, y = data["X"], np.squeeze(data["y"])
31 | 
32 | #add duplicates to X and y:
33 |     
34 | X = np.concatenate([X]*10)
35 | y = np.concatenate([y]*10)
36 | 
37 | 
38 | plt.figure()
39 | model = LMDD2(n_iter=5, dis_measure="aad")
40 | 
41 | model.fit(X)
42 | 
43 | dec_scores = model.decision_scores_
44 | 
45 | plt.hist(dec_scores)
46 | 
47 | plt.show()
48 | 
49 | print(roc_auc_score(y, dec_scores))
50 | 
51 | plt.figure()
52 | 
53 | model2 = LMDD(n_iter=5, dis_measure="aad")
54 | 
55 | model2.fit(X)
56 | 
57 | dec_scores2 = model2.decision_scores_
58 | 
59 | plt.hist(dec_scores2)
60 | 
61 | plt.show()
62 | 
63 | print(roc_auc_score(y, dec_scores2))


--------------------------------------------------------------------------------